Source file src/pkg/compress/lzw/reader.go
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package lzw implements the Lempel-Ziv-Welch compressed data format,
6 // described in T. A. Welch, ``A Technique for High-Performance Data
7 // Compression'', Computer, 17(6) (June 1984), pp 8-19.
8 //
9 // In particular, it implements LZW as used by the GIF, TIFF and PDF file
10 // formats, which means variable-width codes up to 12 bits and the first
11 // two non-literal codes are a clear code and an EOF code.
12 package lzw
13
14 // TODO(nigeltao): check that TIFF and PDF use LZW in the same way as GIF,
15 // modulo LSB/MSB packing order.
16
17 import (
18 "bufio"
19 "errors"
20 "fmt"
21 "io"
22 )
23
24 // Order specifies the bit ordering in an LZW data stream.
25 type Order int
26
27 const (
28 // LSB means Least Significant Bits first, as used in the GIF file format.
29 LSB Order = iota
30 // MSB means Most Significant Bits first, as used in the TIFF and PDF
31 // file formats.
32 MSB
33 )
34
35 const (
36 maxWidth = 12
37 decoderInvalidCode = 0xffff
38 flushBuffer = 1 << maxWidth
39 )
40
41 // decoder is the state from which the readXxx method converts a byte
42 // stream into a code stream.
43 type decoder struct {
44 r io.ByteReader
45 bits uint32
46 nBits uint
47 width uint
48 read func(*decoder) (uint16, error) // readLSB or readMSB
49 litWidth int // width in bits of literal codes
50 err error
51
52 // The first 1<<litWidth codes are literal codes.
53 // The next two codes mean clear and EOF.
54 // Other valid codes are in the range [lo, hi] where lo := clear + 2,
55 // with the upper bound incrementing on each code seen.
56 // overflow is the code at which hi overflows the code width.
57 // last is the most recently seen code, or decoderInvalidCode.
58 clear, eof, hi, overflow, last uint16
59
60 // Each code c in [lo, hi] expands to two or more bytes. For c != hi:
61 // suffix[c] is the last of these bytes.
62 // prefix[c] is the code for all but the last byte.
63 // This code can either be a literal code or another code in [lo, c).
64 // The c == hi case is a special case.
65 suffix [1 << maxWidth]uint8
66 prefix [1 << maxWidth]uint16
67
68 // output is the temporary output buffer.
69 // Literal codes are accumulated from the start of the buffer.
70 // Non-literal codes decode to a sequence of suffixes that are first
71 // written right-to-left from the end of the buffer before being copied
72 // to the start of the buffer.
73 // It is flushed when it contains >= 1<<maxWidth bytes,
74 // so that there is always room to decode an entire code.
75 output [2 * 1 << maxWidth]byte
76 o int // write index into output
77 toRead []byte // bytes to return from Read
78 }
79
80 // readLSB returns the next code for "Least Significant Bits first" data.
81 func (d *decoder) readLSB() (uint16, error) {
82 for d.nBits < d.width {
83 x, err := d.r.ReadByte()
84 if err != nil {
85 return 0, err
86 }
87 d.bits |= uint32(x) << d.nBits
88 d.nBits += 8
89 }
90 code := uint16(d.bits & (1<<d.width - 1))
91 d.bits >>= d.width
92 d.nBits -= d.width
93 return code, nil
94 }
95
96 // readMSB returns the next code for "Most Significant Bits first" data.
97 func (d *decoder) readMSB() (uint16, error) {
98 for d.nBits < d.width {
99 x, err := d.r.ReadByte()
100 if err != nil {
101 return 0, err
102 }
103 d.bits |= uint32(x) << (24 - d.nBits)
104 d.nBits += 8
105 }
106 code := uint16(d.bits >> (32 - d.width))
107 d.bits <<= d.width
108 d.nBits -= d.width
109 return code, nil
110 }
111
112 func (d *decoder) Read(b []byte) (int, error) {
113 for {
114 if len(d.toRead) > 0 {
115 n := copy(b, d.toRead)
116 d.toRead = d.toRead[n:]
117 return n, nil
118 }
119 if d.err != nil {
120 return 0, d.err
121 }
122 d.decode()
123 }
124 panic("unreachable")
125 }
126
127 // decode decompresses bytes from r and leaves them in d.toRead.
128 // read specifies how to decode bytes into codes.
129 // litWidth is the width in bits of literal codes.
130 func (d *decoder) decode() {
131 // Loop over the code stream, converting codes into decompressed bytes.
132 for {
133 code, err := d.read(d)
134 if err != nil {
135 if err == io.EOF {
136 err = io.ErrUnexpectedEOF
137 }
138 d.err = err
139 return
140 }
141 switch {
142 case code < d.clear:
143 // We have a literal code.
144 d.output[d.o] = uint8(code)
145 d.o++
146 if d.last != decoderInvalidCode {
147 // Save what the hi code expands to.
148 d.suffix[d.hi] = uint8(code)
149 d.prefix[d.hi] = d.last
150 }
151 case code == d.clear:
152 d.width = 1 + uint(d.litWidth)
153 d.hi = d.eof
154 d.overflow = 1 << d.width
155 d.last = decoderInvalidCode
156 continue
157 case code == d.eof:
158 d.flush()
159 d.err = io.EOF
160 return
161 case code <= d.hi:
162 c, i := code, len(d.output)-1
163 if code == d.hi {
164 // code == hi is a special case which expands to the last expansion
165 // followed by the head of the last expansion. To find the head, we walk
166 // the prefix chain until we find a literal code.
167 c = d.last
168 for c >= d.clear {
169 c = d.prefix[c]
170 }
171 d.output[i] = uint8(c)
172 i--
173 c = d.last
174 }
175 // Copy the suffix chain into output and then write that to w.
176 for c >= d.clear {
177 d.output[i] = d.suffix[c]
178 i--
179 c = d.prefix[c]
180 }
181 d.output[i] = uint8(c)
182 d.o += copy(d.output[d.o:], d.output[i:])
183 if d.last != decoderInvalidCode {
184 // Save what the hi code expands to.
185 d.suffix[d.hi] = uint8(c)
186 d.prefix[d.hi] = d.last
187 }
188 default:
189 d.err = errors.New("lzw: invalid code")
190 return
191 }
192 d.last, d.hi = code, d.hi+1
193 if d.hi >= d.overflow {
194 if d.width == maxWidth {
195 d.last = decoderInvalidCode
196 } else {
197 d.width++
198 d.overflow <<= 1
199 }
200 }
201 if d.o >= flushBuffer {
202 d.flush()
203 return
204 }
205 }
206 panic("unreachable")
207 }
208
209 func (d *decoder) flush() {
210 d.toRead = d.output[:d.o]
211 d.o = 0
212 }
213
214 var errClosed = errors.New("compress/lzw: reader/writer is closed")
215
216 func (d *decoder) Close() error {
217 d.err = errClosed // in case any Reads come along
218 return nil
219 }
220
221 // NewReader creates a new io.ReadCloser that satisfies reads by decompressing
222 // the data read from r.
223 // It is the caller's responsibility to call Close on the ReadCloser when
224 // finished reading.
225 // The number of bits to use for literal codes, litWidth, must be in the
226 // range [2,8] and is typically 8.
227 func NewReader(r io.Reader, order Order, litWidth int) io.ReadCloser {
228 d := new(decoder)
229 switch order {
230 case LSB:
231 d.read = (*decoder).readLSB
232 case MSB:
233 d.read = (*decoder).readMSB
234 default:
235 d.err = errors.New("lzw: unknown order")
236 return d
237 }
238 if litWidth < 2 || 8 < litWidth {
239 d.err = fmt.Errorf("lzw: litWidth %d out of range", litWidth)
240 return d
241 }
242 if br, ok := r.(io.ByteReader); ok {
243 d.r = br
244 } else {
245 d.r = bufio.NewReader(r)
246 }
247 d.litWidth = litWidth
248 d.width = 1 + uint(litWidth)
249 d.clear = uint16(1) << uint(litWidth)
250 d.eof, d.hi = d.clear+1, d.clear+1
251 d.overflow = uint16(1) << d.width
252 d.last = decoderInvalidCode
253
254 return d
255 }