Source file src/pkg/compress/flate/inflate.go
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package flate implements the DEFLATE compressed data format, described in
6 // RFC 1951. The gzip and zlib packages implement access to DEFLATE-based file
7 // formats.
8 package flate
9
10 import (
11 "bufio"
12 "io"
13 "strconv"
14 )
15
16 const (
17 maxCodeLen = 16 // max length of Huffman code
18 maxHist = 32768 // max history required
19 maxLit = 286
20 maxDist = 32
21 numCodes = 19 // number of codes in Huffman meta-code
22 )
23
24 // A CorruptInputError reports the presence of corrupt input at a given offset.
25 type CorruptInputError int64
26
27 func (e CorruptInputError) Error() string {
28 return "flate: corrupt input before offset " + strconv.FormatInt(int64(e), 10)
29 }
30
31 // An InternalError reports an error in the flate code itself.
32 type InternalError string
33
34 func (e InternalError) Error() string { return "flate: internal error: " + string(e) }
35
36 // A ReadError reports an error encountered while reading input.
37 type ReadError struct {
38 Offset int64 // byte offset where error occurred
39 Err error // error returned by underlying Read
40 }
41
42 func (e *ReadError) Error() string {
43 return "flate: read error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
44 }
45
46 // A WriteError reports an error encountered while writing output.
47 type WriteError struct {
48 Offset int64 // byte offset where error occurred
49 Err error // error returned by underlying Write
50 }
51
52 func (e *WriteError) Error() string {
53 return "flate: write error at offset " + strconv.FormatInt(e.Offset, 10) + ": " + e.Err.Error()
54 }
55
56 // Huffman decoder is based on
57 // J. Brian Connell, ``A Huffman-Shannon-Fano Code,''
58 // Proceedings of the IEEE, 61(7) (July 1973), pp 1046-1047.
59 type huffmanDecoder struct {
60 // min, max code length
61 min, max int
62
63 // limit[i] = largest code word of length i
64 // Given code v of length n,
65 // need more bits if v > limit[n].
66 limit [maxCodeLen + 1]int
67
68 // base[i] = smallest code word of length i - seq number
69 base [maxCodeLen + 1]int
70
71 // codes[seq number] = output code.
72 // Given code v of length n, value is
73 // codes[v - base[n]].
74 codes []int
75 }
76
77 // Initialize Huffman decoding tables from array of code lengths.
78 func (h *huffmanDecoder) init(bits []int) bool {
79 // Count number of codes of each length,
80 // compute min and max length.
81 var count [maxCodeLen + 1]int
82 var min, max int
83 for _, n := range bits {
84 if n == 0 {
85 continue
86 }
87 if min == 0 || n < min {
88 min = n
89 }
90 if n > max {
91 max = n
92 }
93 count[n]++
94 }
95 if max == 0 {
96 return false
97 }
98
99 h.min = min
100 h.max = max
101
102 // For each code range, compute
103 // nextcode (first code of that length),
104 // limit (last code of that length), and
105 // base (offset from first code to sequence number).
106 code := 0
107 seq := 0
108 var nextcode [maxCodeLen]int
109 for i := min; i <= max; i++ {
110 n := count[i]
111 nextcode[i] = code
112 h.base[i] = code - seq
113 code += n
114 seq += n
115 h.limit[i] = code - 1
116 code <<= 1
117 }
118
119 // Make array mapping sequence numbers to codes.
120 if len(h.codes) < len(bits) {
121 h.codes = make([]int, len(bits))
122 }
123 for i, n := range bits {
124 if n == 0 {
125 continue
126 }
127 code := nextcode[n]
128 nextcode[n]++
129 seq := code - h.base[n]
130 h.codes[seq] = i
131 }
132 return true
133 }
134
135 // Hard-coded Huffman tables for DEFLATE algorithm.
136 // See RFC 1951, section 3.2.6.
137 var fixedHuffmanDecoder = huffmanDecoder{
138 7, 9,
139 [maxCodeLen + 1]int{7: 23, 199, 511},
140 [maxCodeLen + 1]int{7: 0, 24, 224},
141 []int{
142 // length 7: 256-279
143 256, 257, 258, 259, 260, 261, 262,
144 263, 264, 265, 266, 267, 268, 269,
145 270, 271, 272, 273, 274, 275, 276,
146 277, 278, 279,
147
148 // length 8: 0-143
149 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
150 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
151 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
152 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
153 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
154 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
155 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
156 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
157 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
158 92, 93, 94, 95, 96, 97, 98, 99, 100,
159 101, 102, 103, 104, 105, 106, 107, 108,
160 109, 110, 111, 112, 113, 114, 115, 116,
161 117, 118, 119, 120, 121, 122, 123, 124,
162 125, 126, 127, 128, 129, 130, 131, 132,
163 133, 134, 135, 136, 137, 138, 139, 140,
164 141, 142, 143,
165
166 // length 8: 280-287
167 280, 281, 282, 283, 284, 285, 286, 287,
168
169 // length 9: 144-255
170 144, 145, 146, 147, 148, 149, 150, 151,
171 152, 153, 154, 155, 156, 157, 158, 159,
172 160, 161, 162, 163, 164, 165, 166, 167,
173 168, 169, 170, 171, 172, 173, 174, 175,
174 176, 177, 178, 179, 180, 181, 182, 183,
175 184, 185, 186, 187, 188, 189, 190, 191,
176 192, 193, 194, 195, 196, 197, 198, 199,
177 200, 201, 202, 203, 204, 205, 206, 207,
178 208, 209, 210, 211, 212, 213, 214, 215,
179 216, 217, 218, 219, 220, 221, 222, 223,
180 224, 225, 226, 227, 228, 229, 230, 231,
181 232, 233, 234, 235, 236, 237, 238, 239,
182 240, 241, 242, 243, 244, 245, 246, 247,
183 248, 249, 250, 251, 252, 253, 254, 255,
184 },
185 }
186
187 // The actual read interface needed by NewReader.
188 // If the passed in io.Reader does not also have ReadByte,
189 // the NewReader will introduce its own buffering.
190 type Reader interface {
191 io.Reader
192 ReadByte() (c byte, err error)
193 }
194
195 // Decompress state.
196 type decompressor struct {
197 // Input source.
198 r Reader
199 roffset int64
200 woffset int64
201
202 // Input bits, in top of b.
203 b uint32
204 nb uint
205
206 // Huffman decoders for literal/length, distance.
207 h1, h2 huffmanDecoder
208
209 // Length arrays used to define Huffman codes.
210 bits [maxLit + maxDist]int
211 codebits [numCodes]int
212
213 // Output history, buffer.
214 hist [maxHist]byte
215 hp int // current output position in buffer
216 hw int // have written hist[0:hw] already
217 hfull bool // buffer has filled at least once
218
219 // Temporary buffer (avoids repeated allocation).
220 buf [4]byte
221
222 // Next step in the decompression,
223 // and decompression state.
224 step func(*decompressor)
225 final bool
226 err error
227 toRead []byte
228 hl, hd *huffmanDecoder
229 copyLen int
230 copyDist int
231 }
232
233 func (f *decompressor) nextBlock() {
234 if f.final {
235 if f.hw != f.hp {
236 f.flush((*decompressor).nextBlock)
237 return
238 }
239 f.err = io.EOF
240 return
241 }
242 for f.nb < 1+2 {
243 if f.err = f.moreBits(); f.err != nil {
244 return
245 }
246 }
247 f.final = f.b&1 == 1
248 f.b >>= 1
249 typ := f.b & 3
250 f.b >>= 2
251 f.nb -= 1 + 2
252 switch typ {
253 case 0:
254 f.dataBlock()
255 case 1:
256 // compressed, fixed Huffman tables
257 f.hl = &fixedHuffmanDecoder
258 f.hd = nil
259 f.huffmanBlock()
260 case 2:
261 // compressed, dynamic Huffman tables
262 if f.err = f.readHuffman(); f.err != nil {
263 break
264 }
265 f.hl = &f.h1
266 f.hd = &f.h2
267 f.huffmanBlock()
268 default:
269 // 3 is reserved.
270 f.err = CorruptInputError(f.roffset)
271 }
272 }
273
274 func (f *decompressor) Read(b []byte) (int, error) {
275 for {
276 if len(f.toRead) > 0 {
277 n := copy(b, f.toRead)
278 f.toRead = f.toRead[n:]
279 return n, nil
280 }
281 if f.err != nil {
282 return 0, f.err
283 }
284 f.step(f)
285 }
286 panic("unreachable")
287 }
288
289 func (f *decompressor) Close() error {
290 if f.err == io.EOF {
291 return nil
292 }
293 return f.err
294 }
295
296 // RFC 1951 section 3.2.7.
297 // Compression with dynamic Huffman codes
298
299 var codeOrder = [...]int{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
300
301 func (f *decompressor) readHuffman() error {
302 // HLIT[5], HDIST[5], HCLEN[4].
303 for f.nb < 5+5+4 {
304 if err := f.moreBits(); err != nil {
305 return err
306 }
307 }
308 nlit := int(f.b&0x1F) + 257
309 f.b >>= 5
310 ndist := int(f.b&0x1F) + 1
311 f.b >>= 5
312 nclen := int(f.b&0xF) + 4
313 f.b >>= 4
314 f.nb -= 5 + 5 + 4
315
316 // (HCLEN+4)*3 bits: code lengths in the magic codeOrder order.
317 for i := 0; i < nclen; i++ {
318 for f.nb < 3 {
319 if err := f.moreBits(); err != nil {
320 return err
321 }
322 }
323 f.codebits[codeOrder[i]] = int(f.b & 0x7)
324 f.b >>= 3
325 f.nb -= 3
326 }
327 for i := nclen; i < len(codeOrder); i++ {
328 f.codebits[codeOrder[i]] = 0
329 }
330 if !f.h1.init(f.codebits[0:]) {
331 return CorruptInputError(f.roffset)
332 }
333
334 // HLIT + 257 code lengths, HDIST + 1 code lengths,
335 // using the code length Huffman code.
336 for i, n := 0, nlit+ndist; i < n; {
337 x, err := f.huffSym(&f.h1)
338 if err != nil {
339 return err
340 }
341 if x < 16 {
342 // Actual length.
343 f.bits[i] = x
344 i++
345 continue
346 }
347 // Repeat previous length or zero.
348 var rep int
349 var nb uint
350 var b int
351 switch x {
352 default:
353 return InternalError("unexpected length code")
354 case 16:
355 rep = 3
356 nb = 2
357 if i == 0 {
358 return CorruptInputError(f.roffset)
359 }
360 b = f.bits[i-1]
361 case 17:
362 rep = 3
363 nb = 3
364 b = 0
365 case 18:
366 rep = 11
367 nb = 7
368 b = 0
369 }
370 for f.nb < nb {
371 if err := f.moreBits(); err != nil {
372 return err
373 }
374 }
375 rep += int(f.b & uint32(1<<nb-1))
376 f.b >>= nb
377 f.nb -= nb
378 if i+rep > n {
379 return CorruptInputError(f.roffset)
380 }
381 for j := 0; j < rep; j++ {
382 f.bits[i] = b
383 i++
384 }
385 }
386
387 if !f.h1.init(f.bits[0:nlit]) || !f.h2.init(f.bits[nlit:nlit+ndist]) {
388 return CorruptInputError(f.roffset)
389 }
390
391 return nil
392 }
393
394 // Decode a single Huffman block from f.
395 // hl and hd are the Huffman states for the lit/length values
396 // and the distance values, respectively. If hd == nil, using the
397 // fixed distance encoding associated with fixed Huffman blocks.
398 func (f *decompressor) huffmanBlock() {
399 for {
400 v, err := f.huffSym(f.hl)
401 if err != nil {
402 f.err = err
403 return
404 }
405 var n uint // number of bits extra
406 var length int
407 switch {
408 case v < 256:
409 f.hist[f.hp] = byte(v)
410 f.hp++
411 if f.hp == len(f.hist) {
412 // After the flush, continue this loop.
413 f.flush((*decompressor).huffmanBlock)
414 return
415 }
416 continue
417 case v == 256:
418 // Done with huffman block; read next block.
419 f.step = (*decompressor).nextBlock
420 return
421 // otherwise, reference to older data
422 case v < 265:
423 length = v - (257 - 3)
424 n = 0
425 case v < 269:
426 length = v*2 - (265*2 - 11)
427 n = 1
428 case v < 273:
429 length = v*4 - (269*4 - 19)
430 n = 2
431 case v < 277:
432 length = v*8 - (273*8 - 35)
433 n = 3
434 case v < 281:
435 length = v*16 - (277*16 - 67)
436 n = 4
437 case v < 285:
438 length = v*32 - (281*32 - 131)
439 n = 5
440 default:
441 length = 258
442 n = 0
443 }
444 if n > 0 {
445 for f.nb < n {
446 if err = f.moreBits(); err != nil {
447 f.err = err
448 return
449 }
450 }
451 length += int(f.b & uint32(1<<n-1))
452 f.b >>= n
453 f.nb -= n
454 }
455
456 var dist int
457 if f.hd == nil {
458 for f.nb < 5 {
459 if err = f.moreBits(); err != nil {
460 f.err = err
461 return
462 }
463 }
464 dist = int(reverseByte[(f.b&0x1F)<<3])
465 f.b >>= 5
466 f.nb -= 5
467 } else {
468 if dist, err = f.huffSym(f.hd); err != nil {
469 f.err = err
470 return
471 }
472 }
473
474 switch {
475 case dist < 4:
476 dist++
477 case dist >= 30:
478 f.err = CorruptInputError(f.roffset)
479 return
480 default:
481 nb := uint(dist-2) >> 1
482 // have 1 bit in bottom of dist, need nb more.
483 extra := (dist & 1) << nb
484 for f.nb < nb {
485 if err = f.moreBits(); err != nil {
486 f.err = err
487 return
488 }
489 }
490 extra |= int(f.b & uint32(1<<nb-1))
491 f.b >>= nb
492 f.nb -= nb
493 dist = 1<<(nb+1) + 1 + extra
494 }
495
496 // Copy history[-dist:-dist+length] into output.
497 if dist > len(f.hist) {
498 f.err = InternalError("bad history distance")
499 return
500 }
501
502 // No check on length; encoding can be prescient.
503 if !f.hfull && dist > f.hp {
504 f.err = CorruptInputError(f.roffset)
505 return
506 }
507
508 p := f.hp - dist
509 if p < 0 {
510 p += len(f.hist)
511 }
512 for i := 0; i < length; i++ {
513 f.hist[f.hp] = f.hist[p]
514 f.hp++
515 p++
516 if f.hp == len(f.hist) {
517 // After flush continue copying out of history.
518 f.copyLen = length - (i + 1)
519 f.copyDist = dist
520 f.flush((*decompressor).copyHuff)
521 return
522 }
523 if p == len(f.hist) {
524 p = 0
525 }
526 }
527 }
528 panic("unreached")
529 }
530
531 func (f *decompressor) copyHuff() {
532 length := f.copyLen
533 dist := f.copyDist
534 p := f.hp - dist
535 if p < 0 {
536 p += len(f.hist)
537 }
538 for i := 0; i < length; i++ {
539 f.hist[f.hp] = f.hist[p]
540 f.hp++
541 p++
542 if f.hp == len(f.hist) {
543 f.copyLen = length - (i + 1)
544 f.flush((*decompressor).copyHuff)
545 return
546 }
547 if p == len(f.hist) {
548 p = 0
549 }
550 }
551
552 // Continue processing Huffman block.
553 f.huffmanBlock()
554 }
555
556 // Copy a single uncompressed data block from input to output.
557 func (f *decompressor) dataBlock() {
558 // Uncompressed.
559 // Discard current half-byte.
560 f.nb = 0
561 f.b = 0
562
563 // Length then ones-complement of length.
564 nr, err := io.ReadFull(f.r, f.buf[0:4])
565 f.roffset += int64(nr)
566 if err != nil {
567 f.err = &ReadError{f.roffset, err}
568 return
569 }
570 n := int(f.buf[0]) | int(f.buf[1])<<8
571 nn := int(f.buf[2]) | int(f.buf[3])<<8
572 if uint16(nn) != uint16(^n) {
573 f.err = CorruptInputError(f.roffset)
574 return
575 }
576
577 if n == 0 {
578 // 0-length block means sync
579 f.flush((*decompressor).nextBlock)
580 return
581 }
582
583 f.copyLen = n
584 f.copyData()
585 }
586
587 func (f *decompressor) copyData() {
588 // Read f.dataLen bytes into history,
589 // pausing for reads as history fills.
590 n := f.copyLen
591 for n > 0 {
592 m := len(f.hist) - f.hp
593 if m > n {
594 m = n
595 }
596 m, err := io.ReadFull(f.r, f.hist[f.hp:f.hp+m])
597 f.roffset += int64(m)
598 if err != nil {
599 f.err = &ReadError{f.roffset, err}
600 return
601 }
602 n -= m
603 f.hp += m
604 if f.hp == len(f.hist) {
605 f.copyLen = n
606 f.flush((*decompressor).copyData)
607 return
608 }
609 }
610 f.step = (*decompressor).nextBlock
611 }
612
613 func (f *decompressor) setDict(dict []byte) {
614 if len(dict) > len(f.hist) {
615 // Will only remember the tail.
616 dict = dict[len(dict)-len(f.hist):]
617 }
618
619 f.hp = copy(f.hist[:], dict)
620 if f.hp == len(f.hist) {
621 f.hp = 0
622 f.hfull = true
623 }
624 f.hw = f.hp
625 }
626
627 func (f *decompressor) moreBits() error {
628 c, err := f.r.ReadByte()
629 if err != nil {
630 if err == io.EOF {
631 err = io.ErrUnexpectedEOF
632 }
633 return err
634 }
635 f.roffset++
636 f.b |= uint32(c) << f.nb
637 f.nb += 8
638 return nil
639 }
640
641 // Read the next Huffman-encoded symbol from f according to h.
642 func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
643 for n := uint(h.min); n <= uint(h.max); n++ {
644 lim := h.limit[n]
645 if lim == -1 {
646 continue
647 }
648 for f.nb < n {
649 if err := f.moreBits(); err != nil {
650 return 0, err
651 }
652 }
653 v := int(f.b & uint32(1<<n-1))
654 v <<= 16 - n
655 v = int(reverseByte[v>>8]) | int(reverseByte[v&0xFF])<<8 // reverse bits
656 if v <= lim {
657 f.b >>= n
658 f.nb -= n
659 return h.codes[v-h.base[n]], nil
660 }
661 }
662 return 0, CorruptInputError(f.roffset)
663 }
664
665 // Flush any buffered output to the underlying writer.
666 func (f *decompressor) flush(step func(*decompressor)) {
667 f.toRead = f.hist[f.hw:f.hp]
668 f.woffset += int64(f.hp - f.hw)
669 f.hw = f.hp
670 if f.hp == len(f.hist) {
671 f.hp = 0
672 f.hw = 0
673 f.hfull = true
674 }
675 f.step = step
676 }
677
678 func makeReader(r io.Reader) Reader {
679 if rr, ok := r.(Reader); ok {
680 return rr
681 }
682 return bufio.NewReader(r)
683 }
684
685 // NewReader returns a new ReadCloser that can be used
686 // to read the uncompressed version of r. It is the caller's
687 // responsibility to call Close on the ReadCloser when
688 // finished reading.
689 func NewReader(r io.Reader) io.ReadCloser {
690 var f decompressor
691 f.r = makeReader(r)
692 f.step = (*decompressor).nextBlock
693 return &f
694 }
695
696 // NewReaderDict is like NewReader but initializes the reader
697 // with a preset dictionary. The returned Reader behaves as if
698 // the uncompressed data stream started with the given dictionary,
699 // which has already been read. NewReaderDict is typically used
700 // to read data compressed by NewWriterDict.
701 func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
702 var f decompressor
703 f.setDict(dict)
704 f.r = makeReader(r)
705 f.step = (*decompressor).nextBlock
706 return &f
707 }