Source file src/pkg/compress/bzip2/bzip2.go
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package bzip2 implements bzip2 decompression.
6 package bzip2
7
8 import "io"
9
10 // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
11 // of guessing: http://en.wikipedia.org/wiki/Bzip2
12 // The source code to pyflate was useful for debugging:
13 // http://www.paul.sladen.org/projects/pyflate
14
15 // A StructuralError is returned when the bzip2 data is found to be
16 // syntactically invalid.
17 type StructuralError string
18
19 func (s StructuralError) Error() string {
20 return "bzip2 data invalid: " + string(s)
21 }
22
23 // A reader decompresses bzip2 compressed data.
24 type reader struct {
25 br bitReader
26 setupDone bool // true if we have parsed the bzip2 header.
27 blockSize int // blockSize in bytes, i.e. 900 * 1024.
28 eof bool
29 buf []byte // stores Burrows-Wheeler transformed data.
30 c [256]uint // the `C' array for the inverse BWT.
31 tt []uint32 // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
32 tPos uint32 // Index of the next output byte in tt.
33
34 preRLE []uint32 // contains the RLE data still to be processed.
35 preRLEUsed int // number of entries of preRLE used.
36 lastByte int // the last byte value seen.
37 byteRepeats uint // the number of repeats of lastByte seen.
38 repeats uint // the number of copies of lastByte to output.
39 }
40
41 // NewReader returns an io.Reader which decompresses bzip2 data from r.
42 func NewReader(r io.Reader) io.Reader {
43 bz2 := new(reader)
44 bz2.br = newBitReader(r)
45 return bz2
46 }
47
48 const bzip2FileMagic = 0x425a // "BZ"
49 const bzip2BlockMagic = 0x314159265359
50 const bzip2FinalMagic = 0x177245385090
51
52 // setup parses the bzip2 header.
53 func (bz2 *reader) setup() error {
54 br := &bz2.br
55
56 magic := br.ReadBits(16)
57 if magic != bzip2FileMagic {
58 return StructuralError("bad magic value")
59 }
60
61 t := br.ReadBits(8)
62 if t != 'h' {
63 return StructuralError("non-Huffman entropy encoding")
64 }
65
66 level := br.ReadBits(8)
67 if level < '1' || level > '9' {
68 return StructuralError("invalid compression level")
69 }
70
71 bz2.blockSize = 100 * 1024 * (int(level) - '0')
72 bz2.tt = make([]uint32, bz2.blockSize)
73 return nil
74 }
75
76 func (bz2 *reader) Read(buf []byte) (n int, err error) {
77 if bz2.eof {
78 return 0, io.EOF
79 }
80
81 if !bz2.setupDone {
82 err = bz2.setup()
83 brErr := bz2.br.Err()
84 if brErr != nil {
85 err = brErr
86 }
87 if err != nil {
88 return 0, err
89 }
90 bz2.setupDone = true
91 }
92
93 n, err = bz2.read(buf)
94 brErr := bz2.br.Err()
95 if brErr != nil {
96 err = brErr
97 }
98 return
99 }
100
101 func (bz2 *reader) read(buf []byte) (n int, err error) {
102 // bzip2 is a block based compressor, except that it has a run-length
103 // preprocessing step. The block based nature means that we can
104 // preallocate fixed-size buffers and reuse them. However, the RLE
105 // preprocessing would require allocating huge buffers to store the
106 // maximum expansion. Thus we process blocks all at once, except for
107 // the RLE which we decompress as required.
108
109 for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
110 // We have RLE data pending.
111
112 // The run-length encoding works like this:
113 // Any sequence of four equal bytes is followed by a length
114 // byte which contains the number of repeats of that byte to
115 // include. (The number of repeats can be zero.) Because we are
116 // decompressing on-demand our state is kept in the reader
117 // object.
118
119 if bz2.repeats > 0 {
120 buf[n] = byte(bz2.lastByte)
121 n++
122 bz2.repeats--
123 if bz2.repeats == 0 {
124 bz2.lastByte = -1
125 }
126 continue
127 }
128
129 bz2.tPos = bz2.preRLE[bz2.tPos]
130 b := byte(bz2.tPos)
131 bz2.tPos >>= 8
132 bz2.preRLEUsed++
133
134 if bz2.byteRepeats == 3 {
135 bz2.repeats = uint(b)
136 bz2.byteRepeats = 0
137 continue
138 }
139
140 if bz2.lastByte == int(b) {
141 bz2.byteRepeats++
142 } else {
143 bz2.byteRepeats = 0
144 }
145 bz2.lastByte = int(b)
146
147 buf[n] = b
148 n++
149 }
150
151 if n > 0 {
152 return
153 }
154
155 // No RLE data is pending so we need to read a block.
156
157 br := &bz2.br
158 magic := br.ReadBits64(48)
159 if magic == bzip2FinalMagic {
160 br.ReadBits64(32) // ignored CRC
161 bz2.eof = true
162 return 0, io.EOF
163 } else if magic != bzip2BlockMagic {
164 return 0, StructuralError("bad magic value found")
165 }
166
167 err = bz2.readBlock()
168 if err != nil {
169 return 0, err
170 }
171
172 return bz2.read(buf)
173 }
174
175 // readBlock reads a bzip2 block. The magic number should already have been consumed.
176 func (bz2 *reader) readBlock() (err error) {
177 br := &bz2.br
178 br.ReadBits64(32) // skip checksum. TODO: check it if we can figure out what it is.
179 randomized := br.ReadBits(1)
180 if randomized != 0 {
181 return StructuralError("deprecated randomized files")
182 }
183 origPtr := uint(br.ReadBits(24))
184
185 // If not every byte value is used in the block (i.e., it's text) then
186 // the symbol set is reduced. The symbols used are stored as a
187 // two-level, 16x16 bitmap.
188 symbolRangeUsedBitmap := br.ReadBits(16)
189 symbolPresent := make([]bool, 256)
190 numSymbols := 0
191 for symRange := uint(0); symRange < 16; symRange++ {
192 if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
193 bits := br.ReadBits(16)
194 for symbol := uint(0); symbol < 16; symbol++ {
195 if bits&(1<<(15-symbol)) != 0 {
196 symbolPresent[16*symRange+symbol] = true
197 numSymbols++
198 }
199 }
200 }
201 }
202
203 // A block uses between two and six different Huffman trees.
204 numHuffmanTrees := br.ReadBits(3)
205 if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
206 return StructuralError("invalid number of Huffman trees")
207 }
208
209 // The Huffman tree can switch every 50 symbols so there's a list of
210 // tree indexes telling us which tree to use for each 50 symbol block.
211 numSelectors := br.ReadBits(15)
212 treeIndexes := make([]uint8, numSelectors)
213
214 // The tree indexes are move-to-front transformed and stored as unary
215 // numbers.
216 mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
217 for i := range treeIndexes {
218 c := 0
219 for {
220 inc := br.ReadBits(1)
221 if inc == 0 {
222 break
223 }
224 c++
225 }
226 if c >= numHuffmanTrees {
227 return StructuralError("tree index too large")
228 }
229 treeIndexes[i] = uint8(mtfTreeDecoder.Decode(c))
230 }
231
232 // The list of symbols for the move-to-front transform is taken from
233 // the previously decoded symbol bitmap.
234 symbols := make([]byte, numSymbols)
235 nextSymbol := 0
236 for i := 0; i < 256; i++ {
237 if symbolPresent[i] {
238 symbols[nextSymbol] = byte(i)
239 nextSymbol++
240 }
241 }
242 mtf := newMTFDecoder(symbols)
243
244 numSymbols += 2 // to account for RUNA and RUNB symbols
245 huffmanTrees := make([]huffmanTree, numHuffmanTrees)
246
247 // Now we decode the arrays of code-lengths for each tree.
248 lengths := make([]uint8, numSymbols)
249 for i := 0; i < numHuffmanTrees; i++ {
250 // The code lengths are delta encoded from a 5-bit base value.
251 length := br.ReadBits(5)
252 for j := 0; j < numSymbols; j++ {
253 for {
254 if !br.ReadBit() {
255 break
256 }
257 if br.ReadBit() {
258 length--
259 } else {
260 length++
261 }
262 }
263 if length < 0 || length > 20 {
264 return StructuralError("Huffman length out of range")
265 }
266 lengths[j] = uint8(length)
267 }
268 huffmanTrees[i], err = newHuffmanTree(lengths)
269 if err != nil {
270 return err
271 }
272 }
273
274 selectorIndex := 1 // the next tree index to use
275 currentHuffmanTree := huffmanTrees[treeIndexes[0]]
276 bufIndex := 0 // indexes bz2.buf, the output buffer.
277 // The output of the move-to-front transform is run-length encoded and
278 // we merge the decoding into the Huffman parsing loop. These two
279 // variables accumulate the repeat count. See the Wikipedia page for
280 // details.
281 repeat := 0
282 repeat_power := 0
283
284 // The `C' array (used by the inverse BWT) needs to be zero initialized.
285 for i := range bz2.c {
286 bz2.c[i] = 0
287 }
288
289 decoded := 0 // counts the number of symbols decoded by the current tree.
290 for {
291 if decoded == 50 {
292 currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
293 selectorIndex++
294 decoded = 0
295 }
296
297 v := currentHuffmanTree.Decode(br)
298 decoded++
299
300 if v < 2 {
301 // This is either the RUNA or RUNB symbol.
302 if repeat == 0 {
303 repeat_power = 1
304 }
305 repeat += repeat_power << v
306 repeat_power <<= 1
307
308 // This limit of 2 million comes from the bzip2 source
309 // code. It prevents repeat from overflowing.
310 if repeat > 2*1024*1024 {
311 return StructuralError("repeat count too large")
312 }
313 continue
314 }
315
316 if repeat > 0 {
317 // We have decoded a complete run-length so we need to
318 // replicate the last output symbol.
319 for i := 0; i < repeat; i++ {
320 b := byte(mtf.First())
321 bz2.tt[bufIndex] = uint32(b)
322 bz2.c[b]++
323 bufIndex++
324 }
325 repeat = 0
326 }
327
328 if int(v) == numSymbols-1 {
329 // This is the EOF symbol. Because it's always at the
330 // end of the move-to-front list, and never gets moved
331 // to the front, it has this unique value.
332 break
333 }
334
335 // Since two metasymbols (RUNA and RUNB) have values 0 and 1,
336 // one would expect |v-2| to be passed to the MTF decoder.
337 // However, the front of the MTF list is never referenced as 0,
338 // it's always referenced with a run-length of 1. Thus 0
339 // doesn't need to be encoded and we have |v-1| in the next
340 // line.
341 b := byte(mtf.Decode(int(v - 1)))
342 bz2.tt[bufIndex] = uint32(b)
343 bz2.c[b]++
344 bufIndex++
345 }
346
347 if origPtr >= uint(bufIndex) {
348 return StructuralError("origPtr out of bounds")
349 }
350
351 // We have completed the entropy decoding. Now we can perform the
352 // inverse BWT and setup the RLE buffer.
353 bz2.preRLE = bz2.tt[:bufIndex]
354 bz2.preRLEUsed = 0
355 bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
356 bz2.lastByte = -1
357 bz2.byteRepeats = 0
358 bz2.repeats = 0
359
360 return nil
361 }
362
363 // inverseBWT implements the inverse Burrows-Wheeler transform as described in
364 // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
365 // In that document, origPtr is called `I' and c is the `C' array after the
366 // first pass over the data. It's an argument here because we merge the first
367 // pass with the Huffman decoding.
368 //
369 // This also implements the `single array' method from the bzip2 source code
370 // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
371 // index of the next byte in the top 24-bits. The index of the first byte is
372 // returned.
373 func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
374 sum := uint(0)
375 for i := 0; i < 256; i++ {
376 sum += c[i]
377 c[i] = sum - c[i]
378 }
379
380 for i := range tt {
381 b := tt[i] & 0xff
382 tt[c[b]] |= uint32(i) << 8
383 c[b]++
384 }
385
386 return tt[origPtr] >> 8
387 }