src/pkg/compress/bzip2/bzip2.go - The Go Programming Language

Golang

Source file src/pkg/compress/bzip2/bzip2.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package bzip2 implements bzip2 decompression.
     6	package bzip2
     7	
     8	import "io"
     9	
    10	// There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
    11	// of guessing: http://en.wikipedia.org/wiki/Bzip2
    12	// The source code to pyflate was useful for debugging:
    13	// http://www.paul.sladen.org/projects/pyflate
    14	
    15	// A StructuralError is returned when the bzip2 data is found to be
    16	// syntactically invalid.
    17	type StructuralError string
    18	
    19	func (s StructuralError) Error() string {
    20		return "bzip2 data invalid: " + string(s)
    21	}
    22	
    23	// A reader decompresses bzip2 compressed data.
    24	type reader struct {
    25		br        bitReader
    26		setupDone bool // true if we have parsed the bzip2 header.
    27		blockSize int  // blockSize in bytes, i.e. 900 * 1024.
    28		eof       bool
    29		buf       []byte    // stores Burrows-Wheeler transformed data.
    30		c         [256]uint // the `C' array for the inverse BWT.
    31		tt        []uint32  // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
    32		tPos      uint32    // Index of the next output byte in tt.
    33	
    34		preRLE      []uint32 // contains the RLE data still to be processed.
    35		preRLEUsed  int      // number of entries of preRLE used.
    36		lastByte    int      // the last byte value seen.
    37		byteRepeats uint     // the number of repeats of lastByte seen.
    38		repeats     uint     // the number of copies of lastByte to output.
    39	}
    40	
    41	// NewReader returns an io.Reader which decompresses bzip2 data from r.
    42	func NewReader(r io.Reader) io.Reader {
    43		bz2 := new(reader)
    44		bz2.br = newBitReader(r)
    45		return bz2
    46	}
    47	
    48	const bzip2FileMagic = 0x425a // "BZ"
    49	const bzip2BlockMagic = 0x314159265359
    50	const bzip2FinalMagic = 0x177245385090
    51	
    52	// setup parses the bzip2 header.
    53	func (bz2 *reader) setup() error {
    54		br := &bz2.br
    55	
    56		magic := br.ReadBits(16)
    57		if magic != bzip2FileMagic {
    58			return StructuralError("bad magic value")
    59		}
    60	
    61		t := br.ReadBits(8)
    62		if t != 'h' {
    63			return StructuralError("non-Huffman entropy encoding")
    64		}
    65	
    66		level := br.ReadBits(8)
    67		if level < '1' || level > '9' {
    68			return StructuralError("invalid compression level")
    69		}
    70	
    71		bz2.blockSize = 100 * 1024 * (int(level) - '0')
    72		bz2.tt = make([]uint32, bz2.blockSize)
    73		return nil
    74	}
    75	
    76	func (bz2 *reader) Read(buf []byte) (n int, err error) {
    77		if bz2.eof {
    78			return 0, io.EOF
    79		}
    80	
    81		if !bz2.setupDone {
    82			err = bz2.setup()
    83			brErr := bz2.br.Err()
    84			if brErr != nil {
    85				err = brErr
    86			}
    87			if err != nil {
    88				return 0, err
    89			}
    90			bz2.setupDone = true
    91		}
    92	
    93		n, err = bz2.read(buf)
    94		brErr := bz2.br.Err()
    95		if brErr != nil {
    96			err = brErr
    97		}
    98		return
    99	}
   100	
   101	func (bz2 *reader) read(buf []byte) (n int, err error) {
   102		// bzip2 is a block based compressor, except that it has a run-length
   103		// preprocessing step. The block based nature means that we can
   104		// preallocate fixed-size buffers and reuse them. However, the RLE
   105		// preprocessing would require allocating huge buffers to store the
   106		// maximum expansion. Thus we process blocks all at once, except for
   107		// the RLE which we decompress as required.
   108	
   109		for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
   110			// We have RLE data pending.
   111	
   112			// The run-length encoding works like this:
   113			// Any sequence of four equal bytes is followed by a length
   114			// byte which contains the number of repeats of that byte to
   115			// include. (The number of repeats can be zero.) Because we are
   116			// decompressing on-demand our state is kept in the reader
   117			// object.
   118	
   119			if bz2.repeats > 0 {
   120				buf[n] = byte(bz2.lastByte)
   121				n++
   122				bz2.repeats--
   123				if bz2.repeats == 0 {
   124					bz2.lastByte = -1
   125				}
   126				continue
   127			}
   128	
   129			bz2.tPos = bz2.preRLE[bz2.tPos]
   130			b := byte(bz2.tPos)
   131			bz2.tPos >>= 8
   132			bz2.preRLEUsed++
   133	
   134			if bz2.byteRepeats == 3 {
   135				bz2.repeats = uint(b)
   136				bz2.byteRepeats = 0
   137				continue
   138			}
   139	
   140			if bz2.lastByte == int(b) {
   141				bz2.byteRepeats++
   142			} else {
   143				bz2.byteRepeats = 0
   144			}
   145			bz2.lastByte = int(b)
   146	
   147			buf[n] = b
   148			n++
   149		}
   150	
   151		if n > 0 {
   152			return
   153		}
   154	
   155		// No RLE data is pending so we need to read a block.
   156	
   157		br := &bz2.br
   158		magic := br.ReadBits64(48)
   159		if magic == bzip2FinalMagic {
   160			br.ReadBits64(32) // ignored CRC
   161			bz2.eof = true
   162			return 0, io.EOF
   163		} else if magic != bzip2BlockMagic {
   164			return 0, StructuralError("bad magic value found")
   165		}
   166	
   167		err = bz2.readBlock()
   168		if err != nil {
   169			return 0, err
   170		}
   171	
   172		return bz2.read(buf)
   173	}
   174	
   175	// readBlock reads a bzip2 block. The magic number should already have been consumed.
   176	func (bz2 *reader) readBlock() (err error) {
   177		br := &bz2.br
   178		br.ReadBits64(32) // skip checksum. TODO: check it if we can figure out what it is.
   179		randomized := br.ReadBits(1)
   180		if randomized != 0 {
   181			return StructuralError("deprecated randomized files")
   182		}
   183		origPtr := uint(br.ReadBits(24))
   184	
   185		// If not every byte value is used in the block (i.e., it's text) then
   186		// the symbol set is reduced. The symbols used are stored as a
   187		// two-level, 16x16 bitmap.
   188		symbolRangeUsedBitmap := br.ReadBits(16)
   189		symbolPresent := make([]bool, 256)
   190		numSymbols := 0
   191		for symRange := uint(0); symRange < 16; symRange++ {
   192			if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
   193				bits := br.ReadBits(16)
   194				for symbol := uint(0); symbol < 16; symbol++ {
   195					if bits&(1<<(15-symbol)) != 0 {
   196						symbolPresent[16*symRange+symbol] = true
   197						numSymbols++
   198					}
   199				}
   200			}
   201		}
   202	
   203		// A block uses between two and six different Huffman trees.
   204		numHuffmanTrees := br.ReadBits(3)
   205		if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
   206			return StructuralError("invalid number of Huffman trees")
   207		}
   208	
   209		// The Huffman tree can switch every 50 symbols so there's a list of
   210		// tree indexes telling us which tree to use for each 50 symbol block.
   211		numSelectors := br.ReadBits(15)
   212		treeIndexes := make([]uint8, numSelectors)
   213	
   214		// The tree indexes are move-to-front transformed and stored as unary
   215		// numbers.
   216		mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
   217		for i := range treeIndexes {
   218			c := 0
   219			for {
   220				inc := br.ReadBits(1)
   221				if inc == 0 {
   222					break
   223				}
   224				c++
   225			}
   226			if c >= numHuffmanTrees {
   227				return StructuralError("tree index too large")
   228			}
   229			treeIndexes[i] = uint8(mtfTreeDecoder.Decode(c))
   230		}
   231	
   232		// The list of symbols for the move-to-front transform is taken from
   233		// the previously decoded symbol bitmap.
   234		symbols := make([]byte, numSymbols)
   235		nextSymbol := 0
   236		for i := 0; i < 256; i++ {
   237			if symbolPresent[i] {
   238				symbols[nextSymbol] = byte(i)
   239				nextSymbol++
   240			}
   241		}
   242		mtf := newMTFDecoder(symbols)
   243	
   244		numSymbols += 2 // to account for RUNA and RUNB symbols
   245		huffmanTrees := make([]huffmanTree, numHuffmanTrees)
   246	
   247		// Now we decode the arrays of code-lengths for each tree.
   248		lengths := make([]uint8, numSymbols)
   249		for i := 0; i < numHuffmanTrees; i++ {
   250			// The code lengths are delta encoded from a 5-bit base value.
   251			length := br.ReadBits(5)
   252			for j := 0; j < numSymbols; j++ {
   253				for {
   254					if !br.ReadBit() {
   255						break
   256					}
   257					if br.ReadBit() {
   258						length--
   259					} else {
   260						length++
   261					}
   262				}
   263				if length < 0 || length > 20 {
   264					return StructuralError("Huffman length out of range")
   265				}
   266				lengths[j] = uint8(length)
   267			}
   268			huffmanTrees[i], err = newHuffmanTree(lengths)
   269			if err != nil {
   270				return err
   271			}
   272		}
   273	
   274		selectorIndex := 1 // the next tree index to use
   275		currentHuffmanTree := huffmanTrees[treeIndexes[0]]
   276		bufIndex := 0 // indexes bz2.buf, the output buffer.
   277		// The output of the move-to-front transform is run-length encoded and
   278		// we merge the decoding into the Huffman parsing loop. These two
   279		// variables accumulate the repeat count. See the Wikipedia page for
   280		// details.
   281		repeat := 0
   282		repeat_power := 0
   283	
   284		// The `C' array (used by the inverse BWT) needs to be zero initialized.
   285		for i := range bz2.c {
   286			bz2.c[i] = 0
   287		}
   288	
   289		decoded := 0 // counts the number of symbols decoded by the current tree.
   290		for {
   291			if decoded == 50 {
   292				currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
   293				selectorIndex++
   294				decoded = 0
   295			}
   296	
   297			v := currentHuffmanTree.Decode(br)
   298			decoded++
   299	
   300			if v < 2 {
   301				// This is either the RUNA or RUNB symbol.
   302				if repeat == 0 {
   303					repeat_power = 1
   304				}
   305				repeat += repeat_power << v
   306				repeat_power <<= 1
   307	
   308				// This limit of 2 million comes from the bzip2 source
   309				// code. It prevents repeat from overflowing.
   310				if repeat > 2*1024*1024 {
   311					return StructuralError("repeat count too large")
   312				}
   313				continue
   314			}
   315	
   316			if repeat > 0 {
   317				// We have decoded a complete run-length so we need to
   318				// replicate the last output symbol.
   319				for i := 0; i < repeat; i++ {
   320					b := byte(mtf.First())
   321					bz2.tt[bufIndex] = uint32(b)
   322					bz2.c[b]++
   323					bufIndex++
   324				}
   325				repeat = 0
   326			}
   327	
   328			if int(v) == numSymbols-1 {
   329				// This is the EOF symbol. Because it's always at the
   330				// end of the move-to-front list, and never gets moved
   331				// to the front, it has this unique value.
   332				break
   333			}
   334	
   335			// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
   336			// one would expect |v-2| to be passed to the MTF decoder.
   337			// However, the front of the MTF list is never referenced as 0,
   338			// it's always referenced with a run-length of 1. Thus 0
   339			// doesn't need to be encoded and we have |v-1| in the next
   340			// line.
   341			b := byte(mtf.Decode(int(v - 1)))
   342			bz2.tt[bufIndex] = uint32(b)
   343			bz2.c[b]++
   344			bufIndex++
   345		}
   346	
   347		if origPtr >= uint(bufIndex) {
   348			return StructuralError("origPtr out of bounds")
   349		}
   350	
   351		// We have completed the entropy decoding. Now we can perform the
   352		// inverse BWT and setup the RLE buffer.
   353		bz2.preRLE = bz2.tt[:bufIndex]
   354		bz2.preRLEUsed = 0
   355		bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
   356		bz2.lastByte = -1
   357		bz2.byteRepeats = 0
   358		bz2.repeats = 0
   359	
   360		return nil
   361	}
   362	
   363	// inverseBWT implements the inverse Burrows-Wheeler transform as described in
   364	// http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
   365	// In that document, origPtr is called `I' and c is the `C' array after the
   366	// first pass over the data. It's an argument here because we merge the first
   367	// pass with the Huffman decoding.
   368	//
   369	// This also implements the `single array' method from the bzip2 source code
   370	// which leaves the output, still shuffled, in the bottom 8 bits of tt with the
   371	// index of the next byte in the top 24-bits. The index of the first byte is
   372	// returned.
   373	func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
   374		sum := uint(0)
   375		for i := 0; i < 256; i++ {
   376			sum += c[i]
   377			c[i] = sum - c[i]
   378		}
   379	
   380		for i := range tt {
   381			b := tt[i] & 0xff
   382			tt[c[b]] |= uint32(i) << 8
   383			c[b]++
   384		}
   385	
   386		return tt[origPtr] >> 8
   387	}