src/pkg/compress/flate/huffman_bit_writer.go - The Go Programming Language

Golang

Source file src/pkg/compress/flate/huffman_bit_writer.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package flate
     6	
     7	import (
     8		"io"
     9		"math"
    10	)
    11	
    12	const (
    13		// The largest offset code.
    14		offsetCodeCount = 30
    15	
    16		// The special code used to mark the end of a block.
    17		endBlockMarker = 256
    18	
    19		// The first length code.
    20		lengthCodesStart = 257
    21	
    22		// The number of codegen codes.
    23		codegenCodeCount = 19
    24		badCode          = 255
    25	)
    26	
    27	// The number of extra bits needed by length code X - LENGTH_CODES_START.
    28	var lengthExtraBits = []int8{
    29		/* 257 */ 0, 0, 0,
    30		/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
    31		/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
    32		/* 280 */ 4, 5, 5, 5, 5, 0,
    33	}
    34	
    35	// The length indicated by length code X - LENGTH_CODES_START.
    36	var lengthBase = []uint32{
    37		0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
    38		12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
    39		64, 80, 96, 112, 128, 160, 192, 224, 255,
    40	}
    41	
    42	// offset code word extra bits.
    43	var offsetExtraBits = []int8{
    44		0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
    45		4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
    46		9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
    47		/* extended window */
    48		14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
    49	}
    50	
    51	var offsetBase = []uint32{
    52		/* normal deflate */
    53		0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
    54		0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
    55		0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
    56		0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
    57		0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
    58		0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
    59	
    60		/* extended window */
    61		0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
    62		0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
    63		0x100000, 0x180000, 0x200000, 0x300000,
    64	}
    65	
    66	// The odd order in which the codegen code sizes are written.
    67	var codegenOrder = []uint32{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
    68	
    69	type huffmanBitWriter struct {
    70		w io.Writer
    71		// Data waiting to be written is bytes[0:nbytes]
    72		// and then the low nbits of bits.
    73		bits            uint32
    74		nbits           uint32
    75		bytes           [64]byte
    76		nbytes          int
    77		literalFreq     []int32
    78		offsetFreq      []int32
    79		codegen         []uint8
    80		codegenFreq     []int32
    81		literalEncoding *huffmanEncoder
    82		offsetEncoding  *huffmanEncoder
    83		codegenEncoding *huffmanEncoder
    84		err             error
    85	}
    86	
    87	func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
    88		return &huffmanBitWriter{
    89			w:               w,
    90			literalFreq:     make([]int32, maxLit),
    91			offsetFreq:      make([]int32, offsetCodeCount),
    92			codegen:         make([]uint8, maxLit+offsetCodeCount+1),
    93			codegenFreq:     make([]int32, codegenCodeCount),
    94			literalEncoding: newHuffmanEncoder(maxLit),
    95			offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
    96			codegenEncoding: newHuffmanEncoder(codegenCodeCount),
    97		}
    98	}
    99	
   100	func (w *huffmanBitWriter) flushBits() {
   101		if w.err != nil {
   102			w.nbits = 0
   103			return
   104		}
   105		bits := w.bits
   106		w.bits >>= 16
   107		w.nbits -= 16
   108		n := w.nbytes
   109		w.bytes[n] = byte(bits)
   110		w.bytes[n+1] = byte(bits >> 8)
   111		if n += 2; n >= len(w.bytes) {
   112			_, w.err = w.w.Write(w.bytes[0:])
   113			n = 0
   114		}
   115		w.nbytes = n
   116	}
   117	
   118	func (w *huffmanBitWriter) flush() {
   119		if w.err != nil {
   120			w.nbits = 0
   121			return
   122		}
   123		n := w.nbytes
   124		if w.nbits > 8 {
   125			w.bytes[n] = byte(w.bits)
   126			w.bits >>= 8
   127			w.nbits -= 8
   128			n++
   129		}
   130		if w.nbits > 0 {
   131			w.bytes[n] = byte(w.bits)
   132			w.nbits = 0
   133			n++
   134		}
   135		w.bits = 0
   136		_, w.err = w.w.Write(w.bytes[0:n])
   137		w.nbytes = 0
   138	}
   139	
   140	func (w *huffmanBitWriter) writeBits(b, nb int32) {
   141		w.bits |= uint32(b) << w.nbits
   142		if w.nbits += uint32(nb); w.nbits >= 16 {
   143			w.flushBits()
   144		}
   145	}
   146	
   147	func (w *huffmanBitWriter) writeBytes(bytes []byte) {
   148		if w.err != nil {
   149			return
   150		}
   151		n := w.nbytes
   152		if w.nbits == 8 {
   153			w.bytes[n] = byte(w.bits)
   154			w.nbits = 0
   155			n++
   156		}
   157		if w.nbits != 0 {
   158			w.err = InternalError("writeBytes with unfinished bits")
   159			return
   160		}
   161		if n != 0 {
   162			_, w.err = w.w.Write(w.bytes[0:n])
   163			if w.err != nil {
   164				return
   165			}
   166		}
   167		w.nbytes = 0
   168		_, w.err = w.w.Write(bytes)
   169	}
   170	
   171	// RFC 1951 3.2.7 specifies a special run-length encoding for specifying
   172	// the literal and offset lengths arrays (which are concatenated into a single
   173	// array).  This method generates that run-length encoding.
   174	//
   175	// The result is written into the codegen array, and the frequencies
   176	// of each code is written into the codegenFreq array.
   177	// Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
   178	// information.  Code badCode is an end marker
   179	//
   180	//  numLiterals      The number of literals in literalEncoding
   181	//  numOffsets       The number of offsets in offsetEncoding
   182	func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int) {
   183		for i := range w.codegenFreq {
   184			w.codegenFreq[i] = 0
   185		}
   186		// Note that we are using codegen both as a temporary variable for holding
   187		// a copy of the frequencies, and as the place where we put the result.
   188		// This is fine because the output is always shorter than the input used
   189		// so far.
   190		codegen := w.codegen // cache
   191		// Copy the concatenated code sizes to codegen.  Put a marker at the end.
   192		copy(codegen[0:numLiterals], w.literalEncoding.codeBits)
   193		copy(codegen[numLiterals:numLiterals+numOffsets], w.offsetEncoding.codeBits)
   194		codegen[numLiterals+numOffsets] = badCode
   195	
   196		size := codegen[0]
   197		count := 1
   198		outIndex := 0
   199		for inIndex := 1; size != badCode; inIndex++ {
   200			// INVARIANT: We have seen "count" copies of size that have not yet
   201			// had output generated for them.
   202			nextSize := codegen[inIndex]
   203			if nextSize == size {
   204				count++
   205				continue
   206			}
   207			// We need to generate codegen indicating "count" of size.
   208			if size != 0 {
   209				codegen[outIndex] = size
   210				outIndex++
   211				w.codegenFreq[size]++
   212				count--
   213				for count >= 3 {
   214					n := 6
   215					if n > count {
   216						n = count
   217					}
   218					codegen[outIndex] = 16
   219					outIndex++
   220					codegen[outIndex] = uint8(n - 3)
   221					outIndex++
   222					w.codegenFreq[16]++
   223					count -= n
   224				}
   225			} else {
   226				for count >= 11 {
   227					n := 138
   228					if n > count {
   229						n = count
   230					}
   231					codegen[outIndex] = 18
   232					outIndex++
   233					codegen[outIndex] = uint8(n - 11)
   234					outIndex++
   235					w.codegenFreq[18]++
   236					count -= n
   237				}
   238				if count >= 3 {
   239					// count >= 3 && count <= 10
   240					codegen[outIndex] = 17
   241					outIndex++
   242					codegen[outIndex] = uint8(count - 3)
   243					outIndex++
   244					w.codegenFreq[17]++
   245					count = 0
   246				}
   247			}
   248			count--
   249			for ; count >= 0; count-- {
   250				codegen[outIndex] = size
   251				outIndex++
   252				w.codegenFreq[size]++
   253			}
   254			// Set up invariant for next time through the loop.
   255			size = nextSize
   256			count = 1
   257		}
   258		// Marker indicating the end of the codegen.
   259		codegen[outIndex] = badCode
   260	}
   261	
   262	func (w *huffmanBitWriter) writeCode(code *huffmanEncoder, literal uint32) {
   263		if w.err != nil {
   264			return
   265		}
   266		w.writeBits(int32(code.code[literal]), int32(code.codeBits[literal]))
   267	}
   268	
   269	// Write the header of a dynamic Huffman block to the output stream.
   270	//
   271	//  numLiterals  The number of literals specified in codegen
   272	//  numOffsets   The number of offsets specified in codegen
   273	//  numCodegens  The number of codegens used in codegen
   274	func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {
   275		if w.err != nil {
   276			return
   277		}
   278		var firstBits int32 = 4
   279		if isEof {
   280			firstBits = 5
   281		}
   282		w.writeBits(firstBits, 3)
   283		w.writeBits(int32(numLiterals-257), 5)
   284		w.writeBits(int32(numOffsets-1), 5)
   285		w.writeBits(int32(numCodegens-4), 4)
   286	
   287		for i := 0; i < numCodegens; i++ {
   288			value := w.codegenEncoding.codeBits[codegenOrder[i]]
   289			w.writeBits(int32(value), 3)
   290		}
   291	
   292		i := 0
   293		for {
   294			var codeWord int = int(w.codegen[i])
   295			i++
   296			if codeWord == badCode {
   297				break
   298			}
   299			// The low byte contains the actual code to generate.
   300			w.writeCode(w.codegenEncoding, uint32(codeWord))
   301	
   302			switch codeWord {
   303			case 16:
   304				w.writeBits(int32(w.codegen[i]), 2)
   305				i++
   306				break
   307			case 17:
   308				w.writeBits(int32(w.codegen[i]), 3)
   309				i++
   310				break
   311			case 18:
   312				w.writeBits(int32(w.codegen[i]), 7)
   313				i++
   314				break
   315			}
   316		}
   317	}
   318	
   319	func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
   320		if w.err != nil {
   321			return
   322		}
   323		var flag int32
   324		if isEof {
   325			flag = 1
   326		}
   327		w.writeBits(flag, 3)
   328		w.flush()
   329		w.writeBits(int32(length), 16)
   330		w.writeBits(int32(^uint16(length)), 16)
   331	}
   332	
   333	func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
   334		if w.err != nil {
   335			return
   336		}
   337		// Indicate that we are a fixed Huffman block
   338		var value int32 = 2
   339		if isEof {
   340			value = 3
   341		}
   342		w.writeBits(value, 3)
   343	}
   344	
   345	func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
   346		if w.err != nil {
   347			return
   348		}
   349		for i := range w.literalFreq {
   350			w.literalFreq[i] = 0
   351		}
   352		for i := range w.offsetFreq {
   353			w.offsetFreq[i] = 0
   354		}
   355	
   356		n := len(tokens)
   357		tokens = tokens[0 : n+1]
   358		tokens[n] = endBlockMarker
   359	
   360		for _, t := range tokens {
   361			switch t.typ() {
   362			case literalType:
   363				w.literalFreq[t.literal()]++
   364			case matchType:
   365				length := t.length()
   366				offset := t.offset()
   367				w.literalFreq[lengthCodesStart+lengthCode(length)]++
   368				w.offsetFreq[offsetCode(offset)]++
   369			}
   370		}
   371	
   372		// get the number of literals
   373		numLiterals := len(w.literalFreq)
   374		for w.literalFreq[numLiterals-1] == 0 {
   375			numLiterals--
   376		}
   377		// get the number of offsets
   378		numOffsets := len(w.offsetFreq)
   379		for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 {
   380			numOffsets--
   381		}
   382		if numOffsets == 0 {
   383			// We haven't found a single match. If we want to go with the dynamic encoding,
   384			// we should count at least one offset to be sure that the offset huffman tree could be encoded.
   385			w.offsetFreq[0] = 1
   386			numOffsets = 1
   387		}
   388	
   389		w.literalEncoding.generate(w.literalFreq, 15)
   390		w.offsetEncoding.generate(w.offsetFreq, 15)
   391	
   392		storedBytes := 0
   393		if input != nil {
   394			storedBytes = len(input)
   395		}
   396		var extraBits int64
   397		var storedSize int64 = math.MaxInt64
   398		if storedBytes <= maxStoreBlockSize && input != nil {
   399			storedSize = int64((storedBytes + 5) * 8)
   400			// We only bother calculating the costs of the extra bits required by
   401			// the length of offset fields (which will be the same for both fixed
   402			// and dynamic encoding), if we need to compare those two encodings
   403			// against stored encoding.
   404			for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
   405				// First eight length codes have extra size = 0.
   406				extraBits += int64(w.literalFreq[lengthCode]) * int64(lengthExtraBits[lengthCode-lengthCodesStart])
   407			}
   408			for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
   409				// First four offset codes have extra size = 0.
   410				extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode])
   411			}
   412		}
   413	
   414		// Figure out smallest code.
   415		// Fixed Huffman baseline.
   416		var size = int64(3) +
   417			fixedLiteralEncoding.bitLength(w.literalFreq) +
   418			fixedOffsetEncoding.bitLength(w.offsetFreq) +
   419			extraBits
   420		var literalEncoding = fixedLiteralEncoding
   421		var offsetEncoding = fixedOffsetEncoding
   422	
   423		// Dynamic Huffman?
   424		var numCodegens int
   425	
   426		// Generate codegen and codegenFrequencies, which indicates how to encode
   427		// the literalEncoding and the offsetEncoding.
   428		w.generateCodegen(numLiterals, numOffsets)
   429		w.codegenEncoding.generate(w.codegenFreq, 7)
   430		numCodegens = len(w.codegenFreq)
   431		for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
   432			numCodegens--
   433		}
   434		dynamicHeader := int64(3+5+5+4+(3*numCodegens)) +
   435			w.codegenEncoding.bitLength(w.codegenFreq) +
   436			int64(extraBits) +
   437			int64(w.codegenFreq[16]*2) +
   438			int64(w.codegenFreq[17]*3) +
   439			int64(w.codegenFreq[18]*7)
   440		dynamicSize := dynamicHeader +
   441			w.literalEncoding.bitLength(w.literalFreq) +
   442			w.offsetEncoding.bitLength(w.offsetFreq)
   443	
   444		if dynamicSize < size {
   445			size = dynamicSize
   446			literalEncoding = w.literalEncoding
   447			offsetEncoding = w.offsetEncoding
   448		}
   449	
   450		// Stored bytes?
   451		if storedSize < size {
   452			w.writeStoredHeader(storedBytes, eof)
   453			w.writeBytes(input[0:storedBytes])
   454			return
   455		}
   456	
   457		// Huffman.
   458		if literalEncoding == fixedLiteralEncoding {
   459			w.writeFixedHeader(eof)
   460		} else {
   461			w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
   462		}
   463		for _, t := range tokens {
   464			switch t.typ() {
   465			case literalType:
   466				w.writeCode(literalEncoding, t.literal())
   467				break
   468			case matchType:
   469				// Write the length
   470				length := t.length()
   471				lengthCode := lengthCode(length)
   472				w.writeCode(literalEncoding, lengthCode+lengthCodesStart)
   473				extraLengthBits := int32(lengthExtraBits[lengthCode])
   474				if extraLengthBits > 0 {
   475					extraLength := int32(length - lengthBase[lengthCode])
   476					w.writeBits(extraLength, extraLengthBits)
   477				}
   478				// Write the offset
   479				offset := t.offset()
   480				offsetCode := offsetCode(offset)
   481				w.writeCode(offsetEncoding, offsetCode)
   482				extraOffsetBits := int32(offsetExtraBits[offsetCode])
   483				if extraOffsetBits > 0 {
   484					extraOffset := int32(offset - offsetBase[offsetCode])
   485					w.writeBits(extraOffset, extraOffsetBits)
   486				}
   487				break
   488			default:
   489				panic("unknown token type: " + string(t))
   490			}
   491		}
   492	}