src/pkg/compress/bzip2/huffman.go - The Go Programming Language

Golang

Source file src/pkg/compress/bzip2/huffman.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package bzip2
     6	
     7	import "sort"
     8	
     9	// A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
    10	// symbol.
    11	type huffmanTree struct {
    12		// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
    13		// root of the tree and nextNode contains the index of the next element
    14		// of nodes to use when the tree is being constructed.
    15		nodes    []huffmanNode
    16		nextNode int
    17	}
    18	
    19	// A huffmanNode is a node in the tree. left and right contain indexes into the
    20	// nodes slice of the tree. If left or right is invalidNodeValue then the child
    21	// is a left node and its value is in leftValue/rightValue.
    22	//
    23	// The symbols are uint16s because bzip2 encodes not only MTF indexes in the
    24	// tree, but also two magic values for run-length encoding and an EOF symbol.
    25	// Thus there are more than 256 possible symbols.
    26	type huffmanNode struct {
    27		left, right           uint16
    28		leftValue, rightValue uint16
    29	}
    30	
    31	// invalidNodeValue is an invalid index which marks a leaf node in the tree.
    32	const invalidNodeValue = 0xffff
    33	
    34	// Decode reads bits from the given bitReader and navigates the tree until a
    35	// symbol is found.
    36	func (t huffmanTree) Decode(br *bitReader) (v uint16) {
    37		nodeIndex := uint16(0) // node 0 is the root of the tree.
    38	
    39		for {
    40			node := &t.nodes[nodeIndex]
    41			bit := br.ReadBit()
    42			// bzip2 encodes left as a true bit.
    43			if bit {
    44				// left
    45				if node.left == invalidNodeValue {
    46					return node.leftValue
    47				}
    48				nodeIndex = node.left
    49			} else {
    50				// right
    51				if node.right == invalidNodeValue {
    52					return node.rightValue
    53				}
    54				nodeIndex = node.right
    55			}
    56		}
    57	
    58		panic("unreachable")
    59	}
    60	
    61	// newHuffmanTree builds a Huffman tree from a slice containing the code
    62	// lengths of each symbol. The maximum code length is 32 bits.
    63	func newHuffmanTree(lengths []uint8) (huffmanTree, error) {
    64		// There are many possible trees that assign the same code length to
    65		// each symbol (consider reflecting a tree down the middle, for
    66		// example). Since the code length assignments determine the
    67		// efficiency of the tree, each of these trees is equally good. In
    68		// order to minimize the amount of information needed to build a tree
    69		// bzip2 uses a canonical tree so that it can be reconstructed given
    70		// only the code length assignments.
    71	
    72		if len(lengths) < 2 {
    73			panic("newHuffmanTree: too few symbols")
    74		}
    75	
    76		var t huffmanTree
    77	
    78		// First we sort the code length assignments by ascending code length,
    79		// using the symbol value to break ties.
    80		pairs := huffmanSymbolLengthPairs(make([]huffmanSymbolLengthPair, len(lengths)))
    81		for i, length := range lengths {
    82			pairs[i].value = uint16(i)
    83			pairs[i].length = length
    84		}
    85	
    86		sort.Sort(pairs)
    87	
    88		// Now we assign codes to the symbols, starting with the longest code.
    89		// We keep the codes packed into a uint32, at the most-significant end.
    90		// So branches are taken from the MSB downwards. This makes it easy to
    91		// sort them later.
    92		code := uint32(0)
    93		length := uint8(32)
    94	
    95		codes := huffmanCodes(make([]huffmanCode, len(lengths)))
    96		for i := len(pairs) - 1; i >= 0; i-- {
    97			if length > pairs[i].length {
    98				// If the code length decreases we shift in order to
    99				// zero any bits beyond the end of the code.
   100				length >>= 32 - pairs[i].length
   101				length <<= 32 - pairs[i].length
   102				length = pairs[i].length
   103			}
   104			codes[i].code = code
   105			codes[i].codeLen = length
   106			codes[i].value = pairs[i].value
   107			// We need to 'increment' the code, which means treating |code|
   108			// like a |length| bit number.
   109			code += 1 << (32 - length)
   110		}
   111	
   112		// Now we can sort by the code so that the left half of each branch are
   113		// grouped together, recursively.
   114		sort.Sort(codes)
   115	
   116		t.nodes = make([]huffmanNode, len(codes))
   117		_, err := buildHuffmanNode(&t, codes, 0)
   118		return t, err
   119	}
   120	
   121	// huffmanSymbolLengthPair contains a symbol and its code length.
   122	type huffmanSymbolLengthPair struct {
   123		value  uint16
   124		length uint8
   125	}
   126	
   127	// huffmanSymbolLengthPair is used to provide an interface for sorting.
   128	type huffmanSymbolLengthPairs []huffmanSymbolLengthPair
   129	
   130	func (h huffmanSymbolLengthPairs) Len() int {
   131		return len(h)
   132	}
   133	
   134	func (h huffmanSymbolLengthPairs) Less(i, j int) bool {
   135		if h[i].length < h[j].length {
   136			return true
   137		}
   138		if h[i].length > h[j].length {
   139			return false
   140		}
   141		if h[i].value < h[j].value {
   142			return true
   143		}
   144		return false
   145	}
   146	
   147	func (h huffmanSymbolLengthPairs) Swap(i, j int) {
   148		h[i], h[j] = h[j], h[i]
   149	}
   150	
   151	// huffmanCode contains a symbol, its code and code length.
   152	type huffmanCode struct {
   153		code    uint32
   154		codeLen uint8
   155		value   uint16
   156	}
   157	
   158	// huffmanCodes is used to provide an interface for sorting.
   159	type huffmanCodes []huffmanCode
   160	
   161	func (n huffmanCodes) Len() int {
   162		return len(n)
   163	}
   164	
   165	func (n huffmanCodes) Less(i, j int) bool {
   166		return n[i].code < n[j].code
   167	}
   168	
   169	func (n huffmanCodes) Swap(i, j int) {
   170		n[i], n[j] = n[j], n[i]
   171	}
   172	
   173	// buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
   174	// the Huffman tree at the given level. It returns the index of the newly
   175	// constructed node.
   176	func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) {
   177		test := uint32(1) << (31 - level)
   178	
   179		// We have to search the list of codes to find the divide between the left and right sides.
   180		firstRightIndex := len(codes)
   181		for i, code := range codes {
   182			if code.code&test != 0 {
   183				firstRightIndex = i
   184				break
   185			}
   186		}
   187	
   188		left := codes[:firstRightIndex]
   189		right := codes[firstRightIndex:]
   190	
   191		if len(left) == 0 || len(right) == 0 {
   192			return 0, StructuralError("superfluous level in Huffman tree")
   193		}
   194	
   195		nodeIndex = uint16(t.nextNode)
   196		node := &t.nodes[t.nextNode]
   197		t.nextNode++
   198	
   199		if len(left) == 1 {
   200			// leaf node
   201			node.left = invalidNodeValue
   202			node.leftValue = left[0].value
   203		} else {
   204			node.left, err = buildHuffmanNode(t, left, level+1)
   205		}
   206	
   207		if err != nil {
   208			return
   209		}
   210	
   211		if len(right) == 1 {
   212			// leaf node
   213			node.right = invalidNodeValue
   214			node.rightValue = right[0].value
   215		} else {
   216			node.right, err = buildHuffmanNode(t, right, level+1)
   217		}
   218	
   219		return
   220	}