src/pkg/index/suffixarray/suffixarray.go - The Go Programming Language

Golang

Source file src/pkg/index/suffixarray/suffixarray.go

     1	// Copyright 2010 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package suffixarray implements substring search in logarithmic time using
     6	// an in-memory suffix array.
     7	//
     8	// Example use:
     9	//
    10	//	// create index for some data
    11	//	index := suffixarray.New(data)
    12	//
    13	//	// lookup byte slice s
    14	//	offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
    15	//	offsets2 := index.Lookup(s, 3)  // the list of at most 3 indices where s occurs in data
    16	//
    17	package suffixarray
    18	
    19	import (
    20		"bytes"
    21		"encoding/binary"
    22		"io"
    23		"regexp"
    24		"sort"
    25	)
    26	
    27	// Index implements a suffix array for fast substring search.
    28	type Index struct {
    29		data []byte
    30		sa   []int // suffix array for data; len(sa) == len(data)
    31	}
    32	
    33	// New creates a new Index for data.
    34	// Index creation time is O(N*log(N)) for N = len(data).
    35	func New(data []byte) *Index {
    36		return &Index{data, qsufsort(data)}
    37	}
    38	
    39	// writeInt writes an int x to w using buf to buffer the write.
    40	func writeInt(w io.Writer, buf []byte, x int) error {
    41		binary.PutVarint(buf, int64(x))
    42		_, err := w.Write(buf[0:binary.MaxVarintLen64])
    43		return err
    44	}
    45	
    46	// readInt reads an int x from r using buf to buffer the read and returns x.
    47	func readInt(r io.Reader, buf []byte) (int, error) {
    48		_, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error
    49		x, _ := binary.Varint(buf)
    50		return int(x), err
    51	}
    52	
    53	// writeSlice writes data[:n] to w and returns n.
    54	// It uses buf to buffer the write.
    55	func writeSlice(w io.Writer, buf []byte, data []int) (n int, err error) {
    56		// encode as many elements as fit into buf
    57		p := binary.MaxVarintLen64
    58		for ; n < len(data) && p+binary.MaxVarintLen64 <= len(buf); n++ {
    59			p += binary.PutUvarint(buf[p:], uint64(data[n]))
    60		}
    61	
    62		// update buffer size
    63		binary.PutVarint(buf, int64(p))
    64	
    65		// write buffer
    66		_, err = w.Write(buf[0:p])
    67		return
    68	}
    69	
    70	// readSlice reads data[:n] from r and returns n.
    71	// It uses buf to buffer the read.
    72	func readSlice(r io.Reader, buf []byte, data []int) (n int, err error) {
    73		// read buffer size
    74		var size int
    75		size, err = readInt(r, buf)
    76		if err != nil {
    77			return
    78		}
    79	
    80		// read buffer w/o the size
    81		if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil {
    82			return
    83		}
    84	
    85		// decode as many elements as present in buf
    86		for p := binary.MaxVarintLen64; p < size; n++ {
    87			x, w := binary.Uvarint(buf[p:])
    88			data[n] = int(x)
    89			p += w
    90		}
    91	
    92		return
    93	}
    94	
    95	const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore
    96	
    97	// Read reads the index from r into x; x must not be nil.
    98	func (x *Index) Read(r io.Reader) error {
    99		// buffer for all reads
   100		buf := make([]byte, bufSize)
   101	
   102		// read length
   103		n, err := readInt(r, buf)
   104		if err != nil {
   105			return err
   106		}
   107	
   108		// allocate space
   109		if 2*n < cap(x.data) || cap(x.data) < n {
   110			// new data is significantly smaller or larger then
   111			// existing buffers - allocate new ones
   112			x.data = make([]byte, n)
   113			x.sa = make([]int, n)
   114		} else {
   115			// re-use existing buffers
   116			x.data = x.data[0:n]
   117			x.sa = x.sa[0:n]
   118		}
   119	
   120		// read data
   121		if _, err := io.ReadFull(r, x.data); err != nil {
   122			return err
   123		}
   124	
   125		// read index
   126		for sa := x.sa; len(sa) > 0; {
   127			n, err := readSlice(r, buf, sa)
   128			if err != nil {
   129				return err
   130			}
   131			sa = sa[n:]
   132		}
   133		return nil
   134	}
   135	
   136	// Write writes the index x to w.
   137	func (x *Index) Write(w io.Writer) error {
   138		// buffer for all writes
   139		buf := make([]byte, bufSize)
   140	
   141		// write length
   142		if err := writeInt(w, buf, len(x.data)); err != nil {
   143			return err
   144		}
   145	
   146		// write data
   147		if _, err := w.Write(x.data); err != nil {
   148			return err
   149		}
   150	
   151		// write index
   152		for sa := x.sa; len(sa) > 0; {
   153			n, err := writeSlice(w, buf, sa)
   154			if err != nil {
   155				return err
   156			}
   157			sa = sa[n:]
   158		}
   159		return nil
   160	}
   161	
   162	// Bytes returns the data over which the index was created.
   163	// It must not be modified.
   164	//
   165	func (x *Index) Bytes() []byte {
   166		return x.data
   167	}
   168	
   169	func (x *Index) at(i int) []byte {
   170		return x.data[x.sa[i]:]
   171	}
   172	
   173	// lookupAll returns a slice into the matching region of the index.
   174	// The runtime is O(log(N)*len(s)).
   175	func (x *Index) lookupAll(s []byte) []int {
   176		// find matching suffix index range [i:j]
   177		// find the first index where s would be the prefix
   178		i := sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
   179		// starting at i, find the first index at which s is not a prefix
   180		j := i + sort.Search(len(x.sa)-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) })
   181		return x.sa[i:j]
   182	}
   183	
   184	// Lookup returns an unsorted list of at most n indices where the byte string s
   185	// occurs in the indexed data. If n < 0, all occurrences are returned.
   186	// The result is nil if s is empty, s is not found, or n == 0.
   187	// Lookup time is O(log(N)*len(s) + len(result)) where N is the
   188	// size of the indexed data.
   189	//
   190	func (x *Index) Lookup(s []byte, n int) (result []int) {
   191		if len(s) > 0 && n != 0 {
   192			matches := x.lookupAll(s)
   193			if n < 0 || len(matches) < n {
   194				n = len(matches)
   195			}
   196			// 0 <= n <= len(matches)
   197			if n > 0 {
   198				result = make([]int, n)
   199				copy(result, matches)
   200			}
   201		}
   202		return
   203	}
   204	
   205	// FindAllIndex returns a sorted list of non-overlapping matches of the
   206	// regular expression r, where a match is a pair of indices specifying
   207	// the matched slice of x.Bytes(). If n < 0, all matches are returned
   208	// in successive order. Otherwise, at most n matches are returned and
   209	// they may not be successive. The result is nil if there are no matches,
   210	// or if n == 0.
   211	//
   212	func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) {
   213		// a non-empty literal prefix is used to determine possible
   214		// match start indices with Lookup
   215		prefix, complete := r.LiteralPrefix()
   216		lit := []byte(prefix)
   217	
   218		// worst-case scenario: no literal prefix
   219		if prefix == "" {
   220			return r.FindAllIndex(x.data, n)
   221		}
   222	
   223		// if regexp is a literal just use Lookup and convert its
   224		// result into match pairs
   225		if complete {
   226			// Lookup returns indices that may belong to overlapping matches.
   227			// After eliminating them, we may end up with fewer than n matches.
   228			// If we don't have enough at the end, redo the search with an
   229			// increased value n1, but only if Lookup returned all the requested
   230			// indices in the first place (if it returned fewer than that then
   231			// there cannot be more).
   232			for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
   233				indices := x.Lookup(lit, n1)
   234				if len(indices) == 0 {
   235					return
   236				}
   237				sort.Ints(indices)
   238				pairs := make([]int, 2*len(indices))
   239				result = make([][]int, len(indices))
   240				count := 0
   241				prev := 0
   242				for _, i := range indices {
   243					if count == n {
   244						break
   245					}
   246					// ignore indices leading to overlapping matches
   247					if prev <= i {
   248						j := 2 * count
   249						pairs[j+0] = i
   250						pairs[j+1] = i + len(lit)
   251						result[count] = pairs[j : j+2]
   252						count++
   253						prev = i + len(lit)
   254					}
   255				}
   256				result = result[0:count]
   257				if len(result) >= n || len(indices) != n1 {
   258					// found all matches or there's no chance to find more
   259					// (n and n1 can be negative)
   260					break
   261				}
   262			}
   263			if len(result) == 0 {
   264				result = nil
   265			}
   266			return
   267		}
   268	
   269		// regexp has a non-empty literal prefix; Lookup(lit) computes
   270		// the indices of possible complete matches; use these as starting
   271		// points for anchored searches
   272		// (regexp "^" matches beginning of input, not beginning of line)
   273		r = regexp.MustCompile("^" + r.String()) // compiles because r compiled
   274	
   275		// same comment about Lookup applies here as in the loop above
   276		for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
   277			indices := x.Lookup(lit, n1)
   278			if len(indices) == 0 {
   279				return
   280			}
   281			sort.Ints(indices)
   282			result = result[0:0]
   283			prev := 0
   284			for _, i := range indices {
   285				if len(result) == n {
   286					break
   287				}
   288				m := r.FindIndex(x.data[i:]) // anchored search - will not run off
   289				// ignore indices leading to overlapping matches
   290				if m != nil && prev <= i {
   291					m[0] = i // correct m
   292					m[1] += i
   293					result = append(result, m)
   294					prev = m[1]
   295				}
   296			}
   297			if len(result) >= n || len(indices) != n1 {
   298				// found all matches or there's no chance to find more
   299				// (n and n1 can be negative)
   300				break
   301			}
   302		}
   303		if len(result) == 0 {
   304			result = nil
   305		}
   306		return
   307	}