Source file src/pkg/index/suffixarray/suffixarray.go
1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package suffixarray implements substring search in logarithmic time using
6 // an in-memory suffix array.
7 //
8 // Example use:
9 //
10 // // create index for some data
11 // index := suffixarray.New(data)
12 //
13 // // lookup byte slice s
14 // offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
15 // offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data
16 //
17 package suffixarray
18
19 import (
20 "bytes"
21 "encoding/binary"
22 "io"
23 "regexp"
24 "sort"
25 )
26
27 // Index implements a suffix array for fast substring search.
28 type Index struct {
29 data []byte
30 sa []int // suffix array for data; len(sa) == len(data)
31 }
32
33 // New creates a new Index for data.
34 // Index creation time is O(N*log(N)) for N = len(data).
35 func New(data []byte) *Index {
36 return &Index{data, qsufsort(data)}
37 }
38
39 // writeInt writes an int x to w using buf to buffer the write.
40 func writeInt(w io.Writer, buf []byte, x int) error {
41 binary.PutVarint(buf, int64(x))
42 _, err := w.Write(buf[0:binary.MaxVarintLen64])
43 return err
44 }
45
46 // readInt reads an int x from r using buf to buffer the read and returns x.
47 func readInt(r io.Reader, buf []byte) (int, error) {
48 _, err := io.ReadFull(r, buf[0:binary.MaxVarintLen64]) // ok to continue with error
49 x, _ := binary.Varint(buf)
50 return int(x), err
51 }
52
53 // writeSlice writes data[:n] to w and returns n.
54 // It uses buf to buffer the write.
55 func writeSlice(w io.Writer, buf []byte, data []int) (n int, err error) {
56 // encode as many elements as fit into buf
57 p := binary.MaxVarintLen64
58 for ; n < len(data) && p+binary.MaxVarintLen64 <= len(buf); n++ {
59 p += binary.PutUvarint(buf[p:], uint64(data[n]))
60 }
61
62 // update buffer size
63 binary.PutVarint(buf, int64(p))
64
65 // write buffer
66 _, err = w.Write(buf[0:p])
67 return
68 }
69
70 // readSlice reads data[:n] from r and returns n.
71 // It uses buf to buffer the read.
72 func readSlice(r io.Reader, buf []byte, data []int) (n int, err error) {
73 // read buffer size
74 var size int
75 size, err = readInt(r, buf)
76 if err != nil {
77 return
78 }
79
80 // read buffer w/o the size
81 if _, err = io.ReadFull(r, buf[binary.MaxVarintLen64:size]); err != nil {
82 return
83 }
84
85 // decode as many elements as present in buf
86 for p := binary.MaxVarintLen64; p < size; n++ {
87 x, w := binary.Uvarint(buf[p:])
88 data[n] = int(x)
89 p += w
90 }
91
92 return
93 }
94
95 const bufSize = 16 << 10 // reasonable for BenchmarkSaveRestore
96
97 // Read reads the index from r into x; x must not be nil.
98 func (x *Index) Read(r io.Reader) error {
99 // buffer for all reads
100 buf := make([]byte, bufSize)
101
102 // read length
103 n, err := readInt(r, buf)
104 if err != nil {
105 return err
106 }
107
108 // allocate space
109 if 2*n < cap(x.data) || cap(x.data) < n {
110 // new data is significantly smaller or larger then
111 // existing buffers - allocate new ones
112 x.data = make([]byte, n)
113 x.sa = make([]int, n)
114 } else {
115 // re-use existing buffers
116 x.data = x.data[0:n]
117 x.sa = x.sa[0:n]
118 }
119
120 // read data
121 if _, err := io.ReadFull(r, x.data); err != nil {
122 return err
123 }
124
125 // read index
126 for sa := x.sa; len(sa) > 0; {
127 n, err := readSlice(r, buf, sa)
128 if err != nil {
129 return err
130 }
131 sa = sa[n:]
132 }
133 return nil
134 }
135
136 // Write writes the index x to w.
137 func (x *Index) Write(w io.Writer) error {
138 // buffer for all writes
139 buf := make([]byte, bufSize)
140
141 // write length
142 if err := writeInt(w, buf, len(x.data)); err != nil {
143 return err
144 }
145
146 // write data
147 if _, err := w.Write(x.data); err != nil {
148 return err
149 }
150
151 // write index
152 for sa := x.sa; len(sa) > 0; {
153 n, err := writeSlice(w, buf, sa)
154 if err != nil {
155 return err
156 }
157 sa = sa[n:]
158 }
159 return nil
160 }
161
162 // Bytes returns the data over which the index was created.
163 // It must not be modified.
164 //
165 func (x *Index) Bytes() []byte {
166 return x.data
167 }
168
169 func (x *Index) at(i int) []byte {
170 return x.data[x.sa[i]:]
171 }
172
173 // lookupAll returns a slice into the matching region of the index.
174 // The runtime is O(log(N)*len(s)).
175 func (x *Index) lookupAll(s []byte) []int {
176 // find matching suffix index range [i:j]
177 // find the first index where s would be the prefix
178 i := sort.Search(len(x.sa), func(i int) bool { return bytes.Compare(x.at(i), s) >= 0 })
179 // starting at i, find the first index at which s is not a prefix
180 j := i + sort.Search(len(x.sa)-i, func(j int) bool { return !bytes.HasPrefix(x.at(j+i), s) })
181 return x.sa[i:j]
182 }
183
184 // Lookup returns an unsorted list of at most n indices where the byte string s
185 // occurs in the indexed data. If n < 0, all occurrences are returned.
186 // The result is nil if s is empty, s is not found, or n == 0.
187 // Lookup time is O(log(N)*len(s) + len(result)) where N is the
188 // size of the indexed data.
189 //
190 func (x *Index) Lookup(s []byte, n int) (result []int) {
191 if len(s) > 0 && n != 0 {
192 matches := x.lookupAll(s)
193 if n < 0 || len(matches) < n {
194 n = len(matches)
195 }
196 // 0 <= n <= len(matches)
197 if n > 0 {
198 result = make([]int, n)
199 copy(result, matches)
200 }
201 }
202 return
203 }
204
205 // FindAllIndex returns a sorted list of non-overlapping matches of the
206 // regular expression r, where a match is a pair of indices specifying
207 // the matched slice of x.Bytes(). If n < 0, all matches are returned
208 // in successive order. Otherwise, at most n matches are returned and
209 // they may not be successive. The result is nil if there are no matches,
210 // or if n == 0.
211 //
212 func (x *Index) FindAllIndex(r *regexp.Regexp, n int) (result [][]int) {
213 // a non-empty literal prefix is used to determine possible
214 // match start indices with Lookup
215 prefix, complete := r.LiteralPrefix()
216 lit := []byte(prefix)
217
218 // worst-case scenario: no literal prefix
219 if prefix == "" {
220 return r.FindAllIndex(x.data, n)
221 }
222
223 // if regexp is a literal just use Lookup and convert its
224 // result into match pairs
225 if complete {
226 // Lookup returns indices that may belong to overlapping matches.
227 // After eliminating them, we may end up with fewer than n matches.
228 // If we don't have enough at the end, redo the search with an
229 // increased value n1, but only if Lookup returned all the requested
230 // indices in the first place (if it returned fewer than that then
231 // there cannot be more).
232 for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
233 indices := x.Lookup(lit, n1)
234 if len(indices) == 0 {
235 return
236 }
237 sort.Ints(indices)
238 pairs := make([]int, 2*len(indices))
239 result = make([][]int, len(indices))
240 count := 0
241 prev := 0
242 for _, i := range indices {
243 if count == n {
244 break
245 }
246 // ignore indices leading to overlapping matches
247 if prev <= i {
248 j := 2 * count
249 pairs[j+0] = i
250 pairs[j+1] = i + len(lit)
251 result[count] = pairs[j : j+2]
252 count++
253 prev = i + len(lit)
254 }
255 }
256 result = result[0:count]
257 if len(result) >= n || len(indices) != n1 {
258 // found all matches or there's no chance to find more
259 // (n and n1 can be negative)
260 break
261 }
262 }
263 if len(result) == 0 {
264 result = nil
265 }
266 return
267 }
268
269 // regexp has a non-empty literal prefix; Lookup(lit) computes
270 // the indices of possible complete matches; use these as starting
271 // points for anchored searches
272 // (regexp "^" matches beginning of input, not beginning of line)
273 r = regexp.MustCompile("^" + r.String()) // compiles because r compiled
274
275 // same comment about Lookup applies here as in the loop above
276 for n1 := n; ; n1 += 2 * (n - len(result)) /* overflow ok */ {
277 indices := x.Lookup(lit, n1)
278 if len(indices) == 0 {
279 return
280 }
281 sort.Ints(indices)
282 result = result[0:0]
283 prev := 0
284 for _, i := range indices {
285 if len(result) == n {
286 break
287 }
288 m := r.FindIndex(x.data[i:]) // anchored search - will not run off
289 // ignore indices leading to overlapping matches
290 if m != nil && prev <= i {
291 m[0] = i // correct m
292 m[1] += i
293 result = append(result, m)
294 prev = m[1]
295 }
296 }
297 if len(result) >= n || len(indices) != n1 {
298 // found all matches or there's no chance to find more
299 // (n and n1 can be negative)
300 break
301 }
302 }
303 if len(result) == 0 {
304 result = nil
305 }
306 return
307 }