Source file src/pkg/regexp/regexp.go
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package regexp implements regular expression search.
6 //
7 // The syntax of the regular expressions accepted is the same
8 // general syntax used by Perl, Python, and other languages.
9 // More precisely, it is the syntax accepted by RE2 and described at
10 // http://code.google.com/p/re2/wiki/Syntax, except for \C.
11 //
12 // All characters are UTF-8-encoded code points.
13 //
14 // There are 16 methods of Regexp that match a regular expression and identify
15 // the matched text. Their names are matched by this regular expression:
16 //
17 // Find(All)?(String)?(Submatch)?(Index)?
18 //
19 // If 'All' is present, the routine matches successive non-overlapping
20 // matches of the entire expression. Empty matches abutting a preceding
21 // match are ignored. The return value is a slice containing the successive
22 // return values of the corresponding non-'All' routine. These routines take
23 // an extra integer argument, n; if n >= 0, the function returns at most n
24 // matches/submatches.
25 //
26 // If 'String' is present, the argument is a string; otherwise it is a slice
27 // of bytes; return values are adjusted as appropriate.
28 //
29 // If 'Submatch' is present, the return value is a slice identifying the
30 // successive submatches of the expression. Submatches are matches of
31 // parenthesized subexpressions within the regular expression, numbered from
32 // left to right in order of opening parenthesis. Submatch 0 is the match of
33 // the entire expression, submatch 1 the match of the first parenthesized
34 // subexpression, and so on.
35 //
36 // If 'Index' is present, matches and submatches are identified by byte index
37 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of
38 // the nth submatch. The pair for n==0 identifies the match of the entire
39 // expression. If 'Index' is not present, the match is identified by the
40 // text of the match/submatch. If an index is negative, it means that
41 // subexpression did not match any string in the input.
42 //
43 // There is also a subset of the methods that can be applied to text read
44 // from a RuneReader:
45 //
46 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex
47 //
48 // This set may grow. Note that regular expression matches may need to
49 // examine text beyond the text returned by a match, so the methods that
50 // match text from a RuneReader may read arbitrarily far into the input
51 // before returning.
52 //
53 // (There are a few other methods that do not match this pattern.)
54 //
55 package regexp
56
57 import (
58 "bytes"
59 "io"
60 "regexp/syntax"
61 "strconv"
62 "strings"
63 "sync"
64 "unicode"
65 "unicode/utf8"
66 )
67
68 var debug = false
69
70 // Regexp is the representation of a compiled regular expression.
71 // The public interface is entirely through methods.
72 // A Regexp is safe for concurrent use by multiple goroutines.
73 type Regexp struct {
74 // read-only after Compile
75 expr string // as passed to Compile
76 prog *syntax.Prog // compiled program
77 prefix string // required prefix in unanchored matches
78 prefixBytes []byte // prefix, as a []byte
79 prefixComplete bool // prefix is the entire regexp
80 prefixRune rune // first rune in prefix
81 cond syntax.EmptyOp // empty-width conditions required at start of match
82 numSubexp int
83 subexpNames []string
84 longest bool
85
86 // cache of machines for running regexp
87 mu sync.Mutex
88 machine []*machine
89 }
90
91 // String returns the source text used to compile the regular expression.
92 func (re *Regexp) String() string {
93 return re.expr
94 }
95
96 // Compile parses a regular expression and returns, if successful,
97 // a Regexp object that can be used to match against text.
98 //
99 // When matching against text, the regexp returns a match that
100 // begins as early as possible in the input (leftmost), and among those
101 // it chooses the one that a backtracking search would have found first.
102 // This so-called leftmost-first matching is the same semantics
103 // that Perl, Python, and other implementations use, although this
104 // package implements it without the expense of backtracking.
105 // For POSIX leftmost-longest matching, see CompilePOSIX.
106 func Compile(expr string) (*Regexp, error) {
107 return compile(expr, syntax.Perl, false)
108 }
109
110 // CompilePOSIX is like Compile but restricts the regular expression
111 // to POSIX ERE (egrep) syntax and changes the match semantics to
112 // leftmost-longest.
113 //
114 // That is, when matching against text, the regexp returns a match that
115 // begins as early as possible in the input (leftmost), and among those
116 // it chooses a match that is as long as possible.
117 // This so-called leftmost-longest matching is the same semantics
118 // that early regular expression implementations used and that POSIX
119 // specifies.
120 //
121 // However, there can be multiple leftmost-longest matches, with different
122 // submatch choices, and here this package diverges from POSIX.
123 // Among the possible leftmost-longest matches, this package chooses
124 // the one that a backtracking search would have found first, while POSIX
125 // specifies that the match be chosen to maximize the length of the first
126 // subexpression, then the second, and so on from left to right.
127 // The POSIX rule is computationally prohibitive and not even well-defined.
128 // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
129 func CompilePOSIX(expr string) (*Regexp, error) {
130 return compile(expr, syntax.POSIX, true)
131 }
132
133 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
134 re, err := syntax.Parse(expr, mode)
135 if err != nil {
136 return nil, err
137 }
138 maxCap := re.MaxCap()
139 capNames := re.CapNames()
140
141 re = re.Simplify()
142 prog, err := syntax.Compile(re)
143 if err != nil {
144 return nil, err
145 }
146 regexp := &Regexp{
147 expr: expr,
148 prog: prog,
149 numSubexp: maxCap,
150 subexpNames: capNames,
151 cond: prog.StartCond(),
152 longest: longest,
153 }
154 regexp.prefix, regexp.prefixComplete = prog.Prefix()
155 if regexp.prefix != "" {
156 // TODO(rsc): Remove this allocation by adding
157 // IndexString to package bytes.
158 regexp.prefixBytes = []byte(regexp.prefix)
159 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
160 }
161 return regexp, nil
162 }
163
164 // get returns a machine to use for matching re.
165 // It uses the re's machine cache if possible, to avoid
166 // unnecessary allocation.
167 func (re *Regexp) get() *machine {
168 re.mu.Lock()
169 if n := len(re.machine); n > 0 {
170 z := re.machine[n-1]
171 re.machine = re.machine[:n-1]
172 re.mu.Unlock()
173 return z
174 }
175 re.mu.Unlock()
176 z := progMachine(re.prog)
177 z.re = re
178 return z
179 }
180
181 // put returns a machine to the re's machine cache.
182 // There is no attempt to limit the size of the cache, so it will
183 // grow to the maximum number of simultaneous matches
184 // run using re. (The cache empties when re gets garbage collected.)
185 func (re *Regexp) put(z *machine) {
186 re.mu.Lock()
187 re.machine = append(re.machine, z)
188 re.mu.Unlock()
189 }
190
191 // MustCompile is like Compile but panics if the expression cannot be parsed.
192 // It simplifies safe initialization of global variables holding compiled regular
193 // expressions.
194 func MustCompile(str string) *Regexp {
195 regexp, error := Compile(str)
196 if error != nil {
197 panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
198 }
199 return regexp
200 }
201
202 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
203 // It simplifies safe initialization of global variables holding compiled regular
204 // expressions.
205 func MustCompilePOSIX(str string) *Regexp {
206 regexp, error := CompilePOSIX(str)
207 if error != nil {
208 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
209 }
210 return regexp
211 }
212
213 func quote(s string) string {
214 if strconv.CanBackquote(s) {
215 return "`" + s + "`"
216 }
217 return strconv.Quote(s)
218 }
219
220 // NumSubexp returns the number of parenthesized subexpressions in this Regexp.
221 func (re *Regexp) NumSubexp() int {
222 return re.numSubexp
223 }
224
225 // SubexpNames returns the names of the parenthesized subexpressions
226 // in this Regexp. The name for the first sub-expression is names[1],
227 // so that if m is a match slice, the name for m[i] is SubexpNames()[i].
228 // Since the Regexp as a whole cannot be named, names[0] is always
229 // the empty string. The slice should not be modified.
230 func (re *Regexp) SubexpNames() []string {
231 return re.subexpNames
232 }
233
234 const endOfText rune = -1
235
236 // input abstracts different representations of the input text. It provides
237 // one-character lookahead.
238 type input interface {
239 step(pos int) (r rune, width int) // advance one rune
240 canCheckPrefix() bool // can we look ahead without losing info?
241 hasPrefix(re *Regexp) bool
242 index(re *Regexp, pos int) int
243 context(pos int) syntax.EmptyOp
244 }
245
246 // inputString scans a string.
247 type inputString struct {
248 str string
249 }
250
251 func (i *inputString) step(pos int) (rune, int) {
252 if pos < len(i.str) {
253 c := i.str[pos]
254 if c < utf8.RuneSelf {
255 return rune(c), 1
256 }
257 return utf8.DecodeRuneInString(i.str[pos:])
258 }
259 return endOfText, 0
260 }
261
262 func (i *inputString) canCheckPrefix() bool {
263 return true
264 }
265
266 func (i *inputString) hasPrefix(re *Regexp) bool {
267 return strings.HasPrefix(i.str, re.prefix)
268 }
269
270 func (i *inputString) index(re *Regexp, pos int) int {
271 return strings.Index(i.str[pos:], re.prefix)
272 }
273
274 func (i *inputString) context(pos int) syntax.EmptyOp {
275 r1, r2 := endOfText, endOfText
276 if pos > 0 && pos <= len(i.str) {
277 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
278 }
279 if pos < len(i.str) {
280 r2, _ = utf8.DecodeRuneInString(i.str[pos:])
281 }
282 return syntax.EmptyOpContext(r1, r2)
283 }
284
285 // inputBytes scans a byte slice.
286 type inputBytes struct {
287 str []byte
288 }
289
290 func (i *inputBytes) step(pos int) (rune, int) {
291 if pos < len(i.str) {
292 c := i.str[pos]
293 if c < utf8.RuneSelf {
294 return rune(c), 1
295 }
296 return utf8.DecodeRune(i.str[pos:])
297 }
298 return endOfText, 0
299 }
300
301 func (i *inputBytes) canCheckPrefix() bool {
302 return true
303 }
304
305 func (i *inputBytes) hasPrefix(re *Regexp) bool {
306 return bytes.HasPrefix(i.str, re.prefixBytes)
307 }
308
309 func (i *inputBytes) index(re *Regexp, pos int) int {
310 return bytes.Index(i.str[pos:], re.prefixBytes)
311 }
312
313 func (i *inputBytes) context(pos int) syntax.EmptyOp {
314 r1, r2 := endOfText, endOfText
315 if pos > 0 && pos <= len(i.str) {
316 r1, _ = utf8.DecodeLastRune(i.str[:pos])
317 }
318 if pos < len(i.str) {
319 r2, _ = utf8.DecodeRune(i.str[pos:])
320 }
321 return syntax.EmptyOpContext(r1, r2)
322 }
323
324 // inputReader scans a RuneReader.
325 type inputReader struct {
326 r io.RuneReader
327 atEOT bool
328 pos int
329 }
330
331 func (i *inputReader) step(pos int) (rune, int) {
332 if !i.atEOT && pos != i.pos {
333 return endOfText, 0
334
335 }
336 r, w, err := i.r.ReadRune()
337 if err != nil {
338 i.atEOT = true
339 return endOfText, 0
340 }
341 i.pos += w
342 return r, w
343 }
344
345 func (i *inputReader) canCheckPrefix() bool {
346 return false
347 }
348
349 func (i *inputReader) hasPrefix(re *Regexp) bool {
350 return false
351 }
352
353 func (i *inputReader) index(re *Regexp, pos int) int {
354 return -1
355 }
356
357 func (i *inputReader) context(pos int) syntax.EmptyOp {
358 return 0
359 }
360
361 // LiteralPrefix returns a literal string that must begin any match
362 // of the regular expression re. It returns the boolean true if the
363 // literal string comprises the entire regular expression.
364 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
365 return re.prefix, re.prefixComplete
366 }
367
368 // MatchReader returns whether the Regexp matches the text read by the
369 // RuneReader. The return value is a boolean: true for match, false for no
370 // match.
371 func (re *Regexp) MatchReader(r io.RuneReader) bool {
372 return re.doExecute(r, nil, "", 0, 0) != nil
373 }
374
375 // MatchString returns whether the Regexp matches the string s.
376 // The return value is a boolean: true for match, false for no match.
377 func (re *Regexp) MatchString(s string) bool {
378 return re.doExecute(nil, nil, s, 0, 0) != nil
379 }
380
381 // Match returns whether the Regexp matches the byte slice b.
382 // The return value is a boolean: true for match, false for no match.
383 func (re *Regexp) Match(b []byte) bool {
384 return re.doExecute(nil, b, "", 0, 0) != nil
385 }
386
387 // MatchReader checks whether a textual regular expression matches the text
388 // read by the RuneReader. More complicated queries need to use Compile and
389 // the full Regexp interface.
390 func MatchReader(pattern string, r io.RuneReader) (matched bool, error error) {
391 re, err := Compile(pattern)
392 if err != nil {
393 return false, err
394 }
395 return re.MatchReader(r), nil
396 }
397
398 // MatchString checks whether a textual regular expression
399 // matches a string. More complicated queries need
400 // to use Compile and the full Regexp interface.
401 func MatchString(pattern string, s string) (matched bool, error error) {
402 re, err := Compile(pattern)
403 if err != nil {
404 return false, err
405 }
406 return re.MatchString(s), nil
407 }
408
409 // Match checks whether a textual regular expression
410 // matches a byte slice. More complicated queries need
411 // to use Compile and the full Regexp interface.
412 func Match(pattern string, b []byte) (matched bool, error error) {
413 re, err := Compile(pattern)
414 if err != nil {
415 return false, err
416 }
417 return re.Match(b), nil
418 }
419
420 // ReplaceAllString returns a copy of src, replacing matches of the Regexp
421 // with the replacement string repl. Inside repl, $ signs are interpreted as
422 // in Expand, so for instance $1 represents the text of the first submatch.
423 func (re *Regexp) ReplaceAllString(src, repl string) string {
424 n := 2
425 if strings.Index(repl, "$") >= 0 {
426 n = 2 * (re.numSubexp + 1)
427 }
428 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
429 return re.expand(dst, repl, nil, src, match)
430 })
431 return string(b)
432 }
433
434 // ReplaceAllStringLiteral returns a copy of src, replacing matches of the Regexp
435 // with the replacement string repl. The replacement repl is substituted directly,
436 // without using Expand.
437 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
438 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
439 return append(dst, repl...)
440 }))
441 }
442
443 // ReplaceAllStringFunc returns a copy of src in which all matches of the
444 // Regexp have been replaced by the return value of of function repl applied
445 // to the matched substring. The replacement returned by repl is substituted
446 // directly, without using Expand.
447 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
448 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
449 return append(dst, repl(src[match[0]:match[1]])...)
450 })
451 return string(b)
452 }
453
454 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
455 lastMatchEnd := 0 // end position of the most recent match
456 searchPos := 0 // position where we next look for a match
457 var buf []byte
458 var endPos int
459 if bsrc != nil {
460 endPos = len(bsrc)
461 } else {
462 endPos = len(src)
463 }
464 for searchPos <= endPos {
465 a := re.doExecute(nil, bsrc, src, searchPos, nmatch)
466 if len(a) == 0 {
467 break // no more matches
468 }
469
470 // Copy the unmatched characters before this match.
471 if bsrc != nil {
472 buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
473 } else {
474 buf = append(buf, src[lastMatchEnd:a[0]]...)
475 }
476
477 // Now insert a copy of the replacement string, but not for a
478 // match of the empty string immediately after another match.
479 // (Otherwise, we get double replacement for patterns that
480 // match both empty and nonempty strings.)
481 if a[1] > lastMatchEnd || a[0] == 0 {
482 buf = repl(buf, a)
483 }
484 lastMatchEnd = a[1]
485
486 // Advance past this match; always advance at least one character.
487 var width int
488 if bsrc != nil {
489 _, width = utf8.DecodeRune(bsrc[searchPos:])
490 } else {
491 _, width = utf8.DecodeRuneInString(src[searchPos:])
492 }
493 if searchPos+width > a[1] {
494 searchPos += width
495 } else if searchPos+1 > a[1] {
496 // This clause is only needed at the end of the input
497 // string. In that case, DecodeRuneInString returns width=0.
498 searchPos++
499 } else {
500 searchPos = a[1]
501 }
502 }
503
504 // Copy the unmatched characters after the last match.
505 if bsrc != nil {
506 buf = append(buf, bsrc[lastMatchEnd:]...)
507 } else {
508 buf = append(buf, src[lastMatchEnd:]...)
509 }
510
511 return buf
512 }
513
514 // ReplaceAll returns a copy of src, replacing matches of the Regexp
515 // with the replacement string repl. Inside repl, $ signs are interpreted as
516 // in Expand, so for instance $1 represents the text of the first submatch.
517 func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
518 n := 2
519 if bytes.IndexByte(repl, '$') >= 0 {
520 n = 2 * (re.numSubexp + 1)
521 }
522 srepl := ""
523 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
524 if len(srepl) != len(repl) {
525 srepl = string(repl)
526 }
527 return re.expand(dst, srepl, src, "", match)
528 })
529 return b
530 }
531
532 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
533 // with the replacement bytes repl. The replacement repl is substituted directly,
534 // without using Expand.
535 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
536 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
537 return append(dst, repl...)
538 })
539 }
540
541 // ReplaceAllFunc returns a copy of src in which all matches of the
542 // Regexp have been replaced by the return value of of function repl applied
543 // to the matched byte slice. The replacement returned by repl is substituted
544 // directly, without using Expand.
545 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
546 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
547 return append(dst, repl(src[match[0]:match[1]])...)
548 })
549 }
550
551 var specialBytes = []byte(`\.+*?()|[]{}^$`)
552
553 func special(b byte) bool {
554 return bytes.IndexByte(specialBytes, b) >= 0
555 }
556
557 // QuoteMeta returns a string that quotes all regular expression metacharacters
558 // inside the argument text; the returned string is a regular expression matching
559 // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
560 func QuoteMeta(s string) string {
561 b := make([]byte, 2*len(s))
562
563 // A byte loop is correct because all metacharacters are ASCII.
564 j := 0
565 for i := 0; i < len(s); i++ {
566 if special(s[i]) {
567 b[j] = '\\'
568 j++
569 }
570 b[j] = s[i]
571 j++
572 }
573 return string(b[0:j])
574 }
575
576 // The number of capture values in the program may correspond
577 // to fewer capturing expressions than are in the regexp.
578 // For example, "(a){0}" turns into an empty program, so the
579 // maximum capture in the program is 0 but we need to return
580 // an expression for \1. Pad appends -1s to the slice a as needed.
581 func (re *Regexp) pad(a []int) []int {
582 if a == nil {
583 // No match.
584 return nil
585 }
586 n := (1 + re.numSubexp) * 2
587 for len(a) < n {
588 a = append(a, -1)
589 }
590 return a
591 }
592
593 // Find matches in slice b if b is non-nil, otherwise find matches in string s.
594 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
595 var end int
596 if b == nil {
597 end = len(s)
598 } else {
599 end = len(b)
600 }
601
602 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
603 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap)
604 if len(matches) == 0 {
605 break
606 }
607
608 accept := true
609 if matches[1] == pos {
610 // We've found an empty match.
611 if matches[0] == prevMatchEnd {
612 // We don't allow an empty match right
613 // after a previous match, so ignore it.
614 accept = false
615 }
616 var width int
617 // TODO: use step()
618 if b == nil {
619 _, width = utf8.DecodeRuneInString(s[pos:end])
620 } else {
621 _, width = utf8.DecodeRune(b[pos:end])
622 }
623 if width > 0 {
624 pos += width
625 } else {
626 pos = end + 1
627 }
628 } else {
629 pos = matches[1]
630 }
631 prevMatchEnd = matches[1]
632
633 if accept {
634 deliver(re.pad(matches))
635 i++
636 }
637 }
638 }
639
640 // Find returns a slice holding the text of the leftmost match in b of the regular expression.
641 // A return value of nil indicates no match.
642 func (re *Regexp) Find(b []byte) []byte {
643 a := re.doExecute(nil, b, "", 0, 2)
644 if a == nil {
645 return nil
646 }
647 return b[a[0]:a[1]]
648 }
649
650 // FindIndex returns a two-element slice of integers defining the location of
651 // the leftmost match in b of the regular expression. The match itself is at
652 // b[loc[0]:loc[1]].
653 // A return value of nil indicates no match.
654 func (re *Regexp) FindIndex(b []byte) (loc []int) {
655 a := re.doExecute(nil, b, "", 0, 2)
656 if a == nil {
657 return nil
658 }
659 return a[0:2]
660 }
661
662 // FindString returns a string holding the text of the leftmost match in s of the regular
663 // expression. If there is no match, the return value is an empty string,
664 // but it will also be empty if the regular expression successfully matches
665 // an empty string. Use FindStringIndex or FindStringSubmatch if it is
666 // necessary to distinguish these cases.
667 func (re *Regexp) FindString(s string) string {
668 a := re.doExecute(nil, nil, s, 0, 2)
669 if a == nil {
670 return ""
671 }
672 return s[a[0]:a[1]]
673 }
674
675 // FindStringIndex returns a two-element slice of integers defining the
676 // location of the leftmost match in s of the regular expression. The match
677 // itself is at s[loc[0]:loc[1]].
678 // A return value of nil indicates no match.
679 func (re *Regexp) FindStringIndex(s string) (loc []int) {
680 a := re.doExecute(nil, nil, s, 0, 2)
681 if a == nil {
682 return nil
683 }
684 return a[0:2]
685 }
686
687 // FindReaderIndex returns a two-element slice of integers defining the
688 // location of the leftmost match of the regular expression in text read from
689 // the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return
690 // value of nil indicates no match.
691 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
692 a := re.doExecute(r, nil, "", 0, 2)
693 if a == nil {
694 return nil
695 }
696 return a[0:2]
697 }
698
699 // FindSubmatch returns a slice of slices holding the text of the leftmost
700 // match of the regular expression in b and the matches, if any, of its
701 // subexpressions, as defined by the 'Submatch' descriptions in the package
702 // comment.
703 // A return value of nil indicates no match.
704 func (re *Regexp) FindSubmatch(b []byte) [][]byte {
705 a := re.doExecute(nil, b, "", 0, re.prog.NumCap)
706 if a == nil {
707 return nil
708 }
709 ret := make([][]byte, 1+re.numSubexp)
710 for i := range ret {
711 if 2*i < len(a) && a[2*i] >= 0 {
712 ret[i] = b[a[2*i]:a[2*i+1]]
713 }
714 }
715 return ret
716 }
717
718 // Expand appends template to dst and returns the result; during the
719 // append, Expand replaces variables in the template with corresponding
720 // matches drawn from src. The match slice should have been returned by
721 // FindSubmatchIndex.
722 //
723 // In the template, a variable is denoted by a substring of the form
724 // $name or ${name}, where name is a non-empty sequence of letters,
725 // digits, and underscores. A purely numeric name like $1 refers to
726 // the submatch with the corresponding index; other names refer to
727 // capturing parentheses named with the (?P<name>...) syntax. A
728 // reference to an out of range or unmatched index or a name that is not
729 // present in the regular expression is replaced with an empty string.
730 //
731 // In the $name form, name is taken to be as long as possible: $1x is
732 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
733 //
734 // To insert a literal $ in the output, use $$ in the template.
735 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
736 return re.expand(dst, string(template), src, "", match)
737 }
738
739 // ExpandString is like Expand but the template and source are strings.
740 // It appends to and returns a byte slice in order to give the calling
741 // code control over allocation.
742 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
743 return re.expand(dst, template, nil, src, match)
744 }
745
746 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
747 for len(template) > 0 {
748 i := strings.Index(template, "$")
749 if i < 0 {
750 break
751 }
752 dst = append(dst, template[:i]...)
753 template = template[i:]
754 if len(template) > 1 && template[1] == '$' {
755 // Treat $$ as $.
756 dst = append(dst, '$')
757 template = template[2:]
758 continue
759 }
760 name, num, rest, ok := extract(template)
761 if !ok {
762 // Malformed; treat $ as raw text.
763 dst = append(dst, '$')
764 template = template[1:]
765 continue
766 }
767 template = rest
768 if num >= 0 {
769 if 2*num+1 < len(match) {
770 if bsrc != nil {
771 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
772 } else {
773 dst = append(dst, src[match[2*num]:match[2*num+1]]...)
774 }
775 }
776 } else {
777 for i, namei := range re.subexpNames {
778 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
779 if bsrc != nil {
780 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
781 } else {
782 dst = append(dst, src[match[2*i]:match[2*i+1]]...)
783 }
784 break
785 }
786 }
787 }
788 }
789 dst = append(dst, template...)
790 return dst
791 }
792
793 // extract returns the name from a leading "$name" or "${name}" in str.
794 // If it is a number, extract returns num set to that number; otherwise num = -1.
795 func extract(str string) (name string, num int, rest string, ok bool) {
796 if len(str) < 2 || str[0] != '$' {
797 return
798 }
799 brace := false
800 if str[1] == '{' {
801 brace = true
802 str = str[2:]
803 } else {
804 str = str[1:]
805 }
806 i := 0
807 for i < len(str) {
808 rune, size := utf8.DecodeRuneInString(str[i:])
809 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
810 break
811 }
812 i += size
813 }
814 if i == 0 {
815 // empty name is not okay
816 return
817 }
818 name = str[:i]
819 if brace {
820 if i >= len(str) || str[i] != '}' {
821 // missing closing brace
822 return
823 }
824 i++
825 }
826
827 // Parse number.
828 num = 0
829 for i := 0; i < len(name); i++ {
830 if name[i] < '0' || '9' < name[i] || num >= 1e8 {
831 num = -1
832 break
833 }
834 num = num*10 + int(name[i]) - '0'
835 }
836 // Disallow leading zeros.
837 if name[0] == '0' && len(name) > 1 {
838 num = -1
839 }
840
841 rest = str[i:]
842 ok = true
843 return
844 }
845
846 // FindSubmatchIndex returns a slice holding the index pairs identifying the
847 // leftmost match of the regular expression in b and the matches, if any, of
848 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
849 // in the package comment.
850 // A return value of nil indicates no match.
851 func (re *Regexp) FindSubmatchIndex(b []byte) []int {
852 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap))
853 }
854
855 // FindStringSubmatch returns a slice of strings holding the text of the
856 // leftmost match of the regular expression in s and the matches, if any, of
857 // its subexpressions, as defined by the 'Submatch' description in the
858 // package comment.
859 // A return value of nil indicates no match.
860 func (re *Regexp) FindStringSubmatch(s string) []string {
861 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap)
862 if a == nil {
863 return nil
864 }
865 ret := make([]string, 1+re.numSubexp)
866 for i := range ret {
867 if 2*i < len(a) && a[2*i] >= 0 {
868 ret[i] = s[a[2*i]:a[2*i+1]]
869 }
870 }
871 return ret
872 }
873
874 // FindStringSubmatchIndex returns a slice holding the index pairs
875 // identifying the leftmost match of the regular expression in s and the
876 // matches, if any, of its subexpressions, as defined by the 'Submatch' and
877 // 'Index' descriptions in the package comment.
878 // A return value of nil indicates no match.
879 func (re *Regexp) FindStringSubmatchIndex(s string) []int {
880 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap))
881 }
882
883 // FindReaderSubmatchIndex returns a slice holding the index pairs
884 // identifying the leftmost match of the regular expression of text read by
885 // the RuneReader, and the matches, if any, of its subexpressions, as defined
886 // by the 'Submatch' and 'Index' descriptions in the package comment. A
887 // return value of nil indicates no match.
888 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
889 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap))
890 }
891
892 const startSize = 10 // The size at which to start a slice in the 'All' routines.
893
894 // FindAll is the 'All' version of Find; it returns a slice of all successive
895 // matches of the expression, as defined by the 'All' description in the
896 // package comment.
897 // A return value of nil indicates no match.
898 func (re *Regexp) FindAll(b []byte, n int) [][]byte {
899 if n < 0 {
900 n = len(b) + 1
901 }
902 result := make([][]byte, 0, startSize)
903 re.allMatches("", b, n, func(match []int) {
904 result = append(result, b[match[0]:match[1]])
905 })
906 if len(result) == 0 {
907 return nil
908 }
909 return result
910 }
911
912 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
913 // successive matches of the expression, as defined by the 'All' description
914 // in the package comment.
915 // A return value of nil indicates no match.
916 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
917 if n < 0 {
918 n = len(b) + 1
919 }
920 result := make([][]int, 0, startSize)
921 re.allMatches("", b, n, func(match []int) {
922 result = append(result, match[0:2])
923 })
924 if len(result) == 0 {
925 return nil
926 }
927 return result
928 }
929
930 // FindAllString is the 'All' version of FindString; it returns a slice of all
931 // successive matches of the expression, as defined by the 'All' description
932 // in the package comment.
933 // A return value of nil indicates no match.
934 func (re *Regexp) FindAllString(s string, n int) []string {
935 if n < 0 {
936 n = len(s) + 1
937 }
938 result := make([]string, 0, startSize)
939 re.allMatches(s, nil, n, func(match []int) {
940 result = append(result, s[match[0]:match[1]])
941 })
942 if len(result) == 0 {
943 return nil
944 }
945 return result
946 }
947
948 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
949 // slice of all successive matches of the expression, as defined by the 'All'
950 // description in the package comment.
951 // A return value of nil indicates no match.
952 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
953 if n < 0 {
954 n = len(s) + 1
955 }
956 result := make([][]int, 0, startSize)
957 re.allMatches(s, nil, n, func(match []int) {
958 result = append(result, match[0:2])
959 })
960 if len(result) == 0 {
961 return nil
962 }
963 return result
964 }
965
966 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
967 // of all successive matches of the expression, as defined by the 'All'
968 // description in the package comment.
969 // A return value of nil indicates no match.
970 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
971 if n < 0 {
972 n = len(b) + 1
973 }
974 result := make([][][]byte, 0, startSize)
975 re.allMatches("", b, n, func(match []int) {
976 slice := make([][]byte, len(match)/2)
977 for j := range slice {
978 if match[2*j] >= 0 {
979 slice[j] = b[match[2*j]:match[2*j+1]]
980 }
981 }
982 result = append(result, slice)
983 })
984 if len(result) == 0 {
985 return nil
986 }
987 return result
988 }
989
990 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
991 // a slice of all successive matches of the expression, as defined by the
992 // 'All' description in the package comment.
993 // A return value of nil indicates no match.
994 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
995 if n < 0 {
996 n = len(b) + 1
997 }
998 result := make([][]int, 0, startSize)
999 re.allMatches("", b, n, func(match []int) {
1000 result = append(result, match)
1001 })
1002 if len(result) == 0 {
1003 return nil
1004 }
1005 return result
1006 }
1007
1008 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
1009 // returns a slice of all successive matches of the expression, as defined by
1010 // the 'All' description in the package comment.
1011 // A return value of nil indicates no match.
1012 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
1013 if n < 0 {
1014 n = len(s) + 1
1015 }
1016 result := make([][]string, 0, startSize)
1017 re.allMatches(s, nil, n, func(match []int) {
1018 slice := make([]string, len(match)/2)
1019 for j := range slice {
1020 if match[2*j] >= 0 {
1021 slice[j] = s[match[2*j]:match[2*j+1]]
1022 }
1023 }
1024 result = append(result, slice)
1025 })
1026 if len(result) == 0 {
1027 return nil
1028 }
1029 return result
1030 }
1031
1032 // FindAllStringSubmatchIndex is the 'All' version of
1033 // FindStringSubmatchIndex; it returns a slice of all successive matches of
1034 // the expression, as defined by the 'All' description in the package
1035 // comment.
1036 // A return value of nil indicates no match.
1037 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
1038 if n < 0 {
1039 n = len(s) + 1
1040 }
1041 result := make([][]int, 0, startSize)
1042 re.allMatches(s, nil, n, func(match []int) {
1043 result = append(result, match)
1044 })
1045 if len(result) == 0 {
1046 return nil
1047 }
1048 return result
1049 }