src/pkg/regexp/regexp.go - The Go Programming Language

Golang

previous page next page
Source file src/pkg/regexp/regexp.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package regexp implements regular expression search.
     6	//
     7	// The syntax of the regular expressions accepted is the same
     8	// general syntax used by Perl, Python, and other languages.
     9	// More precisely, it is the syntax accepted by RE2 and described at
    10	// http://code.google.com/p/re2/wiki/Syntax, except for \C.
    11	//
    12	// All characters are UTF-8-encoded code points.
    13	//
    14	// There are 16 methods of Regexp that match a regular expression and identify
    15	// the matched text.  Their names are matched by this regular expression:
    16	//
    17	//	Find(All)?(String)?(Submatch)?(Index)?
    18	//
    19	// If 'All' is present, the routine matches successive non-overlapping
    20	// matches of the entire expression.  Empty matches abutting a preceding
    21	// match are ignored.  The return value is a slice containing the successive
    22	// return values of the corresponding non-'All' routine.  These routines take
    23	// an extra integer argument, n; if n >= 0, the function returns at most n
    24	// matches/submatches.
    25	//
    26	// If 'String' is present, the argument is a string; otherwise it is a slice
    27	// of bytes; return values are adjusted as appropriate.
    28	//
    29	// If 'Submatch' is present, the return value is a slice identifying the
    30	// successive submatches of the expression.  Submatches are matches of
    31	// parenthesized subexpressions within the regular expression, numbered from
    32	// left to right in order of opening parenthesis.  Submatch 0 is the match of
    33	// the entire expression, submatch 1 the match of the first parenthesized
    34	// subexpression, and so on.
    35	//
    36	// If 'Index' is present, matches and submatches are identified by byte index
    37	// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
    38	// the nth submatch.  The pair for n==0 identifies the match of the entire
    39	// expression.  If 'Index' is not present, the match is identified by the
    40	// text of the match/submatch.  If an index is negative, it means that
    41	// subexpression did not match any string in the input.
    42	//
    43	// There is also a subset of the methods that can be applied to text read
    44	// from a RuneReader:
    45	//
    46	//	MatchReader, FindReaderIndex, FindReaderSubmatchIndex
    47	//
    48	// This set may grow.  Note that regular expression matches may need to
    49	// examine text beyond the text returned by a match, so the methods that
    50	// match text from a RuneReader may read arbitrarily far into the input
    51	// before returning.
    52	//
    53	// (There are a few other methods that do not match this pattern.)
    54	//
    55	package regexp
    56	
    57	import (
    58		"bytes"
    59		"io"
    60		"regexp/syntax"
    61		"strconv"
    62		"strings"
    63		"sync"
    64		"unicode"
    65		"unicode/utf8"
    66	)
    67	
    68	var debug = false
    69	
    70	// Regexp is the representation of a compiled regular expression.
    71	// The public interface is entirely through methods.
    72	// A Regexp is safe for concurrent use by multiple goroutines.
    73	type Regexp struct {
    74		// read-only after Compile
    75		expr           string         // as passed to Compile
    76		prog           *syntax.Prog   // compiled program
    77		prefix         string         // required prefix in unanchored matches
    78		prefixBytes    []byte         // prefix, as a []byte
    79		prefixComplete bool           // prefix is the entire regexp
    80		prefixRune     rune           // first rune in prefix
    81		cond           syntax.EmptyOp // empty-width conditions required at start of match
    82		numSubexp      int
    83		subexpNames    []string
    84		longest        bool
    85	
    86		// cache of machines for running regexp
    87		mu      sync.Mutex
    88		machine []*machine
    89	}
    90	
    91	// String returns the source text used to compile the regular expression.
    92	func (re *Regexp) String() string {
    93		return re.expr
    94	}
    95	
    96	// Compile parses a regular expression and returns, if successful,
    97	// a Regexp object that can be used to match against text.
    98	//
    99	// When matching against text, the regexp returns a match that
   100	// begins as early as possible in the input (leftmost), and among those
   101	// it chooses the one that a backtracking search would have found first.
   102	// This so-called leftmost-first matching is the same semantics
   103	// that Perl, Python, and other implementations use, although this
   104	// package implements it without the expense of backtracking.
   105	// For POSIX leftmost-longest matching, see CompilePOSIX.
   106	func Compile(expr string) (*Regexp, error) {
   107		return compile(expr, syntax.Perl, false)
   108	}
   109	
   110	// CompilePOSIX is like Compile but restricts the regular expression
   111	// to POSIX ERE (egrep) syntax and changes the match semantics to
   112	// leftmost-longest.
   113	//
   114	// That is, when matching against text, the regexp returns a match that
   115	// begins as early as possible in the input (leftmost), and among those
   116	// it chooses a match that is as long as possible.
   117	// This so-called leftmost-longest matching is the same semantics
   118	// that early regular expression implementations used and that POSIX
   119	// specifies.
   120	//
   121	// However, there can be multiple leftmost-longest matches, with different
   122	// submatch choices, and here this package diverges from POSIX.
   123	// Among the possible leftmost-longest matches, this package chooses
   124	// the one that a backtracking search would have found first, while POSIX
   125	// specifies that the match be chosen to maximize the length of the first
   126	// subexpression, then the second, and so on from left to right.
   127	// The POSIX rule is computationally prohibitive and not even well-defined.
   128	// See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
   129	func CompilePOSIX(expr string) (*Regexp, error) {
   130		return compile(expr, syntax.POSIX, true)
   131	}
   132	
   133	func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
   134		re, err := syntax.Parse(expr, mode)
   135		if err != nil {
   136			return nil, err
   137		}
   138		maxCap := re.MaxCap()
   139		capNames := re.CapNames()
   140	
   141		re = re.Simplify()
   142		prog, err := syntax.Compile(re)
   143		if err != nil {
   144			return nil, err
   145		}
   146		regexp := &Regexp{
   147			expr:        expr,
   148			prog:        prog,
   149			numSubexp:   maxCap,
   150			subexpNames: capNames,
   151			cond:        prog.StartCond(),
   152			longest:     longest,
   153		}
   154		regexp.prefix, regexp.prefixComplete = prog.Prefix()
   155		if regexp.prefix != "" {
   156			// TODO(rsc): Remove this allocation by adding
   157			// IndexString to package bytes.
   158			regexp.prefixBytes = []byte(regexp.prefix)
   159			regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
   160		}
   161		return regexp, nil
   162	}
   163	
   164	// get returns a machine to use for matching re.
   165	// It uses the re's machine cache if possible, to avoid
   166	// unnecessary allocation.
   167	func (re *Regexp) get() *machine {
   168		re.mu.Lock()
   169		if n := len(re.machine); n > 0 {
   170			z := re.machine[n-1]
   171			re.machine = re.machine[:n-1]
   172			re.mu.Unlock()
   173			return z
   174		}
   175		re.mu.Unlock()
   176		z := progMachine(re.prog)
   177		z.re = re
   178		return z
   179	}
   180	
   181	// put returns a machine to the re's machine cache.
   182	// There is no attempt to limit the size of the cache, so it will
   183	// grow to the maximum number of simultaneous matches
   184	// run using re.  (The cache empties when re gets garbage collected.)
   185	func (re *Regexp) put(z *machine) {
   186		re.mu.Lock()
   187		re.machine = append(re.machine, z)
   188		re.mu.Unlock()
   189	}
   190	
   191	// MustCompile is like Compile but panics if the expression cannot be parsed.
   192	// It simplifies safe initialization of global variables holding compiled regular
   193	// expressions.
   194	func MustCompile(str string) *Regexp {
   195		regexp, error := Compile(str)
   196		if error != nil {
   197			panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
   198		}
   199		return regexp
   200	}
   201	
   202	// MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
   203	// It simplifies safe initialization of global variables holding compiled regular
   204	// expressions.
   205	func MustCompilePOSIX(str string) *Regexp {
   206		regexp, error := CompilePOSIX(str)
   207		if error != nil {
   208			panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
   209		}
   210		return regexp
   211	}
   212	
   213	func quote(s string) string {
   214		if strconv.CanBackquote(s) {
   215			return "`" + s + "`"
   216		}
   217		return strconv.Quote(s)
   218	}
   219	
   220	// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
   221	func (re *Regexp) NumSubexp() int {
   222		return re.numSubexp
   223	}
   224	
   225	// SubexpNames returns the names of the parenthesized subexpressions
   226	// in this Regexp.  The name for the first sub-expression is names[1],
   227	// so that if m is a match slice, the name for m[i] is SubexpNames()[i].
   228	// Since the Regexp as a whole cannot be named, names[0] is always
   229	// the empty string.  The slice should not be modified.
   230	func (re *Regexp) SubexpNames() []string {
   231		return re.subexpNames
   232	}
   233	
   234	const endOfText rune = -1
   235	
   236	// input abstracts different representations of the input text. It provides
   237	// one-character lookahead.
   238	type input interface {
   239		step(pos int) (r rune, width int) // advance one rune
   240		canCheckPrefix() bool             // can we look ahead without losing info?
   241		hasPrefix(re *Regexp) bool
   242		index(re *Regexp, pos int) int
   243		context(pos int) syntax.EmptyOp
   244	}
   245	
   246	// inputString scans a string.
   247	type inputString struct {
   248		str string
   249	}
   250	
   251	func (i *inputString) step(pos int) (rune, int) {
   252		if pos < len(i.str) {
   253			c := i.str[pos]
   254			if c < utf8.RuneSelf {
   255				return rune(c), 1
   256			}
   257			return utf8.DecodeRuneInString(i.str[pos:])
   258		}
   259		return endOfText, 0
   260	}
   261	
   262	func (i *inputString) canCheckPrefix() bool {
   263		return true
   264	}
   265	
   266	func (i *inputString) hasPrefix(re *Regexp) bool {
   267		return strings.HasPrefix(i.str, re.prefix)
   268	}
   269	
   270	func (i *inputString) index(re *Regexp, pos int) int {
   271		return strings.Index(i.str[pos:], re.prefix)
   272	}
   273	
   274	func (i *inputString) context(pos int) syntax.EmptyOp {
   275		r1, r2 := endOfText, endOfText
   276		if pos > 0 && pos <= len(i.str) {
   277			r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
   278		}
   279		if pos < len(i.str) {
   280			r2, _ = utf8.DecodeRuneInString(i.str[pos:])
   281		}
   282		return syntax.EmptyOpContext(r1, r2)
   283	}
   284	
   285	// inputBytes scans a byte slice.
   286	type inputBytes struct {
   287		str []byte
   288	}
   289	
   290	func (i *inputBytes) step(pos int) (rune, int) {
   291		if pos < len(i.str) {
   292			c := i.str[pos]
   293			if c < utf8.RuneSelf {
   294				return rune(c), 1
   295			}
   296			return utf8.DecodeRune(i.str[pos:])
   297		}
   298		return endOfText, 0
   299	}
   300	
   301	func (i *inputBytes) canCheckPrefix() bool {
   302		return true
   303	}
   304	
   305	func (i *inputBytes) hasPrefix(re *Regexp) bool {
   306		return bytes.HasPrefix(i.str, re.prefixBytes)
   307	}
   308	
   309	func (i *inputBytes) index(re *Regexp, pos int) int {
   310		return bytes.Index(i.str[pos:], re.prefixBytes)
   311	}
   312	
   313	func (i *inputBytes) context(pos int) syntax.EmptyOp {
   314		r1, r2 := endOfText, endOfText
   315		if pos > 0 && pos <= len(i.str) {
   316			r1, _ = utf8.DecodeLastRune(i.str[:pos])
   317		}
   318		if pos < len(i.str) {
   319			r2, _ = utf8.DecodeRune(i.str[pos:])
   320		}
   321		return syntax.EmptyOpContext(r1, r2)
   322	}
   323	
   324	// inputReader scans a RuneReader.
   325	type inputReader struct {
   326		r     io.RuneReader
   327		atEOT bool
   328		pos   int
   329	}
   330	
   331	func (i *inputReader) step(pos int) (rune, int) {
   332		if !i.atEOT && pos != i.pos {
   333			return endOfText, 0
   334	
   335		}
   336		r, w, err := i.r.ReadRune()
   337		if err != nil {
   338			i.atEOT = true
   339			return endOfText, 0
   340		}
   341		i.pos += w
   342		return r, w
   343	}
   344	
   345	func (i *inputReader) canCheckPrefix() bool {
   346		return false
   347	}
   348	
   349	func (i *inputReader) hasPrefix(re *Regexp) bool {
   350		return false
   351	}
   352	
   353	func (i *inputReader) index(re *Regexp, pos int) int {
   354		return -1
   355	}
   356	
   357	func (i *inputReader) context(pos int) syntax.EmptyOp {
   358		return 0
   359	}
   360	
   361	// LiteralPrefix returns a literal string that must begin any match
   362	// of the regular expression re.  It returns the boolean true if the
   363	// literal string comprises the entire regular expression.
   364	func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
   365		return re.prefix, re.prefixComplete
   366	}
   367	
   368	// MatchReader returns whether the Regexp matches the text read by the
   369	// RuneReader.  The return value is a boolean: true for match, false for no
   370	// match.
   371	func (re *Regexp) MatchReader(r io.RuneReader) bool {
   372		return re.doExecute(r, nil, "", 0, 0) != nil
   373	}
   374	
   375	// MatchString returns whether the Regexp matches the string s.
   376	// The return value is a boolean: true for match, false for no match.
   377	func (re *Regexp) MatchString(s string) bool {
   378		return re.doExecute(nil, nil, s, 0, 0) != nil
   379	}
   380	
   381	// Match returns whether the Regexp matches the byte slice b.
   382	// The return value is a boolean: true for match, false for no match.
   383	func (re *Regexp) Match(b []byte) bool {
   384		return re.doExecute(nil, b, "", 0, 0) != nil
   385	}
   386	
   387	// MatchReader checks whether a textual regular expression matches the text
   388	// read by the RuneReader.  More complicated queries need to use Compile and
   389	// the full Regexp interface.
   390	func MatchReader(pattern string, r io.RuneReader) (matched bool, error error) {
   391		re, err := Compile(pattern)
   392		if err != nil {
   393			return false, err
   394		}
   395		return re.MatchReader(r), nil
   396	}
   397	
   398	// MatchString checks whether a textual regular expression
   399	// matches a string.  More complicated queries need
   400	// to use Compile and the full Regexp interface.
   401	func MatchString(pattern string, s string) (matched bool, error error) {
   402		re, err := Compile(pattern)
   403		if err != nil {
   404			return false, err
   405		}
   406		return re.MatchString(s), nil
   407	}
   408	
   409	// Match checks whether a textual regular expression
   410	// matches a byte slice.  More complicated queries need
   411	// to use Compile and the full Regexp interface.
   412	func Match(pattern string, b []byte) (matched bool, error error) {
   413		re, err := Compile(pattern)
   414		if err != nil {
   415			return false, err
   416		}
   417		return re.Match(b), nil
   418	}
   419	
   420	// ReplaceAllString returns a copy of src, replacing matches of the Regexp
   421	// with the replacement string repl.  Inside repl, $ signs are interpreted as
   422	// in Expand, so for instance $1 represents the text of the first submatch.
   423	func (re *Regexp) ReplaceAllString(src, repl string) string {
   424		n := 2
   425		if strings.Index(repl, "$") >= 0 {
   426			n = 2 * (re.numSubexp + 1)
   427		}
   428		b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
   429			return re.expand(dst, repl, nil, src, match)
   430		})
   431		return string(b)
   432	}
   433	
   434	// ReplaceAllStringLiteral returns a copy of src, replacing matches of the Regexp
   435	// with the replacement string repl.  The replacement repl is substituted directly,
   436	// without using Expand.
   437	func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
   438		return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
   439			return append(dst, repl...)
   440		}))
   441	}
   442	
   443	// ReplaceAllStringFunc returns a copy of src in which all matches of the
   444	// Regexp have been replaced by the return value of of function repl applied
   445	// to the matched substring.  The replacement returned by repl is substituted
   446	// directly, without using Expand.
   447	func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
   448		b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
   449			return append(dst, repl(src[match[0]:match[1]])...)
   450		})
   451		return string(b)
   452	}
   453	
   454	func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
   455		lastMatchEnd := 0 // end position of the most recent match
   456		searchPos := 0    // position where we next look for a match
   457		var buf []byte
   458		var endPos int
   459		if bsrc != nil {
   460			endPos = len(bsrc)
   461		} else {
   462			endPos = len(src)
   463		}
   464		for searchPos <= endPos {
   465			a := re.doExecute(nil, bsrc, src, searchPos, nmatch)
   466			if len(a) == 0 {
   467				break // no more matches
   468			}
   469	
   470			// Copy the unmatched characters before this match.
   471			if bsrc != nil {
   472				buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
   473			} else {
   474				buf = append(buf, src[lastMatchEnd:a[0]]...)
   475			}
   476	
   477			// Now insert a copy of the replacement string, but not for a
   478			// match of the empty string immediately after another match.
   479			// (Otherwise, we get double replacement for patterns that
   480			// match both empty and nonempty strings.)
   481			if a[1] > lastMatchEnd || a[0] == 0 {
   482				buf = repl(buf, a)
   483			}
   484			lastMatchEnd = a[1]
   485	
   486			// Advance past this match; always advance at least one character.
   487			var width int
   488			if bsrc != nil {
   489				_, width = utf8.DecodeRune(bsrc[searchPos:])
   490			} else {
   491				_, width = utf8.DecodeRuneInString(src[searchPos:])
   492			}
   493			if searchPos+width > a[1] {
   494				searchPos += width
   495			} else if searchPos+1 > a[1] {
   496				// This clause is only needed at the end of the input
   497				// string.  In that case, DecodeRuneInString returns width=0.
   498				searchPos++
   499			} else {
   500				searchPos = a[1]
   501			}
   502		}
   503	
   504		// Copy the unmatched characters after the last match.
   505		if bsrc != nil {
   506			buf = append(buf, bsrc[lastMatchEnd:]...)
   507		} else {
   508			buf = append(buf, src[lastMatchEnd:]...)
   509		}
   510	
   511		return buf
   512	}
   513	
   514	// ReplaceAll returns a copy of src, replacing matches of the Regexp
   515	// with the replacement string repl.  Inside repl, $ signs are interpreted as
   516	// in Expand, so for instance $1 represents the text of the first submatch.
   517	func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
   518		n := 2
   519		if bytes.IndexByte(repl, '$') >= 0 {
   520			n = 2 * (re.numSubexp + 1)
   521		}
   522		srepl := ""
   523		b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
   524			if len(srepl) != len(repl) {
   525				srepl = string(repl)
   526			}
   527			return re.expand(dst, srepl, src, "", match)
   528		})
   529		return b
   530	}
   531	
   532	// ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
   533	// with the replacement bytes repl.  The replacement repl is substituted directly,
   534	// without using Expand.
   535	func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
   536		return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
   537			return append(dst, repl...)
   538		})
   539	}
   540	
   541	// ReplaceAllFunc returns a copy of src in which all matches of the
   542	// Regexp have been replaced by the return value of of function repl applied
   543	// to the matched byte slice.  The replacement returned by repl is substituted
   544	// directly, without using Expand.
   545	func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
   546		return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
   547			return append(dst, repl(src[match[0]:match[1]])...)
   548		})
   549	}
   550	
   551	var specialBytes = []byte(`\.+*?()|[]{}^$`)
   552	
   553	func special(b byte) bool {
   554		return bytes.IndexByte(specialBytes, b) >= 0
   555	}
   556	
   557	// QuoteMeta returns a string that quotes all regular expression metacharacters
   558	// inside the argument text; the returned string is a regular expression matching
   559	// the literal text.  For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
   560	func QuoteMeta(s string) string {
   561		b := make([]byte, 2*len(s))
   562	
   563		// A byte loop is correct because all metacharacters are ASCII.
   564		j := 0
   565		for i := 0; i < len(s); i++ {
   566			if special(s[i]) {
   567				b[j] = '\\'
   568				j++
   569			}
   570			b[j] = s[i]
   571			j++
   572		}
   573		return string(b[0:j])
   574	}
   575	
   576	// The number of capture values in the program may correspond
   577	// to fewer capturing expressions than are in the regexp.
   578	// For example, "(a){0}" turns into an empty program, so the
   579	// maximum capture in the program is 0 but we need to return
   580	// an expression for \1.  Pad appends -1s to the slice a as needed.
   581	func (re *Regexp) pad(a []int) []int {
   582		if a == nil {
   583			// No match.
   584			return nil
   585		}
   586		n := (1 + re.numSubexp) * 2
   587		for len(a) < n {
   588			a = append(a, -1)
   589		}
   590		return a
   591	}
   592	
   593	// Find matches in slice b if b is non-nil, otherwise find matches in string s.
   594	func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
   595		var end int
   596		if b == nil {
   597			end = len(s)
   598		} else {
   599			end = len(b)
   600		}
   601	
   602		for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
   603			matches := re.doExecute(nil, b, s, pos, re.prog.NumCap)
   604			if len(matches) == 0 {
   605				break
   606			}
   607	
   608			accept := true
   609			if matches[1] == pos {
   610				// We've found an empty match.
   611				if matches[0] == prevMatchEnd {
   612					// We don't allow an empty match right
   613					// after a previous match, so ignore it.
   614					accept = false
   615				}
   616				var width int
   617				// TODO: use step()
   618				if b == nil {
   619					_, width = utf8.DecodeRuneInString(s[pos:end])
   620				} else {
   621					_, width = utf8.DecodeRune(b[pos:end])
   622				}
   623				if width > 0 {
   624					pos += width
   625				} else {
   626					pos = end + 1
   627				}
   628			} else {
   629				pos = matches[1]
   630			}
   631			prevMatchEnd = matches[1]
   632	
   633			if accept {
   634				deliver(re.pad(matches))
   635				i++
   636			}
   637		}
   638	}
   639	
   640	// Find returns a slice holding the text of the leftmost match in b of the regular expression.
   641	// A return value of nil indicates no match.
   642	func (re *Regexp) Find(b []byte) []byte {
   643		a := re.doExecute(nil, b, "", 0, 2)
   644		if a == nil {
   645			return nil
   646		}
   647		return b[a[0]:a[1]]
   648	}
   649	
   650	// FindIndex returns a two-element slice of integers defining the location of
   651	// the leftmost match in b of the regular expression.  The match itself is at
   652	// b[loc[0]:loc[1]].
   653	// A return value of nil indicates no match.
   654	func (re *Regexp) FindIndex(b []byte) (loc []int) {
   655		a := re.doExecute(nil, b, "", 0, 2)
   656		if a == nil {
   657			return nil
   658		}
   659		return a[0:2]
   660	}
   661	
   662	// FindString returns a string holding the text of the leftmost match in s of the regular
   663	// expression.  If there is no match, the return value is an empty string,
   664	// but it will also be empty if the regular expression successfully matches
   665	// an empty string.  Use FindStringIndex or FindStringSubmatch if it is
   666	// necessary to distinguish these cases.
   667	func (re *Regexp) FindString(s string) string {
   668		a := re.doExecute(nil, nil, s, 0, 2)
   669		if a == nil {
   670			return ""
   671		}
   672		return s[a[0]:a[1]]
   673	}
   674	
   675	// FindStringIndex returns a two-element slice of integers defining the
   676	// location of the leftmost match in s of the regular expression.  The match
   677	// itself is at s[loc[0]:loc[1]].
   678	// A return value of nil indicates no match.
   679	func (re *Regexp) FindStringIndex(s string) (loc []int) {
   680		a := re.doExecute(nil, nil, s, 0, 2)
   681		if a == nil {
   682			return nil
   683		}
   684		return a[0:2]
   685	}
   686	
   687	// FindReaderIndex returns a two-element slice of integers defining the
   688	// location of the leftmost match of the regular expression in text read from
   689	// the RuneReader.  The match itself is at s[loc[0]:loc[1]].  A return
   690	// value of nil indicates no match.
   691	func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
   692		a := re.doExecute(r, nil, "", 0, 2)
   693		if a == nil {
   694			return nil
   695		}
   696		return a[0:2]
   697	}
   698	
   699	// FindSubmatch returns a slice of slices holding the text of the leftmost
   700	// match of the regular expression in b and the matches, if any, of its
   701	// subexpressions, as defined by the 'Submatch' descriptions in the package
   702	// comment.
   703	// A return value of nil indicates no match.
   704	func (re *Regexp) FindSubmatch(b []byte) [][]byte {
   705		a := re.doExecute(nil, b, "", 0, re.prog.NumCap)
   706		if a == nil {
   707			return nil
   708		}
   709		ret := make([][]byte, 1+re.numSubexp)
   710		for i := range ret {
   711			if 2*i < len(a) && a[2*i] >= 0 {
   712				ret[i] = b[a[2*i]:a[2*i+1]]
   713			}
   714		}
   715		return ret
   716	}
   717	
   718	// Expand appends template to dst and returns the result; during the
   719	// append, Expand replaces variables in the template with corresponding
   720	// matches drawn from src.  The match slice should have been returned by
   721	// FindSubmatchIndex.
   722	// 
   723	// In the template, a variable is denoted by a substring of the form
   724	// $name or ${name}, where name is a non-empty sequence of letters,
   725	// digits, and underscores.  A purely numeric name like $1 refers to
   726	// the submatch with the corresponding index; other names refer to
   727	// capturing parentheses named with the (?P<name>...) syntax.  A
   728	// reference to an out of range or unmatched index or a name that is not
   729	// present in the regular expression is replaced with an empty string.
   730	// 
   731	// In the $name form, name is taken to be as long as possible: $1x is
   732	// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
   733	// 
   734	// To insert a literal $ in the output, use $$ in the template.
   735	func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
   736		return re.expand(dst, string(template), src, "", match)
   737	}
   738	
   739	// ExpandString is like Expand but the template and source are strings.
   740	// It appends to and returns a byte slice in order to give the calling
   741	// code control over allocation.
   742	func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
   743		return re.expand(dst, template, nil, src, match)
   744	}
   745	
   746	func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
   747		for len(template) > 0 {
   748			i := strings.Index(template, "$")
   749			if i < 0 {
   750				break
   751			}
   752			dst = append(dst, template[:i]...)
   753			template = template[i:]
   754			if len(template) > 1 && template[1] == '$' {
   755				// Treat $$ as $.
   756				dst = append(dst, '$')
   757				template = template[2:]
   758				continue
   759			}
   760			name, num, rest, ok := extract(template)
   761			if !ok {
   762				// Malformed; treat $ as raw text.
   763				dst = append(dst, '$')
   764				template = template[1:]
   765				continue
   766			}
   767			template = rest
   768			if num >= 0 {
   769				if 2*num+1 < len(match) {
   770					if bsrc != nil {
   771						dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
   772					} else {
   773						dst = append(dst, src[match[2*num]:match[2*num+1]]...)
   774					}
   775				}
   776			} else {
   777				for i, namei := range re.subexpNames {
   778					if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
   779						if bsrc != nil {
   780							dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
   781						} else {
   782							dst = append(dst, src[match[2*i]:match[2*i+1]]...)
   783						}
   784						break
   785					}
   786				}
   787			}
   788		}
   789		dst = append(dst, template...)
   790		return dst
   791	}
   792	
   793	// extract returns the name from a leading "$name" or "${name}" in str.
   794	// If it is a number, extract returns num set to that number; otherwise num = -1.
   795	func extract(str string) (name string, num int, rest string, ok bool) {
   796		if len(str) < 2 || str[0] != '$' {
   797			return
   798		}
   799		brace := false
   800		if str[1] == '{' {
   801			brace = true
   802			str = str[2:]
   803		} else {
   804			str = str[1:]
   805		}
   806		i := 0
   807		for i < len(str) {
   808			rune, size := utf8.DecodeRuneInString(str[i:])
   809			if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
   810				break
   811			}
   812			i += size
   813		}
   814		if i == 0 {
   815			// empty name is not okay
   816			return
   817		}
   818		name = str[:i]
   819		if brace {
   820			if i >= len(str) || str[i] != '}' {
   821				// missing closing brace
   822				return
   823			}
   824			i++
   825		}
   826	
   827		// Parse number.
   828		num = 0
   829		for i := 0; i < len(name); i++ {
   830			if name[i] < '0' || '9' < name[i] || num >= 1e8 {
   831				num = -1
   832				break
   833			}
   834			num = num*10 + int(name[i]) - '0'
   835		}
   836		// Disallow leading zeros.
   837		if name[0] == '0' && len(name) > 1 {
   838			num = -1
   839		}
   840	
   841		rest = str[i:]
   842		ok = true
   843		return
   844	}
   845	
   846	// FindSubmatchIndex returns a slice holding the index pairs identifying the
   847	// leftmost match of the regular expression in b and the matches, if any, of
   848	// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
   849	// in the package comment.
   850	// A return value of nil indicates no match.
   851	func (re *Regexp) FindSubmatchIndex(b []byte) []int {
   852		return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap))
   853	}
   854	
   855	// FindStringSubmatch returns a slice of strings holding the text of the
   856	// leftmost match of the regular expression in s and the matches, if any, of
   857	// its subexpressions, as defined by the 'Submatch' description in the
   858	// package comment.
   859	// A return value of nil indicates no match.
   860	func (re *Regexp) FindStringSubmatch(s string) []string {
   861		a := re.doExecute(nil, nil, s, 0, re.prog.NumCap)
   862		if a == nil {
   863			return nil
   864		}
   865		ret := make([]string, 1+re.numSubexp)
   866		for i := range ret {
   867			if 2*i < len(a) && a[2*i] >= 0 {
   868				ret[i] = s[a[2*i]:a[2*i+1]]
   869			}
   870		}
   871		return ret
   872	}
   873	
   874	// FindStringSubmatchIndex returns a slice holding the index pairs
   875	// identifying the leftmost match of the regular expression in s and the
   876	// matches, if any, of its subexpressions, as defined by the 'Submatch' and
   877	// 'Index' descriptions in the package comment.
   878	// A return value of nil indicates no match.
   879	func (re *Regexp) FindStringSubmatchIndex(s string) []int {
   880		return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap))
   881	}
   882	
   883	// FindReaderSubmatchIndex returns a slice holding the index pairs
   884	// identifying the leftmost match of the regular expression of text read by
   885	// the RuneReader, and the matches, if any, of its subexpressions, as defined
   886	// by the 'Submatch' and 'Index' descriptions in the package comment.  A
   887	// return value of nil indicates no match.
   888	func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
   889		return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap))
   890	}
   891	
   892	const startSize = 10 // The size at which to start a slice in the 'All' routines.
   893	
   894	// FindAll is the 'All' version of Find; it returns a slice of all successive
   895	// matches of the expression, as defined by the 'All' description in the
   896	// package comment.
   897	// A return value of nil indicates no match.
   898	func (re *Regexp) FindAll(b []byte, n int) [][]byte {
   899		if n < 0 {
   900			n = len(b) + 1
   901		}
   902		result := make([][]byte, 0, startSize)
   903		re.allMatches("", b, n, func(match []int) {
   904			result = append(result, b[match[0]:match[1]])
   905		})
   906		if len(result) == 0 {
   907			return nil
   908		}
   909		return result
   910	}
   911	
   912	// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
   913	// successive matches of the expression, as defined by the 'All' description
   914	// in the package comment.
   915	// A return value of nil indicates no match.
   916	func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
   917		if n < 0 {
   918			n = len(b) + 1
   919		}
   920		result := make([][]int, 0, startSize)
   921		re.allMatches("", b, n, func(match []int) {
   922			result = append(result, match[0:2])
   923		})
   924		if len(result) == 0 {
   925			return nil
   926		}
   927		return result
   928	}
   929	
   930	// FindAllString is the 'All' version of FindString; it returns a slice of all
   931	// successive matches of the expression, as defined by the 'All' description
   932	// in the package comment.
   933	// A return value of nil indicates no match.
   934	func (re *Regexp) FindAllString(s string, n int) []string {
   935		if n < 0 {
   936			n = len(s) + 1
   937		}
   938		result := make([]string, 0, startSize)
   939		re.allMatches(s, nil, n, func(match []int) {
   940			result = append(result, s[match[0]:match[1]])
   941		})
   942		if len(result) == 0 {
   943			return nil
   944		}
   945		return result
   946	}
   947	
   948	// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
   949	// slice of all successive matches of the expression, as defined by the 'All'
   950	// description in the package comment.
   951	// A return value of nil indicates no match.
   952	func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
   953		if n < 0 {
   954			n = len(s) + 1
   955		}
   956		result := make([][]int, 0, startSize)
   957		re.allMatches(s, nil, n, func(match []int) {
   958			result = append(result, match[0:2])
   959		})
   960		if len(result) == 0 {
   961			return nil
   962		}
   963		return result
   964	}
   965	
   966	// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
   967	// of all successive matches of the expression, as defined by the 'All'
   968	// description in the package comment.
   969	// A return value of nil indicates no match.
   970	func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
   971		if n < 0 {
   972			n = len(b) + 1
   973		}
   974		result := make([][][]byte, 0, startSize)
   975		re.allMatches("", b, n, func(match []int) {
   976			slice := make([][]byte, len(match)/2)
   977			for j := range slice {
   978				if match[2*j] >= 0 {
   979					slice[j] = b[match[2*j]:match[2*j+1]]
   980				}
   981			}
   982			result = append(result, slice)
   983		})
   984		if len(result) == 0 {
   985			return nil
   986		}
   987		return result
   988	}
   989	
   990	// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
   991	// a slice of all successive matches of the expression, as defined by the
   992	// 'All' description in the package comment.
   993	// A return value of nil indicates no match.
   994	func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
   995		if n < 0 {
   996			n = len(b) + 1
   997		}
   998		result := make([][]int, 0, startSize)
   999		re.allMatches("", b, n, func(match []int) {
  1000			result = append(result, match)
  1001		})
  1002		if len(result) == 0 {
  1003			return nil
  1004		}
  1005		return result
  1006	}
  1007	
  1008	// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
  1009	// returns a slice of all successive matches of the expression, as defined by
  1010	// the 'All' description in the package comment.
  1011	// A return value of nil indicates no match.
  1012	func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
  1013		if n < 0 {
  1014			n = len(s) + 1
  1015		}
  1016		result := make([][]string, 0, startSize)
  1017		re.allMatches(s, nil, n, func(match []int) {
  1018			slice := make([]string, len(match)/2)
  1019			for j := range slice {
  1020				if match[2*j] >= 0 {
  1021					slice[j] = s[match[2*j]:match[2*j+1]]
  1022				}
  1023			}
  1024			result = append(result, slice)
  1025		})
  1026		if len(result) == 0 {
  1027			return nil
  1028		}
  1029		return result
  1030	}
  1031	
  1032	// FindAllStringSubmatchIndex is the 'All' version of
  1033	// FindStringSubmatchIndex; it returns a slice of all successive matches of
  1034	// the expression, as defined by the 'All' description in the package
  1035	// comment.
  1036	// A return value of nil indicates no match.
  1037	func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
  1038		if n < 0 {
  1039			n = len(s) + 1
  1040		}
  1041		result := make([][]int, 0, startSize)
  1042		re.allMatches(s, nil, n, func(match []int) {
  1043			result = append(result, match)
  1044		})
  1045		if len(result) == 0 {
  1046			return nil
  1047		}
  1048		return result
  1049	}
previous page start next page