src/pkg/regexp/syntax/parse.go - The Go Programming Language

Golang

Source file src/pkg/regexp/syntax/parse.go

     1	// Copyright 2011 The Go Authors.  All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package syntax parses regular expressions into parse trees and compiles
     6	// parse trees into programs. Most clients of regular expressions will use
     7	// the facilities of package regexp (such as Compile and Match) instead of
     8	// this package.
     9	package syntax
    10	
    11	import (
    12		"sort"
    13		"strings"
    14		"unicode"
    15		"unicode/utf8"
    16	)
    17	
    18	// An Error describes a failure to parse a regular expression
    19	// and gives the offending expression.
    20	type Error struct {
    21		Code ErrorCode
    22		Expr string
    23	}
    24	
    25	func (e *Error) Error() string {
    26		return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
    27	}
    28	
    29	// An ErrorCode describes a failure to parse a regular expression.
    30	type ErrorCode string
    31	
    32	const (
    33		// Unexpected error
    34		ErrInternalError ErrorCode = "regexp/syntax: internal error"
    35	
    36		// Parse errors
    37		ErrInvalidCharClass      ErrorCode = "invalid character class"
    38		ErrInvalidCharRange      ErrorCode = "invalid character class range"
    39		ErrInvalidEscape         ErrorCode = "invalid escape sequence"
    40		ErrInvalidNamedCapture   ErrorCode = "invalid named capture"
    41		ErrInvalidPerlOp         ErrorCode = "invalid or unsupported Perl syntax"
    42		ErrInvalidRepeatOp       ErrorCode = "invalid nested repetition operator"
    43		ErrInvalidRepeatSize     ErrorCode = "invalid repeat count"
    44		ErrInvalidUTF8           ErrorCode = "invalid UTF-8"
    45		ErrMissingBracket        ErrorCode = "missing closing ]"
    46		ErrMissingParen          ErrorCode = "missing closing )"
    47		ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
    48		ErrTrailingBackslash     ErrorCode = "trailing backslash at end of expression"
    49	)
    50	
    51	func (e ErrorCode) String() string {
    52		return string(e)
    53	}
    54	
    55	// Flags control the behavior of the parser and record information about regexp context.
    56	type Flags uint16
    57	
    58	const (
    59		FoldCase      Flags = 1 << iota // case-insensitive match
    60		Literal                         // treat pattern as literal string
    61		ClassNL                         // allow character classes like [^a-z] and [[:space:]] to match newline
    62		DotNL                           // allow . to match newline
    63		OneLine                         // treat ^ and $ as only matching at beginning and end of text
    64		NonGreedy                       // make repetition operators default to non-greedy
    65		PerlX                           // allow Perl extensions
    66		UnicodeGroups                   // allow \p{Han}, \P{Han} for Unicode group and negation
    67		WasDollar                       // regexp OpEndText was $, not \z
    68		Simple                          // regexp contains no counted repetition
    69	
    70		MatchNL = ClassNL | DotNL
    71	
    72		Perl        = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
    73		POSIX Flags = 0                                         // POSIX syntax
    74	)
    75	
    76	// Pseudo-ops for parsing stack.
    77	const (
    78		opLeftParen = opPseudo + iota
    79		opVerticalBar
    80	)
    81	
    82	type parser struct {
    83		flags       Flags     // parse mode flags
    84		stack       []*Regexp // stack of parsed expressions
    85		free        *Regexp
    86		numCap      int // number of capturing groups seen
    87		wholeRegexp string
    88		tmpClass    []rune // temporary char class work space
    89	}
    90	
    91	func (p *parser) newRegexp(op Op) *Regexp {
    92		re := p.free
    93		if re != nil {
    94			p.free = re.Sub0[0]
    95			*re = Regexp{}
    96		} else {
    97			re = new(Regexp)
    98		}
    99		re.Op = op
   100		return re
   101	}
   102	
   103	func (p *parser) reuse(re *Regexp) {
   104		re.Sub0[0] = p.free
   105		p.free = re
   106	}
   107	
   108	// Parse stack manipulation.
   109	
   110	// push pushes the regexp re onto the parse stack and returns the regexp.
   111	func (p *parser) push(re *Regexp) *Regexp {
   112		if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
   113			// Single rune.
   114			if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
   115				return nil
   116			}
   117			re.Op = OpLiteral
   118			re.Rune = re.Rune[:1]
   119			re.Flags = p.flags &^ FoldCase
   120		} else if re.Op == OpCharClass && len(re.Rune) == 4 &&
   121			re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
   122			unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
   123			unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
   124			re.Op == OpCharClass && len(re.Rune) == 2 &&
   125				re.Rune[0]+1 == re.Rune[1] &&
   126				unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
   127				unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
   128			// Case-insensitive rune like [Aa] or [Δδ].
   129			if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
   130				return nil
   131			}
   132	
   133			// Rewrite as (case-insensitive) literal.
   134			re.Op = OpLiteral
   135			re.Rune = re.Rune[:1]
   136			re.Flags = p.flags | FoldCase
   137		} else {
   138			// Incremental concatenation.
   139			p.maybeConcat(-1, 0)
   140		}
   141	
   142		p.stack = append(p.stack, re)
   143		return re
   144	}
   145	
   146	// maybeConcat implements incremental concatenation
   147	// of literal runes into string nodes.  The parser calls this
   148	// before each push, so only the top fragment of the stack
   149	// might need processing.  Since this is called before a push,
   150	// the topmost literal is no longer subject to operators like *
   151	// (Otherwise ab* would turn into (ab)*.)
   152	// If r >= 0 and there's a node left over, maybeConcat uses it
   153	// to push r with the given flags.
   154	// maybeConcat reports whether r was pushed.
   155	func (p *parser) maybeConcat(r rune, flags Flags) bool {
   156		n := len(p.stack)
   157		if n < 2 {
   158			return false
   159		}
   160	
   161		re1 := p.stack[n-1]
   162		re2 := p.stack[n-2]
   163		if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
   164			return false
   165		}
   166	
   167		// Push re1 into re2.
   168		re2.Rune = append(re2.Rune, re1.Rune...)
   169	
   170		// Reuse re1 if possible.
   171		if r >= 0 {
   172			re1.Rune = re1.Rune0[:1]
   173			re1.Rune[0] = r
   174			re1.Flags = flags
   175			return true
   176		}
   177	
   178		p.stack = p.stack[:n-1]
   179		p.reuse(re1)
   180		return false // did not push r
   181	}
   182	
   183	// newLiteral returns a new OpLiteral Regexp with the given flags
   184	func (p *parser) newLiteral(r rune, flags Flags) *Regexp {
   185		re := p.newRegexp(OpLiteral)
   186		re.Flags = flags
   187		if flags&FoldCase != 0 {
   188			r = minFoldRune(r)
   189		}
   190		re.Rune0[0] = r
   191		re.Rune = re.Rune0[:1]
   192		return re
   193	}
   194	
   195	// minFoldRune returns the minimum rune fold-equivalent to r.
   196	func minFoldRune(r rune) rune {
   197		if r < minFold || r > maxFold {
   198			return r
   199		}
   200		min := r
   201		r0 := r
   202		for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
   203			if min > r {
   204				min = r
   205			}
   206		}
   207		return min
   208	}
   209	
   210	// literal pushes a literal regexp for the rune r on the stack
   211	// and returns that regexp.
   212	func (p *parser) literal(r rune) {
   213		p.push(p.newLiteral(r, p.flags))
   214	}
   215	
   216	// op pushes a regexp with the given op onto the stack
   217	// and returns that regexp.
   218	func (p *parser) op(op Op) *Regexp {
   219		re := p.newRegexp(op)
   220		re.Flags = p.flags
   221		return p.push(re)
   222	}
   223	
   224	// repeat replaces the top stack element with itself repeated according to op, min, max.
   225	// before is the regexp suffix starting at the repetition operator.
   226	// after is the regexp suffix following after the repetition operator.
   227	// repeat returns an updated 'after' and an error, if any.
   228	func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
   229		flags := p.flags
   230		if p.flags&PerlX != 0 {
   231			if len(after) > 0 && after[0] == '?' {
   232				after = after[1:]
   233				flags ^= NonGreedy
   234			}
   235			if lastRepeat != "" {
   236				// In Perl it is not allowed to stack repetition operators:
   237				// a** is a syntax error, not a doubled star, and a++ means
   238				// something else entirely, which we don't support!
   239				return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
   240			}
   241		}
   242		n := len(p.stack)
   243		if n == 0 {
   244			return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
   245		}
   246		sub := p.stack[n-1]
   247		if sub.Op >= opPseudo {
   248			return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
   249		}
   250		re := p.newRegexp(op)
   251		re.Min = min
   252		re.Max = max
   253		re.Flags = flags
   254		re.Sub = re.Sub0[:1]
   255		re.Sub[0] = sub
   256		p.stack[n-1] = re
   257		return after, nil
   258	}
   259	
   260	// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
   261	func (p *parser) concat() *Regexp {
   262		p.maybeConcat(-1, 0)
   263	
   264		// Scan down to find pseudo-operator | or (.
   265		i := len(p.stack)
   266		for i > 0 && p.stack[i-1].Op < opPseudo {
   267			i--
   268		}
   269		subs := p.stack[i:]
   270		p.stack = p.stack[:i]
   271	
   272		// Empty concatenation is special case.
   273		if len(subs) == 0 {
   274			return p.push(p.newRegexp(OpEmptyMatch))
   275		}
   276	
   277		return p.push(p.collapse(subs, OpConcat))
   278	}
   279	
   280	// alternate replaces the top of the stack (above the topmost '(') with its alternation.
   281	func (p *parser) alternate() *Regexp {
   282		// Scan down to find pseudo-operator (.
   283		// There are no | above (.
   284		i := len(p.stack)
   285		for i > 0 && p.stack[i-1].Op < opPseudo {
   286			i--
   287		}
   288		subs := p.stack[i:]
   289		p.stack = p.stack[:i]
   290	
   291		// Make sure top class is clean.
   292		// All the others already are (see swapVerticalBar).
   293		if len(subs) > 0 {
   294			cleanAlt(subs[len(subs)-1])
   295		}
   296	
   297		// Empty alternate is special case
   298		// (shouldn't happen but easy to handle).
   299		if len(subs) == 0 {
   300			return p.push(p.newRegexp(OpNoMatch))
   301		}
   302	
   303		return p.push(p.collapse(subs, OpAlternate))
   304	}
   305	
   306	// cleanAlt cleans re for eventual inclusion in an alternation.
   307	func cleanAlt(re *Regexp) {
   308		switch re.Op {
   309		case OpCharClass:
   310			re.Rune = cleanClass(&re.Rune)
   311			if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
   312				re.Rune = nil
   313				re.Op = OpAnyChar
   314				return
   315			}
   316			if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
   317				re.Rune = nil
   318				re.Op = OpAnyCharNotNL
   319				return
   320			}
   321			if cap(re.Rune)-len(re.Rune) > 100 {
   322				// re.Rune will not grow any more.
   323				// Make a copy or inline to reclaim storage.
   324				re.Rune = append(re.Rune0[:0], re.Rune...)
   325			}
   326		}
   327	}
   328	
   329	// collapse returns the result of applying op to sub.
   330	// If sub contains op nodes, they all get hoisted up
   331	// so that there is never a concat of a concat or an
   332	// alternate of an alternate.
   333	func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
   334		if len(subs) == 1 {
   335			return subs[0]
   336		}
   337		re := p.newRegexp(op)
   338		re.Sub = re.Sub0[:0]
   339		for _, sub := range subs {
   340			if sub.Op == op {
   341				re.Sub = append(re.Sub, sub.Sub...)
   342				p.reuse(sub)
   343			} else {
   344				re.Sub = append(re.Sub, sub)
   345			}
   346		}
   347		if op == OpAlternate {
   348			re.Sub = p.factor(re.Sub, re.Flags)
   349			if len(re.Sub) == 1 {
   350				old := re
   351				re = re.Sub[0]
   352				p.reuse(old)
   353			}
   354		}
   355		return re
   356	}
   357	
   358	// factor factors common prefixes from the alternation list sub.
   359	// It returns a replacement list that reuses the same storage and
   360	// frees (passes to p.reuse) any removed *Regexps.
   361	//
   362	// For example,
   363	//     ABC|ABD|AEF|BCX|BCY
   364	// simplifies by literal prefix extraction to
   365	//     A(B(C|D)|EF)|BC(X|Y)
   366	// which simplifies by character class introduction to
   367	//     A(B[CD]|EF)|BC[XY]
   368	//
   369	func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
   370		if len(sub) < 2 {
   371			return sub
   372		}
   373	
   374		// Round 1: Factor out common literal prefixes.
   375		var str []rune
   376		var strflags Flags
   377		start := 0
   378		out := sub[:0]
   379		for i := 0; i <= len(sub); i++ {
   380			// Invariant: the Regexps that were in sub[0:start] have been
   381			// used or marked for reuse, and the slice space has been reused
   382			// for out (len(out) <= start).
   383			//
   384			// Invariant: sub[start:i] consists of regexps that all begin
   385			// with str as modified by strflags.
   386			var istr []rune
   387			var iflags Flags
   388			if i < len(sub) {
   389				istr, iflags = p.leadingString(sub[i])
   390				if iflags == strflags {
   391					same := 0
   392					for same < len(str) && same < len(istr) && str[same] == istr[same] {
   393						same++
   394					}
   395					if same > 0 {
   396						// Matches at least one rune in current range.
   397						// Keep going around.
   398						str = str[:same]
   399						continue
   400					}
   401				}
   402			}
   403	
   404			// Found end of a run with common leading literal string:
   405			// sub[start:i] all begin with str[0:len(str)], but sub[i]
   406			// does not even begin with str[0].
   407			//
   408			// Factor out common string and append factored expression to out.
   409			if i == start {
   410				// Nothing to do - run of length 0.
   411			} else if i == start+1 {
   412				// Just one: don't bother factoring.
   413				out = append(out, sub[start])
   414			} else {
   415				// Construct factored form: prefix(suffix1|suffix2|...)
   416				prefix := p.newRegexp(OpLiteral)
   417				prefix.Flags = strflags
   418				prefix.Rune = append(prefix.Rune[:0], str...)
   419	
   420				for j := start; j < i; j++ {
   421					sub[j] = p.removeLeadingString(sub[j], len(str))
   422				}
   423				suffix := p.collapse(sub[start:i], OpAlternate) // recurse
   424	
   425				re := p.newRegexp(OpConcat)
   426				re.Sub = append(re.Sub[:0], prefix, suffix)
   427				out = append(out, re)
   428			}
   429	
   430			// Prepare for next iteration.
   431			start = i
   432			str = istr
   433			strflags = iflags
   434		}
   435		sub = out
   436	
   437		// Round 2: Factor out common complex prefixes,
   438		// just the first piece of each concatenation,
   439		// whatever it is.  This is good enough a lot of the time.
   440		start = 0
   441		out = sub[:0]
   442		var first *Regexp
   443		for i := 0; i <= len(sub); i++ {
   444			// Invariant: the Regexps that were in sub[0:start] have been
   445			// used or marked for reuse, and the slice space has been reused
   446			// for out (len(out) <= start).
   447			//
   448			// Invariant: sub[start:i] consists of regexps that all begin with ifirst.
   449			var ifirst *Regexp
   450			if i < len(sub) {
   451				ifirst = p.leadingRegexp(sub[i])
   452				if first != nil && first.Equal(ifirst) {
   453					continue
   454				}
   455			}
   456	
   457			// Found end of a run with common leading regexp:
   458			// sub[start:i] all begin with first but sub[i] does not.
   459			//
   460			// Factor out common regexp and append factored expression to out.
   461			if i == start {
   462				// Nothing to do - run of length 0.
   463			} else if i == start+1 {
   464				// Just one: don't bother factoring.
   465				out = append(out, sub[start])
   466			} else {
   467				// Construct factored form: prefix(suffix1|suffix2|...)
   468				prefix := first
   469				for j := start; j < i; j++ {
   470					reuse := j != start // prefix came from sub[start] 
   471					sub[j] = p.removeLeadingRegexp(sub[j], reuse)
   472				}
   473				suffix := p.collapse(sub[start:i], OpAlternate) // recurse
   474	
   475				re := p.newRegexp(OpConcat)
   476				re.Sub = append(re.Sub[:0], prefix, suffix)
   477				out = append(out, re)
   478			}
   479	
   480			// Prepare for next iteration.
   481			start = i
   482			first = ifirst
   483		}
   484		sub = out
   485	
   486		// Round 3: Collapse runs of single literals into character classes.
   487		start = 0
   488		out = sub[:0]
   489		for i := 0; i <= len(sub); i++ {
   490			// Invariant: the Regexps that were in sub[0:start] have been
   491			// used or marked for reuse, and the slice space has been reused
   492			// for out (len(out) <= start).
   493			//
   494			// Invariant: sub[start:i] consists of regexps that are either
   495			// literal runes or character classes.
   496			if i < len(sub) && isCharClass(sub[i]) {
   497				continue
   498			}
   499	
   500			// sub[i] is not a char or char class;
   501			// emit char class for sub[start:i]...
   502			if i == start {
   503				// Nothing to do - run of length 0.
   504			} else if i == start+1 {
   505				out = append(out, sub[start])
   506			} else {
   507				// Make new char class.
   508				// Start with most complex regexp in sub[start].
   509				max := start
   510				for j := start + 1; j < i; j++ {
   511					if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
   512						max = j
   513					}
   514				}
   515				sub[start], sub[max] = sub[max], sub[start]
   516	
   517				for j := start + 1; j < i; j++ {
   518					mergeCharClass(sub[start], sub[j])
   519					p.reuse(sub[j])
   520				}
   521				cleanAlt(sub[start])
   522				out = append(out, sub[start])
   523			}
   524	
   525			// ... and then emit sub[i].
   526			if i < len(sub) {
   527				out = append(out, sub[i])
   528			}
   529			start = i + 1
   530		}
   531		sub = out
   532	
   533		// Round 4: Collapse runs of empty matches into a single empty match.
   534		start = 0
   535		out = sub[:0]
   536		for i := range sub {
   537			if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
   538				continue
   539			}
   540			out = append(out, sub[i])
   541		}
   542		sub = out
   543	
   544		return sub
   545	}
   546	
   547	// leadingString returns the leading literal string that re begins with.
   548	// The string refers to storage in re or its children.
   549	func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
   550		if re.Op == OpConcat && len(re.Sub) > 0 {
   551			re = re.Sub[0]
   552		}
   553		if re.Op != OpLiteral {
   554			return nil, 0
   555		}
   556		return re.Rune, re.Flags & FoldCase
   557	}
   558	
   559	// removeLeadingString removes the first n leading runes
   560	// from the beginning of re.  It returns the replacement for re.
   561	func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
   562		if re.Op == OpConcat && len(re.Sub) > 0 {
   563			// Removing a leading string in a concatenation
   564			// might simplify the concatenation.
   565			sub := re.Sub[0]
   566			sub = p.removeLeadingString(sub, n)
   567			re.Sub[0] = sub
   568			if sub.Op == OpEmptyMatch {
   569				p.reuse(sub)
   570				switch len(re.Sub) {
   571				case 0, 1:
   572					// Impossible but handle.
   573					re.Op = OpEmptyMatch
   574					re.Sub = nil
   575				case 2:
   576					old := re
   577					re = re.Sub[1]
   578					p.reuse(old)
   579				default:
   580					copy(re.Sub, re.Sub[1:])
   581					re.Sub = re.Sub[:len(re.Sub)-1]
   582				}
   583			}
   584			return re
   585		}
   586	
   587		if re.Op == OpLiteral {
   588			re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
   589			if len(re.Rune) == 0 {
   590				re.Op = OpEmptyMatch
   591			}
   592		}
   593		return re
   594	}
   595	
   596	// leadingRegexp returns the leading regexp that re begins with.
   597	// The regexp refers to storage in re or its children.
   598	func (p *parser) leadingRegexp(re *Regexp) *Regexp {
   599		if re.Op == OpEmptyMatch {
   600			return nil
   601		}
   602		if re.Op == OpConcat && len(re.Sub) > 0 {
   603			sub := re.Sub[0]
   604			if sub.Op == OpEmptyMatch {
   605				return nil
   606			}
   607			return sub
   608		}
   609		return re
   610	}
   611	
   612	// removeLeadingRegexp removes the leading regexp in re.
   613	// It returns the replacement for re.
   614	// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
   615	func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
   616		if re.Op == OpConcat && len(re.Sub) > 0 {
   617			if reuse {
   618				p.reuse(re.Sub[0])
   619			}
   620			re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
   621			switch len(re.Sub) {
   622			case 0:
   623				re.Op = OpEmptyMatch
   624				re.Sub = nil
   625			case 1:
   626				old := re
   627				re = re.Sub[0]
   628				p.reuse(old)
   629			}
   630			return re
   631		}
   632		if reuse {
   633			p.reuse(re)
   634		}
   635		return p.newRegexp(OpEmptyMatch)
   636	}
   637	
   638	func literalRegexp(s string, flags Flags) *Regexp {
   639		re := &Regexp{Op: OpLiteral}
   640		re.Flags = flags
   641		re.Rune = re.Rune0[:0] // use local storage for small strings
   642		for _, c := range s {
   643			if len(re.Rune) >= cap(re.Rune) {
   644				// string is too long to fit in Rune0.  let Go handle it
   645				re.Rune = []rune(s)
   646				break
   647			}
   648			re.Rune = append(re.Rune, c)
   649		}
   650		return re
   651	}
   652	
   653	// Parsing.
   654	
   655	// Parse parses a regular expression string s, controlled by the specified
   656	// Flags, and returns a regular expression parse tree. The syntax is
   657	// described in the top-level comment for package regexp.
   658	func Parse(s string, flags Flags) (*Regexp, error) {
   659		if flags&Literal != 0 {
   660			// Trivial parser for literal string.
   661			if err := checkUTF8(s); err != nil {
   662				return nil, err
   663			}
   664			return literalRegexp(s, flags), nil
   665		}
   666	
   667		// Otherwise, must do real work.
   668		var (
   669			p          parser
   670			err        error
   671			c          rune
   672			op         Op
   673			lastRepeat string
   674			min, max   int
   675		)
   676		p.flags = flags
   677		p.wholeRegexp = s
   678		t := s
   679		for t != "" {
   680			repeat := ""
   681		BigSwitch:
   682			switch t[0] {
   683			default:
   684				if c, t, err = nextRune(t); err != nil {
   685					return nil, err
   686				}
   687				p.literal(c)
   688	
   689			case '(':
   690				if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
   691					// Flag changes and non-capturing groups.
   692					if t, err = p.parsePerlFlags(t); err != nil {
   693						return nil, err
   694					}
   695					break
   696				}
   697				p.numCap++
   698				p.op(opLeftParen).Cap = p.numCap
   699				t = t[1:]
   700			case '|':
   701				if err = p.parseVerticalBar(); err != nil {
   702					return nil, err
   703				}
   704				t = t[1:]
   705			case ')':
   706				if err = p.parseRightParen(); err != nil {
   707					return nil, err
   708				}
   709				t = t[1:]
   710			case '^':
   711				if p.flags&OneLine != 0 {
   712					p.op(OpBeginText)
   713				} else {
   714					p.op(OpBeginLine)
   715				}
   716				t = t[1:]
   717			case '$':
   718				if p.flags&OneLine != 0 {
   719					p.op(OpEndText).Flags |= WasDollar
   720				} else {
   721					p.op(OpEndLine)
   722				}
   723				t = t[1:]
   724			case '.':
   725				if p.flags&DotNL != 0 {
   726					p.op(OpAnyChar)
   727				} else {
   728					p.op(OpAnyCharNotNL)
   729				}
   730				t = t[1:]
   731			case '[':
   732				if t, err = p.parseClass(t); err != nil {
   733					return nil, err
   734				}
   735			case '*', '+', '?':
   736				before := t
   737				switch t[0] {
   738				case '*':
   739					op = OpStar
   740				case '+':
   741					op = OpPlus
   742				case '?':
   743					op = OpQuest
   744				}
   745				after := t[1:]
   746				if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
   747					return nil, err
   748				}
   749				repeat = before
   750				t = after
   751			case '{':
   752				op = OpRepeat
   753				before := t
   754				min, max, after, ok := p.parseRepeat(t)
   755				if !ok {
   756					// If the repeat cannot be parsed, { is a literal.
   757					p.literal('{')
   758					t = t[1:]
   759					break
   760				}
   761				if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
   762					// Numbers were too big, or max is present and min > max.
   763					return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
   764				}
   765				if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
   766					return nil, err
   767				}
   768				repeat = before
   769				t = after
   770			case '\\':
   771				if p.flags&PerlX != 0 && len(t) >= 2 {
   772					switch t[1] {
   773					case 'A':
   774						p.op(OpBeginText)
   775						t = t[2:]
   776						break BigSwitch
   777					case 'b':
   778						p.op(OpWordBoundary)
   779						t = t[2:]
   780						break BigSwitch
   781					case 'B':
   782						p.op(OpNoWordBoundary)
   783						t = t[2:]
   784						break BigSwitch
   785					case 'C':
   786						// any byte; not supported
   787						return nil, &Error{ErrInvalidEscape, t[:2]}
   788					case 'Q':
   789						// \Q ... \E: the ... is always literals
   790						var lit string
   791						if i := strings.Index(t, `\E`); i < 0 {
   792							lit = t[2:]
   793							t = ""
   794						} else {
   795							lit = t[2:i]
   796							t = t[i+2:]
   797						}
   798						p.push(literalRegexp(lit, p.flags))
   799						break BigSwitch
   800					case 'z':
   801						p.op(OpEndText)
   802						t = t[2:]
   803						break BigSwitch
   804					}
   805				}
   806	
   807				re := p.newRegexp(OpCharClass)
   808				re.Flags = p.flags
   809	
   810				// Look for Unicode character group like \p{Han}
   811				if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
   812					r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
   813					if err != nil {
   814						return nil, err
   815					}
   816					if r != nil {
   817						re.Rune = r
   818						t = rest
   819						p.push(re)
   820						break BigSwitch
   821					}
   822				}
   823	
   824				// Perl character class escape.
   825				if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
   826					re.Rune = r
   827					t = rest
   828					p.push(re)
   829					break BigSwitch
   830				}
   831				p.reuse(re)
   832	
   833				// Ordinary single-character escape.
   834				if c, t, err = p.parseEscape(t); err != nil {
   835					return nil, err
   836				}
   837				p.literal(c)
   838			}
   839			lastRepeat = repeat
   840		}
   841	
   842		p.concat()
   843		if p.swapVerticalBar() {
   844			// pop vertical bar
   845			p.stack = p.stack[:len(p.stack)-1]
   846		}
   847		p.alternate()
   848	
   849		n := len(p.stack)
   850		if n != 1 {
   851			return nil, &Error{ErrMissingParen, s}
   852		}
   853		return p.stack[0], nil
   854	}
   855	
   856	// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
   857	// If s is not of that form, it returns ok == false.
   858	// If s has the right form but the values are too big, it returns min == -1, ok == true.
   859	func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
   860		if s == "" || s[0] != '{' {
   861			return
   862		}
   863		s = s[1:]
   864		var ok1 bool
   865		if min, s, ok1 = p.parseInt(s); !ok1 {
   866			return
   867		}
   868		if s == "" {
   869			return
   870		}
   871		if s[0] != ',' {
   872			max = min
   873		} else {
   874			s = s[1:]
   875			if s == "" {
   876				return
   877			}
   878			if s[0] == '}' {
   879				max = -1
   880			} else if max, s, ok1 = p.parseInt(s); !ok1 {
   881				return
   882			} else if max < 0 {
   883				// parseInt found too big a number
   884				min = -1
   885			}
   886		}
   887		if s == "" || s[0] != '}' {
   888			return
   889		}
   890		rest = s[1:]
   891		ok = true
   892		return
   893	}
   894	
   895	// parsePerlFlags parses a Perl flag setting or non-capturing group or both,
   896	// like (?i) or (?: or (?i:.  It removes the prefix from s and updates the parse state.
   897	// The caller must have ensured that s begins with "(?".
   898	func (p *parser) parsePerlFlags(s string) (rest string, err error) {
   899		t := s
   900	
   901		// Check for named captures, first introduced in Python's regexp library.
   902		// As usual, there are three slightly different syntaxes:
   903		//
   904		//   (?P<name>expr)   the original, introduced by Python
   905		//   (?<name>expr)    the .NET alteration, adopted by Perl 5.10
   906		//   (?'name'expr)    another .NET alteration, adopted by Perl 5.10
   907		//
   908		// Perl 5.10 gave in and implemented the Python version too,
   909		// but they claim that the last two are the preferred forms.
   910		// PCRE and languages based on it (specifically, PHP and Ruby)
   911		// support all three as well.  EcmaScript 4 uses only the Python form.
   912		//
   913		// In both the open source world (via Code Search) and the
   914		// Google source tree, (?P<expr>name) is the dominant form,
   915		// so that's the one we implement.  One is enough.
   916		if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
   917			// Pull out name.
   918			end := strings.IndexRune(t, '>')
   919			if end < 0 {
   920				if err = checkUTF8(t); err != nil {
   921					return "", err
   922				}
   923				return "", &Error{ErrInvalidNamedCapture, s}
   924			}
   925	
   926			capture := t[:end+1] // "(?P<name>"
   927			name := t[4:end]     // "name"
   928			if err = checkUTF8(name); err != nil {
   929				return "", err
   930			}
   931			if !isValidCaptureName(name) {
   932				return "", &Error{ErrInvalidNamedCapture, capture}
   933			}
   934	
   935			// Like ordinary capture, but named.
   936			p.numCap++
   937			re := p.op(opLeftParen)
   938			re.Cap = p.numCap
   939			re.Name = name
   940			return t[end+1:], nil
   941		}
   942	
   943		// Non-capturing group.  Might also twiddle Perl flags.
   944		var c rune
   945		t = t[2:] // skip (?
   946		flags := p.flags
   947		sign := +1
   948		sawFlag := false
   949	Loop:
   950		for t != "" {
   951			if c, t, err = nextRune(t); err != nil {
   952				return "", err
   953			}
   954			switch c {
   955			default:
   956				break Loop
   957	
   958			// Flags.
   959			case 'i':
   960				flags |= FoldCase
   961				sawFlag = true
   962			case 'm':
   963				flags &^= OneLine
   964				sawFlag = true
   965			case 's':
   966				flags |= DotNL
   967				sawFlag = true
   968			case 'U':
   969				flags |= NonGreedy
   970				sawFlag = true
   971	
   972			// Switch to negation.
   973			case '-':
   974				if sign < 0 {
   975					break Loop
   976				}
   977				sign = -1
   978				// Invert flags so that | above turn into &^ and vice versa.
   979				// We'll invert flags again before using it below.
   980				flags = ^flags
   981				sawFlag = false
   982	
   983			// End of flags, starting group or not.
   984			case ':', ')':
   985				if sign < 0 {
   986					if !sawFlag {
   987						break Loop
   988					}
   989					flags = ^flags
   990				}
   991				if c == ':' {
   992					// Open new group
   993					p.op(opLeftParen)
   994				}
   995				p.flags = flags
   996				return t, nil
   997			}
   998		}
   999	
  1000		return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
  1001	}
  1002	
  1003	// isValidCaptureName reports whether name
  1004	// is a valid capture name: [A-Za-z0-9_]+.
  1005	// PCRE limits names to 32 bytes.
  1006	// Python rejects names starting with digits.
  1007	// We don't enforce either of those.
  1008	func isValidCaptureName(name string) bool {
  1009		if name == "" {
  1010			return false
  1011		}
  1012		for _, c := range name {
  1013			if c != '_' && !isalnum(c) {
  1014				return false
  1015			}
  1016		}
  1017		return true
  1018	}
  1019	
  1020	// parseInt parses a decimal integer.
  1021	func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
  1022		if s == "" || s[0] < '0' || '9' < s[0] {
  1023			return
  1024		}
  1025		// Disallow leading zeros.
  1026		if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
  1027			return
  1028		}
  1029		t := s
  1030		for s != "" && '0' <= s[0] && s[0] <= '9' {
  1031			s = s[1:]
  1032		}
  1033		rest = s
  1034		ok = true
  1035		// Have digits, compute value.
  1036		t = t[:len(t)-len(s)]
  1037		for i := 0; i < len(t); i++ {
  1038			// Avoid overflow.
  1039			if n >= 1e8 {
  1040				n = -1
  1041				break
  1042			}
  1043			n = n*10 + int(t[i]) - '0'
  1044		}
  1045		return
  1046	}
  1047	
  1048	// can this be represented as a character class?
  1049	// single-rune literal string, char class, ., and .|\n.
  1050	func isCharClass(re *Regexp) bool {
  1051		return re.Op == OpLiteral && len(re.Rune) == 1 ||
  1052			re.Op == OpCharClass ||
  1053			re.Op == OpAnyCharNotNL ||
  1054			re.Op == OpAnyChar
  1055	}
  1056	
  1057	// does re match r?
  1058	func matchRune(re *Regexp, r rune) bool {
  1059		switch re.Op {
  1060		case OpLiteral:
  1061			return len(re.Rune) == 1 && re.Rune[0] == r
  1062		case OpCharClass:
  1063			for i := 0; i < len(re.Rune); i += 2 {
  1064				if re.Rune[i] <= r && r <= re.Rune[i+1] {
  1065					return true
  1066				}
  1067			}
  1068			return false
  1069		case OpAnyCharNotNL:
  1070			return r != '\n'
  1071		case OpAnyChar:
  1072			return true
  1073		}
  1074		return false
  1075	}
  1076	
  1077	// parseVerticalBar handles a | in the input.
  1078	func (p *parser) parseVerticalBar() error {
  1079		p.concat()
  1080	
  1081		// The concatenation we just parsed is on top of the stack.
  1082		// If it sits above an opVerticalBar, swap it below
  1083		// (things below an opVerticalBar become an alternation).
  1084		// Otherwise, push a new vertical bar.
  1085		if !p.swapVerticalBar() {
  1086			p.op(opVerticalBar)
  1087		}
  1088	
  1089		return nil
  1090	}
  1091	
  1092	// mergeCharClass makes dst = dst|src.
  1093	// The caller must ensure that dst.Op >= src.Op,
  1094	// to reduce the amount of copying.
  1095	func mergeCharClass(dst, src *Regexp) {
  1096		switch dst.Op {
  1097		case OpAnyChar:
  1098			// src doesn't add anything.
  1099		case OpAnyCharNotNL:
  1100			// src might add \n
  1101			if matchRune(src, '\n') {
  1102				dst.Op = OpAnyChar
  1103			}
  1104		case OpCharClass:
  1105			// src is simpler, so either literal or char class
  1106			if src.Op == OpLiteral {
  1107				dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
  1108			} else {
  1109				dst.Rune = appendClass(dst.Rune, src.Rune)
  1110			}
  1111		case OpLiteral:
  1112			// both literal
  1113			if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
  1114				break
  1115			}
  1116			dst.Op = OpCharClass
  1117			dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
  1118			dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
  1119		}
  1120	}
  1121	
  1122	// If the top of the stack is an element followed by an opVerticalBar
  1123	// swapVerticalBar swaps the two and returns true.
  1124	// Otherwise it returns false.
  1125	func (p *parser) swapVerticalBar() bool {
  1126		// If above and below vertical bar are literal or char class,
  1127		// can merge into a single char class.
  1128		n := len(p.stack)
  1129		if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
  1130			re1 := p.stack[n-1]
  1131			re3 := p.stack[n-3]
  1132			// Make re3 the more complex of the two.
  1133			if re1.Op > re3.Op {
  1134				re1, re3 = re3, re1
  1135				p.stack[n-3] = re3
  1136			}
  1137			mergeCharClass(re3, re1)
  1138			p.reuse(re1)
  1139			p.stack = p.stack[:n-1]
  1140			return true
  1141		}
  1142	
  1143		if n >= 2 {
  1144			re1 := p.stack[n-1]
  1145			re2 := p.stack[n-2]
  1146			if re2.Op == opVerticalBar {
  1147				if n >= 3 {
  1148					// Now out of reach.
  1149					// Clean opportunistically.
  1150					cleanAlt(p.stack[n-3])
  1151				}
  1152				p.stack[n-2] = re1
  1153				p.stack[n-1] = re2
  1154				return true
  1155			}
  1156		}
  1157		return false
  1158	}
  1159	
  1160	// parseRightParen handles a ) in the input.
  1161	func (p *parser) parseRightParen() error {
  1162		p.concat()
  1163		if p.swapVerticalBar() {
  1164			// pop vertical bar
  1165			p.stack = p.stack[:len(p.stack)-1]
  1166		}
  1167		p.alternate()
  1168	
  1169		n := len(p.stack)
  1170		if n < 2 {
  1171			return &Error{ErrInternalError, ""}
  1172		}
  1173		re1 := p.stack[n-1]
  1174		re2 := p.stack[n-2]
  1175		p.stack = p.stack[:n-2]
  1176		if re2.Op != opLeftParen {
  1177			return &Error{ErrMissingParen, p.wholeRegexp}
  1178		}
  1179		// Restore flags at time of paren.
  1180		p.flags = re2.Flags
  1181		if re2.Cap == 0 {
  1182			// Just for grouping.
  1183			p.push(re1)
  1184		} else {
  1185			re2.Op = OpCapture
  1186			re2.Sub = re2.Sub0[:1]
  1187			re2.Sub[0] = re1
  1188			p.push(re2)
  1189		}
  1190		return nil
  1191	}
  1192	
  1193	// parseEscape parses an escape sequence at the beginning of s
  1194	// and returns the rune.
  1195	func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
  1196		t := s[1:]
  1197		if t == "" {
  1198			return 0, "", &Error{ErrTrailingBackslash, ""}
  1199		}
  1200		c, t, err := nextRune(t)
  1201		if err != nil {
  1202			return 0, "", err
  1203		}
  1204	
  1205	Switch:
  1206		switch c {
  1207		default:
  1208			if c < utf8.RuneSelf && !isalnum(c) {
  1209				// Escaped non-word characters are always themselves.
  1210				// PCRE is not quite so rigorous: it accepts things like
  1211				// \q, but we don't.  We once rejected \_, but too many
  1212				// programs and people insist on using it, so allow \_.
  1213				return c, t, nil
  1214			}
  1215	
  1216		// Octal escapes.
  1217		case '1', '2', '3', '4', '5', '6', '7':
  1218			// Single non-zero digit is a backreference; not supported
  1219			if t == "" || t[0] < '0' || t[0] > '7' {
  1220				break
  1221			}
  1222			fallthrough
  1223		case '0':
  1224			// Consume up to three octal digits; already have one.
  1225			r = c - '0'
  1226			for i := 1; i < 3; i++ {
  1227				if t == "" || t[0] < '0' || t[0] > '7' {
  1228					break
  1229				}
  1230				r = r*8 + rune(t[0]) - '0'
  1231				t = t[1:]
  1232			}
  1233			return r, t, nil
  1234	
  1235		// Hexadecimal escapes.
  1236		case 'x':
  1237			if t == "" {
  1238				break
  1239			}
  1240			if c, t, err = nextRune(t); err != nil {
  1241				return 0, "", err
  1242			}
  1243			if c == '{' {
  1244				// Any number of digits in braces.
  1245				// Perl accepts any text at all; it ignores all text
  1246				// after the first non-hex digit.  We require only hex digits,
  1247				// and at least one.
  1248				nhex := 0
  1249				r = 0
  1250				for {
  1251					if t == "" {
  1252						break Switch
  1253					}
  1254					if c, t, err = nextRune(t); err != nil {
  1255						return 0, "", err
  1256					}
  1257					if c == '}' {
  1258						break
  1259					}
  1260					v := unhex(c)
  1261					if v < 0 {
  1262						break Switch
  1263					}
  1264					r = r*16 + v
  1265					if r > unicode.MaxRune {
  1266						break Switch
  1267					}
  1268					nhex++
  1269				}
  1270				if nhex == 0 {
  1271					break Switch
  1272				}
  1273				return r, t, nil
  1274			}
  1275	
  1276			// Easy case: two hex digits.
  1277			x := unhex(c)
  1278			if c, t, err = nextRune(t); err != nil {
  1279				return 0, "", err
  1280			}
  1281			y := unhex(c)
  1282			if x < 0 || y < 0 {
  1283				break
  1284			}
  1285			return x*16 + y, t, nil
  1286	
  1287		// C escapes.  There is no case 'b', to avoid misparsing
  1288		// the Perl word-boundary \b as the C backspace \b
  1289		// when in POSIX mode.  In Perl, /\b/ means word-boundary
  1290		// but /[\b]/ means backspace.  We don't support that.
  1291		// If you want a backspace, embed a literal backspace
  1292		// character or use \x08.
  1293		case 'a':
  1294			return '\a', t, err
  1295		case 'f':
  1296			return '\f', t, err
  1297		case 'n':
  1298			return '\n', t, err
  1299		case 'r':
  1300			return '\r', t, err
  1301		case 't':
  1302			return '\t', t, err
  1303		case 'v':
  1304			return '\v', t, err
  1305		}
  1306		return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
  1307	}
  1308	
  1309	// parseClassChar parses a character class character at the beginning of s
  1310	// and returns it.
  1311	func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
  1312		if s == "" {
  1313			return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
  1314		}
  1315	
  1316		// Allow regular escape sequences even though
  1317		// many need not be escaped in this context.
  1318		if s[0] == '\\' {
  1319			return p.parseEscape(s)
  1320		}
  1321	
  1322		return nextRune(s)
  1323	}
  1324	
  1325	type charGroup struct {
  1326		sign  int
  1327		class []rune
  1328	}
  1329	
  1330	// parsePerlClassEscape parses a leading Perl character class escape like \d
  1331	// from the beginning of s.  If one is present, it appends the characters to r
  1332	// and returns the new slice r and the remainder of the string.
  1333	func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
  1334		if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
  1335			return
  1336		}
  1337		g := perlGroup[s[0:2]]
  1338		if g.sign == 0 {
  1339			return
  1340		}
  1341		return p.appendGroup(r, g), s[2:]
  1342	}
  1343	
  1344	// parseNamedClass parses a leading POSIX named character class like [:alnum:]
  1345	// from the beginning of s.  If one is present, it appends the characters to r
  1346	// and returns the new slice r and the remainder of the string.
  1347	func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
  1348		if len(s) < 2 || s[0] != '[' || s[1] != ':' {
  1349			return
  1350		}
  1351	
  1352		i := strings.Index(s[2:], ":]")
  1353		if i < 0 {
  1354			return
  1355		}
  1356		i += 2
  1357		name, s := s[0:i+2], s[i+2:]
  1358		g := posixGroup[name]
  1359		if g.sign == 0 {
  1360			return nil, "", &Error{ErrInvalidCharRange, name}
  1361		}
  1362		return p.appendGroup(r, g), s, nil
  1363	}
  1364	
  1365	func (p *parser) appendGroup(r []rune, g charGroup) []rune {
  1366		if p.flags&FoldCase == 0 {
  1367			if g.sign < 0 {
  1368				r = appendNegatedClass(r, g.class)
  1369			} else {
  1370				r = appendClass(r, g.class)
  1371			}
  1372		} else {
  1373			tmp := p.tmpClass[:0]
  1374			tmp = appendFoldedClass(tmp, g.class)
  1375			p.tmpClass = tmp
  1376			tmp = cleanClass(&p.tmpClass)
  1377			if g.sign < 0 {
  1378				r = appendNegatedClass(r, tmp)
  1379			} else {
  1380				r = appendClass(r, tmp)
  1381			}
  1382		}
  1383		return r
  1384	}
  1385	
  1386	var anyTable = &unicode.RangeTable{
  1387		R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
  1388		R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
  1389	}
  1390	
  1391	// unicodeTable returns the unicode.RangeTable identified by name
  1392	// and the table of additional fold-equivalent code points.
  1393	func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
  1394		// Special case: "Any" means any.
  1395		if name == "Any" {
  1396			return anyTable, anyTable
  1397		}
  1398		if t := unicode.Categories[name]; t != nil {
  1399			return t, unicode.FoldCategory[name]
  1400		}
  1401		if t := unicode.Scripts[name]; t != nil {
  1402			return t, unicode.FoldScript[name]
  1403		}
  1404		return nil, nil
  1405	}
  1406	
  1407	// parseUnicodeClass parses a leading Unicode character class like \p{Han}
  1408	// from the beginning of s.  If one is present, it appends the characters to r
  1409	// and returns the new slice r and the remainder of the string.
  1410	func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
  1411		if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
  1412			return
  1413		}
  1414	
  1415		// Committed to parse or return error.
  1416		sign := +1
  1417		if s[1] == 'P' {
  1418			sign = -1
  1419		}
  1420		t := s[2:]
  1421		c, t, err := nextRune(t)
  1422		if err != nil {
  1423			return
  1424		}
  1425		var seq, name string
  1426		if c != '{' {
  1427			// Single-letter name.
  1428			seq = s[:len(s)-len(t)]
  1429			name = seq[2:]
  1430		} else {
  1431			// Name is in braces.
  1432			end := strings.IndexRune(s, '}')
  1433			if end < 0 {
  1434				if err = checkUTF8(s); err != nil {
  1435					return
  1436				}
  1437				return nil, "", &Error{ErrInvalidCharRange, s}
  1438			}
  1439			seq, t = s[:end+1], s[end+1:]
  1440			name = s[3:end]
  1441			if err = checkUTF8(name); err != nil {
  1442				return
  1443			}
  1444		}
  1445	
  1446		// Group can have leading negation too.  \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
  1447		if name != "" && name[0] == '^' {
  1448			sign = -sign
  1449			name = name[1:]
  1450		}
  1451	
  1452		tab, fold := unicodeTable(name)
  1453		if tab == nil {
  1454			return nil, "", &Error{ErrInvalidCharRange, seq}
  1455		}
  1456	
  1457		if p.flags&FoldCase == 0 || fold == nil {
  1458			if sign > 0 {
  1459				r = appendTable(r, tab)
  1460			} else {
  1461				r = appendNegatedTable(r, tab)
  1462			}
  1463		} else {
  1464			// Merge and clean tab and fold in a temporary buffer.
  1465			// This is necessary for the negative case and just tidy
  1466			// for the positive case.
  1467			tmp := p.tmpClass[:0]
  1468			tmp = appendTable(tmp, tab)
  1469			tmp = appendTable(tmp, fold)
  1470			p.tmpClass = tmp
  1471			tmp = cleanClass(&p.tmpClass)
  1472			if sign > 0 {
  1473				r = appendClass(r, tmp)
  1474			} else {
  1475				r = appendNegatedClass(r, tmp)
  1476			}
  1477		}
  1478		return r, t, nil
  1479	}
  1480	
  1481	// parseClass parses a character class at the beginning of s
  1482	// and pushes it onto the parse stack.
  1483	func (p *parser) parseClass(s string) (rest string, err error) {
  1484		t := s[1:] // chop [
  1485		re := p.newRegexp(OpCharClass)
  1486		re.Flags = p.flags
  1487		re.Rune = re.Rune0[:0]
  1488	
  1489		sign := +1
  1490		if t != "" && t[0] == '^' {
  1491			sign = -1
  1492			t = t[1:]
  1493	
  1494			// If character class does not match \n, add it here,
  1495			// so that negation later will do the right thing.
  1496			if p.flags&ClassNL == 0 {
  1497				re.Rune = append(re.Rune, '\n', '\n')
  1498			}
  1499		}
  1500	
  1501		class := re.Rune
  1502		first := true // ] and - are okay as first char in class
  1503		for t == "" || t[0] != ']' || first {
  1504			// POSIX: - is only okay unescaped as first or last in class.
  1505			// Perl: - is okay anywhere.
  1506			if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
  1507				_, size := utf8.DecodeRuneInString(t[1:])
  1508				return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
  1509			}
  1510			first = false
  1511	
  1512			// Look for POSIX [:alnum:] etc.
  1513			if len(t) > 2 && t[0] == '[' && t[1] == ':' {
  1514				nclass, nt, err := p.parseNamedClass(t, class)
  1515				if err != nil {
  1516					return "", err
  1517				}
  1518				if nclass != nil {
  1519					class, t = nclass, nt
  1520					continue
  1521				}
  1522			}
  1523	
  1524			// Look for Unicode character group like \p{Han}.
  1525			nclass, nt, err := p.parseUnicodeClass(t, class)
  1526			if err != nil {
  1527				return "", err
  1528			}
  1529			if nclass != nil {
  1530				class, t = nclass, nt
  1531				continue
  1532			}
  1533	
  1534			// Look for Perl character class symbols (extension).
  1535			if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
  1536				class, t = nclass, nt
  1537				continue
  1538			}
  1539	
  1540			// Single character or simple range.
  1541			rng := t
  1542			var lo, hi rune
  1543			if lo, t, err = p.parseClassChar(t, s); err != nil {
  1544				return "", err
  1545			}
  1546			hi = lo
  1547			// [a-] means (a|-) so check for final ].
  1548			if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
  1549				t = t[1:]
  1550				if hi, t, err = p.parseClassChar(t, s); err != nil {
  1551					return "", err
  1552				}
  1553				if hi < lo {
  1554					rng = rng[:len(rng)-len(t)]
  1555					return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
  1556				}
  1557			}
  1558			if p.flags&FoldCase == 0 {
  1559				class = appendRange(class, lo, hi)
  1560			} else {
  1561				class = appendFoldedRange(class, lo, hi)
  1562			}
  1563		}
  1564		t = t[1:] // chop ]
  1565	
  1566		// Use &re.Rune instead of &class to avoid allocation.
  1567		re.Rune = class
  1568		class = cleanClass(&re.Rune)
  1569		if sign < 0 {
  1570			class = negateClass(class)
  1571		}
  1572		re.Rune = class
  1573		p.push(re)
  1574		return t, nil
  1575	}
  1576	
  1577	// cleanClass sorts the ranges (pairs of elements of r),
  1578	// merges them, and eliminates duplicates.
  1579	func cleanClass(rp *[]rune) []rune {
  1580	
  1581		// Sort by lo increasing, hi decreasing to break ties.
  1582		sort.Sort(ranges{rp})
  1583	
  1584		r := *rp
  1585		if len(r) < 2 {
  1586			return r
  1587		}
  1588	
  1589		// Merge abutting, overlapping.
  1590		w := 2 // write index
  1591		for i := 2; i < len(r); i += 2 {
  1592			lo, hi := r[i], r[i+1]
  1593			if lo <= r[w-1]+1 {
  1594				// merge with previous range
  1595				if hi > r[w-1] {
  1596					r[w-1] = hi
  1597				}
  1598				continue
  1599			}
  1600			// new disjoint range
  1601			r[w] = lo
  1602			r[w+1] = hi
  1603			w += 2
  1604		}
  1605	
  1606		return r[:w]
  1607	}
  1608	
  1609	// appendLiteral returns the result of appending the literal x to the class r.
  1610	func appendLiteral(r []rune, x rune, flags Flags) []rune {
  1611		if flags&FoldCase != 0 {
  1612			return appendFoldedRange(r, x, x)
  1613		}
  1614		return appendRange(r, x, x)
  1615	}
  1616	
  1617	// appendRange returns the result of appending the range lo-hi to the class r.
  1618	func appendRange(r []rune, lo, hi rune) []rune {
  1619		// Expand last range or next to last range if it overlaps or abuts.
  1620		// Checking two ranges helps when appending case-folded
  1621		// alphabets, so that one range can be expanding A-Z and the
  1622		// other expanding a-z.
  1623		n := len(r)
  1624		for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
  1625			if n >= i {
  1626				rlo, rhi := r[n-i], r[n-i+1]
  1627				if lo <= rhi+1 && rlo <= hi+1 {
  1628					if lo < rlo {
  1629						r[n-i] = lo
  1630					}
  1631					if hi > rhi {
  1632						r[n-i+1] = hi
  1633					}
  1634					return r
  1635				}
  1636			}
  1637		}
  1638	
  1639		return append(r, lo, hi)
  1640	}
  1641	
  1642	const (
  1643		// minimum and maximum runes involved in folding.
  1644		// checked during test.
  1645		minFold = 0x0041
  1646		maxFold = 0x1044f
  1647	)
  1648	
  1649	// appendFoldedRange returns the result of appending the range lo-hi
  1650	// and its case folding-equivalent runes to the class r.
  1651	func appendFoldedRange(r []rune, lo, hi rune) []rune {
  1652		// Optimizations.
  1653		if lo <= minFold && hi >= maxFold {
  1654			// Range is full: folding can't add more.
  1655			return appendRange(r, lo, hi)
  1656		}
  1657		if hi < minFold || lo > maxFold {
  1658			// Range is outside folding possibilities.
  1659			return appendRange(r, lo, hi)
  1660		}
  1661		if lo < minFold {
  1662			// [lo, minFold-1] needs no folding.
  1663			r = appendRange(r, lo, minFold-1)
  1664			lo = minFold
  1665		}
  1666		if hi > maxFold {
  1667			// [maxFold+1, hi] needs no folding.
  1668			r = appendRange(r, maxFold+1, hi)
  1669			hi = maxFold
  1670		}
  1671	
  1672		// Brute force.  Depend on appendRange to coalesce ranges on the fly.
  1673		for c := lo; c <= hi; c++ {
  1674			r = appendRange(r, c, c)
  1675			f := unicode.SimpleFold(c)
  1676			for f != c {
  1677				r = appendRange(r, f, f)
  1678				f = unicode.SimpleFold(f)
  1679			}
  1680		}
  1681		return r
  1682	}
  1683	
  1684	// appendClass returns the result of appending the class x to the class r.
  1685	// It assume x is clean.
  1686	func appendClass(r []rune, x []rune) []rune {
  1687		for i := 0; i < len(x); i += 2 {
  1688			r = appendRange(r, x[i], x[i+1])
  1689		}
  1690		return r
  1691	}
  1692	
  1693	// appendFolded returns the result of appending the case folding of the class x to the class r.
  1694	func appendFoldedClass(r []rune, x []rune) []rune {
  1695		for i := 0; i < len(x); i += 2 {
  1696			r = appendFoldedRange(r, x[i], x[i+1])
  1697		}
  1698		return r
  1699	}
  1700	
  1701	// appendNegatedClass returns the result of appending the negation of the class x to the class r.
  1702	// It assumes x is clean.
  1703	func appendNegatedClass(r []rune, x []rune) []rune {
  1704		nextLo := '\u0000'
  1705		for i := 0; i < len(x); i += 2 {
  1706			lo, hi := x[i], x[i+1]
  1707			if nextLo <= lo-1 {
  1708				r = appendRange(r, nextLo, lo-1)
  1709			}
  1710			nextLo = hi + 1
  1711		}
  1712		if nextLo <= unicode.MaxRune {
  1713			r = appendRange(r, nextLo, unicode.MaxRune)
  1714		}
  1715		return r
  1716	}
  1717	
  1718	// appendTable returns the result of appending x to the class r.
  1719	func appendTable(r []rune, x *unicode.RangeTable) []rune {
  1720		for _, xr := range x.R16 {
  1721			lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
  1722			if stride == 1 {
  1723				r = appendRange(r, lo, hi)
  1724				continue
  1725			}
  1726			for c := lo; c <= hi; c += stride {
  1727				r = appendRange(r, c, c)
  1728			}
  1729		}
  1730		for _, xr := range x.R32 {
  1731			lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
  1732			if stride == 1 {
  1733				r = appendRange(r, lo, hi)
  1734				continue
  1735			}
  1736			for c := lo; c <= hi; c += stride {
  1737				r = appendRange(r, c, c)
  1738			}
  1739		}
  1740		return r
  1741	}
  1742	
  1743	// appendNegatedTable returns the result of appending the negation of x to the class r.
  1744	func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
  1745		nextLo := '\u0000' // lo end of next class to add
  1746		for _, xr := range x.R16 {
  1747			lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
  1748			if stride == 1 {
  1749				if nextLo <= lo-1 {
  1750					r = appendRange(r, nextLo, lo-1)
  1751				}
  1752				nextLo = hi + 1
  1753				continue
  1754			}
  1755			for c := lo; c <= hi; c += stride {
  1756				if nextLo <= c-1 {
  1757					r = appendRange(r, nextLo, c-1)
  1758				}
  1759				nextLo = c + 1
  1760			}
  1761		}
  1762		for _, xr := range x.R32 {
  1763			lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
  1764			if stride == 1 {
  1765				if nextLo <= lo-1 {
  1766					r = appendRange(r, nextLo, lo-1)
  1767				}
  1768				nextLo = hi + 1
  1769				continue
  1770			}
  1771			for c := lo; c <= hi; c += stride {
  1772				if nextLo <= c-1 {
  1773					r = appendRange(r, nextLo, c-1)
  1774				}
  1775				nextLo = c + 1
  1776			}
  1777		}
  1778		if nextLo <= unicode.MaxRune {
  1779			r = appendRange(r, nextLo, unicode.MaxRune)
  1780		}
  1781		return r
  1782	}
  1783	
  1784	// negateClass overwrites r and returns r's negation.
  1785	// It assumes the class r is already clean.
  1786	func negateClass(r []rune) []rune {
  1787		nextLo := '\u0000' // lo end of next class to add
  1788		w := 0             // write index
  1789		for i := 0; i < len(r); i += 2 {
  1790			lo, hi := r[i], r[i+1]
  1791			if nextLo <= lo-1 {
  1792				r[w] = nextLo
  1793				r[w+1] = lo - 1
  1794				w += 2
  1795			}
  1796			nextLo = hi + 1
  1797		}
  1798		r = r[:w]
  1799		if nextLo <= unicode.MaxRune {
  1800			// It's possible for the negation to have one more
  1801			// range - this one - than the original class, so use append.
  1802			r = append(r, nextLo, unicode.MaxRune)
  1803		}
  1804		return r
  1805	}
  1806	
  1807	// ranges implements sort.Interface on a []rune.
  1808	// The choice of receiver type definition is strange
  1809	// but avoids an allocation since we already have
  1810	// a *[]rune.
  1811	type ranges struct {
  1812		p *[]rune
  1813	}
  1814	
  1815	func (ra ranges) Less(i, j int) bool {
  1816		p := *ra.p
  1817		i *= 2
  1818		j *= 2
  1819		return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
  1820	}
  1821	
  1822	func (ra ranges) Len() int {
  1823		return len(*ra.p) / 2
  1824	}
  1825	
  1826	func (ra ranges) Swap(i, j int) {
  1827		p := *ra.p
  1828		i *= 2
  1829		j *= 2
  1830		p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
  1831	}
  1832	
  1833	func checkUTF8(s string) error {
  1834		for s != "" {
  1835			rune, size := utf8.DecodeRuneInString(s)
  1836			if rune == utf8.RuneError && size == 1 {
  1837				return &Error{Code: ErrInvalidUTF8, Expr: s}
  1838			}
  1839			s = s[size:]
  1840		}
  1841		return nil
  1842	}
  1843	
  1844	func nextRune(s string) (c rune, t string, err error) {
  1845		c, size := utf8.DecodeRuneInString(s)
  1846		if c == utf8.RuneError && size == 1 {
  1847			return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
  1848		}
  1849		return c, s[size:], nil
  1850	}
  1851	
  1852	func isalnum(c rune) bool {
  1853		return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
  1854	}
  1855	
  1856	func unhex(c rune) rune {
  1857		if '0' <= c && c <= '9' {
  1858			return c - '0'
  1859		}
  1860		if 'a' <= c && c <= 'f' {
  1861			return c - 'a' + 10
  1862		}
  1863		if 'A' <= c && c <= 'F' {
  1864			return c - 'A' + 10
  1865		}
  1866		return -1
  1867	}