Source file src/pkg/regexp/syntax/parse.go
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package syntax parses regular expressions into parse trees and compiles
6 // parse trees into programs. Most clients of regular expressions will use
7 // the facilities of package regexp (such as Compile and Match) instead of
8 // this package.
9 package syntax
10
11 import (
12 "sort"
13 "strings"
14 "unicode"
15 "unicode/utf8"
16 )
17
18 // An Error describes a failure to parse a regular expression
19 // and gives the offending expression.
20 type Error struct {
21 Code ErrorCode
22 Expr string
23 }
24
25 func (e *Error) Error() string {
26 return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
27 }
28
29 // An ErrorCode describes a failure to parse a regular expression.
30 type ErrorCode string
31
32 const (
33 // Unexpected error
34 ErrInternalError ErrorCode = "regexp/syntax: internal error"
35
36 // Parse errors
37 ErrInvalidCharClass ErrorCode = "invalid character class"
38 ErrInvalidCharRange ErrorCode = "invalid character class range"
39 ErrInvalidEscape ErrorCode = "invalid escape sequence"
40 ErrInvalidNamedCapture ErrorCode = "invalid named capture"
41 ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"
42 ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"
43 ErrInvalidRepeatSize ErrorCode = "invalid repeat count"
44 ErrInvalidUTF8 ErrorCode = "invalid UTF-8"
45 ErrMissingBracket ErrorCode = "missing closing ]"
46 ErrMissingParen ErrorCode = "missing closing )"
47 ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
48 ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"
49 )
50
51 func (e ErrorCode) String() string {
52 return string(e)
53 }
54
55 // Flags control the behavior of the parser and record information about regexp context.
56 type Flags uint16
57
58 const (
59 FoldCase Flags = 1 << iota // case-insensitive match
60 Literal // treat pattern as literal string
61 ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline
62 DotNL // allow . to match newline
63 OneLine // treat ^ and $ as only matching at beginning and end of text
64 NonGreedy // make repetition operators default to non-greedy
65 PerlX // allow Perl extensions
66 UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation
67 WasDollar // regexp OpEndText was $, not \z
68 Simple // regexp contains no counted repetition
69
70 MatchNL = ClassNL | DotNL
71
72 Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
73 POSIX Flags = 0 // POSIX syntax
74 )
75
76 // Pseudo-ops for parsing stack.
77 const (
78 opLeftParen = opPseudo + iota
79 opVerticalBar
80 )
81
82 type parser struct {
83 flags Flags // parse mode flags
84 stack []*Regexp // stack of parsed expressions
85 free *Regexp
86 numCap int // number of capturing groups seen
87 wholeRegexp string
88 tmpClass []rune // temporary char class work space
89 }
90
91 func (p *parser) newRegexp(op Op) *Regexp {
92 re := p.free
93 if re != nil {
94 p.free = re.Sub0[0]
95 *re = Regexp{}
96 } else {
97 re = new(Regexp)
98 }
99 re.Op = op
100 return re
101 }
102
103 func (p *parser) reuse(re *Regexp) {
104 re.Sub0[0] = p.free
105 p.free = re
106 }
107
108 // Parse stack manipulation.
109
110 // push pushes the regexp re onto the parse stack and returns the regexp.
111 func (p *parser) push(re *Regexp) *Regexp {
112 if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
113 // Single rune.
114 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
115 return nil
116 }
117 re.Op = OpLiteral
118 re.Rune = re.Rune[:1]
119 re.Flags = p.flags &^ FoldCase
120 } else if re.Op == OpCharClass && len(re.Rune) == 4 &&
121 re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] &&
122 unicode.SimpleFold(re.Rune[0]) == re.Rune[2] &&
123 unicode.SimpleFold(re.Rune[2]) == re.Rune[0] ||
124 re.Op == OpCharClass && len(re.Rune) == 2 &&
125 re.Rune[0]+1 == re.Rune[1] &&
126 unicode.SimpleFold(re.Rune[0]) == re.Rune[1] &&
127 unicode.SimpleFold(re.Rune[1]) == re.Rune[0] {
128 // Case-insensitive rune like [Aa] or [Δδ].
129 if p.maybeConcat(re.Rune[0], p.flags|FoldCase) {
130 return nil
131 }
132
133 // Rewrite as (case-insensitive) literal.
134 re.Op = OpLiteral
135 re.Rune = re.Rune[:1]
136 re.Flags = p.flags | FoldCase
137 } else {
138 // Incremental concatenation.
139 p.maybeConcat(-1, 0)
140 }
141
142 p.stack = append(p.stack, re)
143 return re
144 }
145
146 // maybeConcat implements incremental concatenation
147 // of literal runes into string nodes. The parser calls this
148 // before each push, so only the top fragment of the stack
149 // might need processing. Since this is called before a push,
150 // the topmost literal is no longer subject to operators like *
151 // (Otherwise ab* would turn into (ab)*.)
152 // If r >= 0 and there's a node left over, maybeConcat uses it
153 // to push r with the given flags.
154 // maybeConcat reports whether r was pushed.
155 func (p *parser) maybeConcat(r rune, flags Flags) bool {
156 n := len(p.stack)
157 if n < 2 {
158 return false
159 }
160
161 re1 := p.stack[n-1]
162 re2 := p.stack[n-2]
163 if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase {
164 return false
165 }
166
167 // Push re1 into re2.
168 re2.Rune = append(re2.Rune, re1.Rune...)
169
170 // Reuse re1 if possible.
171 if r >= 0 {
172 re1.Rune = re1.Rune0[:1]
173 re1.Rune[0] = r
174 re1.Flags = flags
175 return true
176 }
177
178 p.stack = p.stack[:n-1]
179 p.reuse(re1)
180 return false // did not push r
181 }
182
183 // newLiteral returns a new OpLiteral Regexp with the given flags
184 func (p *parser) newLiteral(r rune, flags Flags) *Regexp {
185 re := p.newRegexp(OpLiteral)
186 re.Flags = flags
187 if flags&FoldCase != 0 {
188 r = minFoldRune(r)
189 }
190 re.Rune0[0] = r
191 re.Rune = re.Rune0[:1]
192 return re
193 }
194
195 // minFoldRune returns the minimum rune fold-equivalent to r.
196 func minFoldRune(r rune) rune {
197 if r < minFold || r > maxFold {
198 return r
199 }
200 min := r
201 r0 := r
202 for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
203 if min > r {
204 min = r
205 }
206 }
207 return min
208 }
209
210 // literal pushes a literal regexp for the rune r on the stack
211 // and returns that regexp.
212 func (p *parser) literal(r rune) {
213 p.push(p.newLiteral(r, p.flags))
214 }
215
216 // op pushes a regexp with the given op onto the stack
217 // and returns that regexp.
218 func (p *parser) op(op Op) *Regexp {
219 re := p.newRegexp(op)
220 re.Flags = p.flags
221 return p.push(re)
222 }
223
224 // repeat replaces the top stack element with itself repeated according to op, min, max.
225 // before is the regexp suffix starting at the repetition operator.
226 // after is the regexp suffix following after the repetition operator.
227 // repeat returns an updated 'after' and an error, if any.
228 func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) {
229 flags := p.flags
230 if p.flags&PerlX != 0 {
231 if len(after) > 0 && after[0] == '?' {
232 after = after[1:]
233 flags ^= NonGreedy
234 }
235 if lastRepeat != "" {
236 // In Perl it is not allowed to stack repetition operators:
237 // a** is a syntax error, not a doubled star, and a++ means
238 // something else entirely, which we don't support!
239 return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]}
240 }
241 }
242 n := len(p.stack)
243 if n == 0 {
244 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
245 }
246 sub := p.stack[n-1]
247 if sub.Op >= opPseudo {
248 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]}
249 }
250 re := p.newRegexp(op)
251 re.Min = min
252 re.Max = max
253 re.Flags = flags
254 re.Sub = re.Sub0[:1]
255 re.Sub[0] = sub
256 p.stack[n-1] = re
257 return after, nil
258 }
259
260 // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
261 func (p *parser) concat() *Regexp {
262 p.maybeConcat(-1, 0)
263
264 // Scan down to find pseudo-operator | or (.
265 i := len(p.stack)
266 for i > 0 && p.stack[i-1].Op < opPseudo {
267 i--
268 }
269 subs := p.stack[i:]
270 p.stack = p.stack[:i]
271
272 // Empty concatenation is special case.
273 if len(subs) == 0 {
274 return p.push(p.newRegexp(OpEmptyMatch))
275 }
276
277 return p.push(p.collapse(subs, OpConcat))
278 }
279
280 // alternate replaces the top of the stack (above the topmost '(') with its alternation.
281 func (p *parser) alternate() *Regexp {
282 // Scan down to find pseudo-operator (.
283 // There are no | above (.
284 i := len(p.stack)
285 for i > 0 && p.stack[i-1].Op < opPseudo {
286 i--
287 }
288 subs := p.stack[i:]
289 p.stack = p.stack[:i]
290
291 // Make sure top class is clean.
292 // All the others already are (see swapVerticalBar).
293 if len(subs) > 0 {
294 cleanAlt(subs[len(subs)-1])
295 }
296
297 // Empty alternate is special case
298 // (shouldn't happen but easy to handle).
299 if len(subs) == 0 {
300 return p.push(p.newRegexp(OpNoMatch))
301 }
302
303 return p.push(p.collapse(subs, OpAlternate))
304 }
305
306 // cleanAlt cleans re for eventual inclusion in an alternation.
307 func cleanAlt(re *Regexp) {
308 switch re.Op {
309 case OpCharClass:
310 re.Rune = cleanClass(&re.Rune)
311 if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune {
312 re.Rune = nil
313 re.Op = OpAnyChar
314 return
315 }
316 if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune {
317 re.Rune = nil
318 re.Op = OpAnyCharNotNL
319 return
320 }
321 if cap(re.Rune)-len(re.Rune) > 100 {
322 // re.Rune will not grow any more.
323 // Make a copy or inline to reclaim storage.
324 re.Rune = append(re.Rune0[:0], re.Rune...)
325 }
326 }
327 }
328
329 // collapse returns the result of applying op to sub.
330 // If sub contains op nodes, they all get hoisted up
331 // so that there is never a concat of a concat or an
332 // alternate of an alternate.
333 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
334 if len(subs) == 1 {
335 return subs[0]
336 }
337 re := p.newRegexp(op)
338 re.Sub = re.Sub0[:0]
339 for _, sub := range subs {
340 if sub.Op == op {
341 re.Sub = append(re.Sub, sub.Sub...)
342 p.reuse(sub)
343 } else {
344 re.Sub = append(re.Sub, sub)
345 }
346 }
347 if op == OpAlternate {
348 re.Sub = p.factor(re.Sub, re.Flags)
349 if len(re.Sub) == 1 {
350 old := re
351 re = re.Sub[0]
352 p.reuse(old)
353 }
354 }
355 return re
356 }
357
358 // factor factors common prefixes from the alternation list sub.
359 // It returns a replacement list that reuses the same storage and
360 // frees (passes to p.reuse) any removed *Regexps.
361 //
362 // For example,
363 // ABC|ABD|AEF|BCX|BCY
364 // simplifies by literal prefix extraction to
365 // A(B(C|D)|EF)|BC(X|Y)
366 // which simplifies by character class introduction to
367 // A(B[CD]|EF)|BC[XY]
368 //
369 func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
370 if len(sub) < 2 {
371 return sub
372 }
373
374 // Round 1: Factor out common literal prefixes.
375 var str []rune
376 var strflags Flags
377 start := 0
378 out := sub[:0]
379 for i := 0; i <= len(sub); i++ {
380 // Invariant: the Regexps that were in sub[0:start] have been
381 // used or marked for reuse, and the slice space has been reused
382 // for out (len(out) <= start).
383 //
384 // Invariant: sub[start:i] consists of regexps that all begin
385 // with str as modified by strflags.
386 var istr []rune
387 var iflags Flags
388 if i < len(sub) {
389 istr, iflags = p.leadingString(sub[i])
390 if iflags == strflags {
391 same := 0
392 for same < len(str) && same < len(istr) && str[same] == istr[same] {
393 same++
394 }
395 if same > 0 {
396 // Matches at least one rune in current range.
397 // Keep going around.
398 str = str[:same]
399 continue
400 }
401 }
402 }
403
404 // Found end of a run with common leading literal string:
405 // sub[start:i] all begin with str[0:len(str)], but sub[i]
406 // does not even begin with str[0].
407 //
408 // Factor out common string and append factored expression to out.
409 if i == start {
410 // Nothing to do - run of length 0.
411 } else if i == start+1 {
412 // Just one: don't bother factoring.
413 out = append(out, sub[start])
414 } else {
415 // Construct factored form: prefix(suffix1|suffix2|...)
416 prefix := p.newRegexp(OpLiteral)
417 prefix.Flags = strflags
418 prefix.Rune = append(prefix.Rune[:0], str...)
419
420 for j := start; j < i; j++ {
421 sub[j] = p.removeLeadingString(sub[j], len(str))
422 }
423 suffix := p.collapse(sub[start:i], OpAlternate) // recurse
424
425 re := p.newRegexp(OpConcat)
426 re.Sub = append(re.Sub[:0], prefix, suffix)
427 out = append(out, re)
428 }
429
430 // Prepare for next iteration.
431 start = i
432 str = istr
433 strflags = iflags
434 }
435 sub = out
436
437 // Round 2: Factor out common complex prefixes,
438 // just the first piece of each concatenation,
439 // whatever it is. This is good enough a lot of the time.
440 start = 0
441 out = sub[:0]
442 var first *Regexp
443 for i := 0; i <= len(sub); i++ {
444 // Invariant: the Regexps that were in sub[0:start] have been
445 // used or marked for reuse, and the slice space has been reused
446 // for out (len(out) <= start).
447 //
448 // Invariant: sub[start:i] consists of regexps that all begin with ifirst.
449 var ifirst *Regexp
450 if i < len(sub) {
451 ifirst = p.leadingRegexp(sub[i])
452 if first != nil && first.Equal(ifirst) {
453 continue
454 }
455 }
456
457 // Found end of a run with common leading regexp:
458 // sub[start:i] all begin with first but sub[i] does not.
459 //
460 // Factor out common regexp and append factored expression to out.
461 if i == start {
462 // Nothing to do - run of length 0.
463 } else if i == start+1 {
464 // Just one: don't bother factoring.
465 out = append(out, sub[start])
466 } else {
467 // Construct factored form: prefix(suffix1|suffix2|...)
468 prefix := first
469 for j := start; j < i; j++ {
470 reuse := j != start // prefix came from sub[start]
471 sub[j] = p.removeLeadingRegexp(sub[j], reuse)
472 }
473 suffix := p.collapse(sub[start:i], OpAlternate) // recurse
474
475 re := p.newRegexp(OpConcat)
476 re.Sub = append(re.Sub[:0], prefix, suffix)
477 out = append(out, re)
478 }
479
480 // Prepare for next iteration.
481 start = i
482 first = ifirst
483 }
484 sub = out
485
486 // Round 3: Collapse runs of single literals into character classes.
487 start = 0
488 out = sub[:0]
489 for i := 0; i <= len(sub); i++ {
490 // Invariant: the Regexps that were in sub[0:start] have been
491 // used or marked for reuse, and the slice space has been reused
492 // for out (len(out) <= start).
493 //
494 // Invariant: sub[start:i] consists of regexps that are either
495 // literal runes or character classes.
496 if i < len(sub) && isCharClass(sub[i]) {
497 continue
498 }
499
500 // sub[i] is not a char or char class;
501 // emit char class for sub[start:i]...
502 if i == start {
503 // Nothing to do - run of length 0.
504 } else if i == start+1 {
505 out = append(out, sub[start])
506 } else {
507 // Make new char class.
508 // Start with most complex regexp in sub[start].
509 max := start
510 for j := start + 1; j < i; j++ {
511 if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
512 max = j
513 }
514 }
515 sub[start], sub[max] = sub[max], sub[start]
516
517 for j := start + 1; j < i; j++ {
518 mergeCharClass(sub[start], sub[j])
519 p.reuse(sub[j])
520 }
521 cleanAlt(sub[start])
522 out = append(out, sub[start])
523 }
524
525 // ... and then emit sub[i].
526 if i < len(sub) {
527 out = append(out, sub[i])
528 }
529 start = i + 1
530 }
531 sub = out
532
533 // Round 4: Collapse runs of empty matches into a single empty match.
534 start = 0
535 out = sub[:0]
536 for i := range sub {
537 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
538 continue
539 }
540 out = append(out, sub[i])
541 }
542 sub = out
543
544 return sub
545 }
546
547 // leadingString returns the leading literal string that re begins with.
548 // The string refers to storage in re or its children.
549 func (p *parser) leadingString(re *Regexp) ([]rune, Flags) {
550 if re.Op == OpConcat && len(re.Sub) > 0 {
551 re = re.Sub[0]
552 }
553 if re.Op != OpLiteral {
554 return nil, 0
555 }
556 return re.Rune, re.Flags & FoldCase
557 }
558
559 // removeLeadingString removes the first n leading runes
560 // from the beginning of re. It returns the replacement for re.
561 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
562 if re.Op == OpConcat && len(re.Sub) > 0 {
563 // Removing a leading string in a concatenation
564 // might simplify the concatenation.
565 sub := re.Sub[0]
566 sub = p.removeLeadingString(sub, n)
567 re.Sub[0] = sub
568 if sub.Op == OpEmptyMatch {
569 p.reuse(sub)
570 switch len(re.Sub) {
571 case 0, 1:
572 // Impossible but handle.
573 re.Op = OpEmptyMatch
574 re.Sub = nil
575 case 2:
576 old := re
577 re = re.Sub[1]
578 p.reuse(old)
579 default:
580 copy(re.Sub, re.Sub[1:])
581 re.Sub = re.Sub[:len(re.Sub)-1]
582 }
583 }
584 return re
585 }
586
587 if re.Op == OpLiteral {
588 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
589 if len(re.Rune) == 0 {
590 re.Op = OpEmptyMatch
591 }
592 }
593 return re
594 }
595
596 // leadingRegexp returns the leading regexp that re begins with.
597 // The regexp refers to storage in re or its children.
598 func (p *parser) leadingRegexp(re *Regexp) *Regexp {
599 if re.Op == OpEmptyMatch {
600 return nil
601 }
602 if re.Op == OpConcat && len(re.Sub) > 0 {
603 sub := re.Sub[0]
604 if sub.Op == OpEmptyMatch {
605 return nil
606 }
607 return sub
608 }
609 return re
610 }
611
612 // removeLeadingRegexp removes the leading regexp in re.
613 // It returns the replacement for re.
614 // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
615 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
616 if re.Op == OpConcat && len(re.Sub) > 0 {
617 if reuse {
618 p.reuse(re.Sub[0])
619 }
620 re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
621 switch len(re.Sub) {
622 case 0:
623 re.Op = OpEmptyMatch
624 re.Sub = nil
625 case 1:
626 old := re
627 re = re.Sub[0]
628 p.reuse(old)
629 }
630 return re
631 }
632 if reuse {
633 p.reuse(re)
634 }
635 return p.newRegexp(OpEmptyMatch)
636 }
637
638 func literalRegexp(s string, flags Flags) *Regexp {
639 re := &Regexp{Op: OpLiteral}
640 re.Flags = flags
641 re.Rune = re.Rune0[:0] // use local storage for small strings
642 for _, c := range s {
643 if len(re.Rune) >= cap(re.Rune) {
644 // string is too long to fit in Rune0. let Go handle it
645 re.Rune = []rune(s)
646 break
647 }
648 re.Rune = append(re.Rune, c)
649 }
650 return re
651 }
652
653 // Parsing.
654
655 // Parse parses a regular expression string s, controlled by the specified
656 // Flags, and returns a regular expression parse tree. The syntax is
657 // described in the top-level comment for package regexp.
658 func Parse(s string, flags Flags) (*Regexp, error) {
659 if flags&Literal != 0 {
660 // Trivial parser for literal string.
661 if err := checkUTF8(s); err != nil {
662 return nil, err
663 }
664 return literalRegexp(s, flags), nil
665 }
666
667 // Otherwise, must do real work.
668 var (
669 p parser
670 err error
671 c rune
672 op Op
673 lastRepeat string
674 min, max int
675 )
676 p.flags = flags
677 p.wholeRegexp = s
678 t := s
679 for t != "" {
680 repeat := ""
681 BigSwitch:
682 switch t[0] {
683 default:
684 if c, t, err = nextRune(t); err != nil {
685 return nil, err
686 }
687 p.literal(c)
688
689 case '(':
690 if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' {
691 // Flag changes and non-capturing groups.
692 if t, err = p.parsePerlFlags(t); err != nil {
693 return nil, err
694 }
695 break
696 }
697 p.numCap++
698 p.op(opLeftParen).Cap = p.numCap
699 t = t[1:]
700 case '|':
701 if err = p.parseVerticalBar(); err != nil {
702 return nil, err
703 }
704 t = t[1:]
705 case ')':
706 if err = p.parseRightParen(); err != nil {
707 return nil, err
708 }
709 t = t[1:]
710 case '^':
711 if p.flags&OneLine != 0 {
712 p.op(OpBeginText)
713 } else {
714 p.op(OpBeginLine)
715 }
716 t = t[1:]
717 case '$':
718 if p.flags&OneLine != 0 {
719 p.op(OpEndText).Flags |= WasDollar
720 } else {
721 p.op(OpEndLine)
722 }
723 t = t[1:]
724 case '.':
725 if p.flags&DotNL != 0 {
726 p.op(OpAnyChar)
727 } else {
728 p.op(OpAnyCharNotNL)
729 }
730 t = t[1:]
731 case '[':
732 if t, err = p.parseClass(t); err != nil {
733 return nil, err
734 }
735 case '*', '+', '?':
736 before := t
737 switch t[0] {
738 case '*':
739 op = OpStar
740 case '+':
741 op = OpPlus
742 case '?':
743 op = OpQuest
744 }
745 after := t[1:]
746 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
747 return nil, err
748 }
749 repeat = before
750 t = after
751 case '{':
752 op = OpRepeat
753 before := t
754 min, max, after, ok := p.parseRepeat(t)
755 if !ok {
756 // If the repeat cannot be parsed, { is a literal.
757 p.literal('{')
758 t = t[1:]
759 break
760 }
761 if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
762 // Numbers were too big, or max is present and min > max.
763 return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]}
764 }
765 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil {
766 return nil, err
767 }
768 repeat = before
769 t = after
770 case '\\':
771 if p.flags&PerlX != 0 && len(t) >= 2 {
772 switch t[1] {
773 case 'A':
774 p.op(OpBeginText)
775 t = t[2:]
776 break BigSwitch
777 case 'b':
778 p.op(OpWordBoundary)
779 t = t[2:]
780 break BigSwitch
781 case 'B':
782 p.op(OpNoWordBoundary)
783 t = t[2:]
784 break BigSwitch
785 case 'C':
786 // any byte; not supported
787 return nil, &Error{ErrInvalidEscape, t[:2]}
788 case 'Q':
789 // \Q ... \E: the ... is always literals
790 var lit string
791 if i := strings.Index(t, `\E`); i < 0 {
792 lit = t[2:]
793 t = ""
794 } else {
795 lit = t[2:i]
796 t = t[i+2:]
797 }
798 p.push(literalRegexp(lit, p.flags))
799 break BigSwitch
800 case 'z':
801 p.op(OpEndText)
802 t = t[2:]
803 break BigSwitch
804 }
805 }
806
807 re := p.newRegexp(OpCharClass)
808 re.Flags = p.flags
809
810 // Look for Unicode character group like \p{Han}
811 if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') {
812 r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0])
813 if err != nil {
814 return nil, err
815 }
816 if r != nil {
817 re.Rune = r
818 t = rest
819 p.push(re)
820 break BigSwitch
821 }
822 }
823
824 // Perl character class escape.
825 if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil {
826 re.Rune = r
827 t = rest
828 p.push(re)
829 break BigSwitch
830 }
831 p.reuse(re)
832
833 // Ordinary single-character escape.
834 if c, t, err = p.parseEscape(t); err != nil {
835 return nil, err
836 }
837 p.literal(c)
838 }
839 lastRepeat = repeat
840 }
841
842 p.concat()
843 if p.swapVerticalBar() {
844 // pop vertical bar
845 p.stack = p.stack[:len(p.stack)-1]
846 }
847 p.alternate()
848
849 n := len(p.stack)
850 if n != 1 {
851 return nil, &Error{ErrMissingParen, s}
852 }
853 return p.stack[0], nil
854 }
855
856 // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
857 // If s is not of that form, it returns ok == false.
858 // If s has the right form but the values are too big, it returns min == -1, ok == true.
859 func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
860 if s == "" || s[0] != '{' {
861 return
862 }
863 s = s[1:]
864 var ok1 bool
865 if min, s, ok1 = p.parseInt(s); !ok1 {
866 return
867 }
868 if s == "" {
869 return
870 }
871 if s[0] != ',' {
872 max = min
873 } else {
874 s = s[1:]
875 if s == "" {
876 return
877 }
878 if s[0] == '}' {
879 max = -1
880 } else if max, s, ok1 = p.parseInt(s); !ok1 {
881 return
882 } else if max < 0 {
883 // parseInt found too big a number
884 min = -1
885 }
886 }
887 if s == "" || s[0] != '}' {
888 return
889 }
890 rest = s[1:]
891 ok = true
892 return
893 }
894
895 // parsePerlFlags parses a Perl flag setting or non-capturing group or both,
896 // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state.
897 // The caller must have ensured that s begins with "(?".
898 func (p *parser) parsePerlFlags(s string) (rest string, err error) {
899 t := s
900
901 // Check for named captures, first introduced in Python's regexp library.
902 // As usual, there are three slightly different syntaxes:
903 //
904 // (?P<name>expr) the original, introduced by Python
905 // (?<name>expr) the .NET alteration, adopted by Perl 5.10
906 // (?'name'expr) another .NET alteration, adopted by Perl 5.10
907 //
908 // Perl 5.10 gave in and implemented the Python version too,
909 // but they claim that the last two are the preferred forms.
910 // PCRE and languages based on it (specifically, PHP and Ruby)
911 // support all three as well. EcmaScript 4 uses only the Python form.
912 //
913 // In both the open source world (via Code Search) and the
914 // Google source tree, (?P<expr>name) is the dominant form,
915 // so that's the one we implement. One is enough.
916 if len(t) > 4 && t[2] == 'P' && t[3] == '<' {
917 // Pull out name.
918 end := strings.IndexRune(t, '>')
919 if end < 0 {
920 if err = checkUTF8(t); err != nil {
921 return "", err
922 }
923 return "", &Error{ErrInvalidNamedCapture, s}
924 }
925
926 capture := t[:end+1] // "(?P<name>"
927 name := t[4:end] // "name"
928 if err = checkUTF8(name); err != nil {
929 return "", err
930 }
931 if !isValidCaptureName(name) {
932 return "", &Error{ErrInvalidNamedCapture, capture}
933 }
934
935 // Like ordinary capture, but named.
936 p.numCap++
937 re := p.op(opLeftParen)
938 re.Cap = p.numCap
939 re.Name = name
940 return t[end+1:], nil
941 }
942
943 // Non-capturing group. Might also twiddle Perl flags.
944 var c rune
945 t = t[2:] // skip (?
946 flags := p.flags
947 sign := +1
948 sawFlag := false
949 Loop:
950 for t != "" {
951 if c, t, err = nextRune(t); err != nil {
952 return "", err
953 }
954 switch c {
955 default:
956 break Loop
957
958 // Flags.
959 case 'i':
960 flags |= FoldCase
961 sawFlag = true
962 case 'm':
963 flags &^= OneLine
964 sawFlag = true
965 case 's':
966 flags |= DotNL
967 sawFlag = true
968 case 'U':
969 flags |= NonGreedy
970 sawFlag = true
971
972 // Switch to negation.
973 case '-':
974 if sign < 0 {
975 break Loop
976 }
977 sign = -1
978 // Invert flags so that | above turn into &^ and vice versa.
979 // We'll invert flags again before using it below.
980 flags = ^flags
981 sawFlag = false
982
983 // End of flags, starting group or not.
984 case ':', ')':
985 if sign < 0 {
986 if !sawFlag {
987 break Loop
988 }
989 flags = ^flags
990 }
991 if c == ':' {
992 // Open new group
993 p.op(opLeftParen)
994 }
995 p.flags = flags
996 return t, nil
997 }
998 }
999
1000 return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]}
1001 }
1002
1003 // isValidCaptureName reports whether name
1004 // is a valid capture name: [A-Za-z0-9_]+.
1005 // PCRE limits names to 32 bytes.
1006 // Python rejects names starting with digits.
1007 // We don't enforce either of those.
1008 func isValidCaptureName(name string) bool {
1009 if name == "" {
1010 return false
1011 }
1012 for _, c := range name {
1013 if c != '_' && !isalnum(c) {
1014 return false
1015 }
1016 }
1017 return true
1018 }
1019
1020 // parseInt parses a decimal integer.
1021 func (p *parser) parseInt(s string) (n int, rest string, ok bool) {
1022 if s == "" || s[0] < '0' || '9' < s[0] {
1023 return
1024 }
1025 // Disallow leading zeros.
1026 if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
1027 return
1028 }
1029 t := s
1030 for s != "" && '0' <= s[0] && s[0] <= '9' {
1031 s = s[1:]
1032 }
1033 rest = s
1034 ok = true
1035 // Have digits, compute value.
1036 t = t[:len(t)-len(s)]
1037 for i := 0; i < len(t); i++ {
1038 // Avoid overflow.
1039 if n >= 1e8 {
1040 n = -1
1041 break
1042 }
1043 n = n*10 + int(t[i]) - '0'
1044 }
1045 return
1046 }
1047
1048 // can this be represented as a character class?
1049 // single-rune literal string, char class, ., and .|\n.
1050 func isCharClass(re *Regexp) bool {
1051 return re.Op == OpLiteral && len(re.Rune) == 1 ||
1052 re.Op == OpCharClass ||
1053 re.Op == OpAnyCharNotNL ||
1054 re.Op == OpAnyChar
1055 }
1056
1057 // does re match r?
1058 func matchRune(re *Regexp, r rune) bool {
1059 switch re.Op {
1060 case OpLiteral:
1061 return len(re.Rune) == 1 && re.Rune[0] == r
1062 case OpCharClass:
1063 for i := 0; i < len(re.Rune); i += 2 {
1064 if re.Rune[i] <= r && r <= re.Rune[i+1] {
1065 return true
1066 }
1067 }
1068 return false
1069 case OpAnyCharNotNL:
1070 return r != '\n'
1071 case OpAnyChar:
1072 return true
1073 }
1074 return false
1075 }
1076
1077 // parseVerticalBar handles a | in the input.
1078 func (p *parser) parseVerticalBar() error {
1079 p.concat()
1080
1081 // The concatenation we just parsed is on top of the stack.
1082 // If it sits above an opVerticalBar, swap it below
1083 // (things below an opVerticalBar become an alternation).
1084 // Otherwise, push a new vertical bar.
1085 if !p.swapVerticalBar() {
1086 p.op(opVerticalBar)
1087 }
1088
1089 return nil
1090 }
1091
1092 // mergeCharClass makes dst = dst|src.
1093 // The caller must ensure that dst.Op >= src.Op,
1094 // to reduce the amount of copying.
1095 func mergeCharClass(dst, src *Regexp) {
1096 switch dst.Op {
1097 case OpAnyChar:
1098 // src doesn't add anything.
1099 case OpAnyCharNotNL:
1100 // src might add \n
1101 if matchRune(src, '\n') {
1102 dst.Op = OpAnyChar
1103 }
1104 case OpCharClass:
1105 // src is simpler, so either literal or char class
1106 if src.Op == OpLiteral {
1107 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1108 } else {
1109 dst.Rune = appendClass(dst.Rune, src.Rune)
1110 }
1111 case OpLiteral:
1112 // both literal
1113 if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags {
1114 break
1115 }
1116 dst.Op = OpCharClass
1117 dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags)
1118 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags)
1119 }
1120 }
1121
1122 // If the top of the stack is an element followed by an opVerticalBar
1123 // swapVerticalBar swaps the two and returns true.
1124 // Otherwise it returns false.
1125 func (p *parser) swapVerticalBar() bool {
1126 // If above and below vertical bar are literal or char class,
1127 // can merge into a single char class.
1128 n := len(p.stack)
1129 if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) {
1130 re1 := p.stack[n-1]
1131 re3 := p.stack[n-3]
1132 // Make re3 the more complex of the two.
1133 if re1.Op > re3.Op {
1134 re1, re3 = re3, re1
1135 p.stack[n-3] = re3
1136 }
1137 mergeCharClass(re3, re1)
1138 p.reuse(re1)
1139 p.stack = p.stack[:n-1]
1140 return true
1141 }
1142
1143 if n >= 2 {
1144 re1 := p.stack[n-1]
1145 re2 := p.stack[n-2]
1146 if re2.Op == opVerticalBar {
1147 if n >= 3 {
1148 // Now out of reach.
1149 // Clean opportunistically.
1150 cleanAlt(p.stack[n-3])
1151 }
1152 p.stack[n-2] = re1
1153 p.stack[n-1] = re2
1154 return true
1155 }
1156 }
1157 return false
1158 }
1159
1160 // parseRightParen handles a ) in the input.
1161 func (p *parser) parseRightParen() error {
1162 p.concat()
1163 if p.swapVerticalBar() {
1164 // pop vertical bar
1165 p.stack = p.stack[:len(p.stack)-1]
1166 }
1167 p.alternate()
1168
1169 n := len(p.stack)
1170 if n < 2 {
1171 return &Error{ErrInternalError, ""}
1172 }
1173 re1 := p.stack[n-1]
1174 re2 := p.stack[n-2]
1175 p.stack = p.stack[:n-2]
1176 if re2.Op != opLeftParen {
1177 return &Error{ErrMissingParen, p.wholeRegexp}
1178 }
1179 // Restore flags at time of paren.
1180 p.flags = re2.Flags
1181 if re2.Cap == 0 {
1182 // Just for grouping.
1183 p.push(re1)
1184 } else {
1185 re2.Op = OpCapture
1186 re2.Sub = re2.Sub0[:1]
1187 re2.Sub[0] = re1
1188 p.push(re2)
1189 }
1190 return nil
1191 }
1192
1193 // parseEscape parses an escape sequence at the beginning of s
1194 // and returns the rune.
1195 func (p *parser) parseEscape(s string) (r rune, rest string, err error) {
1196 t := s[1:]
1197 if t == "" {
1198 return 0, "", &Error{ErrTrailingBackslash, ""}
1199 }
1200 c, t, err := nextRune(t)
1201 if err != nil {
1202 return 0, "", err
1203 }
1204
1205 Switch:
1206 switch c {
1207 default:
1208 if c < utf8.RuneSelf && !isalnum(c) {
1209 // Escaped non-word characters are always themselves.
1210 // PCRE is not quite so rigorous: it accepts things like
1211 // \q, but we don't. We once rejected \_, but too many
1212 // programs and people insist on using it, so allow \_.
1213 return c, t, nil
1214 }
1215
1216 // Octal escapes.
1217 case '1', '2', '3', '4', '5', '6', '7':
1218 // Single non-zero digit is a backreference; not supported
1219 if t == "" || t[0] < '0' || t[0] > '7' {
1220 break
1221 }
1222 fallthrough
1223 case '0':
1224 // Consume up to three octal digits; already have one.
1225 r = c - '0'
1226 for i := 1; i < 3; i++ {
1227 if t == "" || t[0] < '0' || t[0] > '7' {
1228 break
1229 }
1230 r = r*8 + rune(t[0]) - '0'
1231 t = t[1:]
1232 }
1233 return r, t, nil
1234
1235 // Hexadecimal escapes.
1236 case 'x':
1237 if t == "" {
1238 break
1239 }
1240 if c, t, err = nextRune(t); err != nil {
1241 return 0, "", err
1242 }
1243 if c == '{' {
1244 // Any number of digits in braces.
1245 // Perl accepts any text at all; it ignores all text
1246 // after the first non-hex digit. We require only hex digits,
1247 // and at least one.
1248 nhex := 0
1249 r = 0
1250 for {
1251 if t == "" {
1252 break Switch
1253 }
1254 if c, t, err = nextRune(t); err != nil {
1255 return 0, "", err
1256 }
1257 if c == '}' {
1258 break
1259 }
1260 v := unhex(c)
1261 if v < 0 {
1262 break Switch
1263 }
1264 r = r*16 + v
1265 if r > unicode.MaxRune {
1266 break Switch
1267 }
1268 nhex++
1269 }
1270 if nhex == 0 {
1271 break Switch
1272 }
1273 return r, t, nil
1274 }
1275
1276 // Easy case: two hex digits.
1277 x := unhex(c)
1278 if c, t, err = nextRune(t); err != nil {
1279 return 0, "", err
1280 }
1281 y := unhex(c)
1282 if x < 0 || y < 0 {
1283 break
1284 }
1285 return x*16 + y, t, nil
1286
1287 // C escapes. There is no case 'b', to avoid misparsing
1288 // the Perl word-boundary \b as the C backspace \b
1289 // when in POSIX mode. In Perl, /\b/ means word-boundary
1290 // but /[\b]/ means backspace. We don't support that.
1291 // If you want a backspace, embed a literal backspace
1292 // character or use \x08.
1293 case 'a':
1294 return '\a', t, err
1295 case 'f':
1296 return '\f', t, err
1297 case 'n':
1298 return '\n', t, err
1299 case 'r':
1300 return '\r', t, err
1301 case 't':
1302 return '\t', t, err
1303 case 'v':
1304 return '\v', t, err
1305 }
1306 return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]}
1307 }
1308
1309 // parseClassChar parses a character class character at the beginning of s
1310 // and returns it.
1311 func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) {
1312 if s == "" {
1313 return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
1314 }
1315
1316 // Allow regular escape sequences even though
1317 // many need not be escaped in this context.
1318 if s[0] == '\\' {
1319 return p.parseEscape(s)
1320 }
1321
1322 return nextRune(s)
1323 }
1324
1325 type charGroup struct {
1326 sign int
1327 class []rune
1328 }
1329
1330 // parsePerlClassEscape parses a leading Perl character class escape like \d
1331 // from the beginning of s. If one is present, it appends the characters to r
1332 // and returns the new slice r and the remainder of the string.
1333 func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) {
1334 if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' {
1335 return
1336 }
1337 g := perlGroup[s[0:2]]
1338 if g.sign == 0 {
1339 return
1340 }
1341 return p.appendGroup(r, g), s[2:]
1342 }
1343
1344 // parseNamedClass parses a leading POSIX named character class like [:alnum:]
1345 // from the beginning of s. If one is present, it appends the characters to r
1346 // and returns the new slice r and the remainder of the string.
1347 func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) {
1348 if len(s) < 2 || s[0] != '[' || s[1] != ':' {
1349 return
1350 }
1351
1352 i := strings.Index(s[2:], ":]")
1353 if i < 0 {
1354 return
1355 }
1356 i += 2
1357 name, s := s[0:i+2], s[i+2:]
1358 g := posixGroup[name]
1359 if g.sign == 0 {
1360 return nil, "", &Error{ErrInvalidCharRange, name}
1361 }
1362 return p.appendGroup(r, g), s, nil
1363 }
1364
1365 func (p *parser) appendGroup(r []rune, g charGroup) []rune {
1366 if p.flags&FoldCase == 0 {
1367 if g.sign < 0 {
1368 r = appendNegatedClass(r, g.class)
1369 } else {
1370 r = appendClass(r, g.class)
1371 }
1372 } else {
1373 tmp := p.tmpClass[:0]
1374 tmp = appendFoldedClass(tmp, g.class)
1375 p.tmpClass = tmp
1376 tmp = cleanClass(&p.tmpClass)
1377 if g.sign < 0 {
1378 r = appendNegatedClass(r, tmp)
1379 } else {
1380 r = appendClass(r, tmp)
1381 }
1382 }
1383 return r
1384 }
1385
1386 var anyTable = &unicode.RangeTable{
1387 R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}},
1388 R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}},
1389 }
1390
1391 // unicodeTable returns the unicode.RangeTable identified by name
1392 // and the table of additional fold-equivalent code points.
1393 func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
1394 // Special case: "Any" means any.
1395 if name == "Any" {
1396 return anyTable, anyTable
1397 }
1398 if t := unicode.Categories[name]; t != nil {
1399 return t, unicode.FoldCategory[name]
1400 }
1401 if t := unicode.Scripts[name]; t != nil {
1402 return t, unicode.FoldScript[name]
1403 }
1404 return nil, nil
1405 }
1406
1407 // parseUnicodeClass parses a leading Unicode character class like \p{Han}
1408 // from the beginning of s. If one is present, it appends the characters to r
1409 // and returns the new slice r and the remainder of the string.
1410 func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) {
1411 if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' {
1412 return
1413 }
1414
1415 // Committed to parse or return error.
1416 sign := +1
1417 if s[1] == 'P' {
1418 sign = -1
1419 }
1420 t := s[2:]
1421 c, t, err := nextRune(t)
1422 if err != nil {
1423 return
1424 }
1425 var seq, name string
1426 if c != '{' {
1427 // Single-letter name.
1428 seq = s[:len(s)-len(t)]
1429 name = seq[2:]
1430 } else {
1431 // Name is in braces.
1432 end := strings.IndexRune(s, '}')
1433 if end < 0 {
1434 if err = checkUTF8(s); err != nil {
1435 return
1436 }
1437 return nil, "", &Error{ErrInvalidCharRange, s}
1438 }
1439 seq, t = s[:end+1], s[end+1:]
1440 name = s[3:end]
1441 if err = checkUTF8(name); err != nil {
1442 return
1443 }
1444 }
1445
1446 // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}.
1447 if name != "" && name[0] == '^' {
1448 sign = -sign
1449 name = name[1:]
1450 }
1451
1452 tab, fold := unicodeTable(name)
1453 if tab == nil {
1454 return nil, "", &Error{ErrInvalidCharRange, seq}
1455 }
1456
1457 if p.flags&FoldCase == 0 || fold == nil {
1458 if sign > 0 {
1459 r = appendTable(r, tab)
1460 } else {
1461 r = appendNegatedTable(r, tab)
1462 }
1463 } else {
1464 // Merge and clean tab and fold in a temporary buffer.
1465 // This is necessary for the negative case and just tidy
1466 // for the positive case.
1467 tmp := p.tmpClass[:0]
1468 tmp = appendTable(tmp, tab)
1469 tmp = appendTable(tmp, fold)
1470 p.tmpClass = tmp
1471 tmp = cleanClass(&p.tmpClass)
1472 if sign > 0 {
1473 r = appendClass(r, tmp)
1474 } else {
1475 r = appendNegatedClass(r, tmp)
1476 }
1477 }
1478 return r, t, nil
1479 }
1480
1481 // parseClass parses a character class at the beginning of s
1482 // and pushes it onto the parse stack.
1483 func (p *parser) parseClass(s string) (rest string, err error) {
1484 t := s[1:] // chop [
1485 re := p.newRegexp(OpCharClass)
1486 re.Flags = p.flags
1487 re.Rune = re.Rune0[:0]
1488
1489 sign := +1
1490 if t != "" && t[0] == '^' {
1491 sign = -1
1492 t = t[1:]
1493
1494 // If character class does not match \n, add it here,
1495 // so that negation later will do the right thing.
1496 if p.flags&ClassNL == 0 {
1497 re.Rune = append(re.Rune, '\n', '\n')
1498 }
1499 }
1500
1501 class := re.Rune
1502 first := true // ] and - are okay as first char in class
1503 for t == "" || t[0] != ']' || first {
1504 // POSIX: - is only okay unescaped as first or last in class.
1505 // Perl: - is okay anywhere.
1506 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
1507 _, size := utf8.DecodeRuneInString(t[1:])
1508 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
1509 }
1510 first = false
1511
1512 // Look for POSIX [:alnum:] etc.
1513 if len(t) > 2 && t[0] == '[' && t[1] == ':' {
1514 nclass, nt, err := p.parseNamedClass(t, class)
1515 if err != nil {
1516 return "", err
1517 }
1518 if nclass != nil {
1519 class, t = nclass, nt
1520 continue
1521 }
1522 }
1523
1524 // Look for Unicode character group like \p{Han}.
1525 nclass, nt, err := p.parseUnicodeClass(t, class)
1526 if err != nil {
1527 return "", err
1528 }
1529 if nclass != nil {
1530 class, t = nclass, nt
1531 continue
1532 }
1533
1534 // Look for Perl character class symbols (extension).
1535 if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil {
1536 class, t = nclass, nt
1537 continue
1538 }
1539
1540 // Single character or simple range.
1541 rng := t
1542 var lo, hi rune
1543 if lo, t, err = p.parseClassChar(t, s); err != nil {
1544 return "", err
1545 }
1546 hi = lo
1547 // [a-] means (a|-) so check for final ].
1548 if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
1549 t = t[1:]
1550 if hi, t, err = p.parseClassChar(t, s); err != nil {
1551 return "", err
1552 }
1553 if hi < lo {
1554 rng = rng[:len(rng)-len(t)]
1555 return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
1556 }
1557 }
1558 if p.flags&FoldCase == 0 {
1559 class = appendRange(class, lo, hi)
1560 } else {
1561 class = appendFoldedRange(class, lo, hi)
1562 }
1563 }
1564 t = t[1:] // chop ]
1565
1566 // Use &re.Rune instead of &class to avoid allocation.
1567 re.Rune = class
1568 class = cleanClass(&re.Rune)
1569 if sign < 0 {
1570 class = negateClass(class)
1571 }
1572 re.Rune = class
1573 p.push(re)
1574 return t, nil
1575 }
1576
1577 // cleanClass sorts the ranges (pairs of elements of r),
1578 // merges them, and eliminates duplicates.
1579 func cleanClass(rp *[]rune) []rune {
1580
1581 // Sort by lo increasing, hi decreasing to break ties.
1582 sort.Sort(ranges{rp})
1583
1584 r := *rp
1585 if len(r) < 2 {
1586 return r
1587 }
1588
1589 // Merge abutting, overlapping.
1590 w := 2 // write index
1591 for i := 2; i < len(r); i += 2 {
1592 lo, hi := r[i], r[i+1]
1593 if lo <= r[w-1]+1 {
1594 // merge with previous range
1595 if hi > r[w-1] {
1596 r[w-1] = hi
1597 }
1598 continue
1599 }
1600 // new disjoint range
1601 r[w] = lo
1602 r[w+1] = hi
1603 w += 2
1604 }
1605
1606 return r[:w]
1607 }
1608
1609 // appendLiteral returns the result of appending the literal x to the class r.
1610 func appendLiteral(r []rune, x rune, flags Flags) []rune {
1611 if flags&FoldCase != 0 {
1612 return appendFoldedRange(r, x, x)
1613 }
1614 return appendRange(r, x, x)
1615 }
1616
1617 // appendRange returns the result of appending the range lo-hi to the class r.
1618 func appendRange(r []rune, lo, hi rune) []rune {
1619 // Expand last range or next to last range if it overlaps or abuts.
1620 // Checking two ranges helps when appending case-folded
1621 // alphabets, so that one range can be expanding A-Z and the
1622 // other expanding a-z.
1623 n := len(r)
1624 for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
1625 if n >= i {
1626 rlo, rhi := r[n-i], r[n-i+1]
1627 if lo <= rhi+1 && rlo <= hi+1 {
1628 if lo < rlo {
1629 r[n-i] = lo
1630 }
1631 if hi > rhi {
1632 r[n-i+1] = hi
1633 }
1634 return r
1635 }
1636 }
1637 }
1638
1639 return append(r, lo, hi)
1640 }
1641
1642 const (
1643 // minimum and maximum runes involved in folding.
1644 // checked during test.
1645 minFold = 0x0041
1646 maxFold = 0x1044f
1647 )
1648
1649 // appendFoldedRange returns the result of appending the range lo-hi
1650 // and its case folding-equivalent runes to the class r.
1651 func appendFoldedRange(r []rune, lo, hi rune) []rune {
1652 // Optimizations.
1653 if lo <= minFold && hi >= maxFold {
1654 // Range is full: folding can't add more.
1655 return appendRange(r, lo, hi)
1656 }
1657 if hi < minFold || lo > maxFold {
1658 // Range is outside folding possibilities.
1659 return appendRange(r, lo, hi)
1660 }
1661 if lo < minFold {
1662 // [lo, minFold-1] needs no folding.
1663 r = appendRange(r, lo, minFold-1)
1664 lo = minFold
1665 }
1666 if hi > maxFold {
1667 // [maxFold+1, hi] needs no folding.
1668 r = appendRange(r, maxFold+1, hi)
1669 hi = maxFold
1670 }
1671
1672 // Brute force. Depend on appendRange to coalesce ranges on the fly.
1673 for c := lo; c <= hi; c++ {
1674 r = appendRange(r, c, c)
1675 f := unicode.SimpleFold(c)
1676 for f != c {
1677 r = appendRange(r, f, f)
1678 f = unicode.SimpleFold(f)
1679 }
1680 }
1681 return r
1682 }
1683
1684 // appendClass returns the result of appending the class x to the class r.
1685 // It assume x is clean.
1686 func appendClass(r []rune, x []rune) []rune {
1687 for i := 0; i < len(x); i += 2 {
1688 r = appendRange(r, x[i], x[i+1])
1689 }
1690 return r
1691 }
1692
1693 // appendFolded returns the result of appending the case folding of the class x to the class r.
1694 func appendFoldedClass(r []rune, x []rune) []rune {
1695 for i := 0; i < len(x); i += 2 {
1696 r = appendFoldedRange(r, x[i], x[i+1])
1697 }
1698 return r
1699 }
1700
1701 // appendNegatedClass returns the result of appending the negation of the class x to the class r.
1702 // It assumes x is clean.
1703 func appendNegatedClass(r []rune, x []rune) []rune {
1704 nextLo := '\u0000'
1705 for i := 0; i < len(x); i += 2 {
1706 lo, hi := x[i], x[i+1]
1707 if nextLo <= lo-1 {
1708 r = appendRange(r, nextLo, lo-1)
1709 }
1710 nextLo = hi + 1
1711 }
1712 if nextLo <= unicode.MaxRune {
1713 r = appendRange(r, nextLo, unicode.MaxRune)
1714 }
1715 return r
1716 }
1717
1718 // appendTable returns the result of appending x to the class r.
1719 func appendTable(r []rune, x *unicode.RangeTable) []rune {
1720 for _, xr := range x.R16 {
1721 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1722 if stride == 1 {
1723 r = appendRange(r, lo, hi)
1724 continue
1725 }
1726 for c := lo; c <= hi; c += stride {
1727 r = appendRange(r, c, c)
1728 }
1729 }
1730 for _, xr := range x.R32 {
1731 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1732 if stride == 1 {
1733 r = appendRange(r, lo, hi)
1734 continue
1735 }
1736 for c := lo; c <= hi; c += stride {
1737 r = appendRange(r, c, c)
1738 }
1739 }
1740 return r
1741 }
1742
1743 // appendNegatedTable returns the result of appending the negation of x to the class r.
1744 func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune {
1745 nextLo := '\u0000' // lo end of next class to add
1746 for _, xr := range x.R16 {
1747 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1748 if stride == 1 {
1749 if nextLo <= lo-1 {
1750 r = appendRange(r, nextLo, lo-1)
1751 }
1752 nextLo = hi + 1
1753 continue
1754 }
1755 for c := lo; c <= hi; c += stride {
1756 if nextLo <= c-1 {
1757 r = appendRange(r, nextLo, c-1)
1758 }
1759 nextLo = c + 1
1760 }
1761 }
1762 for _, xr := range x.R32 {
1763 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride)
1764 if stride == 1 {
1765 if nextLo <= lo-1 {
1766 r = appendRange(r, nextLo, lo-1)
1767 }
1768 nextLo = hi + 1
1769 continue
1770 }
1771 for c := lo; c <= hi; c += stride {
1772 if nextLo <= c-1 {
1773 r = appendRange(r, nextLo, c-1)
1774 }
1775 nextLo = c + 1
1776 }
1777 }
1778 if nextLo <= unicode.MaxRune {
1779 r = appendRange(r, nextLo, unicode.MaxRune)
1780 }
1781 return r
1782 }
1783
1784 // negateClass overwrites r and returns r's negation.
1785 // It assumes the class r is already clean.
1786 func negateClass(r []rune) []rune {
1787 nextLo := '\u0000' // lo end of next class to add
1788 w := 0 // write index
1789 for i := 0; i < len(r); i += 2 {
1790 lo, hi := r[i], r[i+1]
1791 if nextLo <= lo-1 {
1792 r[w] = nextLo
1793 r[w+1] = lo - 1
1794 w += 2
1795 }
1796 nextLo = hi + 1
1797 }
1798 r = r[:w]
1799 if nextLo <= unicode.MaxRune {
1800 // It's possible for the negation to have one more
1801 // range - this one - than the original class, so use append.
1802 r = append(r, nextLo, unicode.MaxRune)
1803 }
1804 return r
1805 }
1806
1807 // ranges implements sort.Interface on a []rune.
1808 // The choice of receiver type definition is strange
1809 // but avoids an allocation since we already have
1810 // a *[]rune.
1811 type ranges struct {
1812 p *[]rune
1813 }
1814
1815 func (ra ranges) Less(i, j int) bool {
1816 p := *ra.p
1817 i *= 2
1818 j *= 2
1819 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
1820 }
1821
1822 func (ra ranges) Len() int {
1823 return len(*ra.p) / 2
1824 }
1825
1826 func (ra ranges) Swap(i, j int) {
1827 p := *ra.p
1828 i *= 2
1829 j *= 2
1830 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
1831 }
1832
1833 func checkUTF8(s string) error {
1834 for s != "" {
1835 rune, size := utf8.DecodeRuneInString(s)
1836 if rune == utf8.RuneError && size == 1 {
1837 return &Error{Code: ErrInvalidUTF8, Expr: s}
1838 }
1839 s = s[size:]
1840 }
1841 return nil
1842 }
1843
1844 func nextRune(s string) (c rune, t string, err error) {
1845 c, size := utf8.DecodeRuneInString(s)
1846 if c == utf8.RuneError && size == 1 {
1847 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
1848 }
1849 return c, s[size:], nil
1850 }
1851
1852 func isalnum(c rune) bool {
1853 return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
1854 }
1855
1856 func unhex(c rune) rune {
1857 if '0' <= c && c <= '9' {
1858 return c - '0'
1859 }
1860 if 'a' <= c && c <= 'f' {
1861 return c - 'a' + 10
1862 }
1863 if 'A' <= c && c <= 'F' {
1864 return c - 'A' + 10
1865 }
1866 return -1
1867 }