Source file src/pkg/regexp/syntax/parse.go
1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package syntax parses regular expressions into parse trees and compiles 6 // parse trees into programs. Most clients of regular expressions will use 7 // the facilities of package regexp (such as Compile and Match) instead of 8 // this package. 9 package syntax 10 11 import ( 12 "sort" 13 "strings" 14 "unicode" 15 "unicode/utf8" 16 ) 17 18 // An Error describes a failure to parse a regular expression 19 // and gives the offending expression. 20 type Error struct { 21 Code ErrorCode 22 Expr string 23 } 24 25 func (e *Error) Error() string { 26 return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" 27 } 28 29 // An ErrorCode describes a failure to parse a regular expression. 30 type ErrorCode string 31 32 const ( 33 // Unexpected error 34 ErrInternalError ErrorCode = "regexp/syntax: internal error" 35 36 // Parse errors 37 ErrInvalidCharClass ErrorCode = "invalid character class" 38 ErrInvalidCharRange ErrorCode = "invalid character class range" 39 ErrInvalidEscape ErrorCode = "invalid escape sequence" 40 ErrInvalidNamedCapture ErrorCode = "invalid named capture" 41 ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" 42 ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" 43 ErrInvalidRepeatSize ErrorCode = "invalid repeat count" 44 ErrInvalidUTF8 ErrorCode = "invalid UTF-8" 45 ErrMissingBracket ErrorCode = "missing closing ]" 46 ErrMissingParen ErrorCode = "missing closing )" 47 ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" 48 ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" 49 ) 50 51 func (e ErrorCode) String() string { 52 return string(e) 53 } 54 55 // Flags control the behavior of the parser and record information about regexp context. 56 type Flags uint16 57 58 const ( 59 FoldCase Flags = 1 << iota // case-insensitive match 60 Literal // treat pattern as literal string 61 ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline 62 DotNL // allow . to match newline 63 OneLine // treat ^ and $ as only matching at beginning and end of text 64 NonGreedy // make repetition operators default to non-greedy 65 PerlX // allow Perl extensions 66 UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation 67 WasDollar // regexp OpEndText was $, not \z 68 Simple // regexp contains no counted repetition 69 70 MatchNL = ClassNL | DotNL 71 72 Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible 73 POSIX Flags = 0 // POSIX syntax 74 ) 75 76 // Pseudo-ops for parsing stack. 77 const ( 78 opLeftParen = opPseudo + iota 79 opVerticalBar 80 ) 81 82 type parser struct { 83 flags Flags // parse mode flags 84 stack []*Regexp // stack of parsed expressions 85 free *Regexp 86 numCap int // number of capturing groups seen 87 wholeRegexp string 88 tmpClass []rune // temporary char class work space 89 } 90 91 func (p *parser) newRegexp(op Op) *Regexp { 92 re := p.free 93 if re != nil { 94 p.free = re.Sub0[0] 95 *re = Regexp{} 96 } else { 97 re = new(Regexp) 98 } 99 re.Op = op 100 return re 101 } 102 103 func (p *parser) reuse(re *Regexp) { 104 re.Sub0[0] = p.free 105 p.free = re 106 } 107 108 // Parse stack manipulation. 109 110 // push pushes the regexp re onto the parse stack and returns the regexp. 111 func (p *parser) push(re *Regexp) *Regexp { 112 if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { 113 // Single rune. 114 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { 115 return nil 116 } 117 re.Op = OpLiteral 118 re.Rune = re.Rune[:1] 119 re.Flags = p.flags &^ FoldCase 120 } else if re.Op == OpCharClass && len(re.Rune) == 4 && 121 re.Rune[0] == re.Rune[1] && re.Rune[2] == re.Rune[3] && 122 unicode.SimpleFold(re.Rune[0]) == re.Rune[2] && 123 unicode.SimpleFold(re.Rune[2]) == re.Rune[0] || 124 re.Op == OpCharClass && len(re.Rune) == 2 && 125 re.Rune[0]+1 == re.Rune[1] && 126 unicode.SimpleFold(re.Rune[0]) == re.Rune[1] && 127 unicode.SimpleFold(re.Rune[1]) == re.Rune[0] { 128 // Case-insensitive rune like [Aa] or [Δδ]. 129 if p.maybeConcat(re.Rune[0], p.flags|FoldCase) { 130 return nil 131 } 132 133 // Rewrite as (case-insensitive) literal. 134 re.Op = OpLiteral 135 re.Rune = re.Rune[:1] 136 re.Flags = p.flags | FoldCase 137 } else { 138 // Incremental concatenation. 139 p.maybeConcat(-1, 0) 140 } 141 142 p.stack = append(p.stack, re) 143 return re 144 } 145 146 // maybeConcat implements incremental concatenation 147 // of literal runes into string nodes. The parser calls this 148 // before each push, so only the top fragment of the stack 149 // might need processing. Since this is called before a push, 150 // the topmost literal is no longer subject to operators like * 151 // (Otherwise ab* would turn into (ab)*.) 152 // If r >= 0 and there's a node left over, maybeConcat uses it 153 // to push r with the given flags. 154 // maybeConcat reports whether r was pushed. 155 func (p *parser) maybeConcat(r rune, flags Flags) bool { 156 n := len(p.stack) 157 if n < 2 { 158 return false 159 } 160 161 re1 := p.stack[n-1] 162 re2 := p.stack[n-2] 163 if re1.Op != OpLiteral || re2.Op != OpLiteral || re1.Flags&FoldCase != re2.Flags&FoldCase { 164 return false 165 } 166 167 // Push re1 into re2. 168 re2.Rune = append(re2.Rune, re1.Rune...) 169 170 // Reuse re1 if possible. 171 if r >= 0 { 172 re1.Rune = re1.Rune0[:1] 173 re1.Rune[0] = r 174 re1.Flags = flags 175 return true 176 } 177 178 p.stack = p.stack[:n-1] 179 p.reuse(re1) 180 return false // did not push r 181 } 182 183 // newLiteral returns a new OpLiteral Regexp with the given flags 184 func (p *parser) newLiteral(r rune, flags Flags) *Regexp { 185 re := p.newRegexp(OpLiteral) 186 re.Flags = flags 187 if flags&FoldCase != 0 { 188 r = minFoldRune(r) 189 } 190 re.Rune0[0] = r 191 re.Rune = re.Rune0[:1] 192 return re 193 } 194 195 // minFoldRune returns the minimum rune fold-equivalent to r. 196 func minFoldRune(r rune) rune { 197 if r < minFold || r > maxFold { 198 return r 199 } 200 min := r 201 r0 := r 202 for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) { 203 if min > r { 204 min = r 205 } 206 } 207 return min 208 } 209 210 // literal pushes a literal regexp for the rune r on the stack 211 // and returns that regexp. 212 func (p *parser) literal(r rune) { 213 p.push(p.newLiteral(r, p.flags)) 214 } 215 216 // op pushes a regexp with the given op onto the stack 217 // and returns that regexp. 218 func (p *parser) op(op Op) *Regexp { 219 re := p.newRegexp(op) 220 re.Flags = p.flags 221 return p.push(re) 222 } 223 224 // repeat replaces the top stack element with itself repeated according to op, min, max. 225 // before is the regexp suffix starting at the repetition operator. 226 // after is the regexp suffix following after the repetition operator. 227 // repeat returns an updated 'after' and an error, if any. 228 func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) (string, error) { 229 flags := p.flags 230 if p.flags&PerlX != 0 { 231 if len(after) > 0 && after[0] == '?' { 232 after = after[1:] 233 flags ^= NonGreedy 234 } 235 if lastRepeat != "" { 236 // In Perl it is not allowed to stack repetition operators: 237 // a** is a syntax error, not a doubled star, and a++ means 238 // something else entirely, which we don't support! 239 return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(after)]} 240 } 241 } 242 n := len(p.stack) 243 if n == 0 { 244 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 245 } 246 sub := p.stack[n-1] 247 if sub.Op >= opPseudo { 248 return "", &Error{ErrMissingRepeatArgument, before[:len(before)-len(after)]} 249 } 250 re := p.newRegexp(op) 251 re.Min = min 252 re.Max = max 253 re.Flags = flags 254 re.Sub = re.Sub0[:1] 255 re.Sub[0] = sub 256 p.stack[n-1] = re 257 return after, nil 258 } 259 260 // concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. 261 func (p *parser) concat() *Regexp { 262 p.maybeConcat(-1, 0) 263 264 // Scan down to find pseudo-operator | or (. 265 i := len(p.stack) 266 for i > 0 && p.stack[i-1].Op < opPseudo { 267 i-- 268 } 269 subs := p.stack[i:] 270 p.stack = p.stack[:i] 271 272 // Empty concatenation is special case. 273 if len(subs) == 0 { 274 return p.push(p.newRegexp(OpEmptyMatch)) 275 } 276 277 return p.push(p.collapse(subs, OpConcat)) 278 } 279 280 // alternate replaces the top of the stack (above the topmost '(') with its alternation. 281 func (p *parser) alternate() *Regexp { 282 // Scan down to find pseudo-operator (. 283 // There are no | above (. 284 i := len(p.stack) 285 for i > 0 && p.stack[i-1].Op < opPseudo { 286 i-- 287 } 288 subs := p.stack[i:] 289 p.stack = p.stack[:i] 290 291 // Make sure top class is clean. 292 // All the others already are (see swapVerticalBar). 293 if len(subs) > 0 { 294 cleanAlt(subs[len(subs)-1]) 295 } 296 297 // Empty alternate is special case 298 // (shouldn't happen but easy to handle). 299 if len(subs) == 0 { 300 return p.push(p.newRegexp(OpNoMatch)) 301 } 302 303 return p.push(p.collapse(subs, OpAlternate)) 304 } 305 306 // cleanAlt cleans re for eventual inclusion in an alternation. 307 func cleanAlt(re *Regexp) { 308 switch re.Op { 309 case OpCharClass: 310 re.Rune = cleanClass(&re.Rune) 311 if len(re.Rune) == 2 && re.Rune[0] == 0 && re.Rune[1] == unicode.MaxRune { 312 re.Rune = nil 313 re.Op = OpAnyChar 314 return 315 } 316 if len(re.Rune) == 4 && re.Rune[0] == 0 && re.Rune[1] == '\n'-1 && re.Rune[2] == '\n'+1 && re.Rune[3] == unicode.MaxRune { 317 re.Rune = nil 318 re.Op = OpAnyCharNotNL 319 return 320 } 321 if cap(re.Rune)-len(re.Rune) > 100 { 322 // re.Rune will not grow any more. 323 // Make a copy or inline to reclaim storage. 324 re.Rune = append(re.Rune0[:0], re.Rune...) 325 } 326 } 327 } 328 329 // collapse returns the result of applying op to sub. 330 // If sub contains op nodes, they all get hoisted up 331 // so that there is never a concat of a concat or an 332 // alternate of an alternate. 333 func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { 334 if len(subs) == 1 { 335 return subs[0] 336 } 337 re := p.newRegexp(op) 338 re.Sub = re.Sub0[:0] 339 for _, sub := range subs { 340 if sub.Op == op { 341 re.Sub = append(re.Sub, sub.Sub...) 342 p.reuse(sub) 343 } else { 344 re.Sub = append(re.Sub, sub) 345 } 346 } 347 if op == OpAlternate { 348 re.Sub = p.factor(re.Sub, re.Flags) 349 if len(re.Sub) == 1 { 350 old := re 351 re = re.Sub[0] 352 p.reuse(old) 353 } 354 } 355 return re 356 } 357 358 // factor factors common prefixes from the alternation list sub. 359 // It returns a replacement list that reuses the same storage and 360 // frees (passes to p.reuse) any removed *Regexps. 361 // 362 // For example, 363 // ABC|ABD|AEF|BCX|BCY 364 // simplifies by literal prefix extraction to 365 // A(B(C|D)|EF)|BC(X|Y) 366 // which simplifies by character class introduction to 367 // A(B[CD]|EF)|BC[XY] 368 // 369 func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { 370 if len(sub) < 2 { 371 return sub 372 } 373 374 // Round 1: Factor out common literal prefixes. 375 var str []rune 376 var strflags Flags 377 start := 0 378 out := sub[:0] 379 for i := 0; i <= len(sub); i++ { 380 // Invariant: the Regexps that were in sub[0:start] have been 381 // used or marked for reuse, and the slice space has been reused 382 // for out (len(out) <= start). 383 // 384 // Invariant: sub[start:i] consists of regexps that all begin 385 // with str as modified by strflags. 386 var istr []rune 387 var iflags Flags 388 if i < len(sub) { 389 istr, iflags = p.leadingString(sub[i]) 390 if iflags == strflags { 391 same := 0 392 for same < len(str) && same < len(istr) && str[same] == istr[same] { 393 same++ 394 } 395 if same > 0 { 396 // Matches at least one rune in current range. 397 // Keep going around. 398 str = str[:same] 399 continue 400 } 401 } 402 } 403 404 // Found end of a run with common leading literal string: 405 // sub[start:i] all begin with str[0:len(str)], but sub[i] 406 // does not even begin with str[0]. 407 // 408 // Factor out common string and append factored expression to out. 409 if i == start { 410 // Nothing to do - run of length 0. 411 } else if i == start+1 { 412 // Just one: don't bother factoring. 413 out = append(out, sub[start]) 414 } else { 415 // Construct factored form: prefix(suffix1|suffix2|...) 416 prefix := p.newRegexp(OpLiteral) 417 prefix.Flags = strflags 418 prefix.Rune = append(prefix.Rune[:0], str...) 419 420 for j := start; j < i; j++ { 421 sub[j] = p.removeLeadingString(sub[j], len(str)) 422 } 423 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 424 425 re := p.newRegexp(OpConcat) 426 re.Sub = append(re.Sub[:0], prefix, suffix) 427 out = append(out, re) 428 } 429 430 // Prepare for next iteration. 431 start = i 432 str = istr 433 strflags = iflags 434 } 435 sub = out 436 437 // Round 2: Factor out common complex prefixes, 438 // just the first piece of each concatenation, 439 // whatever it is. This is good enough a lot of the time. 440 start = 0 441 out = sub[:0] 442 var first *Regexp 443 for i := 0; i <= len(sub); i++ { 444 // Invariant: the Regexps that were in sub[0:start] have been 445 // used or marked for reuse, and the slice space has been reused 446 // for out (len(out) <= start). 447 // 448 // Invariant: sub[start:i] consists of regexps that all begin with ifirst. 449 var ifirst *Regexp 450 if i < len(sub) { 451 ifirst = p.leadingRegexp(sub[i]) 452 if first != nil && first.Equal(ifirst) { 453 continue 454 } 455 } 456 457 // Found end of a run with common leading regexp: 458 // sub[start:i] all begin with first but sub[i] does not. 459 // 460 // Factor out common regexp and append factored expression to out. 461 if i == start { 462 // Nothing to do - run of length 0. 463 } else if i == start+1 { 464 // Just one: don't bother factoring. 465 out = append(out, sub[start]) 466 } else { 467 // Construct factored form: prefix(suffix1|suffix2|...) 468 prefix := first 469 for j := start; j < i; j++ { 470 reuse := j != start // prefix came from sub[start] 471 sub[j] = p.removeLeadingRegexp(sub[j], reuse) 472 } 473 suffix := p.collapse(sub[start:i], OpAlternate) // recurse 474 475 re := p.newRegexp(OpConcat) 476 re.Sub = append(re.Sub[:0], prefix, suffix) 477 out = append(out, re) 478 } 479 480 // Prepare for next iteration. 481 start = i 482 first = ifirst 483 } 484 sub = out 485 486 // Round 3: Collapse runs of single literals into character classes. 487 start = 0 488 out = sub[:0] 489 for i := 0; i <= len(sub); i++ { 490 // Invariant: the Regexps that were in sub[0:start] have been 491 // used or marked for reuse, and the slice space has been reused 492 // for out (len(out) <= start). 493 // 494 // Invariant: sub[start:i] consists of regexps that are either 495 // literal runes or character classes. 496 if i < len(sub) && isCharClass(sub[i]) { 497 continue 498 } 499 500 // sub[i] is not a char or char class; 501 // emit char class for sub[start:i]... 502 if i == start { 503 // Nothing to do - run of length 0. 504 } else if i == start+1 { 505 out = append(out, sub[start]) 506 } else { 507 // Make new char class. 508 // Start with most complex regexp in sub[start]. 509 max := start 510 for j := start + 1; j < i; j++ { 511 if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) { 512 max = j 513 } 514 } 515 sub[start], sub[max] = sub[max], sub[start] 516 517 for j := start + 1; j < i; j++ { 518 mergeCharClass(sub[start], sub[j]) 519 p.reuse(sub[j]) 520 } 521 cleanAlt(sub[start]) 522 out = append(out, sub[start]) 523 } 524 525 // ... and then emit sub[i]. 526 if i < len(sub) { 527 out = append(out, sub[i]) 528 } 529 start = i + 1 530 } 531 sub = out 532 533 // Round 4: Collapse runs of empty matches into a single empty match. 534 start = 0 535 out = sub[:0] 536 for i := range sub { 537 if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch { 538 continue 539 } 540 out = append(out, sub[i]) 541 } 542 sub = out 543 544 return sub 545 } 546 547 // leadingString returns the leading literal string that re begins with. 548 // The string refers to storage in re or its children. 549 func (p *parser) leadingString(re *Regexp) ([]rune, Flags) { 550 if re.Op == OpConcat && len(re.Sub) > 0 { 551 re = re.Sub[0] 552 } 553 if re.Op != OpLiteral { 554 return nil, 0 555 } 556 return re.Rune, re.Flags & FoldCase 557 } 558 559 // removeLeadingString removes the first n leading runes 560 // from the beginning of re. It returns the replacement for re. 561 func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp { 562 if re.Op == OpConcat && len(re.Sub) > 0 { 563 // Removing a leading string in a concatenation 564 // might simplify the concatenation. 565 sub := re.Sub[0] 566 sub = p.removeLeadingString(sub, n) 567 re.Sub[0] = sub 568 if sub.Op == OpEmptyMatch { 569 p.reuse(sub) 570 switch len(re.Sub) { 571 case 0, 1: 572 // Impossible but handle. 573 re.Op = OpEmptyMatch 574 re.Sub = nil 575 case 2: 576 old := re 577 re = re.Sub[1] 578 p.reuse(old) 579 default: 580 copy(re.Sub, re.Sub[1:]) 581 re.Sub = re.Sub[:len(re.Sub)-1] 582 } 583 } 584 return re 585 } 586 587 if re.Op == OpLiteral { 588 re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])] 589 if len(re.Rune) == 0 { 590 re.Op = OpEmptyMatch 591 } 592 } 593 return re 594 } 595 596 // leadingRegexp returns the leading regexp that re begins with. 597 // The regexp refers to storage in re or its children. 598 func (p *parser) leadingRegexp(re *Regexp) *Regexp { 599 if re.Op == OpEmptyMatch { 600 return nil 601 } 602 if re.Op == OpConcat && len(re.Sub) > 0 { 603 sub := re.Sub[0] 604 if sub.Op == OpEmptyMatch { 605 return nil 606 } 607 return sub 608 } 609 return re 610 } 611 612 // removeLeadingRegexp removes the leading regexp in re. 613 // It returns the replacement for re. 614 // If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse. 615 func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { 616 if re.Op == OpConcat && len(re.Sub) > 0 { 617 if reuse { 618 p.reuse(re.Sub[0]) 619 } 620 re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])] 621 switch len(re.Sub) { 622 case 0: 623 re.Op = OpEmptyMatch 624 re.Sub = nil 625 case 1: 626 old := re 627 re = re.Sub[0] 628 p.reuse(old) 629 } 630 return re 631 } 632 if reuse { 633 p.reuse(re) 634 } 635 return p.newRegexp(OpEmptyMatch) 636 } 637 638 func literalRegexp(s string, flags Flags) *Regexp { 639 re := &Regexp{Op: OpLiteral} 640 re.Flags = flags 641 re.Rune = re.Rune0[:0] // use local storage for small strings 642 for _, c := range s { 643 if len(re.Rune) >= cap(re.Rune) { 644 // string is too long to fit in Rune0. let Go handle it 645 re.Rune = []rune(s) 646 break 647 } 648 re.Rune = append(re.Rune, c) 649 } 650 return re 651 } 652 653 // Parsing. 654 655 // Parse parses a regular expression string s, controlled by the specified 656 // Flags, and returns a regular expression parse tree. The syntax is 657 // described in the top-level comment for package regexp. 658 func Parse(s string, flags Flags) (*Regexp, error) { 659 if flags&Literal != 0 { 660 // Trivial parser for literal string. 661 if err := checkUTF8(s); err != nil { 662 return nil, err 663 } 664 return literalRegexp(s, flags), nil 665 } 666 667 // Otherwise, must do real work. 668 var ( 669 p parser 670 err error 671 c rune 672 op Op 673 lastRepeat string 674 min, max int 675 ) 676 p.flags = flags 677 p.wholeRegexp = s 678 t := s 679 for t != "" { 680 repeat := "" 681 BigSwitch: 682 switch t[0] { 683 default: 684 if c, t, err = nextRune(t); err != nil { 685 return nil, err 686 } 687 p.literal(c) 688 689 case '(': 690 if p.flags&PerlX != 0 && len(t) >= 2 && t[1] == '?' { 691 // Flag changes and non-capturing groups. 692 if t, err = p.parsePerlFlags(t); err != nil { 693 return nil, err 694 } 695 break 696 } 697 p.numCap++ 698 p.op(opLeftParen).Cap = p.numCap 699 t = t[1:] 700 case '|': 701 if err = p.parseVerticalBar(); err != nil { 702 return nil, err 703 } 704 t = t[1:] 705 case ')': 706 if err = p.parseRightParen(); err != nil { 707 return nil, err 708 } 709 t = t[1:] 710 case '^': 711 if p.flags&OneLine != 0 { 712 p.op(OpBeginText) 713 } else { 714 p.op(OpBeginLine) 715 } 716 t = t[1:] 717 case '$': 718 if p.flags&OneLine != 0 { 719 p.op(OpEndText).Flags |= WasDollar 720 } else { 721 p.op(OpEndLine) 722 } 723 t = t[1:] 724 case '.': 725 if p.flags&DotNL != 0 { 726 p.op(OpAnyChar) 727 } else { 728 p.op(OpAnyCharNotNL) 729 } 730 t = t[1:] 731 case '[': 732 if t, err = p.parseClass(t); err != nil { 733 return nil, err 734 } 735 case '*', '+', '?': 736 before := t 737 switch t[0] { 738 case '*': 739 op = OpStar 740 case '+': 741 op = OpPlus 742 case '?': 743 op = OpQuest 744 } 745 after := t[1:] 746 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { 747 return nil, err 748 } 749 repeat = before 750 t = after 751 case '{': 752 op = OpRepeat 753 before := t 754 min, max, after, ok := p.parseRepeat(t) 755 if !ok { 756 // If the repeat cannot be parsed, { is a literal. 757 p.literal('{') 758 t = t[1:] 759 break 760 } 761 if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max { 762 // Numbers were too big, or max is present and min > max. 763 return nil, &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} 764 } 765 if after, err = p.repeat(op, min, max, before, after, lastRepeat); err != nil { 766 return nil, err 767 } 768 repeat = before 769 t = after 770 case '\\': 771 if p.flags&PerlX != 0 && len(t) >= 2 { 772 switch t[1] { 773 case 'A': 774 p.op(OpBeginText) 775 t = t[2:] 776 break BigSwitch 777 case 'b': 778 p.op(OpWordBoundary) 779 t = t[2:] 780 break BigSwitch 781 case 'B': 782 p.op(OpNoWordBoundary) 783 t = t[2:] 784 break BigSwitch 785 case 'C': 786 // any byte; not supported 787 return nil, &Error{ErrInvalidEscape, t[:2]} 788 case 'Q': 789 // \Q ... \E: the ... is always literals 790 var lit string 791 if i := strings.Index(t, `\E`); i < 0 { 792 lit = t[2:] 793 t = "" 794 } else { 795 lit = t[2:i] 796 t = t[i+2:] 797 } 798 p.push(literalRegexp(lit, p.flags)) 799 break BigSwitch 800 case 'z': 801 p.op(OpEndText) 802 t = t[2:] 803 break BigSwitch 804 } 805 } 806 807 re := p.newRegexp(OpCharClass) 808 re.Flags = p.flags 809 810 // Look for Unicode character group like \p{Han} 811 if len(t) >= 2 && (t[1] == 'p' || t[1] == 'P') { 812 r, rest, err := p.parseUnicodeClass(t, re.Rune0[:0]) 813 if err != nil { 814 return nil, err 815 } 816 if r != nil { 817 re.Rune = r 818 t = rest 819 p.push(re) 820 break BigSwitch 821 } 822 } 823 824 // Perl character class escape. 825 if r, rest := p.parsePerlClassEscape(t, re.Rune0[:0]); r != nil { 826 re.Rune = r 827 t = rest 828 p.push(re) 829 break BigSwitch 830 } 831 p.reuse(re) 832 833 // Ordinary single-character escape. 834 if c, t, err = p.parseEscape(t); err != nil { 835 return nil, err 836 } 837 p.literal(c) 838 } 839 lastRepeat = repeat 840 } 841 842 p.concat() 843 if p.swapVerticalBar() { 844 // pop vertical bar 845 p.stack = p.stack[:len(p.stack)-1] 846 } 847 p.alternate() 848 849 n := len(p.stack) 850 if n != 1 { 851 return nil, &Error{ErrMissingParen, s} 852 } 853 return p.stack[0], nil 854 } 855 856 // parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}. 857 // If s is not of that form, it returns ok == false. 858 // If s has the right form but the values are too big, it returns min == -1, ok == true. 859 func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) { 860 if s == "" || s[0] != '{' { 861 return 862 } 863 s = s[1:] 864 var ok1 bool 865 if min, s, ok1 = p.parseInt(s); !ok1 { 866 return 867 } 868 if s == "" { 869 return 870 } 871 if s[0] != ',' { 872 max = min 873 } else { 874 s = s[1:] 875 if s == "" { 876 return 877 } 878 if s[0] == '}' { 879 max = -1 880 } else if max, s, ok1 = p.parseInt(s); !ok1 { 881 return 882 } else if max < 0 { 883 // parseInt found too big a number 884 min = -1 885 } 886 } 887 if s == "" || s[0] != '}' { 888 return 889 } 890 rest = s[1:] 891 ok = true 892 return 893 } 894 895 // parsePerlFlags parses a Perl flag setting or non-capturing group or both, 896 // like (?i) or (?: or (?i:. It removes the prefix from s and updates the parse state. 897 // The caller must have ensured that s begins with "(?". 898 func (p *parser) parsePerlFlags(s string) (rest string, err error) { 899 t := s 900 901 // Check for named captures, first introduced in Python's regexp library. 902 // As usual, there are three slightly different syntaxes: 903 // 904 // (?P<name>expr) the original, introduced by Python 905 // (?<name>expr) the .NET alteration, adopted by Perl 5.10 906 // (?'name'expr) another .NET alteration, adopted by Perl 5.10 907 // 908 // Perl 5.10 gave in and implemented the Python version too, 909 // but they claim that the last two are the preferred forms. 910 // PCRE and languages based on it (specifically, PHP and Ruby) 911 // support all three as well. EcmaScript 4 uses only the Python form. 912 // 913 // In both the open source world (via Code Search) and the 914 // Google source tree, (?P<expr>name) is the dominant form, 915 // so that's the one we implement. One is enough. 916 if len(t) > 4 && t[2] == 'P' && t[3] == '<' { 917 // Pull out name. 918 end := strings.IndexRune(t, '>') 919 if end < 0 { 920 if err = checkUTF8(t); err != nil { 921 return "", err 922 } 923 return "", &Error{ErrInvalidNamedCapture, s} 924 } 925 926 capture := t[:end+1] // "(?P<name>" 927 name := t[4:end] // "name" 928 if err = checkUTF8(name); err != nil { 929 return "", err 930 } 931 if !isValidCaptureName(name) { 932 return "", &Error{ErrInvalidNamedCapture, capture} 933 } 934 935 // Like ordinary capture, but named. 936 p.numCap++ 937 re := p.op(opLeftParen) 938 re.Cap = p.numCap 939 re.Name = name 940 return t[end+1:], nil 941 } 942 943 // Non-capturing group. Might also twiddle Perl flags. 944 var c rune 945 t = t[2:] // skip (? 946 flags := p.flags 947 sign := +1 948 sawFlag := false 949 Loop: 950 for t != "" { 951 if c, t, err = nextRune(t); err != nil { 952 return "", err 953 } 954 switch c { 955 default: 956 break Loop 957 958 // Flags. 959 case 'i': 960 flags |= FoldCase 961 sawFlag = true 962 case 'm': 963 flags &^= OneLine 964 sawFlag = true 965 case 's': 966 flags |= DotNL 967 sawFlag = true 968 case 'U': 969 flags |= NonGreedy 970 sawFlag = true 971 972 // Switch to negation. 973 case '-': 974 if sign < 0 { 975 break Loop 976 } 977 sign = -1 978 // Invert flags so that | above turn into &^ and vice versa. 979 // We'll invert flags again before using it below. 980 flags = ^flags 981 sawFlag = false 982 983 // End of flags, starting group or not. 984 case ':', ')': 985 if sign < 0 { 986 if !sawFlag { 987 break Loop 988 } 989 flags = ^flags 990 } 991 if c == ':' { 992 // Open new group 993 p.op(opLeftParen) 994 } 995 p.flags = flags 996 return t, nil 997 } 998 } 999 1000 return "", &Error{ErrInvalidPerlOp, s[:len(s)-len(t)]} 1001 } 1002 1003 // isValidCaptureName reports whether name 1004 // is a valid capture name: [A-Za-z0-9_]+. 1005 // PCRE limits names to 32 bytes. 1006 // Python rejects names starting with digits. 1007 // We don't enforce either of those. 1008 func isValidCaptureName(name string) bool { 1009 if name == "" { 1010 return false 1011 } 1012 for _, c := range name { 1013 if c != '_' && !isalnum(c) { 1014 return false 1015 } 1016 } 1017 return true 1018 } 1019 1020 // parseInt parses a decimal integer. 1021 func (p *parser) parseInt(s string) (n int, rest string, ok bool) { 1022 if s == "" || s[0] < '0' || '9' < s[0] { 1023 return 1024 } 1025 // Disallow leading zeros. 1026 if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' { 1027 return 1028 } 1029 t := s 1030 for s != "" && '0' <= s[0] && s[0] <= '9' { 1031 s = s[1:] 1032 } 1033 rest = s 1034 ok = true 1035 // Have digits, compute value. 1036 t = t[:len(t)-len(s)] 1037 for i := 0; i < len(t); i++ { 1038 // Avoid overflow. 1039 if n >= 1e8 { 1040 n = -1 1041 break 1042 } 1043 n = n*10 + int(t[i]) - '0' 1044 } 1045 return 1046 } 1047 1048 // can this be represented as a character class? 1049 // single-rune literal string, char class, ., and .|\n. 1050 func isCharClass(re *Regexp) bool { 1051 return re.Op == OpLiteral && len(re.Rune) == 1 || 1052 re.Op == OpCharClass || 1053 re.Op == OpAnyCharNotNL || 1054 re.Op == OpAnyChar 1055 } 1056 1057 // does re match r? 1058 func matchRune(re *Regexp, r rune) bool { 1059 switch re.Op { 1060 case OpLiteral: 1061 return len(re.Rune) == 1 && re.Rune[0] == r 1062 case OpCharClass: 1063 for i := 0; i < len(re.Rune); i += 2 { 1064 if re.Rune[i] <= r && r <= re.Rune[i+1] { 1065 return true 1066 } 1067 } 1068 return false 1069 case OpAnyCharNotNL: 1070 return r != '\n' 1071 case OpAnyChar: 1072 return true 1073 } 1074 return false 1075 } 1076 1077 // parseVerticalBar handles a | in the input. 1078 func (p *parser) parseVerticalBar() error { 1079 p.concat() 1080 1081 // The concatenation we just parsed is on top of the stack. 1082 // If it sits above an opVerticalBar, swap it below 1083 // (things below an opVerticalBar become an alternation). 1084 // Otherwise, push a new vertical bar. 1085 if !p.swapVerticalBar() { 1086 p.op(opVerticalBar) 1087 } 1088 1089 return nil 1090 } 1091 1092 // mergeCharClass makes dst = dst|src. 1093 // The caller must ensure that dst.Op >= src.Op, 1094 // to reduce the amount of copying. 1095 func mergeCharClass(dst, src *Regexp) { 1096 switch dst.Op { 1097 case OpAnyChar: 1098 // src doesn't add anything. 1099 case OpAnyCharNotNL: 1100 // src might add \n 1101 if matchRune(src, '\n') { 1102 dst.Op = OpAnyChar 1103 } 1104 case OpCharClass: 1105 // src is simpler, so either literal or char class 1106 if src.Op == OpLiteral { 1107 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1108 } else { 1109 dst.Rune = appendClass(dst.Rune, src.Rune) 1110 } 1111 case OpLiteral: 1112 // both literal 1113 if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags { 1114 break 1115 } 1116 dst.Op = OpCharClass 1117 dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags) 1118 dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) 1119 } 1120 } 1121 1122 // If the top of the stack is an element followed by an opVerticalBar 1123 // swapVerticalBar swaps the two and returns true. 1124 // Otherwise it returns false. 1125 func (p *parser) swapVerticalBar() bool { 1126 // If above and below vertical bar are literal or char class, 1127 // can merge into a single char class. 1128 n := len(p.stack) 1129 if n >= 3 && p.stack[n-2].Op == opVerticalBar && isCharClass(p.stack[n-1]) && isCharClass(p.stack[n-3]) { 1130 re1 := p.stack[n-1] 1131 re3 := p.stack[n-3] 1132 // Make re3 the more complex of the two. 1133 if re1.Op > re3.Op { 1134 re1, re3 = re3, re1 1135 p.stack[n-3] = re3 1136 } 1137 mergeCharClass(re3, re1) 1138 p.reuse(re1) 1139 p.stack = p.stack[:n-1] 1140 return true 1141 } 1142 1143 if n >= 2 { 1144 re1 := p.stack[n-1] 1145 re2 := p.stack[n-2] 1146 if re2.Op == opVerticalBar { 1147 if n >= 3 { 1148 // Now out of reach. 1149 // Clean opportunistically. 1150 cleanAlt(p.stack[n-3]) 1151 } 1152 p.stack[n-2] = re1 1153 p.stack[n-1] = re2 1154 return true 1155 } 1156 } 1157 return false 1158 } 1159 1160 // parseRightParen handles a ) in the input. 1161 func (p *parser) parseRightParen() error { 1162 p.concat() 1163 if p.swapVerticalBar() { 1164 // pop vertical bar 1165 p.stack = p.stack[:len(p.stack)-1] 1166 } 1167 p.alternate() 1168 1169 n := len(p.stack) 1170 if n < 2 { 1171 return &Error{ErrInternalError, ""} 1172 } 1173 re1 := p.stack[n-1] 1174 re2 := p.stack[n-2] 1175 p.stack = p.stack[:n-2] 1176 if re2.Op != opLeftParen { 1177 return &Error{ErrMissingParen, p.wholeRegexp} 1178 } 1179 // Restore flags at time of paren. 1180 p.flags = re2.Flags 1181 if re2.Cap == 0 { 1182 // Just for grouping. 1183 p.push(re1) 1184 } else { 1185 re2.Op = OpCapture 1186 re2.Sub = re2.Sub0[:1] 1187 re2.Sub[0] = re1 1188 p.push(re2) 1189 } 1190 return nil 1191 } 1192 1193 // parseEscape parses an escape sequence at the beginning of s 1194 // and returns the rune. 1195 func (p *parser) parseEscape(s string) (r rune, rest string, err error) { 1196 t := s[1:] 1197 if t == "" { 1198 return 0, "", &Error{ErrTrailingBackslash, ""} 1199 } 1200 c, t, err := nextRune(t) 1201 if err != nil { 1202 return 0, "", err 1203 } 1204 1205 Switch: 1206 switch c { 1207 default: 1208 if c < utf8.RuneSelf && !isalnum(c) { 1209 // Escaped non-word characters are always themselves. 1210 // PCRE is not quite so rigorous: it accepts things like 1211 // \q, but we don't. We once rejected \_, but too many 1212 // programs and people insist on using it, so allow \_. 1213 return c, t, nil 1214 } 1215 1216 // Octal escapes. 1217 case '1', '2', '3', '4', '5', '6', '7': 1218 // Single non-zero digit is a backreference; not supported 1219 if t == "" || t[0] < '0' || t[0] > '7' { 1220 break 1221 } 1222 fallthrough 1223 case '0': 1224 // Consume up to three octal digits; already have one. 1225 r = c - '0' 1226 for i := 1; i < 3; i++ { 1227 if t == "" || t[0] < '0' || t[0] > '7' { 1228 break 1229 } 1230 r = r*8 + rune(t[0]) - '0' 1231 t = t[1:] 1232 } 1233 return r, t, nil 1234 1235 // Hexadecimal escapes. 1236 case 'x': 1237 if t == "" { 1238 break 1239 } 1240 if c, t, err = nextRune(t); err != nil { 1241 return 0, "", err 1242 } 1243 if c == '{' { 1244 // Any number of digits in braces. 1245 // Perl accepts any text at all; it ignores all text 1246 // after the first non-hex digit. We require only hex digits, 1247 // and at least one. 1248 nhex := 0 1249 r = 0 1250 for { 1251 if t == "" { 1252 break Switch 1253 } 1254 if c, t, err = nextRune(t); err != nil { 1255 return 0, "", err 1256 } 1257 if c == '}' { 1258 break 1259 } 1260 v := unhex(c) 1261 if v < 0 { 1262 break Switch 1263 } 1264 r = r*16 + v 1265 if r > unicode.MaxRune { 1266 break Switch 1267 } 1268 nhex++ 1269 } 1270 if nhex == 0 { 1271 break Switch 1272 } 1273 return r, t, nil 1274 } 1275 1276 // Easy case: two hex digits. 1277 x := unhex(c) 1278 if c, t, err = nextRune(t); err != nil { 1279 return 0, "", err 1280 } 1281 y := unhex(c) 1282 if x < 0 || y < 0 { 1283 break 1284 } 1285 return x*16 + y, t, nil 1286 1287 // C escapes. There is no case 'b', to avoid misparsing 1288 // the Perl word-boundary \b as the C backspace \b 1289 // when in POSIX mode. In Perl, /\b/ means word-boundary 1290 // but /[\b]/ means backspace. We don't support that. 1291 // If you want a backspace, embed a literal backspace 1292 // character or use \x08. 1293 case 'a': 1294 return '\a', t, err 1295 case 'f': 1296 return '\f', t, err 1297 case 'n': 1298 return '\n', t, err 1299 case 'r': 1300 return '\r', t, err 1301 case 't': 1302 return '\t', t, err 1303 case 'v': 1304 return '\v', t, err 1305 } 1306 return 0, "", &Error{ErrInvalidEscape, s[:len(s)-len(t)]} 1307 } 1308 1309 // parseClassChar parses a character class character at the beginning of s 1310 // and returns it. 1311 func (p *parser) parseClassChar(s, wholeClass string) (r rune, rest string, err error) { 1312 if s == "" { 1313 return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} 1314 } 1315 1316 // Allow regular escape sequences even though 1317 // many need not be escaped in this context. 1318 if s[0] == '\\' { 1319 return p.parseEscape(s) 1320 } 1321 1322 return nextRune(s) 1323 } 1324 1325 type charGroup struct { 1326 sign int 1327 class []rune 1328 } 1329 1330 // parsePerlClassEscape parses a leading Perl character class escape like \d 1331 // from the beginning of s. If one is present, it appends the characters to r 1332 // and returns the new slice r and the remainder of the string. 1333 func (p *parser) parsePerlClassEscape(s string, r []rune) (out []rune, rest string) { 1334 if p.flags&PerlX == 0 || len(s) < 2 || s[0] != '\\' { 1335 return 1336 } 1337 g := perlGroup[s[0:2]] 1338 if g.sign == 0 { 1339 return 1340 } 1341 return p.appendGroup(r, g), s[2:] 1342 } 1343 1344 // parseNamedClass parses a leading POSIX named character class like [:alnum:] 1345 // from the beginning of s. If one is present, it appends the characters to r 1346 // and returns the new slice r and the remainder of the string. 1347 func (p *parser) parseNamedClass(s string, r []rune) (out []rune, rest string, err error) { 1348 if len(s) < 2 || s[0] != '[' || s[1] != ':' { 1349 return 1350 } 1351 1352 i := strings.Index(s[2:], ":]") 1353 if i < 0 { 1354 return 1355 } 1356 i += 2 1357 name, s := s[0:i+2], s[i+2:] 1358 g := posixGroup[name] 1359 if g.sign == 0 { 1360 return nil, "", &Error{ErrInvalidCharRange, name} 1361 } 1362 return p.appendGroup(r, g), s, nil 1363 } 1364 1365 func (p *parser) appendGroup(r []rune, g charGroup) []rune { 1366 if p.flags&FoldCase == 0 { 1367 if g.sign < 0 { 1368 r = appendNegatedClass(r, g.class) 1369 } else { 1370 r = appendClass(r, g.class) 1371 } 1372 } else { 1373 tmp := p.tmpClass[:0] 1374 tmp = appendFoldedClass(tmp, g.class) 1375 p.tmpClass = tmp 1376 tmp = cleanClass(&p.tmpClass) 1377 if g.sign < 0 { 1378 r = appendNegatedClass(r, tmp) 1379 } else { 1380 r = appendClass(r, tmp) 1381 } 1382 } 1383 return r 1384 } 1385 1386 var anyTable = &unicode.RangeTable{ 1387 R16: []unicode.Range16{{Lo: 0, Hi: 1<<16 - 1, Stride: 1}}, 1388 R32: []unicode.Range32{{Lo: 1 << 16, Hi: unicode.MaxRune, Stride: 1}}, 1389 } 1390 1391 // unicodeTable returns the unicode.RangeTable identified by name 1392 // and the table of additional fold-equivalent code points. 1393 func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) { 1394 // Special case: "Any" means any. 1395 if name == "Any" { 1396 return anyTable, anyTable 1397 } 1398 if t := unicode.Categories[name]; t != nil { 1399 return t, unicode.FoldCategory[name] 1400 } 1401 if t := unicode.Scripts[name]; t != nil { 1402 return t, unicode.FoldScript[name] 1403 } 1404 return nil, nil 1405 } 1406 1407 // parseUnicodeClass parses a leading Unicode character class like \p{Han} 1408 // from the beginning of s. If one is present, it appends the characters to r 1409 // and returns the new slice r and the remainder of the string. 1410 func (p *parser) parseUnicodeClass(s string, r []rune) (out []rune, rest string, err error) { 1411 if p.flags&UnicodeGroups == 0 || len(s) < 2 || s[0] != '\\' || s[1] != 'p' && s[1] != 'P' { 1412 return 1413 } 1414 1415 // Committed to parse or return error. 1416 sign := +1 1417 if s[1] == 'P' { 1418 sign = -1 1419 } 1420 t := s[2:] 1421 c, t, err := nextRune(t) 1422 if err != nil { 1423 return 1424 } 1425 var seq, name string 1426 if c != '{' { 1427 // Single-letter name. 1428 seq = s[:len(s)-len(t)] 1429 name = seq[2:] 1430 } else { 1431 // Name is in braces. 1432 end := strings.IndexRune(s, '}') 1433 if end < 0 { 1434 if err = checkUTF8(s); err != nil { 1435 return 1436 } 1437 return nil, "", &Error{ErrInvalidCharRange, s} 1438 } 1439 seq, t = s[:end+1], s[end+1:] 1440 name = s[3:end] 1441 if err = checkUTF8(name); err != nil { 1442 return 1443 } 1444 } 1445 1446 // Group can have leading negation too. \p{^Han} == \P{Han}, \P{^Han} == \p{Han}. 1447 if name != "" && name[0] == '^' { 1448 sign = -sign 1449 name = name[1:] 1450 } 1451 1452 tab, fold := unicodeTable(name) 1453 if tab == nil { 1454 return nil, "", &Error{ErrInvalidCharRange, seq} 1455 } 1456 1457 if p.flags&FoldCase == 0 || fold == nil { 1458 if sign > 0 { 1459 r = appendTable(r, tab) 1460 } else { 1461 r = appendNegatedTable(r, tab) 1462 } 1463 } else { 1464 // Merge and clean tab and fold in a temporary buffer. 1465 // This is necessary for the negative case and just tidy 1466 // for the positive case. 1467 tmp := p.tmpClass[:0] 1468 tmp = appendTable(tmp, tab) 1469 tmp = appendTable(tmp, fold) 1470 p.tmpClass = tmp 1471 tmp = cleanClass(&p.tmpClass) 1472 if sign > 0 { 1473 r = appendClass(r, tmp) 1474 } else { 1475 r = appendNegatedClass(r, tmp) 1476 } 1477 } 1478 return r, t, nil 1479 } 1480 1481 // parseClass parses a character class at the beginning of s 1482 // and pushes it onto the parse stack. 1483 func (p *parser) parseClass(s string) (rest string, err error) { 1484 t := s[1:] // chop [ 1485 re := p.newRegexp(OpCharClass) 1486 re.Flags = p.flags 1487 re.Rune = re.Rune0[:0] 1488 1489 sign := +1 1490 if t != "" && t[0] == '^' { 1491 sign = -1 1492 t = t[1:] 1493 1494 // If character class does not match \n, add it here, 1495 // so that negation later will do the right thing. 1496 if p.flags&ClassNL == 0 { 1497 re.Rune = append(re.Rune, '\n', '\n') 1498 } 1499 } 1500 1501 class := re.Rune 1502 first := true // ] and - are okay as first char in class 1503 for t == "" || t[0] != ']' || first { 1504 // POSIX: - is only okay unescaped as first or last in class. 1505 // Perl: - is okay anywhere. 1506 if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { 1507 _, size := utf8.DecodeRuneInString(t[1:]) 1508 return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} 1509 } 1510 first = false 1511 1512 // Look for POSIX [:alnum:] etc. 1513 if len(t) > 2 && t[0] == '[' && t[1] == ':' { 1514 nclass, nt, err := p.parseNamedClass(t, class) 1515 if err != nil { 1516 return "", err 1517 } 1518 if nclass != nil { 1519 class, t = nclass, nt 1520 continue 1521 } 1522 } 1523 1524 // Look for Unicode character group like \p{Han}. 1525 nclass, nt, err := p.parseUnicodeClass(t, class) 1526 if err != nil { 1527 return "", err 1528 } 1529 if nclass != nil { 1530 class, t = nclass, nt 1531 continue 1532 } 1533 1534 // Look for Perl character class symbols (extension). 1535 if nclass, nt := p.parsePerlClassEscape(t, class); nclass != nil { 1536 class, t = nclass, nt 1537 continue 1538 } 1539 1540 // Single character or simple range. 1541 rng := t 1542 var lo, hi rune 1543 if lo, t, err = p.parseClassChar(t, s); err != nil { 1544 return "", err 1545 } 1546 hi = lo 1547 // [a-] means (a|-) so check for final ]. 1548 if len(t) >= 2 && t[0] == '-' && t[1] != ']' { 1549 t = t[1:] 1550 if hi, t, err = p.parseClassChar(t, s); err != nil { 1551 return "", err 1552 } 1553 if hi < lo { 1554 rng = rng[:len(rng)-len(t)] 1555 return "", &Error{Code: ErrInvalidCharRange, Expr: rng} 1556 } 1557 } 1558 if p.flags&FoldCase == 0 { 1559 class = appendRange(class, lo, hi) 1560 } else { 1561 class = appendFoldedRange(class, lo, hi) 1562 } 1563 } 1564 t = t[1:] // chop ] 1565 1566 // Use &re.Rune instead of &class to avoid allocation. 1567 re.Rune = class 1568 class = cleanClass(&re.Rune) 1569 if sign < 0 { 1570 class = negateClass(class) 1571 } 1572 re.Rune = class 1573 p.push(re) 1574 return t, nil 1575 } 1576 1577 // cleanClass sorts the ranges (pairs of elements of r), 1578 // merges them, and eliminates duplicates. 1579 func cleanClass(rp *[]rune) []rune { 1580 1581 // Sort by lo increasing, hi decreasing to break ties. 1582 sort.Sort(ranges{rp}) 1583 1584 r := *rp 1585 if len(r) < 2 { 1586 return r 1587 } 1588 1589 // Merge abutting, overlapping. 1590 w := 2 // write index 1591 for i := 2; i < len(r); i += 2 { 1592 lo, hi := r[i], r[i+1] 1593 if lo <= r[w-1]+1 { 1594 // merge with previous range 1595 if hi > r[w-1] { 1596 r[w-1] = hi 1597 } 1598 continue 1599 } 1600 // new disjoint range 1601 r[w] = lo 1602 r[w+1] = hi 1603 w += 2 1604 } 1605 1606 return r[:w] 1607 } 1608 1609 // appendLiteral returns the result of appending the literal x to the class r. 1610 func appendLiteral(r []rune, x rune, flags Flags) []rune { 1611 if flags&FoldCase != 0 { 1612 return appendFoldedRange(r, x, x) 1613 } 1614 return appendRange(r, x, x) 1615 } 1616 1617 // appendRange returns the result of appending the range lo-hi to the class r. 1618 func appendRange(r []rune, lo, hi rune) []rune { 1619 // Expand last range or next to last range if it overlaps or abuts. 1620 // Checking two ranges helps when appending case-folded 1621 // alphabets, so that one range can be expanding A-Z and the 1622 // other expanding a-z. 1623 n := len(r) 1624 for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4 1625 if n >= i { 1626 rlo, rhi := r[n-i], r[n-i+1] 1627 if lo <= rhi+1 && rlo <= hi+1 { 1628 if lo < rlo { 1629 r[n-i] = lo 1630 } 1631 if hi > rhi { 1632 r[n-i+1] = hi 1633 } 1634 return r 1635 } 1636 } 1637 } 1638 1639 return append(r, lo, hi) 1640 } 1641 1642 const ( 1643 // minimum and maximum runes involved in folding. 1644 // checked during test. 1645 minFold = 0x0041 1646 maxFold = 0x1044f 1647 ) 1648 1649 // appendFoldedRange returns the result of appending the range lo-hi 1650 // and its case folding-equivalent runes to the class r. 1651 func appendFoldedRange(r []rune, lo, hi rune) []rune { 1652 // Optimizations. 1653 if lo <= minFold && hi >= maxFold { 1654 // Range is full: folding can't add more. 1655 return appendRange(r, lo, hi) 1656 } 1657 if hi < minFold || lo > maxFold { 1658 // Range is outside folding possibilities. 1659 return appendRange(r, lo, hi) 1660 } 1661 if lo < minFold { 1662 // [lo, minFold-1] needs no folding. 1663 r = appendRange(r, lo, minFold-1) 1664 lo = minFold 1665 } 1666 if hi > maxFold { 1667 // [maxFold+1, hi] needs no folding. 1668 r = appendRange(r, maxFold+1, hi) 1669 hi = maxFold 1670 } 1671 1672 // Brute force. Depend on appendRange to coalesce ranges on the fly. 1673 for c := lo; c <= hi; c++ { 1674 r = appendRange(r, c, c) 1675 f := unicode.SimpleFold(c) 1676 for f != c { 1677 r = appendRange(r, f, f) 1678 f = unicode.SimpleFold(f) 1679 } 1680 } 1681 return r 1682 } 1683 1684 // appendClass returns the result of appending the class x to the class r. 1685 // It assume x is clean. 1686 func appendClass(r []rune, x []rune) []rune { 1687 for i := 0; i < len(x); i += 2 { 1688 r = appendRange(r, x[i], x[i+1]) 1689 } 1690 return r 1691 } 1692 1693 // appendFolded returns the result of appending the case folding of the class x to the class r. 1694 func appendFoldedClass(r []rune, x []rune) []rune { 1695 for i := 0; i < len(x); i += 2 { 1696 r = appendFoldedRange(r, x[i], x[i+1]) 1697 } 1698 return r 1699 } 1700 1701 // appendNegatedClass returns the result of appending the negation of the class x to the class r. 1702 // It assumes x is clean. 1703 func appendNegatedClass(r []rune, x []rune) []rune { 1704 nextLo := '\u0000' 1705 for i := 0; i < len(x); i += 2 { 1706 lo, hi := x[i], x[i+1] 1707 if nextLo <= lo-1 { 1708 r = appendRange(r, nextLo, lo-1) 1709 } 1710 nextLo = hi + 1 1711 } 1712 if nextLo <= unicode.MaxRune { 1713 r = appendRange(r, nextLo, unicode.MaxRune) 1714 } 1715 return r 1716 } 1717 1718 // appendTable returns the result of appending x to the class r. 1719 func appendTable(r []rune, x *unicode.RangeTable) []rune { 1720 for _, xr := range x.R16 { 1721 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1722 if stride == 1 { 1723 r = appendRange(r, lo, hi) 1724 continue 1725 } 1726 for c := lo; c <= hi; c += stride { 1727 r = appendRange(r, c, c) 1728 } 1729 } 1730 for _, xr := range x.R32 { 1731 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1732 if stride == 1 { 1733 r = appendRange(r, lo, hi) 1734 continue 1735 } 1736 for c := lo; c <= hi; c += stride { 1737 r = appendRange(r, c, c) 1738 } 1739 } 1740 return r 1741 } 1742 1743 // appendNegatedTable returns the result of appending the negation of x to the class r. 1744 func appendNegatedTable(r []rune, x *unicode.RangeTable) []rune { 1745 nextLo := '\u0000' // lo end of next class to add 1746 for _, xr := range x.R16 { 1747 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1748 if stride == 1 { 1749 if nextLo <= lo-1 { 1750 r = appendRange(r, nextLo, lo-1) 1751 } 1752 nextLo = hi + 1 1753 continue 1754 } 1755 for c := lo; c <= hi; c += stride { 1756 if nextLo <= c-1 { 1757 r = appendRange(r, nextLo, c-1) 1758 } 1759 nextLo = c + 1 1760 } 1761 } 1762 for _, xr := range x.R32 { 1763 lo, hi, stride := rune(xr.Lo), rune(xr.Hi), rune(xr.Stride) 1764 if stride == 1 { 1765 if nextLo <= lo-1 { 1766 r = appendRange(r, nextLo, lo-1) 1767 } 1768 nextLo = hi + 1 1769 continue 1770 } 1771 for c := lo; c <= hi; c += stride { 1772 if nextLo <= c-1 { 1773 r = appendRange(r, nextLo, c-1) 1774 } 1775 nextLo = c + 1 1776 } 1777 } 1778 if nextLo <= unicode.MaxRune { 1779 r = appendRange(r, nextLo, unicode.MaxRune) 1780 } 1781 return r 1782 } 1783 1784 // negateClass overwrites r and returns r's negation. 1785 // It assumes the class r is already clean. 1786 func negateClass(r []rune) []rune { 1787 nextLo := '\u0000' // lo end of next class to add 1788 w := 0 // write index 1789 for i := 0; i < len(r); i += 2 { 1790 lo, hi := r[i], r[i+1] 1791 if nextLo <= lo-1 { 1792 r[w] = nextLo 1793 r[w+1] = lo - 1 1794 w += 2 1795 } 1796 nextLo = hi + 1 1797 } 1798 r = r[:w] 1799 if nextLo <= unicode.MaxRune { 1800 // It's possible for the negation to have one more 1801 // range - this one - than the original class, so use append. 1802 r = append(r, nextLo, unicode.MaxRune) 1803 } 1804 return r 1805 } 1806 1807 // ranges implements sort.Interface on a []rune. 1808 // The choice of receiver type definition is strange 1809 // but avoids an allocation since we already have 1810 // a *[]rune. 1811 type ranges struct { 1812 p *[]rune 1813 } 1814 1815 func (ra ranges) Less(i, j int) bool { 1816 p := *ra.p 1817 i *= 2 1818 j *= 2 1819 return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] 1820 } 1821 1822 func (ra ranges) Len() int { 1823 return len(*ra.p) / 2 1824 } 1825 1826 func (ra ranges) Swap(i, j int) { 1827 p := *ra.p 1828 i *= 2 1829 j *= 2 1830 p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] 1831 } 1832 1833 func checkUTF8(s string) error { 1834 for s != "" { 1835 rune, size := utf8.DecodeRuneInString(s) 1836 if rune == utf8.RuneError && size == 1 { 1837 return &Error{Code: ErrInvalidUTF8, Expr: s} 1838 } 1839 s = s[size:] 1840 } 1841 return nil 1842 } 1843 1844 func nextRune(s string) (c rune, t string, err error) { 1845 c, size := utf8.DecodeRuneInString(s) 1846 if c == utf8.RuneError && size == 1 { 1847 return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} 1848 } 1849 return c, s[size:], nil 1850 } 1851 1852 func isalnum(c rune) bool { 1853 return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' 1854 } 1855 1856 func unhex(c rune) rune { 1857 if '0' <= c && c <= '9' { 1858 return c - '0' 1859 } 1860 if 'a' <= c && c <= 'f' { 1861 return c - 'a' + 10 1862 } 1863 if 'A' <= c && c <= 'F' { 1864 return c - 'A' + 10 1865 } 1866 return -1 1867 }