Source file src/pkg/regexp/regexp.go
1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package regexp implements regular expression search. 6 // 7 // The syntax of the regular expressions accepted is the same 8 // general syntax used by Perl, Python, and other languages. 9 // More precisely, it is the syntax accepted by RE2 and described at 10 // http://code.google.com/p/re2/wiki/Syntax, except for \C. 11 // 12 // All characters are UTF-8-encoded code points. 13 // 14 // There are 16 methods of Regexp that match a regular expression and identify 15 // the matched text. Their names are matched by this regular expression: 16 // 17 // Find(All)?(String)?(Submatch)?(Index)? 18 // 19 // If 'All' is present, the routine matches successive non-overlapping 20 // matches of the entire expression. Empty matches abutting a preceding 21 // match are ignored. The return value is a slice containing the successive 22 // return values of the corresponding non-'All' routine. These routines take 23 // an extra integer argument, n; if n >= 0, the function returns at most n 24 // matches/submatches. 25 // 26 // If 'String' is present, the argument is a string; otherwise it is a slice 27 // of bytes; return values are adjusted as appropriate. 28 // 29 // If 'Submatch' is present, the return value is a slice identifying the 30 // successive submatches of the expression. Submatches are matches of 31 // parenthesized subexpressions within the regular expression, numbered from 32 // left to right in order of opening parenthesis. Submatch 0 is the match of 33 // the entire expression, submatch 1 the match of the first parenthesized 34 // subexpression, and so on. 35 // 36 // If 'Index' is present, matches and submatches are identified by byte index 37 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of 38 // the nth submatch. The pair for n==0 identifies the match of the entire 39 // expression. If 'Index' is not present, the match is identified by the 40 // text of the match/submatch. If an index is negative, it means that 41 // subexpression did not match any string in the input. 42 // 43 // There is also a subset of the methods that can be applied to text read 44 // from a RuneReader: 45 // 46 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex 47 // 48 // This set may grow. Note that regular expression matches may need to 49 // examine text beyond the text returned by a match, so the methods that 50 // match text from a RuneReader may read arbitrarily far into the input 51 // before returning. 52 // 53 // (There are a few other methods that do not match this pattern.) 54 // 55 package regexp 56 57 import ( 58 "bytes" 59 "io" 60 "regexp/syntax" 61 "strconv" 62 "strings" 63 "sync" 64 "unicode" 65 "unicode/utf8" 66 ) 67 68 var debug = false 69 70 // Regexp is the representation of a compiled regular expression. 71 // The public interface is entirely through methods. 72 // A Regexp is safe for concurrent use by multiple goroutines. 73 type Regexp struct { 74 // read-only after Compile 75 expr string // as passed to Compile 76 prog *syntax.Prog // compiled program 77 prefix string // required prefix in unanchored matches 78 prefixBytes []byte // prefix, as a []byte 79 prefixComplete bool // prefix is the entire regexp 80 prefixRune rune // first rune in prefix 81 cond syntax.EmptyOp // empty-width conditions required at start of match 82 numSubexp int 83 subexpNames []string 84 longest bool 85 86 // cache of machines for running regexp 87 mu sync.Mutex 88 machine []*machine 89 } 90 91 // String returns the source text used to compile the regular expression. 92 func (re *Regexp) String() string { 93 return re.expr 94 } 95 96 // Compile parses a regular expression and returns, if successful, 97 // a Regexp object that can be used to match against text. 98 // 99 // When matching against text, the regexp returns a match that 100 // begins as early as possible in the input (leftmost), and among those 101 // it chooses the one that a backtracking search would have found first. 102 // This so-called leftmost-first matching is the same semantics 103 // that Perl, Python, and other implementations use, although this 104 // package implements it without the expense of backtracking. 105 // For POSIX leftmost-longest matching, see CompilePOSIX. 106 func Compile(expr string) (*Regexp, error) { 107 return compile(expr, syntax.Perl, false) 108 } 109 110 // CompilePOSIX is like Compile but restricts the regular expression 111 // to POSIX ERE (egrep) syntax and changes the match semantics to 112 // leftmost-longest. 113 // 114 // That is, when matching against text, the regexp returns a match that 115 // begins as early as possible in the input (leftmost), and among those 116 // it chooses a match that is as long as possible. 117 // This so-called leftmost-longest matching is the same semantics 118 // that early regular expression implementations used and that POSIX 119 // specifies. 120 // 121 // However, there can be multiple leftmost-longest matches, with different 122 // submatch choices, and here this package diverges from POSIX. 123 // Among the possible leftmost-longest matches, this package chooses 124 // the one that a backtracking search would have found first, while POSIX 125 // specifies that the match be chosen to maximize the length of the first 126 // subexpression, then the second, and so on from left to right. 127 // The POSIX rule is computationally prohibitive and not even well-defined. 128 // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details. 129 func CompilePOSIX(expr string) (*Regexp, error) { 130 return compile(expr, syntax.POSIX, true) 131 } 132 133 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 134 re, err := syntax.Parse(expr, mode) 135 if err != nil { 136 return nil, err 137 } 138 maxCap := re.MaxCap() 139 capNames := re.CapNames() 140 141 re = re.Simplify() 142 prog, err := syntax.Compile(re) 143 if err != nil { 144 return nil, err 145 } 146 regexp := &Regexp{ 147 expr: expr, 148 prog: prog, 149 numSubexp: maxCap, 150 subexpNames: capNames, 151 cond: prog.StartCond(), 152 longest: longest, 153 } 154 regexp.prefix, regexp.prefixComplete = prog.Prefix() 155 if regexp.prefix != "" { 156 // TODO(rsc): Remove this allocation by adding 157 // IndexString to package bytes. 158 regexp.prefixBytes = []byte(regexp.prefix) 159 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 160 } 161 return regexp, nil 162 } 163 164 // get returns a machine to use for matching re. 165 // It uses the re's machine cache if possible, to avoid 166 // unnecessary allocation. 167 func (re *Regexp) get() *machine { 168 re.mu.Lock() 169 if n := len(re.machine); n > 0 { 170 z := re.machine[n-1] 171 re.machine = re.machine[:n-1] 172 re.mu.Unlock() 173 return z 174 } 175 re.mu.Unlock() 176 z := progMachine(re.prog) 177 z.re = re 178 return z 179 } 180 181 // put returns a machine to the re's machine cache. 182 // There is no attempt to limit the size of the cache, so it will 183 // grow to the maximum number of simultaneous matches 184 // run using re. (The cache empties when re gets garbage collected.) 185 func (re *Regexp) put(z *machine) { 186 re.mu.Lock() 187 re.machine = append(re.machine, z) 188 re.mu.Unlock() 189 } 190 191 // MustCompile is like Compile but panics if the expression cannot be parsed. 192 // It simplifies safe initialization of global variables holding compiled regular 193 // expressions. 194 func MustCompile(str string) *Regexp { 195 regexp, error := Compile(str) 196 if error != nil { 197 panic(`regexp: Compile(` + quote(str) + `): ` + error.Error()) 198 } 199 return regexp 200 } 201 202 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed. 203 // It simplifies safe initialization of global variables holding compiled regular 204 // expressions. 205 func MustCompilePOSIX(str string) *Regexp { 206 regexp, error := CompilePOSIX(str) 207 if error != nil { 208 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error()) 209 } 210 return regexp 211 } 212 213 func quote(s string) string { 214 if strconv.CanBackquote(s) { 215 return "`" + s + "`" 216 } 217 return strconv.Quote(s) 218 } 219 220 // NumSubexp returns the number of parenthesized subexpressions in this Regexp. 221 func (re *Regexp) NumSubexp() int { 222 return re.numSubexp 223 } 224 225 // SubexpNames returns the names of the parenthesized subexpressions 226 // in this Regexp. The name for the first sub-expression is names[1], 227 // so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 228 // Since the Regexp as a whole cannot be named, names[0] is always 229 // the empty string. The slice should not be modified. 230 func (re *Regexp) SubexpNames() []string { 231 return re.subexpNames 232 } 233 234 const endOfText rune = -1 235 236 // input abstracts different representations of the input text. It provides 237 // one-character lookahead. 238 type input interface { 239 step(pos int) (r rune, width int) // advance one rune 240 canCheckPrefix() bool // can we look ahead without losing info? 241 hasPrefix(re *Regexp) bool 242 index(re *Regexp, pos int) int 243 context(pos int) syntax.EmptyOp 244 } 245 246 // inputString scans a string. 247 type inputString struct { 248 str string 249 } 250 251 func (i *inputString) step(pos int) (rune, int) { 252 if pos < len(i.str) { 253 c := i.str[pos] 254 if c < utf8.RuneSelf { 255 return rune(c), 1 256 } 257 return utf8.DecodeRuneInString(i.str[pos:]) 258 } 259 return endOfText, 0 260 } 261 262 func (i *inputString) canCheckPrefix() bool { 263 return true 264 } 265 266 func (i *inputString) hasPrefix(re *Regexp) bool { 267 return strings.HasPrefix(i.str, re.prefix) 268 } 269 270 func (i *inputString) index(re *Regexp, pos int) int { 271 return strings.Index(i.str[pos:], re.prefix) 272 } 273 274 func (i *inputString) context(pos int) syntax.EmptyOp { 275 r1, r2 := endOfText, endOfText 276 if pos > 0 && pos <= len(i.str) { 277 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 278 } 279 if pos < len(i.str) { 280 r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 281 } 282 return syntax.EmptyOpContext(r1, r2) 283 } 284 285 // inputBytes scans a byte slice. 286 type inputBytes struct { 287 str []byte 288 } 289 290 func (i *inputBytes) step(pos int) (rune, int) { 291 if pos < len(i.str) { 292 c := i.str[pos] 293 if c < utf8.RuneSelf { 294 return rune(c), 1 295 } 296 return utf8.DecodeRune(i.str[pos:]) 297 } 298 return endOfText, 0 299 } 300 301 func (i *inputBytes) canCheckPrefix() bool { 302 return true 303 } 304 305 func (i *inputBytes) hasPrefix(re *Regexp) bool { 306 return bytes.HasPrefix(i.str, re.prefixBytes) 307 } 308 309 func (i *inputBytes) index(re *Regexp, pos int) int { 310 return bytes.Index(i.str[pos:], re.prefixBytes) 311 } 312 313 func (i *inputBytes) context(pos int) syntax.EmptyOp { 314 r1, r2 := endOfText, endOfText 315 if pos > 0 && pos <= len(i.str) { 316 r1, _ = utf8.DecodeLastRune(i.str[:pos]) 317 } 318 if pos < len(i.str) { 319 r2, _ = utf8.DecodeRune(i.str[pos:]) 320 } 321 return syntax.EmptyOpContext(r1, r2) 322 } 323 324 // inputReader scans a RuneReader. 325 type inputReader struct { 326 r io.RuneReader 327 atEOT bool 328 pos int 329 } 330 331 func (i *inputReader) step(pos int) (rune, int) { 332 if !i.atEOT && pos != i.pos { 333 return endOfText, 0 334 335 } 336 r, w, err := i.r.ReadRune() 337 if err != nil { 338 i.atEOT = true 339 return endOfText, 0 340 } 341 i.pos += w 342 return r, w 343 } 344 345 func (i *inputReader) canCheckPrefix() bool { 346 return false 347 } 348 349 func (i *inputReader) hasPrefix(re *Regexp) bool { 350 return false 351 } 352 353 func (i *inputReader) index(re *Regexp, pos int) int { 354 return -1 355 } 356 357 func (i *inputReader) context(pos int) syntax.EmptyOp { 358 return 0 359 } 360 361 // LiteralPrefix returns a literal string that must begin any match 362 // of the regular expression re. It returns the boolean true if the 363 // literal string comprises the entire regular expression. 364 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 365 return re.prefix, re.prefixComplete 366 } 367 368 // MatchReader returns whether the Regexp matches the text read by the 369 // RuneReader. The return value is a boolean: true for match, false for no 370 // match. 371 func (re *Regexp) MatchReader(r io.RuneReader) bool { 372 return re.doExecute(r, nil, "", 0, 0) != nil 373 } 374 375 // MatchString returns whether the Regexp matches the string s. 376 // The return value is a boolean: true for match, false for no match. 377 func (re *Regexp) MatchString(s string) bool { 378 return re.doExecute(nil, nil, s, 0, 0) != nil 379 } 380 381 // Match returns whether the Regexp matches the byte slice b. 382 // The return value is a boolean: true for match, false for no match. 383 func (re *Regexp) Match(b []byte) bool { 384 return re.doExecute(nil, b, "", 0, 0) != nil 385 } 386 387 // MatchReader checks whether a textual regular expression matches the text 388 // read by the RuneReader. More complicated queries need to use Compile and 389 // the full Regexp interface. 390 func MatchReader(pattern string, r io.RuneReader) (matched bool, error error) { 391 re, err := Compile(pattern) 392 if err != nil { 393 return false, err 394 } 395 return re.MatchReader(r), nil 396 } 397 398 // MatchString checks whether a textual regular expression 399 // matches a string. More complicated queries need 400 // to use Compile and the full Regexp interface. 401 func MatchString(pattern string, s string) (matched bool, error error) { 402 re, err := Compile(pattern) 403 if err != nil { 404 return false, err 405 } 406 return re.MatchString(s), nil 407 } 408 409 // Match checks whether a textual regular expression 410 // matches a byte slice. More complicated queries need 411 // to use Compile and the full Regexp interface. 412 func Match(pattern string, b []byte) (matched bool, error error) { 413 re, err := Compile(pattern) 414 if err != nil { 415 return false, err 416 } 417 return re.Match(b), nil 418 } 419 420 // ReplaceAllString returns a copy of src, replacing matches of the Regexp 421 // with the replacement string repl. Inside repl, $ signs are interpreted as 422 // in Expand, so for instance $1 represents the text of the first submatch. 423 func (re *Regexp) ReplaceAllString(src, repl string) string { 424 n := 2 425 if strings.Index(repl, "$") >= 0 { 426 n = 2 * (re.numSubexp + 1) 427 } 428 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 429 return re.expand(dst, repl, nil, src, match) 430 }) 431 return string(b) 432 } 433 434 // ReplaceAllStringLiteral returns a copy of src, replacing matches of the Regexp 435 // with the replacement string repl. The replacement repl is substituted directly, 436 // without using Expand. 437 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 438 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 439 return append(dst, repl...) 440 })) 441 } 442 443 // ReplaceAllStringFunc returns a copy of src in which all matches of the 444 // Regexp have been replaced by the return value of of function repl applied 445 // to the matched substring. The replacement returned by repl is substituted 446 // directly, without using Expand. 447 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 448 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 449 return append(dst, repl(src[match[0]:match[1]])...) 450 }) 451 return string(b) 452 } 453 454 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 455 lastMatchEnd := 0 // end position of the most recent match 456 searchPos := 0 // position where we next look for a match 457 var buf []byte 458 var endPos int 459 if bsrc != nil { 460 endPos = len(bsrc) 461 } else { 462 endPos = len(src) 463 } 464 for searchPos <= endPos { 465 a := re.doExecute(nil, bsrc, src, searchPos, nmatch) 466 if len(a) == 0 { 467 break // no more matches 468 } 469 470 // Copy the unmatched characters before this match. 471 if bsrc != nil { 472 buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 473 } else { 474 buf = append(buf, src[lastMatchEnd:a[0]]...) 475 } 476 477 // Now insert a copy of the replacement string, but not for a 478 // match of the empty string immediately after another match. 479 // (Otherwise, we get double replacement for patterns that 480 // match both empty and nonempty strings.) 481 if a[1] > lastMatchEnd || a[0] == 0 { 482 buf = repl(buf, a) 483 } 484 lastMatchEnd = a[1] 485 486 // Advance past this match; always advance at least one character. 487 var width int 488 if bsrc != nil { 489 _, width = utf8.DecodeRune(bsrc[searchPos:]) 490 } else { 491 _, width = utf8.DecodeRuneInString(src[searchPos:]) 492 } 493 if searchPos+width > a[1] { 494 searchPos += width 495 } else if searchPos+1 > a[1] { 496 // This clause is only needed at the end of the input 497 // string. In that case, DecodeRuneInString returns width=0. 498 searchPos++ 499 } else { 500 searchPos = a[1] 501 } 502 } 503 504 // Copy the unmatched characters after the last match. 505 if bsrc != nil { 506 buf = append(buf, bsrc[lastMatchEnd:]...) 507 } else { 508 buf = append(buf, src[lastMatchEnd:]...) 509 } 510 511 return buf 512 } 513 514 // ReplaceAll returns a copy of src, replacing matches of the Regexp 515 // with the replacement string repl. Inside repl, $ signs are interpreted as 516 // in Expand, so for instance $1 represents the text of the first submatch. 517 func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 518 n := 2 519 if bytes.IndexByte(repl, '$') >= 0 { 520 n = 2 * (re.numSubexp + 1) 521 } 522 srepl := "" 523 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 524 if len(srepl) != len(repl) { 525 srepl = string(repl) 526 } 527 return re.expand(dst, srepl, src, "", match) 528 }) 529 return b 530 } 531 532 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp 533 // with the replacement bytes repl. The replacement repl is substituted directly, 534 // without using Expand. 535 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 536 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 537 return append(dst, repl...) 538 }) 539 } 540 541 // ReplaceAllFunc returns a copy of src in which all matches of the 542 // Regexp have been replaced by the return value of of function repl applied 543 // to the matched byte slice. The replacement returned by repl is substituted 544 // directly, without using Expand. 545 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 546 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 547 return append(dst, repl(src[match[0]:match[1]])...) 548 }) 549 } 550 551 var specialBytes = []byte(`\.+*?()|[]{}^$`) 552 553 func special(b byte) bool { 554 return bytes.IndexByte(specialBytes, b) >= 0 555 } 556 557 // QuoteMeta returns a string that quotes all regular expression metacharacters 558 // inside the argument text; the returned string is a regular expression matching 559 // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. 560 func QuoteMeta(s string) string { 561 b := make([]byte, 2*len(s)) 562 563 // A byte loop is correct because all metacharacters are ASCII. 564 j := 0 565 for i := 0; i < len(s); i++ { 566 if special(s[i]) { 567 b[j] = '\\' 568 j++ 569 } 570 b[j] = s[i] 571 j++ 572 } 573 return string(b[0:j]) 574 } 575 576 // The number of capture values in the program may correspond 577 // to fewer capturing expressions than are in the regexp. 578 // For example, "(a){0}" turns into an empty program, so the 579 // maximum capture in the program is 0 but we need to return 580 // an expression for \1. Pad appends -1s to the slice a as needed. 581 func (re *Regexp) pad(a []int) []int { 582 if a == nil { 583 // No match. 584 return nil 585 } 586 n := (1 + re.numSubexp) * 2 587 for len(a) < n { 588 a = append(a, -1) 589 } 590 return a 591 } 592 593 // Find matches in slice b if b is non-nil, otherwise find matches in string s. 594 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 595 var end int 596 if b == nil { 597 end = len(s) 598 } else { 599 end = len(b) 600 } 601 602 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 603 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap) 604 if len(matches) == 0 { 605 break 606 } 607 608 accept := true 609 if matches[1] == pos { 610 // We've found an empty match. 611 if matches[0] == prevMatchEnd { 612 // We don't allow an empty match right 613 // after a previous match, so ignore it. 614 accept = false 615 } 616 var width int 617 // TODO: use step() 618 if b == nil { 619 _, width = utf8.DecodeRuneInString(s[pos:end]) 620 } else { 621 _, width = utf8.DecodeRune(b[pos:end]) 622 } 623 if width > 0 { 624 pos += width 625 } else { 626 pos = end + 1 627 } 628 } else { 629 pos = matches[1] 630 } 631 prevMatchEnd = matches[1] 632 633 if accept { 634 deliver(re.pad(matches)) 635 i++ 636 } 637 } 638 } 639 640 // Find returns a slice holding the text of the leftmost match in b of the regular expression. 641 // A return value of nil indicates no match. 642 func (re *Regexp) Find(b []byte) []byte { 643 a := re.doExecute(nil, b, "", 0, 2) 644 if a == nil { 645 return nil 646 } 647 return b[a[0]:a[1]] 648 } 649 650 // FindIndex returns a two-element slice of integers defining the location of 651 // the leftmost match in b of the regular expression. The match itself is at 652 // b[loc[0]:loc[1]]. 653 // A return value of nil indicates no match. 654 func (re *Regexp) FindIndex(b []byte) (loc []int) { 655 a := re.doExecute(nil, b, "", 0, 2) 656 if a == nil { 657 return nil 658 } 659 return a[0:2] 660 } 661 662 // FindString returns a string holding the text of the leftmost match in s of the regular 663 // expression. If there is no match, the return value is an empty string, 664 // but it will also be empty if the regular expression successfully matches 665 // an empty string. Use FindStringIndex or FindStringSubmatch if it is 666 // necessary to distinguish these cases. 667 func (re *Regexp) FindString(s string) string { 668 a := re.doExecute(nil, nil, s, 0, 2) 669 if a == nil { 670 return "" 671 } 672 return s[a[0]:a[1]] 673 } 674 675 // FindStringIndex returns a two-element slice of integers defining the 676 // location of the leftmost match in s of the regular expression. The match 677 // itself is at s[loc[0]:loc[1]]. 678 // A return value of nil indicates no match. 679 func (re *Regexp) FindStringIndex(s string) (loc []int) { 680 a := re.doExecute(nil, nil, s, 0, 2) 681 if a == nil { 682 return nil 683 } 684 return a[0:2] 685 } 686 687 // FindReaderIndex returns a two-element slice of integers defining the 688 // location of the leftmost match of the regular expression in text read from 689 // the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return 690 // value of nil indicates no match. 691 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 692 a := re.doExecute(r, nil, "", 0, 2) 693 if a == nil { 694 return nil 695 } 696 return a[0:2] 697 } 698 699 // FindSubmatch returns a slice of slices holding the text of the leftmost 700 // match of the regular expression in b and the matches, if any, of its 701 // subexpressions, as defined by the 'Submatch' descriptions in the package 702 // comment. 703 // A return value of nil indicates no match. 704 func (re *Regexp) FindSubmatch(b []byte) [][]byte { 705 a := re.doExecute(nil, b, "", 0, re.prog.NumCap) 706 if a == nil { 707 return nil 708 } 709 ret := make([][]byte, 1+re.numSubexp) 710 for i := range ret { 711 if 2*i < len(a) && a[2*i] >= 0 { 712 ret[i] = b[a[2*i]:a[2*i+1]] 713 } 714 } 715 return ret 716 } 717 718 // Expand appends template to dst and returns the result; during the 719 // append, Expand replaces variables in the template with corresponding 720 // matches drawn from src. The match slice should have been returned by 721 // FindSubmatchIndex. 722 // 723 // In the template, a variable is denoted by a substring of the form 724 // $name or ${name}, where name is a non-empty sequence of letters, 725 // digits, and underscores. A purely numeric name like $1 refers to 726 // the submatch with the corresponding index; other names refer to 727 // capturing parentheses named with the (?P<name>...) syntax. A 728 // reference to an out of range or unmatched index or a name that is not 729 // present in the regular expression is replaced with an empty string. 730 // 731 // In the $name form, name is taken to be as long as possible: $1x is 732 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 733 // 734 // To insert a literal $ in the output, use $$ in the template. 735 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 736 return re.expand(dst, string(template), src, "", match) 737 } 738 739 // ExpandString is like Expand but the template and source are strings. 740 // It appends to and returns a byte slice in order to give the calling 741 // code control over allocation. 742 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 743 return re.expand(dst, template, nil, src, match) 744 } 745 746 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 747 for len(template) > 0 { 748 i := strings.Index(template, "$") 749 if i < 0 { 750 break 751 } 752 dst = append(dst, template[:i]...) 753 template = template[i:] 754 if len(template) > 1 && template[1] == '$' { 755 // Treat $$ as $. 756 dst = append(dst, '$') 757 template = template[2:] 758 continue 759 } 760 name, num, rest, ok := extract(template) 761 if !ok { 762 // Malformed; treat $ as raw text. 763 dst = append(dst, '$') 764 template = template[1:] 765 continue 766 } 767 template = rest 768 if num >= 0 { 769 if 2*num+1 < len(match) { 770 if bsrc != nil { 771 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 772 } else { 773 dst = append(dst, src[match[2*num]:match[2*num+1]]...) 774 } 775 } 776 } else { 777 for i, namei := range re.subexpNames { 778 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 779 if bsrc != nil { 780 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 781 } else { 782 dst = append(dst, src[match[2*i]:match[2*i+1]]...) 783 } 784 break 785 } 786 } 787 } 788 } 789 dst = append(dst, template...) 790 return dst 791 } 792 793 // extract returns the name from a leading "$name" or "${name}" in str. 794 // If it is a number, extract returns num set to that number; otherwise num = -1. 795 func extract(str string) (name string, num int, rest string, ok bool) { 796 if len(str) < 2 || str[0] != '$' { 797 return 798 } 799 brace := false 800 if str[1] == '{' { 801 brace = true 802 str = str[2:] 803 } else { 804 str = str[1:] 805 } 806 i := 0 807 for i < len(str) { 808 rune, size := utf8.DecodeRuneInString(str[i:]) 809 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { 810 break 811 } 812 i += size 813 } 814 if i == 0 { 815 // empty name is not okay 816 return 817 } 818 name = str[:i] 819 if brace { 820 if i >= len(str) || str[i] != '}' { 821 // missing closing brace 822 return 823 } 824 i++ 825 } 826 827 // Parse number. 828 num = 0 829 for i := 0; i < len(name); i++ { 830 if name[i] < '0' || '9' < name[i] || num >= 1e8 { 831 num = -1 832 break 833 } 834 num = num*10 + int(name[i]) - '0' 835 } 836 // Disallow leading zeros. 837 if name[0] == '0' && len(name) > 1 { 838 num = -1 839 } 840 841 rest = str[i:] 842 ok = true 843 return 844 } 845 846 // FindSubmatchIndex returns a slice holding the index pairs identifying the 847 // leftmost match of the regular expression in b and the matches, if any, of 848 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 849 // in the package comment. 850 // A return value of nil indicates no match. 851 func (re *Regexp) FindSubmatchIndex(b []byte) []int { 852 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap)) 853 } 854 855 // FindStringSubmatch returns a slice of strings holding the text of the 856 // leftmost match of the regular expression in s and the matches, if any, of 857 // its subexpressions, as defined by the 'Submatch' description in the 858 // package comment. 859 // A return value of nil indicates no match. 860 func (re *Regexp) FindStringSubmatch(s string) []string { 861 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap) 862 if a == nil { 863 return nil 864 } 865 ret := make([]string, 1+re.numSubexp) 866 for i := range ret { 867 if 2*i < len(a) && a[2*i] >= 0 { 868 ret[i] = s[a[2*i]:a[2*i+1]] 869 } 870 } 871 return ret 872 } 873 874 // FindStringSubmatchIndex returns a slice holding the index pairs 875 // identifying the leftmost match of the regular expression in s and the 876 // matches, if any, of its subexpressions, as defined by the 'Submatch' and 877 // 'Index' descriptions in the package comment. 878 // A return value of nil indicates no match. 879 func (re *Regexp) FindStringSubmatchIndex(s string) []int { 880 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap)) 881 } 882 883 // FindReaderSubmatchIndex returns a slice holding the index pairs 884 // identifying the leftmost match of the regular expression of text read by 885 // the RuneReader, and the matches, if any, of its subexpressions, as defined 886 // by the 'Submatch' and 'Index' descriptions in the package comment. A 887 // return value of nil indicates no match. 888 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 889 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap)) 890 } 891 892 const startSize = 10 // The size at which to start a slice in the 'All' routines. 893 894 // FindAll is the 'All' version of Find; it returns a slice of all successive 895 // matches of the expression, as defined by the 'All' description in the 896 // package comment. 897 // A return value of nil indicates no match. 898 func (re *Regexp) FindAll(b []byte, n int) [][]byte { 899 if n < 0 { 900 n = len(b) + 1 901 } 902 result := make([][]byte, 0, startSize) 903 re.allMatches("", b, n, func(match []int) { 904 result = append(result, b[match[0]:match[1]]) 905 }) 906 if len(result) == 0 { 907 return nil 908 } 909 return result 910 } 911 912 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all 913 // successive matches of the expression, as defined by the 'All' description 914 // in the package comment. 915 // A return value of nil indicates no match. 916 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 917 if n < 0 { 918 n = len(b) + 1 919 } 920 result := make([][]int, 0, startSize) 921 re.allMatches("", b, n, func(match []int) { 922 result = append(result, match[0:2]) 923 }) 924 if len(result) == 0 { 925 return nil 926 } 927 return result 928 } 929 930 // FindAllString is the 'All' version of FindString; it returns a slice of all 931 // successive matches of the expression, as defined by the 'All' description 932 // in the package comment. 933 // A return value of nil indicates no match. 934 func (re *Regexp) FindAllString(s string, n int) []string { 935 if n < 0 { 936 n = len(s) + 1 937 } 938 result := make([]string, 0, startSize) 939 re.allMatches(s, nil, n, func(match []int) { 940 result = append(result, s[match[0]:match[1]]) 941 }) 942 if len(result) == 0 { 943 return nil 944 } 945 return result 946 } 947 948 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a 949 // slice of all successive matches of the expression, as defined by the 'All' 950 // description in the package comment. 951 // A return value of nil indicates no match. 952 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 953 if n < 0 { 954 n = len(s) + 1 955 } 956 result := make([][]int, 0, startSize) 957 re.allMatches(s, nil, n, func(match []int) { 958 result = append(result, match[0:2]) 959 }) 960 if len(result) == 0 { 961 return nil 962 } 963 return result 964 } 965 966 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice 967 // of all successive matches of the expression, as defined by the 'All' 968 // description in the package comment. 969 // A return value of nil indicates no match. 970 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 971 if n < 0 { 972 n = len(b) + 1 973 } 974 result := make([][][]byte, 0, startSize) 975 re.allMatches("", b, n, func(match []int) { 976 slice := make([][]byte, len(match)/2) 977 for j := range slice { 978 if match[2*j] >= 0 { 979 slice[j] = b[match[2*j]:match[2*j+1]] 980 } 981 } 982 result = append(result, slice) 983 }) 984 if len(result) == 0 { 985 return nil 986 } 987 return result 988 } 989 990 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns 991 // a slice of all successive matches of the expression, as defined by the 992 // 'All' description in the package comment. 993 // A return value of nil indicates no match. 994 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 995 if n < 0 { 996 n = len(b) + 1 997 } 998 result := make([][]int, 0, startSize) 999 re.allMatches("", b, n, func(match []int) { 1000 result = append(result, match) 1001 }) 1002 if len(result) == 0 { 1003 return nil 1004 } 1005 return result 1006 } 1007 1008 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it 1009 // returns a slice of all successive matches of the expression, as defined by 1010 // the 'All' description in the package comment. 1011 // A return value of nil indicates no match. 1012 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1013 if n < 0 { 1014 n = len(s) + 1 1015 } 1016 result := make([][]string, 0, startSize) 1017 re.allMatches(s, nil, n, func(match []int) { 1018 slice := make([]string, len(match)/2) 1019 for j := range slice { 1020 if match[2*j] >= 0 { 1021 slice[j] = s[match[2*j]:match[2*j+1]] 1022 } 1023 } 1024 result = append(result, slice) 1025 }) 1026 if len(result) == 0 { 1027 return nil 1028 } 1029 return result 1030 } 1031 1032 // FindAllStringSubmatchIndex is the 'All' version of 1033 // FindStringSubmatchIndex; it returns a slice of all successive matches of 1034 // the expression, as defined by the 'All' description in the package 1035 // comment. 1036 // A return value of nil indicates no match. 1037 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1038 if n < 0 { 1039 n = len(s) + 1 1040 } 1041 result := make([][]int, 0, startSize) 1042 re.allMatches(s, nil, n, func(match []int) { 1043 result = append(result, match) 1044 }) 1045 if len(result) == 0 { 1046 return nil 1047 } 1048 return result 1049 }