src/pkg/regexp/syntax/regexp.go - The Go Programming Language

Golang

Source file src/pkg/regexp/syntax/regexp.go

     1	// Copyright 2011 The Go Authors.  All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package syntax
     6	
     7	// Note to implementers:
     8	// In this package, re is always a *Regexp and r is always a rune.
     9	
    10	import (
    11		"bytes"
    12		"strconv"
    13		"strings"
    14		"unicode"
    15	)
    16	
    17	// A Regexp is a node in a regular expression syntax tree.
    18	type Regexp struct {
    19		Op       Op // operator
    20		Flags    Flags
    21		Sub      []*Regexp  // subexpressions, if any
    22		Sub0     [1]*Regexp // storage for short Sub
    23		Rune     []rune     // matched runes, for OpLiteral, OpCharClass
    24		Rune0    [2]rune    // storage for short Rune
    25		Min, Max int        // min, max for OpRepeat
    26		Cap      int        // capturing index, for OpCapture
    27		Name     string     // capturing name, for OpCapture
    28	}
    29	
    30	// An Op is a single regular expression operator.
    31	type Op uint8
    32	
    33	// Operators are listed in precedence order, tightest binding to weakest.
    34	// Character class operators are listed simplest to most complex
    35	// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
    36	
    37	const (
    38		OpNoMatch        Op = 1 + iota // matches no strings
    39		OpEmptyMatch                   // matches empty string
    40		OpLiteral                      // matches Runes sequence
    41		OpCharClass                    // matches Runes interpreted as range pair list
    42		OpAnyCharNotNL                 // matches any character
    43		OpAnyChar                      // matches any character
    44		OpBeginLine                    // matches empty string at beginning of line
    45		OpEndLine                      // matches empty string at end of line
    46		OpBeginText                    // matches empty string at beginning of text
    47		OpEndText                      // matches empty string at end of text
    48		OpWordBoundary                 // matches word boundary `\b`
    49		OpNoWordBoundary               // matches word non-boundary `\B`
    50		OpCapture                      // capturing subexpression with index Cap, optional name Name
    51		OpStar                         // matches Sub[0] zero or more times
    52		OpPlus                         // matches Sub[0] one or more times
    53		OpQuest                        // matches Sub[0] zero or one times
    54		OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
    55		OpConcat                       // matches concatenation of Subs
    56		OpAlternate                    // matches alternation of Subs
    57	)
    58	
    59	const opPseudo Op = 128 // where pseudo-ops start
    60	
    61	// Equal returns true if x and y have identical structure.
    62	func (x *Regexp) Equal(y *Regexp) bool {
    63		if x == nil || y == nil {
    64			return x == y
    65		}
    66		if x.Op != y.Op {
    67			return false
    68		}
    69		switch x.Op {
    70		case OpEndText:
    71			// The parse flags remember whether this is \z or \Z.
    72			if x.Flags&WasDollar != y.Flags&WasDollar {
    73				return false
    74			}
    75	
    76		case OpLiteral, OpCharClass:
    77			if len(x.Rune) != len(y.Rune) {
    78				return false
    79			}
    80			for i, r := range x.Rune {
    81				if r != y.Rune[i] {
    82					return false
    83				}
    84			}
    85	
    86		case OpAlternate, OpConcat:
    87			if len(x.Sub) != len(y.Sub) {
    88				return false
    89			}
    90			for i, sub := range x.Sub {
    91				if !sub.Equal(y.Sub[i]) {
    92					return false
    93				}
    94			}
    95	
    96		case OpStar, OpPlus, OpQuest:
    97			if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
    98				return false
    99			}
   100	
   101		case OpRepeat:
   102			if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
   103				return false
   104			}
   105	
   106		case OpCapture:
   107			if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
   108				return false
   109			}
   110		}
   111		return true
   112	}
   113	
   114	// writeRegexp writes the Perl syntax for the regular expression re to b.
   115	func writeRegexp(b *bytes.Buffer, re *Regexp) {
   116		switch re.Op {
   117		default:
   118			b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
   119		case OpNoMatch:
   120			b.WriteString(`[^\x00-\x{10FFFF}]`)
   121		case OpEmptyMatch:
   122			b.WriteString(`(?:)`)
   123		case OpLiteral:
   124			if re.Flags&FoldCase != 0 {
   125				b.WriteString(`(?i:`)
   126			}
   127			for _, r := range re.Rune {
   128				escape(b, r, false)
   129			}
   130			if re.Flags&FoldCase != 0 {
   131				b.WriteString(`)`)
   132			}
   133		case OpCharClass:
   134			if len(re.Rune)%2 != 0 {
   135				b.WriteString(`[invalid char class]`)
   136				break
   137			}
   138			b.WriteRune('[')
   139			if len(re.Rune) == 0 {
   140				b.WriteString(`^\x00-\x{10FFFF}`)
   141			} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
   142				// Contains 0 and MaxRune.  Probably a negated class.
   143				// Print the gaps.
   144				b.WriteRune('^')
   145				for i := 1; i < len(re.Rune)-1; i += 2 {
   146					lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
   147					escape(b, lo, lo == '-')
   148					if lo != hi {
   149						b.WriteRune('-')
   150						escape(b, hi, hi == '-')
   151					}
   152				}
   153			} else {
   154				for i := 0; i < len(re.Rune); i += 2 {
   155					lo, hi := re.Rune[i], re.Rune[i+1]
   156					escape(b, lo, lo == '-')
   157					if lo != hi {
   158						b.WriteRune('-')
   159						escape(b, hi, hi == '-')
   160					}
   161				}
   162			}
   163			b.WriteRune(']')
   164		case OpAnyCharNotNL:
   165			b.WriteString(`(?-s:.)`)
   166		case OpAnyChar:
   167			b.WriteString(`(?s:.)`)
   168		case OpBeginLine:
   169			b.WriteRune('^')
   170		case OpEndLine:
   171			b.WriteRune('$')
   172		case OpBeginText:
   173			b.WriteString(`\A`)
   174		case OpEndText:
   175			if re.Flags&WasDollar != 0 {
   176				b.WriteString(`(?-m:$)`)
   177			} else {
   178				b.WriteString(`\z`)
   179			}
   180		case OpWordBoundary:
   181			b.WriteString(`\b`)
   182		case OpNoWordBoundary:
   183			b.WriteString(`\B`)
   184		case OpCapture:
   185			if re.Name != "" {
   186				b.WriteString(`(?P<`)
   187				b.WriteString(re.Name)
   188				b.WriteRune('>')
   189			} else {
   190				b.WriteRune('(')
   191			}
   192			if re.Sub[0].Op != OpEmptyMatch {
   193				writeRegexp(b, re.Sub[0])
   194			}
   195			b.WriteRune(')')
   196		case OpStar, OpPlus, OpQuest, OpRepeat:
   197			if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
   198				b.WriteString(`(?:`)
   199				writeRegexp(b, sub)
   200				b.WriteString(`)`)
   201			} else {
   202				writeRegexp(b, sub)
   203			}
   204			switch re.Op {
   205			case OpStar:
   206				b.WriteRune('*')
   207			case OpPlus:
   208				b.WriteRune('+')
   209			case OpQuest:
   210				b.WriteRune('?')
   211			case OpRepeat:
   212				b.WriteRune('{')
   213				b.WriteString(strconv.Itoa(re.Min))
   214				if re.Max != re.Min {
   215					b.WriteRune(',')
   216					if re.Max >= 0 {
   217						b.WriteString(strconv.Itoa(re.Max))
   218					}
   219				}
   220				b.WriteRune('}')
   221			}
   222			if re.Flags&NonGreedy != 0 {
   223				b.WriteRune('?')
   224			}
   225		case OpConcat:
   226			for _, sub := range re.Sub {
   227				if sub.Op == OpAlternate {
   228					b.WriteString(`(?:`)
   229					writeRegexp(b, sub)
   230					b.WriteString(`)`)
   231				} else {
   232					writeRegexp(b, sub)
   233				}
   234			}
   235		case OpAlternate:
   236			for i, sub := range re.Sub {
   237				if i > 0 {
   238					b.WriteRune('|')
   239				}
   240				writeRegexp(b, sub)
   241			}
   242		}
   243	}
   244	
   245	func (re *Regexp) String() string {
   246		var b bytes.Buffer
   247		writeRegexp(&b, re)
   248		return b.String()
   249	}
   250	
   251	const meta = `\.+*?()|[]{}^$`
   252	
   253	func escape(b *bytes.Buffer, r rune, force bool) {
   254		if unicode.IsPrint(r) {
   255			if strings.IndexRune(meta, r) >= 0 || force {
   256				b.WriteRune('\\')
   257			}
   258			b.WriteRune(r)
   259			return
   260		}
   261	
   262		switch r {
   263		case '\a':
   264			b.WriteString(`\a`)
   265		case '\f':
   266			b.WriteString(`\f`)
   267		case '\n':
   268			b.WriteString(`\n`)
   269		case '\r':
   270			b.WriteString(`\r`)
   271		case '\t':
   272			b.WriteString(`\t`)
   273		case '\v':
   274			b.WriteString(`\v`)
   275		default:
   276			if r < 0x100 {
   277				b.WriteString(`\x`)
   278				s := strconv.FormatInt(int64(r), 16)
   279				if len(s) == 1 {
   280					b.WriteRune('0')
   281				}
   282				b.WriteString(s)
   283				break
   284			}
   285			b.WriteString(`\x{`)
   286			b.WriteString(strconv.FormatInt(int64(r), 16))
   287			b.WriteString(`}`)
   288		}
   289	}
   290	
   291	// MaxCap walks the regexp to find the maximum capture index.
   292	func (re *Regexp) MaxCap() int {
   293		m := 0
   294		if re.Op == OpCapture {
   295			m = re.Cap
   296		}
   297		for _, sub := range re.Sub {
   298			if n := sub.MaxCap(); m < n {
   299				m = n
   300			}
   301		}
   302		return m
   303	}
   304	
   305	// CapNames walks the regexp to find the names of capturing groups.
   306	func (re *Regexp) CapNames() []string {
   307		names := make([]string, re.MaxCap()+1)
   308		re.capNames(names)
   309		return names
   310	}
   311	
   312	func (re *Regexp) capNames(names []string) {
   313		if re.Op == OpCapture {
   314			names[re.Cap] = re.Name
   315		}
   316		for _, sub := range re.Sub {
   317			sub.capNames(names)
   318		}
   319	}