Source file src/pkg/regexp/syntax/regexp.go
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package syntax
6
7 // Note to implementers:
8 // In this package, re is always a *Regexp and r is always a rune.
9
10 import (
11 "bytes"
12 "strconv"
13 "strings"
14 "unicode"
15 )
16
17 // A Regexp is a node in a regular expression syntax tree.
18 type Regexp struct {
19 Op Op // operator
20 Flags Flags
21 Sub []*Regexp // subexpressions, if any
22 Sub0 [1]*Regexp // storage for short Sub
23 Rune []rune // matched runes, for OpLiteral, OpCharClass
24 Rune0 [2]rune // storage for short Rune
25 Min, Max int // min, max for OpRepeat
26 Cap int // capturing index, for OpCapture
27 Name string // capturing name, for OpCapture
28 }
29
30 // An Op is a single regular expression operator.
31 type Op uint8
32
33 // Operators are listed in precedence order, tightest binding to weakest.
34 // Character class operators are listed simplest to most complex
35 // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
36
37 const (
38 OpNoMatch Op = 1 + iota // matches no strings
39 OpEmptyMatch // matches empty string
40 OpLiteral // matches Runes sequence
41 OpCharClass // matches Runes interpreted as range pair list
42 OpAnyCharNotNL // matches any character
43 OpAnyChar // matches any character
44 OpBeginLine // matches empty string at beginning of line
45 OpEndLine // matches empty string at end of line
46 OpBeginText // matches empty string at beginning of text
47 OpEndText // matches empty string at end of text
48 OpWordBoundary // matches word boundary `\b`
49 OpNoWordBoundary // matches word non-boundary `\B`
50 OpCapture // capturing subexpression with index Cap, optional name Name
51 OpStar // matches Sub[0] zero or more times
52 OpPlus // matches Sub[0] one or more times
53 OpQuest // matches Sub[0] zero or one times
54 OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
55 OpConcat // matches concatenation of Subs
56 OpAlternate // matches alternation of Subs
57 )
58
59 const opPseudo Op = 128 // where pseudo-ops start
60
61 // Equal returns true if x and y have identical structure.
62 func (x *Regexp) Equal(y *Regexp) bool {
63 if x == nil || y == nil {
64 return x == y
65 }
66 if x.Op != y.Op {
67 return false
68 }
69 switch x.Op {
70 case OpEndText:
71 // The parse flags remember whether this is \z or \Z.
72 if x.Flags&WasDollar != y.Flags&WasDollar {
73 return false
74 }
75
76 case OpLiteral, OpCharClass:
77 if len(x.Rune) != len(y.Rune) {
78 return false
79 }
80 for i, r := range x.Rune {
81 if r != y.Rune[i] {
82 return false
83 }
84 }
85
86 case OpAlternate, OpConcat:
87 if len(x.Sub) != len(y.Sub) {
88 return false
89 }
90 for i, sub := range x.Sub {
91 if !sub.Equal(y.Sub[i]) {
92 return false
93 }
94 }
95
96 case OpStar, OpPlus, OpQuest:
97 if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
98 return false
99 }
100
101 case OpRepeat:
102 if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
103 return false
104 }
105
106 case OpCapture:
107 if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
108 return false
109 }
110 }
111 return true
112 }
113
114 // writeRegexp writes the Perl syntax for the regular expression re to b.
115 func writeRegexp(b *bytes.Buffer, re *Regexp) {
116 switch re.Op {
117 default:
118 b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
119 case OpNoMatch:
120 b.WriteString(`[^\x00-\x{10FFFF}]`)
121 case OpEmptyMatch:
122 b.WriteString(`(?:)`)
123 case OpLiteral:
124 if re.Flags&FoldCase != 0 {
125 b.WriteString(`(?i:`)
126 }
127 for _, r := range re.Rune {
128 escape(b, r, false)
129 }
130 if re.Flags&FoldCase != 0 {
131 b.WriteString(`)`)
132 }
133 case OpCharClass:
134 if len(re.Rune)%2 != 0 {
135 b.WriteString(`[invalid char class]`)
136 break
137 }
138 b.WriteRune('[')
139 if len(re.Rune) == 0 {
140 b.WriteString(`^\x00-\x{10FFFF}`)
141 } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
142 // Contains 0 and MaxRune. Probably a negated class.
143 // Print the gaps.
144 b.WriteRune('^')
145 for i := 1; i < len(re.Rune)-1; i += 2 {
146 lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
147 escape(b, lo, lo == '-')
148 if lo != hi {
149 b.WriteRune('-')
150 escape(b, hi, hi == '-')
151 }
152 }
153 } else {
154 for i := 0; i < len(re.Rune); i += 2 {
155 lo, hi := re.Rune[i], re.Rune[i+1]
156 escape(b, lo, lo == '-')
157 if lo != hi {
158 b.WriteRune('-')
159 escape(b, hi, hi == '-')
160 }
161 }
162 }
163 b.WriteRune(']')
164 case OpAnyCharNotNL:
165 b.WriteString(`(?-s:.)`)
166 case OpAnyChar:
167 b.WriteString(`(?s:.)`)
168 case OpBeginLine:
169 b.WriteRune('^')
170 case OpEndLine:
171 b.WriteRune('$')
172 case OpBeginText:
173 b.WriteString(`\A`)
174 case OpEndText:
175 if re.Flags&WasDollar != 0 {
176 b.WriteString(`(?-m:$)`)
177 } else {
178 b.WriteString(`\z`)
179 }
180 case OpWordBoundary:
181 b.WriteString(`\b`)
182 case OpNoWordBoundary:
183 b.WriteString(`\B`)
184 case OpCapture:
185 if re.Name != "" {
186 b.WriteString(`(?P<`)
187 b.WriteString(re.Name)
188 b.WriteRune('>')
189 } else {
190 b.WriteRune('(')
191 }
192 if re.Sub[0].Op != OpEmptyMatch {
193 writeRegexp(b, re.Sub[0])
194 }
195 b.WriteRune(')')
196 case OpStar, OpPlus, OpQuest, OpRepeat:
197 if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
198 b.WriteString(`(?:`)
199 writeRegexp(b, sub)
200 b.WriteString(`)`)
201 } else {
202 writeRegexp(b, sub)
203 }
204 switch re.Op {
205 case OpStar:
206 b.WriteRune('*')
207 case OpPlus:
208 b.WriteRune('+')
209 case OpQuest:
210 b.WriteRune('?')
211 case OpRepeat:
212 b.WriteRune('{')
213 b.WriteString(strconv.Itoa(re.Min))
214 if re.Max != re.Min {
215 b.WriteRune(',')
216 if re.Max >= 0 {
217 b.WriteString(strconv.Itoa(re.Max))
218 }
219 }
220 b.WriteRune('}')
221 }
222 if re.Flags&NonGreedy != 0 {
223 b.WriteRune('?')
224 }
225 case OpConcat:
226 for _, sub := range re.Sub {
227 if sub.Op == OpAlternate {
228 b.WriteString(`(?:`)
229 writeRegexp(b, sub)
230 b.WriteString(`)`)
231 } else {
232 writeRegexp(b, sub)
233 }
234 }
235 case OpAlternate:
236 for i, sub := range re.Sub {
237 if i > 0 {
238 b.WriteRune('|')
239 }
240 writeRegexp(b, sub)
241 }
242 }
243 }
244
245 func (re *Regexp) String() string {
246 var b bytes.Buffer
247 writeRegexp(&b, re)
248 return b.String()
249 }
250
251 const meta = `\.+*?()|[]{}^$`
252
253 func escape(b *bytes.Buffer, r rune, force bool) {
254 if unicode.IsPrint(r) {
255 if strings.IndexRune(meta, r) >= 0 || force {
256 b.WriteRune('\\')
257 }
258 b.WriteRune(r)
259 return
260 }
261
262 switch r {
263 case '\a':
264 b.WriteString(`\a`)
265 case '\f':
266 b.WriteString(`\f`)
267 case '\n':
268 b.WriteString(`\n`)
269 case '\r':
270 b.WriteString(`\r`)
271 case '\t':
272 b.WriteString(`\t`)
273 case '\v':
274 b.WriteString(`\v`)
275 default:
276 if r < 0x100 {
277 b.WriteString(`\x`)
278 s := strconv.FormatInt(int64(r), 16)
279 if len(s) == 1 {
280 b.WriteRune('0')
281 }
282 b.WriteString(s)
283 break
284 }
285 b.WriteString(`\x{`)
286 b.WriteString(strconv.FormatInt(int64(r), 16))
287 b.WriteString(`}`)
288 }
289 }
290
291 // MaxCap walks the regexp to find the maximum capture index.
292 func (re *Regexp) MaxCap() int {
293 m := 0
294 if re.Op == OpCapture {
295 m = re.Cap
296 }
297 for _, sub := range re.Sub {
298 if n := sub.MaxCap(); m < n {
299 m = n
300 }
301 }
302 return m
303 }
304
305 // CapNames walks the regexp to find the names of capturing groups.
306 func (re *Regexp) CapNames() []string {
307 names := make([]string, re.MaxCap()+1)
308 re.capNames(names)
309 return names
310 }
311
312 func (re *Regexp) capNames(names []string) {
313 if re.Op == OpCapture {
314 names[re.Cap] = re.Name
315 }
316 for _, sub := range re.Sub {
317 sub.capNames(names)
318 }
319 }