Source file src/pkg/regexp/syntax/prog.go
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package syntax
6
7 import (
8 "bytes"
9 "strconv"
10 "unicode"
11 )
12
13 // Compiled program.
14 // May not belong in this package, but convenient for now.
15
16 // A Prog is a compiled regular expression program.
17 type Prog struct {
18 Inst []Inst
19 Start int // index of start instruction
20 NumCap int // number of InstCapture insts in re
21 }
22
23 // An InstOp is an instruction opcode.
24 type InstOp uint8
25
26 const (
27 InstAlt InstOp = iota
28 InstAltMatch
29 InstCapture
30 InstEmptyWidth
31 InstMatch
32 InstFail
33 InstNop
34 InstRune
35 InstRune1
36 InstRuneAny
37 InstRuneAnyNotNL
38 )
39
40 // An EmptyOp specifies a kind or mixture of zero-width assertions.
41 type EmptyOp uint8
42
43 const (
44 EmptyBeginLine EmptyOp = 1 << iota
45 EmptyEndLine
46 EmptyBeginText
47 EmptyEndText
48 EmptyWordBoundary
49 EmptyNoWordBoundary
50 )
51
52 // EmptyOpContext returns the zero-width assertions
53 // satisfied at the position between the runes r1 and r2.
54 // Passing r1 == -1 indicates that the position is
55 // at the beginning of the text.
56 // Passing r2 == -1 indicates that the position is
57 // at the end of the text.
58 func EmptyOpContext(r1, r2 rune) EmptyOp {
59 var op EmptyOp
60 if r1 < 0 {
61 op |= EmptyBeginText | EmptyBeginLine
62 }
63 if r1 == '\n' {
64 op |= EmptyBeginLine
65 }
66 if r2 < 0 {
67 op |= EmptyEndText | EmptyEndLine
68 }
69 if r2 == '\n' {
70 op |= EmptyEndLine
71 }
72 if IsWordChar(r1) != IsWordChar(r2) {
73 op |= EmptyWordBoundary
74 } else {
75 op |= EmptyNoWordBoundary
76 }
77 return op
78 }
79
80 // IsWordChar reports whether r is consider a ``word character''
81 // during the evaluation of the \b and \B zero-width assertions.
82 // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
83 func IsWordChar(r rune) bool {
84 return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
85 }
86
87 // An Inst is a single instruction in a regular expression program.
88 type Inst struct {
89 Op InstOp
90 Out uint32 // all but InstMatch, InstFail
91 Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
92 Rune []rune
93 }
94
95 func (p *Prog) String() string {
96 var b bytes.Buffer
97 dumpProg(&b, p)
98 return b.String()
99 }
100
101 // skipNop follows any no-op or capturing instructions
102 // and returns the resulting pc.
103 func (p *Prog) skipNop(pc uint32) *Inst {
104 i := &p.Inst[pc]
105 for i.Op == InstNop || i.Op == InstCapture {
106 pc = i.Out
107 i = &p.Inst[pc]
108 }
109 return i
110 }
111
112 // op returns i.Op but merges all the Rune special cases into InstRune
113 func (i *Inst) op() InstOp {
114 op := i.Op
115 switch op {
116 case InstRune1, InstRuneAny, InstRuneAnyNotNL:
117 op = InstRune
118 }
119 return op
120 }
121
122 // Prefix returns a literal string that all matches for the
123 // regexp must start with. Complete is true if the prefix
124 // is the entire match.
125 func (p *Prog) Prefix() (prefix string, complete bool) {
126 i := p.skipNop(uint32(p.Start))
127
128 // Avoid allocation of buffer if prefix is empty.
129 if i.op() != InstRune || len(i.Rune) != 1 {
130 return "", i.Op == InstMatch
131 }
132
133 // Have prefix; gather characters.
134 var buf bytes.Buffer
135 for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
136 buf.WriteRune(i.Rune[0])
137 i = p.skipNop(i.Out)
138 }
139 return buf.String(), i.Op == InstMatch
140 }
141
142 // StartCond returns the leading empty-width conditions that must
143 // be true in any match. It returns ^EmptyOp(0) if no matches are possible.
144 func (p *Prog) StartCond() EmptyOp {
145 var flag EmptyOp
146 pc := uint32(p.Start)
147 i := &p.Inst[pc]
148 Loop:
149 for {
150 switch i.Op {
151 case InstEmptyWidth:
152 flag |= EmptyOp(i.Arg)
153 case InstFail:
154 return ^EmptyOp(0)
155 case InstCapture, InstNop:
156 // skip
157 default:
158 break Loop
159 }
160 pc = i.Out
161 i = &p.Inst[pc]
162 }
163 return flag
164 }
165
166 // MatchRune returns true if the instruction matches (and consumes) r.
167 // It should only be called when i.Op == InstRune.
168 func (i *Inst) MatchRune(r rune) bool {
169 rune := i.Rune
170
171 // Special case: single-rune slice is from literal string, not char class.
172 if len(rune) == 1 {
173 r0 := rune[0]
174 if r == r0 {
175 return true
176 }
177 if Flags(i.Arg)&FoldCase != 0 {
178 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
179 if r == r1 {
180 return true
181 }
182 }
183 }
184 return false
185 }
186
187 // Peek at the first few pairs.
188 // Should handle ASCII well.
189 for j := 0; j < len(rune) && j <= 8; j += 2 {
190 if r < rune[j] {
191 return false
192 }
193 if r <= rune[j+1] {
194 return true
195 }
196 }
197
198 // Otherwise binary search.
199 lo := 0
200 hi := len(rune) / 2
201 for lo < hi {
202 m := lo + (hi-lo)/2
203 if c := rune[2*m]; c <= r {
204 if r <= rune[2*m+1] {
205 return true
206 }
207 lo = m + 1
208 } else {
209 hi = m
210 }
211 }
212 return false
213 }
214
215 // As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
216 // Since we act on runes, it would be easy to support Unicode here.
217 func wordRune(r rune) bool {
218 return r == '_' ||
219 ('A' <= r && r <= 'Z') ||
220 ('a' <= r && r <= 'z') ||
221 ('0' <= r && r <= '9')
222 }
223
224 // MatchEmptyWidth returns true if the instruction matches
225 // an empty string between the runes before and after.
226 // It should only be called when i.Op == InstEmptyWidth.
227 func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
228 switch EmptyOp(i.Arg) {
229 case EmptyBeginLine:
230 return before == '\n' || before == -1
231 case EmptyEndLine:
232 return after == '\n' || after == -1
233 case EmptyBeginText:
234 return before == -1
235 case EmptyEndText:
236 return after == -1
237 case EmptyWordBoundary:
238 return wordRune(before) != wordRune(after)
239 case EmptyNoWordBoundary:
240 return wordRune(before) == wordRune(after)
241 }
242 panic("unknown empty width arg")
243 }
244
245 func (i *Inst) String() string {
246 var b bytes.Buffer
247 dumpInst(&b, i)
248 return b.String()
249 }
250
251 func bw(b *bytes.Buffer, args ...string) {
252 for _, s := range args {
253 b.WriteString(s)
254 }
255 }
256
257 func dumpProg(b *bytes.Buffer, p *Prog) {
258 for j := range p.Inst {
259 i := &p.Inst[j]
260 pc := strconv.Itoa(j)
261 if len(pc) < 3 {
262 b.WriteString(" "[len(pc):])
263 }
264 if j == p.Start {
265 pc += "*"
266 }
267 bw(b, pc, "\t")
268 dumpInst(b, i)
269 bw(b, "\n")
270 }
271 }
272
273 func u32(i uint32) string {
274 return strconv.FormatUint(uint64(i), 10)
275 }
276
277 func dumpInst(b *bytes.Buffer, i *Inst) {
278 switch i.Op {
279 case InstAlt:
280 bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
281 case InstAltMatch:
282 bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
283 case InstCapture:
284 bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
285 case InstEmptyWidth:
286 bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
287 case InstMatch:
288 bw(b, "match")
289 case InstFail:
290 bw(b, "fail")
291 case InstNop:
292 bw(b, "nop -> ", u32(i.Out))
293 case InstRune:
294 if i.Rune == nil {
295 // shouldn't happen
296 bw(b, "rune <nil>")
297 }
298 bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
299 if Flags(i.Arg)&FoldCase != 0 {
300 bw(b, "/i")
301 }
302 bw(b, " -> ", u32(i.Out))
303 case InstRune1:
304 bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
305 case InstRuneAny:
306 bw(b, "any -> ", u32(i.Out))
307 case InstRuneAnyNotNL:
308 bw(b, "anynotnl -> ", u32(i.Out))
309 }
310 }