Source file src/pkg/regexp/syntax/compile.go
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package syntax
6
7 import "unicode"
8
9 // A patchList is a list of instruction pointers that need to be filled in (patched).
10 // Because the pointers haven't been filled in yet, we can reuse their storage
11 // to hold the list. It's kind of sleazy, but works well in practice.
12 // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
13 //
14 // These aren't really pointers: they're integers, so we can reinterpret them
15 // this way without using package unsafe. A value l denotes
16 // p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1).
17 // l == 0 denotes the empty list, okay because we start every program
18 // with a fail instruction, so we'll never want to point at its output link.
19 type patchList uint32
20
21 func (l patchList) next(p *Prog) patchList {
22 i := &p.Inst[l>>1]
23 if l&1 == 0 {
24 return patchList(i.Out)
25 }
26 return patchList(i.Arg)
27 }
28
29 func (l patchList) patch(p *Prog, val uint32) {
30 for l != 0 {
31 i := &p.Inst[l>>1]
32 if l&1 == 0 {
33 l = patchList(i.Out)
34 i.Out = val
35 } else {
36 l = patchList(i.Arg)
37 i.Arg = val
38 }
39 }
40 }
41
42 func (l1 patchList) append(p *Prog, l2 patchList) patchList {
43 if l1 == 0 {
44 return l2
45 }
46 if l2 == 0 {
47 return l1
48 }
49
50 last := l1
51 for {
52 next := last.next(p)
53 if next == 0 {
54 break
55 }
56 last = next
57 }
58
59 i := &p.Inst[last>>1]
60 if last&1 == 0 {
61 i.Out = uint32(l2)
62 } else {
63 i.Arg = uint32(l2)
64 }
65 return l1
66 }
67
68 // A frag represents a compiled program fragment.
69 type frag struct {
70 i uint32 // index of first instruction
71 out patchList // where to record end instruction
72 }
73
74 type compiler struct {
75 p *Prog
76 }
77
78 // Compile compiles the regexp into a program to be executed.
79 // The regexp should have been simplified already (returned from re.Simplify).
80 func Compile(re *Regexp) (*Prog, error) {
81 var c compiler
82 c.init()
83 f := c.compile(re)
84 f.out.patch(c.p, c.inst(InstMatch).i)
85 c.p.Start = int(f.i)
86 return c.p, nil
87 }
88
89 func (c *compiler) init() {
90 c.p = new(Prog)
91 c.p.NumCap = 2 // implicit ( and ) for whole match $0
92 c.inst(InstFail)
93 }
94
95 var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
96 var anyRune = []rune{0, unicode.MaxRune}
97
98 func (c *compiler) compile(re *Regexp) frag {
99 switch re.Op {
100 case OpNoMatch:
101 return c.fail()
102 case OpEmptyMatch:
103 return c.nop()
104 case OpLiteral:
105 if len(re.Rune) == 0 {
106 return c.nop()
107 }
108 var f frag
109 for j := range re.Rune {
110 f1 := c.rune(re.Rune[j:j+1], re.Flags)
111 if j == 0 {
112 f = f1
113 } else {
114 f = c.cat(f, f1)
115 }
116 }
117 return f
118 case OpCharClass:
119 return c.rune(re.Rune, re.Flags)
120 case OpAnyCharNotNL:
121 return c.rune(anyRuneNotNL, 0)
122 case OpAnyChar:
123 return c.rune(anyRune, 0)
124 case OpBeginLine:
125 return c.empty(EmptyBeginLine)
126 case OpEndLine:
127 return c.empty(EmptyEndLine)
128 case OpBeginText:
129 return c.empty(EmptyBeginText)
130 case OpEndText:
131 return c.empty(EmptyEndText)
132 case OpWordBoundary:
133 return c.empty(EmptyWordBoundary)
134 case OpNoWordBoundary:
135 return c.empty(EmptyNoWordBoundary)
136 case OpCapture:
137 bra := c.cap(uint32(re.Cap << 1))
138 sub := c.compile(re.Sub[0])
139 ket := c.cap(uint32(re.Cap<<1 | 1))
140 return c.cat(c.cat(bra, sub), ket)
141 case OpStar:
142 return c.star(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
143 case OpPlus:
144 return c.plus(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
145 case OpQuest:
146 return c.quest(c.compile(re.Sub[0]), re.Flags&NonGreedy != 0)
147 case OpConcat:
148 if len(re.Sub) == 0 {
149 return c.nop()
150 }
151 var f frag
152 for i, sub := range re.Sub {
153 if i == 0 {
154 f = c.compile(sub)
155 } else {
156 f = c.cat(f, c.compile(sub))
157 }
158 }
159 return f
160 case OpAlternate:
161 var f frag
162 for _, sub := range re.Sub {
163 f = c.alt(f, c.compile(sub))
164 }
165 return f
166 }
167 panic("regexp: unhandled case in compile")
168 }
169
170 func (c *compiler) inst(op InstOp) frag {
171 // TODO: impose length limit
172 f := frag{i: uint32(len(c.p.Inst))}
173 c.p.Inst = append(c.p.Inst, Inst{Op: op})
174 return f
175 }
176
177 func (c *compiler) nop() frag {
178 f := c.inst(InstNop)
179 f.out = patchList(f.i << 1)
180 return f
181 }
182
183 func (c *compiler) fail() frag {
184 return frag{}
185 }
186
187 func (c *compiler) cap(arg uint32) frag {
188 f := c.inst(InstCapture)
189 f.out = patchList(f.i << 1)
190 c.p.Inst[f.i].Arg = arg
191
192 if c.p.NumCap < int(arg)+1 {
193 c.p.NumCap = int(arg) + 1
194 }
195 return f
196 }
197
198 func (c *compiler) cat(f1, f2 frag) frag {
199 // concat of failure is failure
200 if f1.i == 0 || f2.i == 0 {
201 return frag{}
202 }
203
204 // TODO: elide nop
205
206 f1.out.patch(c.p, f2.i)
207 return frag{f1.i, f2.out}
208 }
209
210 func (c *compiler) alt(f1, f2 frag) frag {
211 // alt of failure is other
212 if f1.i == 0 {
213 return f2
214 }
215 if f2.i == 0 {
216 return f1
217 }
218
219 f := c.inst(InstAlt)
220 i := &c.p.Inst[f.i]
221 i.Out = f1.i
222 i.Arg = f2.i
223 f.out = f1.out.append(c.p, f2.out)
224 return f
225 }
226
227 func (c *compiler) quest(f1 frag, nongreedy bool) frag {
228 f := c.inst(InstAlt)
229 i := &c.p.Inst[f.i]
230 if nongreedy {
231 i.Arg = f1.i
232 f.out = patchList(f.i << 1)
233 } else {
234 i.Out = f1.i
235 f.out = patchList(f.i<<1 | 1)
236 }
237 f.out = f.out.append(c.p, f1.out)
238 return f
239 }
240
241 func (c *compiler) star(f1 frag, nongreedy bool) frag {
242 f := c.inst(InstAlt)
243 i := &c.p.Inst[f.i]
244 if nongreedy {
245 i.Arg = f1.i
246 f.out = patchList(f.i << 1)
247 } else {
248 i.Out = f1.i
249 f.out = patchList(f.i<<1 | 1)
250 }
251 f1.out.patch(c.p, f.i)
252 return f
253 }
254
255 func (c *compiler) plus(f1 frag, nongreedy bool) frag {
256 return frag{f1.i, c.star(f1, nongreedy).out}
257 }
258
259 func (c *compiler) empty(op EmptyOp) frag {
260 f := c.inst(InstEmptyWidth)
261 c.p.Inst[f.i].Arg = uint32(op)
262 f.out = patchList(f.i << 1)
263 return f
264 }
265
266 func (c *compiler) rune(r []rune, flags Flags) frag {
267 f := c.inst(InstRune)
268 i := &c.p.Inst[f.i]
269 i.Rune = r
270 flags &= FoldCase // only relevant flag is FoldCase
271 if len(r) != 1 || unicode.SimpleFold(r[0]) == r[0] {
272 // and sometimes not even that
273 flags &^= FoldCase
274 }
275 i.Arg = uint32(flags)
276 f.out = patchList(f.i << 1)
277
278 // Special cases for exec machine.
279 switch {
280 case flags&FoldCase == 0 && (len(r) == 1 || len(r) == 2 && r[0] == r[1]):
281 i.Op = InstRune1
282 case len(r) == 2 && r[0] == 0 && r[1] == unicode.MaxRune:
283 i.Op = InstRuneAny
284 case len(r) == 4 && r[0] == 0 && r[1] == '\n'-1 && r[2] == '\n'+1 && r[3] == unicode.MaxRune:
285 i.Op = InstRuneAnyNotNL
286 }
287
288 return f
289 }