Source file src/pkg/text/template/parse/lex.go
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package parse
6
7 import (
8 "fmt"
9 "strings"
10 "unicode"
11 "unicode/utf8"
12 )
13
14 // item represents a token or text string returned from the scanner.
15 type item struct {
16 typ itemType
17 val string
18 }
19
20 func (i item) String() string {
21 switch {
22 case i.typ == itemEOF:
23 return "EOF"
24 case i.typ == itemError:
25 return i.val
26 case i.typ > itemKeyword:
27 return fmt.Sprintf("<%s>", i.val)
28 case len(i.val) > 10:
29 return fmt.Sprintf("%.10q...", i.val)
30 }
31 return fmt.Sprintf("%q", i.val)
32 }
33
34 // itemType identifies the type of lex items.
35 type itemType int
36
37 const (
38 itemError itemType = iota // error occurred; value is text of error
39 itemBool // boolean constant
40 itemChar // printable ASCII character; grab bag for comma etc.
41 itemCharConstant // character constant
42 itemComplex // complex constant (1+2i); imaginary is just a number
43 itemColonEquals // colon-equals (':=') introducing a declaration
44 itemEOF
45 itemField // alphanumeric identifier, starting with '.', possibly chained ('.x.y')
46 itemIdentifier // alphanumeric identifier
47 itemLeftDelim // left action delimiter
48 itemNumber // simple number, including imaginary
49 itemPipe // pipe symbol
50 itemRawString // raw quoted string (includes quotes)
51 itemRightDelim // right action delimiter
52 itemString // quoted string (includes quotes)
53 itemText // plain text
54 itemVariable // variable starting with '$', such as '$' or '$1' or '$hello'.
55 // Keywords appear after all the rest.
56 itemKeyword // used only to delimit the keywords
57 itemDot // the cursor, spelled '.'.
58 itemDefine // define keyword
59 itemElse // else keyword
60 itemEnd // end keyword
61 itemIf // if keyword
62 itemRange // range keyword
63 itemTemplate // template keyword
64 itemWith // with keyword
65 )
66
67 // Make the types prettyprint.
68 var itemName = map[itemType]string{
69 itemError: "error",
70 itemBool: "bool",
71 itemChar: "char",
72 itemCharConstant: "charconst",
73 itemComplex: "complex",
74 itemColonEquals: ":=",
75 itemEOF: "EOF",
76 itemField: "field",
77 itemIdentifier: "identifier",
78 itemLeftDelim: "left delim",
79 itemNumber: "number",
80 itemPipe: "pipe",
81 itemRawString: "raw string",
82 itemRightDelim: "right delim",
83 itemString: "string",
84 itemVariable: "variable",
85 // keywords
86 itemDot: ".",
87 itemDefine: "define",
88 itemElse: "else",
89 itemIf: "if",
90 itemEnd: "end",
91 itemRange: "range",
92 itemTemplate: "template",
93 itemWith: "with",
94 }
95
96 func (i itemType) String() string {
97 s := itemName[i]
98 if s == "" {
99 return fmt.Sprintf("item%d", int(i))
100 }
101 return s
102 }
103
104 var key = map[string]itemType{
105 ".": itemDot,
106 "define": itemDefine,
107 "else": itemElse,
108 "end": itemEnd,
109 "if": itemIf,
110 "range": itemRange,
111 "template": itemTemplate,
112 "with": itemWith,
113 }
114
115 const eof = -1
116
117 // stateFn represents the state of the scanner as a function that returns the next state.
118 type stateFn func(*lexer) stateFn
119
120 // lexer holds the state of the scanner.
121 type lexer struct {
122 name string // the name of the input; used only for error reports.
123 input string // the string being scanned.
124 leftDelim string // start of action.
125 rightDelim string // end of action.
126 state stateFn // the next lexing function to enter.
127 pos int // current position in the input.
128 start int // start position of this item.
129 width int // width of last rune read from input.
130 items chan item // channel of scanned items.
131 }
132
133 // next returns the next rune in the input.
134 func (l *lexer) next() (r rune) {
135 if l.pos >= len(l.input) {
136 l.width = 0
137 return eof
138 }
139 r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
140 l.pos += l.width
141 return r
142 }
143
144 // peek returns but does not consume the next rune in the input.
145 func (l *lexer) peek() rune {
146 r := l.next()
147 l.backup()
148 return r
149 }
150
151 // backup steps back one rune. Can only be called once per call of next.
152 func (l *lexer) backup() {
153 l.pos -= l.width
154 }
155
156 // emit passes an item back to the client.
157 func (l *lexer) emit(t itemType) {
158 l.items <- item{t, l.input[l.start:l.pos]}
159 l.start = l.pos
160 }
161
162 // ignore skips over the pending input before this point.
163 func (l *lexer) ignore() {
164 l.start = l.pos
165 }
166
167 // accept consumes the next rune if it's from the valid set.
168 func (l *lexer) accept(valid string) bool {
169 if strings.IndexRune(valid, l.next()) >= 0 {
170 return true
171 }
172 l.backup()
173 return false
174 }
175
176 // acceptRun consumes a run of runes from the valid set.
177 func (l *lexer) acceptRun(valid string) {
178 for strings.IndexRune(valid, l.next()) >= 0 {
179 }
180 l.backup()
181 }
182
183 // lineNumber reports which line we're on. Doing it this way
184 // means we don't have to worry about peek double counting.
185 func (l *lexer) lineNumber() int {
186 return 1 + strings.Count(l.input[:l.pos], "\n")
187 }
188
189 // error returns an error token and terminates the scan by passing
190 // back a nil pointer that will be the next state, terminating l.nextItem.
191 func (l *lexer) errorf(format string, args ...interface{}) stateFn {
192 l.items <- item{itemError, fmt.Sprintf(format, args...)}
193 return nil
194 }
195
196 // nextItem returns the next item from the input.
197 func (l *lexer) nextItem() item {
198 for {
199 select {
200 case item := <-l.items:
201 return item
202 default:
203 l.state = l.state(l)
204 }
205 }
206 panic("not reached")
207 }
208
209 // lex creates a new scanner for the input string.
210 func lex(name, input, left, right string) *lexer {
211 if left == "" {
212 left = leftDelim
213 }
214 if right == "" {
215 right = rightDelim
216 }
217 l := &lexer{
218 name: name,
219 input: input,
220 leftDelim: left,
221 rightDelim: right,
222 state: lexText,
223 items: make(chan item, 2), // Two items of buffering is sufficient for all state functions
224 }
225 return l
226 }
227
228 // state functions
229
230 const (
231 leftDelim = "{{"
232 rightDelim = "}}"
233 leftComment = "/*"
234 rightComment = "*/"
235 )
236
237 // lexText scans until an opening action delimiter, "{{".
238 func lexText(l *lexer) stateFn {
239 for {
240 if strings.HasPrefix(l.input[l.pos:], l.leftDelim) {
241 if l.pos > l.start {
242 l.emit(itemText)
243 }
244 return lexLeftDelim
245 }
246 if l.next() == eof {
247 break
248 }
249 }
250 // Correctly reached EOF.
251 if l.pos > l.start {
252 l.emit(itemText)
253 }
254 l.emit(itemEOF)
255 return nil
256 }
257
258 // lexLeftDelim scans the left delimiter, which is known to be present.
259 func lexLeftDelim(l *lexer) stateFn {
260 if strings.HasPrefix(l.input[l.pos:], l.leftDelim+leftComment) {
261 return lexComment
262 }
263 l.pos += len(l.leftDelim)
264 l.emit(itemLeftDelim)
265 return lexInsideAction
266 }
267
268 // lexComment scans a comment. The left comment marker is known to be present.
269 func lexComment(l *lexer) stateFn {
270 i := strings.Index(l.input[l.pos:], rightComment+l.rightDelim)
271 if i < 0 {
272 return l.errorf("unclosed comment")
273 }
274 l.pos += i + len(rightComment) + len(l.rightDelim)
275 l.ignore()
276 return lexText
277 }
278
279 // lexRightDelim scans the right delimiter, which is known to be present.
280 func lexRightDelim(l *lexer) stateFn {
281 l.pos += len(l.rightDelim)
282 l.emit(itemRightDelim)
283 return lexText
284 }
285
286 // lexInsideAction scans the elements inside action delimiters.
287 func lexInsideAction(l *lexer) stateFn {
288 // Either number, quoted string, or identifier.
289 // Spaces separate and are ignored.
290 // Pipe symbols separate and are emitted.
291 if strings.HasPrefix(l.input[l.pos:], l.rightDelim) {
292 return lexRightDelim
293 }
294 switch r := l.next(); {
295 case r == eof || r == '\n':
296 return l.errorf("unclosed action")
297 case isSpace(r):
298 l.ignore()
299 case r == ':':
300 if l.next() != '=' {
301 return l.errorf("expected :=")
302 }
303 l.emit(itemColonEquals)
304 case r == '|':
305 l.emit(itemPipe)
306 case r == '"':
307 return lexQuote
308 case r == '`':
309 return lexRawQuote
310 case r == '$':
311 return lexIdentifier
312 case r == '\'':
313 return lexChar
314 case r == '.':
315 // special look-ahead for ".field" so we don't break l.backup().
316 if l.pos < len(l.input) {
317 r := l.input[l.pos]
318 if r < '0' || '9' < r {
319 return lexIdentifier // itemDot comes from the keyword table.
320 }
321 }
322 fallthrough // '.' can start a number.
323 case r == '+' || r == '-' || ('0' <= r && r <= '9'):
324 l.backup()
325 return lexNumber
326 case isAlphaNumeric(r):
327 l.backup()
328 return lexIdentifier
329 case r <= unicode.MaxASCII && unicode.IsPrint(r):
330 l.emit(itemChar)
331 return lexInsideAction
332 default:
333 return l.errorf("unrecognized character in action: %#U", r)
334 }
335 return lexInsideAction
336 }
337
338 // lexIdentifier scans an alphanumeric or field.
339 func lexIdentifier(l *lexer) stateFn {
340 Loop:
341 for {
342 switch r := l.next(); {
343 case isAlphaNumeric(r):
344 // absorb.
345 case r == '.' && (l.input[l.start] == '.' || l.input[l.start] == '$'):
346 // field chaining; absorb into one token.
347 default:
348 l.backup()
349 word := l.input[l.start:l.pos]
350 if !l.atTerminator() {
351 return l.errorf("unexpected character %+U", r)
352 }
353 switch {
354 case key[word] > itemKeyword:
355 l.emit(key[word])
356 case word[0] == '.':
357 l.emit(itemField)
358 case word[0] == '$':
359 l.emit(itemVariable)
360 case word == "true", word == "false":
361 l.emit(itemBool)
362 default:
363 l.emit(itemIdentifier)
364 }
365 break Loop
366 }
367 }
368 return lexInsideAction
369 }
370
371 // atTerminator reports whether the input is at valid termination character to
372 // appear after an identifier. Mostly to catch cases like "$x+2" not being
373 // acceptable without a space, in case we decide one day to implement
374 // arithmetic.
375 func (l *lexer) atTerminator() bool {
376 r := l.peek()
377 if isSpace(r) {
378 return true
379 }
380 switch r {
381 case eof, ',', '|', ':':
382 return true
383 }
384 // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
385 // succeed but should fail) but only in extremely rare cases caused by willfully
386 // bad choice of delimiter.
387 if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r {
388 return true
389 }
390 return false
391 }
392
393 // lexChar scans a character constant. The initial quote is already
394 // scanned. Syntax checking is done by the parse.
395 func lexChar(l *lexer) stateFn {
396 Loop:
397 for {
398 switch l.next() {
399 case '\\':
400 if r := l.next(); r != eof && r != '\n' {
401 break
402 }
403 fallthrough
404 case eof, '\n':
405 return l.errorf("unterminated character constant")
406 case '\'':
407 break Loop
408 }
409 }
410 l.emit(itemCharConstant)
411 return lexInsideAction
412 }
413
414 // lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
415 // isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
416 // and "089" - but when it's wrong the input is invalid and the parser (via
417 // strconv) will notice.
418 func lexNumber(l *lexer) stateFn {
419 if !l.scanNumber() {
420 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
421 }
422 if sign := l.peek(); sign == '+' || sign == '-' {
423 // Complex: 1+2i. No spaces, must end in 'i'.
424 if !l.scanNumber() || l.input[l.pos-1] != 'i' {
425 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
426 }
427 l.emit(itemComplex)
428 } else {
429 l.emit(itemNumber)
430 }
431 return lexInsideAction
432 }
433
434 func (l *lexer) scanNumber() bool {
435 // Optional leading sign.
436 l.accept("+-")
437 // Is it hex?
438 digits := "0123456789"
439 if l.accept("0") && l.accept("xX") {
440 digits = "0123456789abcdefABCDEF"
441 }
442 l.acceptRun(digits)
443 if l.accept(".") {
444 l.acceptRun(digits)
445 }
446 if l.accept("eE") {
447 l.accept("+-")
448 l.acceptRun("0123456789")
449 }
450 // Is it imaginary?
451 l.accept("i")
452 // Next thing mustn't be alphanumeric.
453 if isAlphaNumeric(l.peek()) {
454 l.next()
455 return false
456 }
457 return true
458 }
459
460 // lexQuote scans a quoted string.
461 func lexQuote(l *lexer) stateFn {
462 Loop:
463 for {
464 switch l.next() {
465 case '\\':
466 if r := l.next(); r != eof && r != '\n' {
467 break
468 }
469 fallthrough
470 case eof, '\n':
471 return l.errorf("unterminated quoted string")
472 case '"':
473 break Loop
474 }
475 }
476 l.emit(itemString)
477 return lexInsideAction
478 }
479
480 // lexRawQuote scans a raw quoted string.
481 func lexRawQuote(l *lexer) stateFn {
482 Loop:
483 for {
484 switch l.next() {
485 case eof, '\n':
486 return l.errorf("unterminated raw quoted string")
487 case '`':
488 break Loop
489 }
490 }
491 l.emit(itemRawString)
492 return lexInsideAction
493 }
494
495 // isSpace reports whether r is a space character.
496 func isSpace(r rune) bool {
497 switch r {
498 case ' ', '\t', '\n', '\r':
499 return true
500 }
501 return false
502 }
503
504 // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
505 func isAlphaNumeric(r rune) bool {
506 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
507 }