Source file src/pkg/text/template/parse/lex.go
1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package parse 6 7 import ( 8 "fmt" 9 "strings" 10 "unicode" 11 "unicode/utf8" 12 ) 13 14 // item represents a token or text string returned from the scanner. 15 type item struct { 16 typ itemType 17 val string 18 } 19 20 func (i item) String() string { 21 switch { 22 case i.typ == itemEOF: 23 return "EOF" 24 case i.typ == itemError: 25 return i.val 26 case i.typ > itemKeyword: 27 return fmt.Sprintf("<%s>", i.val) 28 case len(i.val) > 10: 29 return fmt.Sprintf("%.10q...", i.val) 30 } 31 return fmt.Sprintf("%q", i.val) 32 } 33 34 // itemType identifies the type of lex items. 35 type itemType int 36 37 const ( 38 itemError itemType = iota // error occurred; value is text of error 39 itemBool // boolean constant 40 itemChar // printable ASCII character; grab bag for comma etc. 41 itemCharConstant // character constant 42 itemComplex // complex constant (1+2i); imaginary is just a number 43 itemColonEquals // colon-equals (':=') introducing a declaration 44 itemEOF 45 itemField // alphanumeric identifier, starting with '.', possibly chained ('.x.y') 46 itemIdentifier // alphanumeric identifier 47 itemLeftDelim // left action delimiter 48 itemNumber // simple number, including imaginary 49 itemPipe // pipe symbol 50 itemRawString // raw quoted string (includes quotes) 51 itemRightDelim // right action delimiter 52 itemString // quoted string (includes quotes) 53 itemText // plain text 54 itemVariable // variable starting with '$', such as '$' or '$1' or '$hello'. 55 // Keywords appear after all the rest. 56 itemKeyword // used only to delimit the keywords 57 itemDot // the cursor, spelled '.'. 58 itemDefine // define keyword 59 itemElse // else keyword 60 itemEnd // end keyword 61 itemIf // if keyword 62 itemRange // range keyword 63 itemTemplate // template keyword 64 itemWith // with keyword 65 ) 66 67 // Make the types prettyprint. 68 var itemName = map[itemType]string{ 69 itemError: "error", 70 itemBool: "bool", 71 itemChar: "char", 72 itemCharConstant: "charconst", 73 itemComplex: "complex", 74 itemColonEquals: ":=", 75 itemEOF: "EOF", 76 itemField: "field", 77 itemIdentifier: "identifier", 78 itemLeftDelim: "left delim", 79 itemNumber: "number", 80 itemPipe: "pipe", 81 itemRawString: "raw string", 82 itemRightDelim: "right delim", 83 itemString: "string", 84 itemVariable: "variable", 85 // keywords 86 itemDot: ".", 87 itemDefine: "define", 88 itemElse: "else", 89 itemIf: "if", 90 itemEnd: "end", 91 itemRange: "range", 92 itemTemplate: "template", 93 itemWith: "with", 94 } 95 96 func (i itemType) String() string { 97 s := itemName[i] 98 if s == "" { 99 return fmt.Sprintf("item%d", int(i)) 100 } 101 return s 102 } 103 104 var key = map[string]itemType{ 105 ".": itemDot, 106 "define": itemDefine, 107 "else": itemElse, 108 "end": itemEnd, 109 "if": itemIf, 110 "range": itemRange, 111 "template": itemTemplate, 112 "with": itemWith, 113 } 114 115 const eof = -1 116 117 // stateFn represents the state of the scanner as a function that returns the next state. 118 type stateFn func(*lexer) stateFn 119 120 // lexer holds the state of the scanner. 121 type lexer struct { 122 name string // the name of the input; used only for error reports. 123 input string // the string being scanned. 124 leftDelim string // start of action. 125 rightDelim string // end of action. 126 state stateFn // the next lexing function to enter. 127 pos int // current position in the input. 128 start int // start position of this item. 129 width int // width of last rune read from input. 130 items chan item // channel of scanned items. 131 } 132 133 // next returns the next rune in the input. 134 func (l *lexer) next() (r rune) { 135 if l.pos >= len(l.input) { 136 l.width = 0 137 return eof 138 } 139 r, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) 140 l.pos += l.width 141 return r 142 } 143 144 // peek returns but does not consume the next rune in the input. 145 func (l *lexer) peek() rune { 146 r := l.next() 147 l.backup() 148 return r 149 } 150 151 // backup steps back one rune. Can only be called once per call of next. 152 func (l *lexer) backup() { 153 l.pos -= l.width 154 } 155 156 // emit passes an item back to the client. 157 func (l *lexer) emit(t itemType) { 158 l.items <- item{t, l.input[l.start:l.pos]} 159 l.start = l.pos 160 } 161 162 // ignore skips over the pending input before this point. 163 func (l *lexer) ignore() { 164 l.start = l.pos 165 } 166 167 // accept consumes the next rune if it's from the valid set. 168 func (l *lexer) accept(valid string) bool { 169 if strings.IndexRune(valid, l.next()) >= 0 { 170 return true 171 } 172 l.backup() 173 return false 174 } 175 176 // acceptRun consumes a run of runes from the valid set. 177 func (l *lexer) acceptRun(valid string) { 178 for strings.IndexRune(valid, l.next()) >= 0 { 179 } 180 l.backup() 181 } 182 183 // lineNumber reports which line we're on. Doing it this way 184 // means we don't have to worry about peek double counting. 185 func (l *lexer) lineNumber() int { 186 return 1 + strings.Count(l.input[:l.pos], "\n") 187 } 188 189 // error returns an error token and terminates the scan by passing 190 // back a nil pointer that will be the next state, terminating l.nextItem. 191 func (l *lexer) errorf(format string, args ...interface{}) stateFn { 192 l.items <- item{itemError, fmt.Sprintf(format, args...)} 193 return nil 194 } 195 196 // nextItem returns the next item from the input. 197 func (l *lexer) nextItem() item { 198 for { 199 select { 200 case item := <-l.items: 201 return item 202 default: 203 l.state = l.state(l) 204 } 205 } 206 panic("not reached") 207 } 208 209 // lex creates a new scanner for the input string. 210 func lex(name, input, left, right string) *lexer { 211 if left == "" { 212 left = leftDelim 213 } 214 if right == "" { 215 right = rightDelim 216 } 217 l := &lexer{ 218 name: name, 219 input: input, 220 leftDelim: left, 221 rightDelim: right, 222 state: lexText, 223 items: make(chan item, 2), // Two items of buffering is sufficient for all state functions 224 } 225 return l 226 } 227 228 // state functions 229 230 const ( 231 leftDelim = "{{" 232 rightDelim = "}}" 233 leftComment = "/*" 234 rightComment = "*/" 235 ) 236 237 // lexText scans until an opening action delimiter, "{{". 238 func lexText(l *lexer) stateFn { 239 for { 240 if strings.HasPrefix(l.input[l.pos:], l.leftDelim) { 241 if l.pos > l.start { 242 l.emit(itemText) 243 } 244 return lexLeftDelim 245 } 246 if l.next() == eof { 247 break 248 } 249 } 250 // Correctly reached EOF. 251 if l.pos > l.start { 252 l.emit(itemText) 253 } 254 l.emit(itemEOF) 255 return nil 256 } 257 258 // lexLeftDelim scans the left delimiter, which is known to be present. 259 func lexLeftDelim(l *lexer) stateFn { 260 if strings.HasPrefix(l.input[l.pos:], l.leftDelim+leftComment) { 261 return lexComment 262 } 263 l.pos += len(l.leftDelim) 264 l.emit(itemLeftDelim) 265 return lexInsideAction 266 } 267 268 // lexComment scans a comment. The left comment marker is known to be present. 269 func lexComment(l *lexer) stateFn { 270 i := strings.Index(l.input[l.pos:], rightComment+l.rightDelim) 271 if i < 0 { 272 return l.errorf("unclosed comment") 273 } 274 l.pos += i + len(rightComment) + len(l.rightDelim) 275 l.ignore() 276 return lexText 277 } 278 279 // lexRightDelim scans the right delimiter, which is known to be present. 280 func lexRightDelim(l *lexer) stateFn { 281 l.pos += len(l.rightDelim) 282 l.emit(itemRightDelim) 283 return lexText 284 } 285 286 // lexInsideAction scans the elements inside action delimiters. 287 func lexInsideAction(l *lexer) stateFn { 288 // Either number, quoted string, or identifier. 289 // Spaces separate and are ignored. 290 // Pipe symbols separate and are emitted. 291 if strings.HasPrefix(l.input[l.pos:], l.rightDelim) { 292 return lexRightDelim 293 } 294 switch r := l.next(); { 295 case r == eof || r == '\n': 296 return l.errorf("unclosed action") 297 case isSpace(r): 298 l.ignore() 299 case r == ':': 300 if l.next() != '=' { 301 return l.errorf("expected :=") 302 } 303 l.emit(itemColonEquals) 304 case r == '|': 305 l.emit(itemPipe) 306 case r == '"': 307 return lexQuote 308 case r == '`': 309 return lexRawQuote 310 case r == '$': 311 return lexIdentifier 312 case r == '\'': 313 return lexChar 314 case r == '.': 315 // special look-ahead for ".field" so we don't break l.backup(). 316 if l.pos < len(l.input) { 317 r := l.input[l.pos] 318 if r < '0' || '9' < r { 319 return lexIdentifier // itemDot comes from the keyword table. 320 } 321 } 322 fallthrough // '.' can start a number. 323 case r == '+' || r == '-' || ('0' <= r && r <= '9'): 324 l.backup() 325 return lexNumber 326 case isAlphaNumeric(r): 327 l.backup() 328 return lexIdentifier 329 case r <= unicode.MaxASCII && unicode.IsPrint(r): 330 l.emit(itemChar) 331 return lexInsideAction 332 default: 333 return l.errorf("unrecognized character in action: %#U", r) 334 } 335 return lexInsideAction 336 } 337 338 // lexIdentifier scans an alphanumeric or field. 339 func lexIdentifier(l *lexer) stateFn { 340 Loop: 341 for { 342 switch r := l.next(); { 343 case isAlphaNumeric(r): 344 // absorb. 345 case r == '.' && (l.input[l.start] == '.' || l.input[l.start] == '$'): 346 // field chaining; absorb into one token. 347 default: 348 l.backup() 349 word := l.input[l.start:l.pos] 350 if !l.atTerminator() { 351 return l.errorf("unexpected character %+U", r) 352 } 353 switch { 354 case key[word] > itemKeyword: 355 l.emit(key[word]) 356 case word[0] == '.': 357 l.emit(itemField) 358 case word[0] == '$': 359 l.emit(itemVariable) 360 case word == "true", word == "false": 361 l.emit(itemBool) 362 default: 363 l.emit(itemIdentifier) 364 } 365 break Loop 366 } 367 } 368 return lexInsideAction 369 } 370 371 // atTerminator reports whether the input is at valid termination character to 372 // appear after an identifier. Mostly to catch cases like "$x+2" not being 373 // acceptable without a space, in case we decide one day to implement 374 // arithmetic. 375 func (l *lexer) atTerminator() bool { 376 r := l.peek() 377 if isSpace(r) { 378 return true 379 } 380 switch r { 381 case eof, ',', '|', ':': 382 return true 383 } 384 // Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will 385 // succeed but should fail) but only in extremely rare cases caused by willfully 386 // bad choice of delimiter. 387 if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r { 388 return true 389 } 390 return false 391 } 392 393 // lexChar scans a character constant. The initial quote is already 394 // scanned. Syntax checking is done by the parse. 395 func lexChar(l *lexer) stateFn { 396 Loop: 397 for { 398 switch l.next() { 399 case '\\': 400 if r := l.next(); r != eof && r != '\n' { 401 break 402 } 403 fallthrough 404 case eof, '\n': 405 return l.errorf("unterminated character constant") 406 case '\'': 407 break Loop 408 } 409 } 410 l.emit(itemCharConstant) 411 return lexInsideAction 412 } 413 414 // lexNumber scans a number: decimal, octal, hex, float, or imaginary. This 415 // isn't a perfect number scanner - for instance it accepts "." and "0x0.2" 416 // and "089" - but when it's wrong the input is invalid and the parser (via 417 // strconv) will notice. 418 func lexNumber(l *lexer) stateFn { 419 if !l.scanNumber() { 420 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) 421 } 422 if sign := l.peek(); sign == '+' || sign == '-' { 423 // Complex: 1+2i. No spaces, must end in 'i'. 424 if !l.scanNumber() || l.input[l.pos-1] != 'i' { 425 return l.errorf("bad number syntax: %q", l.input[l.start:l.pos]) 426 } 427 l.emit(itemComplex) 428 } else { 429 l.emit(itemNumber) 430 } 431 return lexInsideAction 432 } 433 434 func (l *lexer) scanNumber() bool { 435 // Optional leading sign. 436 l.accept("+-") 437 // Is it hex? 438 digits := "0123456789" 439 if l.accept("0") && l.accept("xX") { 440 digits = "0123456789abcdefABCDEF" 441 } 442 l.acceptRun(digits) 443 if l.accept(".") { 444 l.acceptRun(digits) 445 } 446 if l.accept("eE") { 447 l.accept("+-") 448 l.acceptRun("0123456789") 449 } 450 // Is it imaginary? 451 l.accept("i") 452 // Next thing mustn't be alphanumeric. 453 if isAlphaNumeric(l.peek()) { 454 l.next() 455 return false 456 } 457 return true 458 } 459 460 // lexQuote scans a quoted string. 461 func lexQuote(l *lexer) stateFn { 462 Loop: 463 for { 464 switch l.next() { 465 case '\\': 466 if r := l.next(); r != eof && r != '\n' { 467 break 468 } 469 fallthrough 470 case eof, '\n': 471 return l.errorf("unterminated quoted string") 472 case '"': 473 break Loop 474 } 475 } 476 l.emit(itemString) 477 return lexInsideAction 478 } 479 480 // lexRawQuote scans a raw quoted string. 481 func lexRawQuote(l *lexer) stateFn { 482 Loop: 483 for { 484 switch l.next() { 485 case eof, '\n': 486 return l.errorf("unterminated raw quoted string") 487 case '`': 488 break Loop 489 } 490 } 491 l.emit(itemRawString) 492 return lexInsideAction 493 } 494 495 // isSpace reports whether r is a space character. 496 func isSpace(r rune) bool { 497 switch r { 498 case ' ', '\t', '\n', '\r': 499 return true 500 } 501 return false 502 } 503 504 // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore. 505 func isAlphaNumeric(r rune) bool { 506 return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r) 507 }