Source file src/pkg/go/scanner/scanner.go
1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner implements a scanner for Go source text. 6 // It takes a []byte as source which can then be tokenized 7 // through repeated calls to the Scan method. 8 // 9 package scanner 10 11 import ( 12 "bytes" 13 "fmt" 14 "go/token" 15 "path/filepath" 16 "strconv" 17 "unicode" 18 "unicode/utf8" 19 ) 20 21 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is 22 // encountered and a handler was installed, the handler is called with a 23 // position and an error message. The position points to the beginning of 24 // the offending token. 25 // 26 type ErrorHandler func(pos token.Position, msg string) 27 28 // A Scanner holds the scanner's internal state while processing 29 // a given text. It can be allocated as part of another data 30 // structure but must be initialized via Init before use. 31 // 32 type Scanner struct { 33 // immutable state 34 file *token.File // source file handle 35 dir string // directory portion of file.Name() 36 src []byte // source 37 err ErrorHandler // error reporting; or nil 38 mode Mode // scanning mode 39 40 // scanning state 41 ch rune // current character 42 offset int // character offset 43 rdOffset int // reading offset (position after current character) 44 lineOffset int // current line offset 45 insertSemi bool // insert a semicolon before next newline 46 47 // public state - ok to modify 48 ErrorCount int // number of errors encountered 49 } 50 51 // Read the next Unicode char into s.ch. 52 // s.ch < 0 means end-of-file. 53 // 54 func (s *Scanner) next() { 55 if s.rdOffset < len(s.src) { 56 s.offset = s.rdOffset 57 if s.ch == '\n' { 58 s.lineOffset = s.offset 59 s.file.AddLine(s.offset) 60 } 61 r, w := rune(s.src[s.rdOffset]), 1 62 switch { 63 case r == 0: 64 s.error(s.offset, "illegal character NUL") 65 case r >= 0x80: 66 // not ASCII 67 r, w = utf8.DecodeRune(s.src[s.rdOffset:]) 68 if r == utf8.RuneError && w == 1 { 69 s.error(s.offset, "illegal UTF-8 encoding") 70 } 71 } 72 s.rdOffset += w 73 s.ch = r 74 } else { 75 s.offset = len(s.src) 76 if s.ch == '\n' { 77 s.lineOffset = s.offset 78 s.file.AddLine(s.offset) 79 } 80 s.ch = -1 // eof 81 } 82 } 83 84 // A mode value is set of flags (or 0). 85 // They control scanner behavior. 86 // 87 type Mode uint 88 89 const ( 90 ScanComments Mode = 1 << iota // return comments as COMMENT tokens 91 dontInsertSemis // do not automatically insert semicolons - for testing only 92 ) 93 94 // Init prepares the scanner s to tokenize the text src by setting the 95 // scanner at the beginning of src. The scanner uses the file set file 96 // for position information and it adds line information for each line. 97 // It is ok to re-use the same file when re-scanning the same file as 98 // line information which is already present is ignored. Init causes a 99 // panic if the file size does not match the src size. 100 // 101 // Calls to Scan will invoke the error handler err if they encounter a 102 // syntax error and err is not nil. Also, for each error encountered, 103 // the Scanner field ErrorCount is incremented by one. The mode parameter 104 // determines how comments are handled. 105 // 106 // Note that Init may call err if there is an error in the first character 107 // of the file. 108 // 109 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) { 110 // Explicitly initialize all fields since a scanner may be reused. 111 if file.Size() != len(src) { 112 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src))) 113 } 114 s.file = file 115 s.dir, _ = filepath.Split(file.Name()) 116 s.src = src 117 s.err = err 118 s.mode = mode 119 120 s.ch = ' ' 121 s.offset = 0 122 s.rdOffset = 0 123 s.lineOffset = 0 124 s.insertSemi = false 125 s.ErrorCount = 0 126 127 s.next() 128 } 129 130 func (s *Scanner) error(offs int, msg string) { 131 if s.err != nil { 132 s.err(s.file.Position(s.file.Pos(offs)), msg) 133 } 134 s.ErrorCount++ 135 } 136 137 var prefix = []byte("//line ") 138 139 func (s *Scanner) interpretLineComment(text []byte) { 140 if bytes.HasPrefix(text, prefix) { 141 // get filename and line number, if any 142 if i := bytes.LastIndex(text, []byte{':'}); i > 0 { 143 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 { 144 // valid //line filename:line comment; 145 filename := filepath.Clean(string(text[len(prefix):i])) 146 if !filepath.IsAbs(filename) { 147 // make filename relative to current directory 148 filename = filepath.Join(s.dir, filename) 149 } 150 // update scanner position 151 s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line 152 } 153 } 154 } 155 } 156 157 func (s *Scanner) scanComment() string { 158 // initial '/' already consumed; s.ch == '/' || s.ch == '*' 159 offs := s.offset - 1 // position of initial '/' 160 161 if s.ch == '/' { 162 //-style comment 163 s.next() 164 for s.ch != '\n' && s.ch >= 0 { 165 s.next() 166 } 167 if offs == s.lineOffset { 168 // comment starts at the beginning of the current line 169 s.interpretLineComment(s.src[offs:s.offset]) 170 } 171 goto exit 172 } 173 174 /*-style comment */ 175 s.next() 176 for s.ch >= 0 { 177 ch := s.ch 178 s.next() 179 if ch == '*' && s.ch == '/' { 180 s.next() 181 goto exit 182 } 183 } 184 185 s.error(offs, "comment not terminated") 186 187 exit: 188 return string(s.src[offs:s.offset]) 189 } 190 191 func (s *Scanner) findLineEnd() bool { 192 // initial '/' already consumed 193 194 defer func(offs int) { 195 // reset scanner state to where it was upon calling findLineEnd 196 s.ch = '/' 197 s.offset = offs 198 s.rdOffset = offs + 1 199 s.next() // consume initial '/' again 200 }(s.offset - 1) 201 202 // read ahead until a newline, EOF, or non-comment token is found 203 for s.ch == '/' || s.ch == '*' { 204 if s.ch == '/' { 205 //-style comment always contains a newline 206 return true 207 } 208 /*-style comment: look for newline */ 209 s.next() 210 for s.ch >= 0 { 211 ch := s.ch 212 if ch == '\n' { 213 return true 214 } 215 s.next() 216 if ch == '*' && s.ch == '/' { 217 s.next() 218 break 219 } 220 } 221 s.skipWhitespace() // s.insertSemi is set 222 if s.ch < 0 || s.ch == '\n' { 223 return true 224 } 225 if s.ch != '/' { 226 // non-comment token 227 return false 228 } 229 s.next() // consume '/' 230 } 231 232 return false 233 } 234 235 func isLetter(ch rune) bool { 236 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch) 237 } 238 239 func isDigit(ch rune) bool { 240 return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch) 241 } 242 243 func (s *Scanner) scanIdentifier() string { 244 offs := s.offset 245 for isLetter(s.ch) || isDigit(s.ch) { 246 s.next() 247 } 248 return string(s.src[offs:s.offset]) 249 } 250 251 func digitVal(ch rune) int { 252 switch { 253 case '0' <= ch && ch <= '9': 254 return int(ch - '0') 255 case 'a' <= ch && ch <= 'f': 256 return int(ch - 'a' + 10) 257 case 'A' <= ch && ch <= 'F': 258 return int(ch - 'A' + 10) 259 } 260 return 16 // larger than any legal digit val 261 } 262 263 func (s *Scanner) scanMantissa(base int) { 264 for digitVal(s.ch) < base { 265 s.next() 266 } 267 } 268 269 func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) { 270 // digitVal(s.ch) < 10 271 offs := s.offset 272 tok := token.INT 273 274 if seenDecimalPoint { 275 offs-- 276 tok = token.FLOAT 277 s.scanMantissa(10) 278 goto exponent 279 } 280 281 if s.ch == '0' { 282 // int or float 283 offs := s.offset 284 s.next() 285 if s.ch == 'x' || s.ch == 'X' { 286 // hexadecimal int 287 s.next() 288 s.scanMantissa(16) 289 if s.offset-offs <= 2 { 290 // only scanned "0x" or "0X" 291 s.error(offs, "illegal hexadecimal number") 292 } 293 } else { 294 // octal int or float 295 seenDecimalDigit := false 296 s.scanMantissa(8) 297 if s.ch == '8' || s.ch == '9' { 298 // illegal octal int or float 299 seenDecimalDigit = true 300 s.scanMantissa(10) 301 } 302 if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' { 303 goto fraction 304 } 305 // octal int 306 if seenDecimalDigit { 307 s.error(offs, "illegal octal number") 308 } 309 } 310 goto exit 311 } 312 313 // decimal int or float 314 s.scanMantissa(10) 315 316 fraction: 317 if s.ch == '.' { 318 tok = token.FLOAT 319 s.next() 320 s.scanMantissa(10) 321 } 322 323 exponent: 324 if s.ch == 'e' || s.ch == 'E' { 325 tok = token.FLOAT 326 s.next() 327 if s.ch == '-' || s.ch == '+' { 328 s.next() 329 } 330 s.scanMantissa(10) 331 } 332 333 if s.ch == 'i' { 334 tok = token.IMAG 335 s.next() 336 } 337 338 exit: 339 return tok, string(s.src[offs:s.offset]) 340 } 341 342 func (s *Scanner) scanEscape(quote rune) { 343 offs := s.offset 344 345 var i, base, max uint32 346 switch s.ch { 347 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 348 s.next() 349 return 350 case '0', '1', '2', '3', '4', '5', '6', '7': 351 i, base, max = 3, 8, 255 352 case 'x': 353 s.next() 354 i, base, max = 2, 16, 255 355 case 'u': 356 s.next() 357 i, base, max = 4, 16, unicode.MaxRune 358 case 'U': 359 s.next() 360 i, base, max = 8, 16, unicode.MaxRune 361 default: 362 s.next() // always make progress 363 s.error(offs, "unknown escape sequence") 364 return 365 } 366 367 var x uint32 368 for ; i > 0 && s.ch != quote && s.ch >= 0; i-- { 369 d := uint32(digitVal(s.ch)) 370 if d >= base { 371 s.error(s.offset, "illegal character in escape sequence") 372 break 373 } 374 x = x*base + d 375 s.next() 376 } 377 // in case of an error, consume remaining chars 378 for ; i > 0 && s.ch != quote && s.ch >= 0; i-- { 379 s.next() 380 } 381 if x > max || 0xd800 <= x && x < 0xe000 { 382 s.error(offs, "escape sequence is invalid Unicode code point") 383 } 384 } 385 386 func (s *Scanner) scanChar() string { 387 // '\'' opening already consumed 388 offs := s.offset - 1 389 390 n := 0 391 for s.ch != '\'' { 392 ch := s.ch 393 n++ 394 s.next() 395 if ch == '\n' || ch < 0 { 396 s.error(offs, "character literal not terminated") 397 n = 1 398 break 399 } 400 if ch == '\\' { 401 s.scanEscape('\'') 402 } 403 } 404 405 s.next() 406 407 if n != 1 { 408 s.error(offs, "illegal character literal") 409 } 410 411 return string(s.src[offs:s.offset]) 412 } 413 414 func (s *Scanner) scanString() string { 415 // '"' opening already consumed 416 offs := s.offset - 1 417 418 for s.ch != '"' { 419 ch := s.ch 420 s.next() 421 if ch == '\n' || ch < 0 { 422 s.error(offs, "string not terminated") 423 break 424 } 425 if ch == '\\' { 426 s.scanEscape('"') 427 } 428 } 429 430 s.next() 431 432 return string(s.src[offs:s.offset]) 433 } 434 435 func stripCR(b []byte) []byte { 436 c := make([]byte, len(b)) 437 i := 0 438 for _, ch := range b { 439 if ch != '\r' { 440 c[i] = ch 441 i++ 442 } 443 } 444 return c[:i] 445 } 446 447 func (s *Scanner) scanRawString() string { 448 // '`' opening already consumed 449 offs := s.offset - 1 450 451 hasCR := false 452 for s.ch != '`' { 453 ch := s.ch 454 s.next() 455 if ch == '\r' { 456 hasCR = true 457 } 458 if ch < 0 { 459 s.error(offs, "string not terminated") 460 break 461 } 462 } 463 464 s.next() 465 466 lit := s.src[offs:s.offset] 467 if hasCR { 468 lit = stripCR(lit) 469 } 470 471 return string(lit) 472 } 473 474 func (s *Scanner) skipWhitespace() { 475 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' { 476 s.next() 477 } 478 } 479 480 // Helper functions for scanning multi-byte tokens such as >> += >>= . 481 // Different routines recognize different length tok_i based on matches 482 // of ch_i. If a token ends in '=', the result is tok1 or tok3 483 // respectively. Otherwise, the result is tok0 if there was no other 484 // matching character, or tok2 if the matching character was ch2. 485 486 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token { 487 if s.ch == '=' { 488 s.next() 489 return tok1 490 } 491 return tok0 492 } 493 494 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token { 495 if s.ch == '=' { 496 s.next() 497 return tok1 498 } 499 if s.ch == ch2 { 500 s.next() 501 return tok2 502 } 503 return tok0 504 } 505 506 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token { 507 if s.ch == '=' { 508 s.next() 509 return tok1 510 } 511 if s.ch == ch2 { 512 s.next() 513 if s.ch == '=' { 514 s.next() 515 return tok3 516 } 517 return tok2 518 } 519 return tok0 520 } 521 522 // Scan scans the next token and returns the token position, the token, 523 // and its literal string if applicable. The source end is indicated by 524 // token.EOF. 525 // 526 // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT, 527 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string 528 // has the corresponding value. 529 // 530 // If the returned token is token.SEMICOLON, the corresponding 531 // literal string is ";" if the semicolon was present in the source, 532 // and "\n" if the semicolon was inserted because of a newline or 533 // at EOF. 534 // 535 // If the returned token is token.ILLEGAL, the literal string is the 536 // offending character. 537 // 538 // In all other cases, Scan returns an empty literal string. 539 // 540 // For more tolerant parsing, Scan will return a valid token if 541 // possible even if a syntax error was encountered. Thus, even 542 // if the resulting token sequence contains no illegal tokens, 543 // a client may not assume that no error occurred. Instead it 544 // must check the scanner's ErrorCount or the number of calls 545 // of the error handler, if there was one installed. 546 // 547 // Scan adds line information to the file added to the file 548 // set with Init. Token positions are relative to that file 549 // and thus relative to the file set. 550 // 551 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) { 552 scanAgain: 553 s.skipWhitespace() 554 555 // current token start 556 pos = s.file.Pos(s.offset) 557 558 // determine token value 559 insertSemi := false 560 switch ch := s.ch; { 561 case isLetter(ch): 562 lit = s.scanIdentifier() 563 tok = token.Lookup(lit) 564 switch tok { 565 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN: 566 insertSemi = true 567 } 568 case digitVal(ch) < 10: 569 insertSemi = true 570 tok, lit = s.scanNumber(false) 571 default: 572 s.next() // always make progress 573 switch ch { 574 case -1: 575 if s.insertSemi { 576 s.insertSemi = false // EOF consumed 577 return pos, token.SEMICOLON, "\n" 578 } 579 tok = token.EOF 580 case '\n': 581 // we only reach here if s.insertSemi was 582 // set in the first place and exited early 583 // from s.skipWhitespace() 584 s.insertSemi = false // newline consumed 585 return pos, token.SEMICOLON, "\n" 586 case '"': 587 insertSemi = true 588 tok = token.STRING 589 lit = s.scanString() 590 case '\'': 591 insertSemi = true 592 tok = token.CHAR 593 lit = s.scanChar() 594 case '`': 595 insertSemi = true 596 tok = token.STRING 597 lit = s.scanRawString() 598 case ':': 599 tok = s.switch2(token.COLON, token.DEFINE) 600 case '.': 601 if digitVal(s.ch) < 10 { 602 insertSemi = true 603 tok, lit = s.scanNumber(true) 604 } else if s.ch == '.' { 605 s.next() 606 if s.ch == '.' { 607 s.next() 608 tok = token.ELLIPSIS 609 } 610 } else { 611 tok = token.PERIOD 612 } 613 case ',': 614 tok = token.COMMA 615 case ';': 616 tok = token.SEMICOLON 617 lit = ";" 618 case '(': 619 tok = token.LPAREN 620 case ')': 621 insertSemi = true 622 tok = token.RPAREN 623 case '[': 624 tok = token.LBRACK 625 case ']': 626 insertSemi = true 627 tok = token.RBRACK 628 case '{': 629 tok = token.LBRACE 630 case '}': 631 insertSemi = true 632 tok = token.RBRACE 633 case '+': 634 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC) 635 if tok == token.INC { 636 insertSemi = true 637 } 638 case '-': 639 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC) 640 if tok == token.DEC { 641 insertSemi = true 642 } 643 case '*': 644 tok = s.switch2(token.MUL, token.MUL_ASSIGN) 645 case '/': 646 if s.ch == '/' || s.ch == '*' { 647 // comment 648 if s.insertSemi && s.findLineEnd() { 649 // reset position to the beginning of the comment 650 s.ch = '/' 651 s.offset = s.file.Offset(pos) 652 s.rdOffset = s.offset + 1 653 s.insertSemi = false // newline consumed 654 return pos, token.SEMICOLON, "\n" 655 } 656 lit = s.scanComment() 657 if s.mode&ScanComments == 0 { 658 // skip comment 659 s.insertSemi = false // newline consumed 660 goto scanAgain 661 } 662 tok = token.COMMENT 663 } else { 664 tok = s.switch2(token.QUO, token.QUO_ASSIGN) 665 } 666 case '%': 667 tok = s.switch2(token.REM, token.REM_ASSIGN) 668 case '^': 669 tok = s.switch2(token.XOR, token.XOR_ASSIGN) 670 case '<': 671 if s.ch == '-' { 672 s.next() 673 tok = token.ARROW 674 } else { 675 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN) 676 } 677 case '>': 678 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN) 679 case '=': 680 tok = s.switch2(token.ASSIGN, token.EQL) 681 case '!': 682 tok = s.switch2(token.NOT, token.NEQ) 683 case '&': 684 if s.ch == '^' { 685 s.next() 686 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN) 687 } else { 688 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND) 689 } 690 case '|': 691 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR) 692 default: 693 s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch)) 694 insertSemi = s.insertSemi // preserve insertSemi info 695 tok = token.ILLEGAL 696 lit = string(ch) 697 } 698 } 699 if s.mode&dontInsertSemis == 0 { 700 s.insertSemi = insertSemi 701 } 702 703 return 704 }