Source file src/pkg/text/scanner/scanner.go
1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package scanner provides a scanner and tokenizer for UTF-8-encoded text. 6 // It takes an io.Reader providing the source, which then can be tokenized 7 // through repeated calls to the Scan function. For compatibility with 8 // existing tools, the NUL character is not allowed. 9 // 10 // By default, a Scanner skips white space and Go comments and recognizes all 11 // literals as defined by the Go language specification. It may be 12 // customized to recognize only a subset of those literals and to recognize 13 // different white space characters. 14 // 15 // Basic usage pattern: 16 // 17 // var s scanner.Scanner 18 // s.Init(src) 19 // tok := s.Scan() 20 // for tok != scanner.EOF { 21 // // do something with tok 22 // tok = s.Scan() 23 // } 24 // 25 package scanner 26 27 import ( 28 "bytes" 29 "fmt" 30 "io" 31 "os" 32 "unicode" 33 "unicode/utf8" 34 ) 35 36 // TODO(gri): Consider changing this to use the new (token) Position package. 37 38 // A source position is represented by a Position value. 39 // A position is valid if Line > 0. 40 type Position struct { 41 Filename string // filename, if any 42 Offset int // byte offset, starting at 0 43 Line int // line number, starting at 1 44 Column int // column number, starting at 1 (character count per line) 45 } 46 47 // IsValid returns true if the position is valid. 48 func (pos *Position) IsValid() bool { return pos.Line > 0 } 49 50 func (pos Position) String() string { 51 s := pos.Filename 52 if pos.IsValid() { 53 if s != "" { 54 s += ":" 55 } 56 s += fmt.Sprintf("%d:%d", pos.Line, pos.Column) 57 } 58 if s == "" { 59 s = "???" 60 } 61 return s 62 } 63 64 // Predefined mode bits to control recognition of tokens. For instance, 65 // to configure a Scanner such that it only recognizes (Go) identifiers, 66 // integers, and skips comments, set the Scanner's Mode field to: 67 // 68 // ScanIdents | ScanInts | SkipComments 69 // 70 const ( 71 ScanIdents = 1 << -Ident 72 ScanInts = 1 << -Int 73 ScanFloats = 1 << -Float // includes Ints 74 ScanChars = 1 << -Char 75 ScanStrings = 1 << -String 76 ScanRawStrings = 1 << -RawString 77 ScanComments = 1 << -Comment 78 SkipComments = 1 << -skipComment // if set with ScanComments, comments become white space 79 GoTokens = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments 80 ) 81 82 // The result of Scan is one of the following tokens or a Unicode character. 83 const ( 84 EOF = -(iota + 1) 85 Ident 86 Int 87 Float 88 Char 89 String 90 RawString 91 Comment 92 skipComment 93 ) 94 95 var tokenString = map[rune]string{ 96 EOF: "EOF", 97 Ident: "Ident", 98 Int: "Int", 99 Float: "Float", 100 Char: "Char", 101 String: "String", 102 RawString: "RawString", 103 Comment: "Comment", 104 } 105 106 // TokenString returns a printable string for a token or Unicode character. 107 func TokenString(tok rune) string { 108 if s, found := tokenString[tok]; found { 109 return s 110 } 111 return fmt.Sprintf("%q", string(tok)) 112 } 113 114 // GoWhitespace is the default value for the Scanner's Whitespace field. 115 // Its value selects Go's white space characters. 116 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' ' 117 118 const bufLen = 1024 // at least utf8.UTFMax 119 120 // A Scanner implements reading of Unicode characters and tokens from an io.Reader. 121 type Scanner struct { 122 // Input 123 src io.Reader 124 125 // Source buffer 126 srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next() 127 srcPos int // reading position (srcBuf index) 128 srcEnd int // source end (srcBuf index) 129 130 // Source position 131 srcBufOffset int // byte offset of srcBuf[0] in source 132 line int // line count 133 column int // character count 134 lastLineLen int // length of last line in characters (for correct column reporting) 135 lastCharLen int // length of last character in bytes 136 137 // Token text buffer 138 // Typically, token text is stored completely in srcBuf, but in general 139 // the token text's head may be buffered in tokBuf while the token text's 140 // tail is stored in srcBuf. 141 tokBuf bytes.Buffer // token text head that is not in srcBuf anymore 142 tokPos int // token text tail position (srcBuf index); valid if >= 0 143 tokEnd int // token text tail end (srcBuf index) 144 145 // One character look-ahead 146 ch rune // character before current srcPos 147 148 // Error is called for each error encountered. If no Error 149 // function is set, the error is reported to os.Stderr. 150 Error func(s *Scanner, msg string) 151 152 // ErrorCount is incremented by one for each error encountered. 153 ErrorCount int 154 155 // The Mode field controls which tokens are recognized. For instance, 156 // to recognize Ints, set the ScanInts bit in Mode. The field may be 157 // changed at any time. 158 Mode uint 159 160 // The Whitespace field controls which characters are recognized 161 // as white space. To recognize a character ch <= ' ' as white space, 162 // set the ch'th bit in Whitespace (the Scanner's behavior is undefined 163 // for values ch > ' '). The field may be changed at any time. 164 Whitespace uint64 165 166 // Start position of most recently scanned token; set by Scan. 167 // Calling Init or Next invalidates the position (Line == 0). 168 // The Filename field is always left untouched by the Scanner. 169 // If an error is reported (via Error) and Position is invalid, 170 // the scanner is not inside a token. Call Pos to obtain an error 171 // position in that case. 172 Position 173 } 174 175 // Init initializes a Scanner with a new source and returns s. 176 // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens, 177 // and Whitespace is set to GoWhitespace. 178 func (s *Scanner) Init(src io.Reader) *Scanner { 179 s.src = src 180 181 // initialize source buffer 182 // (the first call to next() will fill it by calling src.Read) 183 s.srcBuf[0] = utf8.RuneSelf // sentinel 184 s.srcPos = 0 185 s.srcEnd = 0 186 187 // initialize source position 188 s.srcBufOffset = 0 189 s.line = 1 190 s.column = 0 191 s.lastLineLen = 0 192 s.lastCharLen = 0 193 194 // initialize token text buffer 195 // (required for first call to next()). 196 s.tokPos = -1 197 198 // initialize one character look-ahead 199 s.ch = -1 // no char read yet 200 201 // initialize public fields 202 s.Error = nil 203 s.ErrorCount = 0 204 s.Mode = GoTokens 205 s.Whitespace = GoWhitespace 206 s.Line = 0 // invalidate token position 207 208 return s 209 } 210 211 // TODO(gri): The code for next() and the internal scanner state could benefit 212 // from a rethink. While next() is optimized for the common ASCII 213 // case, the "corrections" needed for proper position tracking undo 214 // some of the attempts for fast-path optimization. 215 216 // next reads and returns the next Unicode character. It is designed such 217 // that only a minimal amount of work needs to be done in the common ASCII 218 // case (one test to check for both ASCII and end-of-buffer, and one test 219 // to check for newlines). 220 func (s *Scanner) next() rune { 221 ch, width := rune(s.srcBuf[s.srcPos]), 1 222 223 if ch >= utf8.RuneSelf { 224 // uncommon case: not ASCII or not enough bytes 225 for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) { 226 // not enough bytes: read some more, but first 227 // save away token text if any 228 if s.tokPos >= 0 { 229 s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos]) 230 s.tokPos = 0 231 // s.tokEnd is set by Scan() 232 } 233 // move unread bytes to beginning of buffer 234 copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd]) 235 s.srcBufOffset += s.srcPos 236 // read more bytes 237 // (an io.Reader must return io.EOF when it reaches 238 // the end of what it is reading - simply returning 239 // n == 0 will make this loop retry forever; but the 240 // error is in the reader implementation in that case) 241 i := s.srcEnd - s.srcPos 242 n, err := s.src.Read(s.srcBuf[i:bufLen]) 243 s.srcPos = 0 244 s.srcEnd = i + n 245 s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel 246 if err != nil { 247 if s.srcEnd == 0 { 248 if s.lastCharLen > 0 { 249 // previous character was not EOF 250 s.column++ 251 } 252 s.lastCharLen = 0 253 return EOF 254 } 255 if err != io.EOF { 256 s.error(err.Error()) 257 } 258 // If err == EOF, we won't be getting more 259 // bytes; break to avoid infinite loop. If 260 // err is something else, we don't know if 261 // we can get more bytes; thus also break. 262 break 263 } 264 } 265 // at least one byte 266 ch = rune(s.srcBuf[s.srcPos]) 267 if ch >= utf8.RuneSelf { 268 // uncommon case: not ASCII 269 ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd]) 270 if ch == utf8.RuneError && width == 1 { 271 // advance for correct error position 272 s.srcPos += width 273 s.lastCharLen = width 274 s.column++ 275 s.error("illegal UTF-8 encoding") 276 return ch 277 } 278 } 279 } 280 281 // advance 282 s.srcPos += width 283 s.lastCharLen = width 284 s.column++ 285 286 // special situations 287 switch ch { 288 case 0: 289 // for compatibility with other tools 290 s.error("illegal character NUL") 291 case '\n': 292 s.line++ 293 s.lastLineLen = s.column 294 s.column = 0 295 } 296 297 return ch 298 } 299 300 // Next reads and returns the next Unicode character. 301 // It returns EOF at the end of the source. It reports 302 // a read error by calling s.Error, if not nil; otherwise 303 // it prints an error message to os.Stderr. Next does not 304 // update the Scanner's Position field; use Pos() to 305 // get the current position. 306 func (s *Scanner) Next() rune { 307 s.tokPos = -1 // don't collect token text 308 s.Line = 0 // invalidate token position 309 ch := s.Peek() 310 s.ch = s.next() 311 return ch 312 } 313 314 // Peek returns the next Unicode character in the source without advancing 315 // the scanner. It returns EOF if the scanner's position is at the last 316 // character of the source. 317 func (s *Scanner) Peek() rune { 318 if s.ch < 0 { 319 s.ch = s.next() 320 } 321 return s.ch 322 } 323 324 func (s *Scanner) error(msg string) { 325 s.ErrorCount++ 326 if s.Error != nil { 327 s.Error(s, msg) 328 return 329 } 330 pos := s.Position 331 if !pos.IsValid() { 332 pos = s.Pos() 333 } 334 fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg) 335 } 336 337 func (s *Scanner) scanIdentifier() rune { 338 ch := s.next() // read character after first '_' or letter 339 for ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) { 340 ch = s.next() 341 } 342 return ch 343 } 344 345 func digitVal(ch rune) int { 346 switch { 347 case '0' <= ch && ch <= '9': 348 return int(ch - '0') 349 case 'a' <= ch && ch <= 'f': 350 return int(ch - 'a' + 10) 351 case 'A' <= ch && ch <= 'F': 352 return int(ch - 'A' + 10) 353 } 354 return 16 // larger than any legal digit val 355 } 356 357 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' } 358 359 func (s *Scanner) scanMantissa(ch rune) rune { 360 for isDecimal(ch) { 361 ch = s.next() 362 } 363 return ch 364 } 365 366 func (s *Scanner) scanFraction(ch rune) rune { 367 if ch == '.' { 368 ch = s.scanMantissa(s.next()) 369 } 370 return ch 371 } 372 373 func (s *Scanner) scanExponent(ch rune) rune { 374 if ch == 'e' || ch == 'E' { 375 ch = s.next() 376 if ch == '-' || ch == '+' { 377 ch = s.next() 378 } 379 ch = s.scanMantissa(ch) 380 } 381 return ch 382 } 383 384 func (s *Scanner) scanNumber(ch rune) (rune, rune) { 385 // isDecimal(ch) 386 if ch == '0' { 387 // int or float 388 ch = s.next() 389 if ch == 'x' || ch == 'X' { 390 // hexadecimal int 391 ch = s.next() 392 for digitVal(ch) < 16 { 393 ch = s.next() 394 } 395 } else { 396 // octal int or float 397 seenDecimalDigit := false 398 for isDecimal(ch) { 399 if ch > '7' { 400 seenDecimalDigit = true 401 } 402 ch = s.next() 403 } 404 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 405 // float 406 ch = s.scanFraction(ch) 407 ch = s.scanExponent(ch) 408 return Float, ch 409 } 410 // octal int 411 if seenDecimalDigit { 412 s.error("illegal octal number") 413 } 414 } 415 return Int, ch 416 } 417 // decimal int or float 418 ch = s.scanMantissa(ch) 419 if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') { 420 // float 421 ch = s.scanFraction(ch) 422 ch = s.scanExponent(ch) 423 return Float, ch 424 } 425 return Int, ch 426 } 427 428 func (s *Scanner) scanDigits(ch rune, base, n int) rune { 429 for n > 0 && digitVal(ch) < base { 430 ch = s.next() 431 n-- 432 } 433 if n > 0 { 434 s.error("illegal char escape") 435 } 436 return ch 437 } 438 439 func (s *Scanner) scanEscape(quote rune) rune { 440 ch := s.next() // read character after '/' 441 switch ch { 442 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote: 443 // nothing to do 444 ch = s.next() 445 case '0', '1', '2', '3', '4', '5', '6', '7': 446 ch = s.scanDigits(ch, 8, 3) 447 case 'x': 448 ch = s.scanDigits(s.next(), 16, 2) 449 case 'u': 450 ch = s.scanDigits(s.next(), 16, 4) 451 case 'U': 452 ch = s.scanDigits(s.next(), 16, 8) 453 default: 454 s.error("illegal char escape") 455 } 456 return ch 457 } 458 459 func (s *Scanner) scanString(quote rune) (n int) { 460 ch := s.next() // read character after quote 461 for ch != quote { 462 if ch == '\n' || ch < 0 { 463 s.error("literal not terminated") 464 return 465 } 466 if ch == '\\' { 467 ch = s.scanEscape(quote) 468 } else { 469 ch = s.next() 470 } 471 n++ 472 } 473 return 474 } 475 476 func (s *Scanner) scanRawString() { 477 ch := s.next() // read character after '`' 478 for ch != '`' { 479 if ch < 0 { 480 s.error("literal not terminated") 481 return 482 } 483 ch = s.next() 484 } 485 } 486 487 func (s *Scanner) scanChar() { 488 if s.scanString('\'') != 1 { 489 s.error("illegal char literal") 490 } 491 } 492 493 func (s *Scanner) scanComment(ch rune) rune { 494 // ch == '/' || ch == '*' 495 if ch == '/' { 496 // line comment 497 ch = s.next() // read character after "//" 498 for ch != '\n' && ch >= 0 { 499 ch = s.next() 500 } 501 return ch 502 } 503 504 // general comment 505 ch = s.next() // read character after "/*" 506 for { 507 if ch < 0 { 508 s.error("comment not terminated") 509 break 510 } 511 ch0 := ch 512 ch = s.next() 513 if ch0 == '*' && ch == '/' { 514 ch = s.next() 515 break 516 } 517 } 518 return ch 519 } 520 521 // Scan reads the next token or Unicode character from source and returns it. 522 // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set. 523 // It returns EOF at the end of the source. It reports scanner errors (read and 524 // token errors) by calling s.Error, if not nil; otherwise it prints an error 525 // message to os.Stderr. 526 func (s *Scanner) Scan() rune { 527 ch := s.Peek() 528 529 // reset token text position 530 s.tokPos = -1 531 s.Line = 0 532 533 redo: 534 // skip white space 535 for s.Whitespace&(1<<uint(ch)) != 0 { 536 ch = s.next() 537 } 538 539 // start collecting token text 540 s.tokBuf.Reset() 541 s.tokPos = s.srcPos - s.lastCharLen 542 543 // set token position 544 // (this is a slightly optimized version of the code in Pos()) 545 s.Offset = s.srcBufOffset + s.tokPos 546 if s.column > 0 { 547 // common case: last character was not a '\n' 548 s.Line = s.line 549 s.Column = s.column 550 } else { 551 // last character was a '\n' 552 // (we cannot be at the beginning of the source 553 // since we have called next() at least once) 554 s.Line = s.line - 1 555 s.Column = s.lastLineLen 556 } 557 558 // determine token value 559 tok := ch 560 switch { 561 case unicode.IsLetter(ch) || ch == '_': 562 if s.Mode&ScanIdents != 0 { 563 tok = Ident 564 ch = s.scanIdentifier() 565 } else { 566 ch = s.next() 567 } 568 case isDecimal(ch): 569 if s.Mode&(ScanInts|ScanFloats) != 0 { 570 tok, ch = s.scanNumber(ch) 571 } else { 572 ch = s.next() 573 } 574 default: 575 switch ch { 576 case '"': 577 if s.Mode&ScanStrings != 0 { 578 s.scanString('"') 579 tok = String 580 } 581 ch = s.next() 582 case '\'': 583 if s.Mode&ScanChars != 0 { 584 s.scanChar() 585 tok = Char 586 } 587 ch = s.next() 588 case '.': 589 ch = s.next() 590 if isDecimal(ch) && s.Mode&ScanFloats != 0 { 591 tok = Float 592 ch = s.scanMantissa(ch) 593 ch = s.scanExponent(ch) 594 } 595 case '/': 596 ch = s.next() 597 if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 { 598 if s.Mode&SkipComments != 0 { 599 s.tokPos = -1 // don't collect token text 600 ch = s.scanComment(ch) 601 goto redo 602 } 603 ch = s.scanComment(ch) 604 tok = Comment 605 } 606 case '`': 607 if s.Mode&ScanRawStrings != 0 { 608 s.scanRawString() 609 tok = String 610 } 611 ch = s.next() 612 default: 613 ch = s.next() 614 } 615 } 616 617 // end of token text 618 s.tokEnd = s.srcPos - s.lastCharLen 619 620 s.ch = ch 621 return tok 622 } 623 624 // Pos returns the position of the character immediately after 625 // the character or token returned by the last call to Next or Scan. 626 func (s *Scanner) Pos() (pos Position) { 627 pos.Filename = s.Filename 628 pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen 629 switch { 630 case s.column > 0: 631 // common case: last character was not a '\n' 632 pos.Line = s.line 633 pos.Column = s.column 634 case s.lastLineLen > 0: 635 // last character was a '\n' 636 pos.Line = s.line - 1 637 pos.Column = s.lastLineLen 638 default: 639 // at the beginning of the source 640 pos.Line = 1 641 pos.Column = 1 642 } 643 return 644 } 645 646 // TokenText returns the string corresponding to the most recently scanned token. 647 // Valid after calling Scan(). 648 func (s *Scanner) TokenText() string { 649 if s.tokPos < 0 { 650 // no token text 651 return "" 652 } 653 654 if s.tokEnd < 0 { 655 // if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0) 656 s.tokEnd = s.tokPos 657 } 658 659 if s.tokBuf.Len() == 0 { 660 // common case: the entire token text is still in srcBuf 661 return string(s.srcBuf[s.tokPos:s.tokEnd]) 662 } 663 664 // part of the token text was saved in tokBuf: save the rest in 665 // tokBuf as well and return its content 666 s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd]) 667 s.tokPos = s.tokEnd // ensure idempotency of TokenText() call 668 return s.tokBuf.String() 669 }