Source file src/pkg/go/scanner/scanner.go
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package scanner implements a scanner for Go source text.
6 // It takes a []byte as source which can then be tokenized
7 // through repeated calls to the Scan method.
8 //
9 package scanner
10
11 import (
12 "bytes"
13 "fmt"
14 "go/token"
15 "path/filepath"
16 "strconv"
17 "unicode"
18 "unicode/utf8"
19 )
20
21 // An ErrorHandler may be provided to Scanner.Init. If a syntax error is
22 // encountered and a handler was installed, the handler is called with a
23 // position and an error message. The position points to the beginning of
24 // the offending token.
25 //
26 type ErrorHandler func(pos token.Position, msg string)
27
28 // A Scanner holds the scanner's internal state while processing
29 // a given text. It can be allocated as part of another data
30 // structure but must be initialized via Init before use.
31 //
32 type Scanner struct {
33 // immutable state
34 file *token.File // source file handle
35 dir string // directory portion of file.Name()
36 src []byte // source
37 err ErrorHandler // error reporting; or nil
38 mode Mode // scanning mode
39
40 // scanning state
41 ch rune // current character
42 offset int // character offset
43 rdOffset int // reading offset (position after current character)
44 lineOffset int // current line offset
45 insertSemi bool // insert a semicolon before next newline
46
47 // public state - ok to modify
48 ErrorCount int // number of errors encountered
49 }
50
51 // Read the next Unicode char into s.ch.
52 // s.ch < 0 means end-of-file.
53 //
54 func (s *Scanner) next() {
55 if s.rdOffset < len(s.src) {
56 s.offset = s.rdOffset
57 if s.ch == '\n' {
58 s.lineOffset = s.offset
59 s.file.AddLine(s.offset)
60 }
61 r, w := rune(s.src[s.rdOffset]), 1
62 switch {
63 case r == 0:
64 s.error(s.offset, "illegal character NUL")
65 case r >= 0x80:
66 // not ASCII
67 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
68 if r == utf8.RuneError && w == 1 {
69 s.error(s.offset, "illegal UTF-8 encoding")
70 }
71 }
72 s.rdOffset += w
73 s.ch = r
74 } else {
75 s.offset = len(s.src)
76 if s.ch == '\n' {
77 s.lineOffset = s.offset
78 s.file.AddLine(s.offset)
79 }
80 s.ch = -1 // eof
81 }
82 }
83
84 // A mode value is set of flags (or 0).
85 // They control scanner behavior.
86 //
87 type Mode uint
88
89 const (
90 ScanComments Mode = 1 << iota // return comments as COMMENT tokens
91 dontInsertSemis // do not automatically insert semicolons - for testing only
92 )
93
94 // Init prepares the scanner s to tokenize the text src by setting the
95 // scanner at the beginning of src. The scanner uses the file set file
96 // for position information and it adds line information for each line.
97 // It is ok to re-use the same file when re-scanning the same file as
98 // line information which is already present is ignored. Init causes a
99 // panic if the file size does not match the src size.
100 //
101 // Calls to Scan will invoke the error handler err if they encounter a
102 // syntax error and err is not nil. Also, for each error encountered,
103 // the Scanner field ErrorCount is incremented by one. The mode parameter
104 // determines how comments are handled.
105 //
106 // Note that Init may call err if there is an error in the first character
107 // of the file.
108 //
109 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
110 // Explicitly initialize all fields since a scanner may be reused.
111 if file.Size() != len(src) {
112 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
113 }
114 s.file = file
115 s.dir, _ = filepath.Split(file.Name())
116 s.src = src
117 s.err = err
118 s.mode = mode
119
120 s.ch = ' '
121 s.offset = 0
122 s.rdOffset = 0
123 s.lineOffset = 0
124 s.insertSemi = false
125 s.ErrorCount = 0
126
127 s.next()
128 }
129
130 func (s *Scanner) error(offs int, msg string) {
131 if s.err != nil {
132 s.err(s.file.Position(s.file.Pos(offs)), msg)
133 }
134 s.ErrorCount++
135 }
136
137 var prefix = []byte("//line ")
138
139 func (s *Scanner) interpretLineComment(text []byte) {
140 if bytes.HasPrefix(text, prefix) {
141 // get filename and line number, if any
142 if i := bytes.LastIndex(text, []byte{':'}); i > 0 {
143 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
144 // valid //line filename:line comment;
145 filename := filepath.Clean(string(text[len(prefix):i]))
146 if !filepath.IsAbs(filename) {
147 // make filename relative to current directory
148 filename = filepath.Join(s.dir, filename)
149 }
150 // update scanner position
151 s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line
152 }
153 }
154 }
155 }
156
157 func (s *Scanner) scanComment() string {
158 // initial '/' already consumed; s.ch == '/' || s.ch == '*'
159 offs := s.offset - 1 // position of initial '/'
160
161 if s.ch == '/' {
162 //-style comment
163 s.next()
164 for s.ch != '\n' && s.ch >= 0 {
165 s.next()
166 }
167 if offs == s.lineOffset {
168 // comment starts at the beginning of the current line
169 s.interpretLineComment(s.src[offs:s.offset])
170 }
171 goto exit
172 }
173
174 /*-style comment */
175 s.next()
176 for s.ch >= 0 {
177 ch := s.ch
178 s.next()
179 if ch == '*' && s.ch == '/' {
180 s.next()
181 goto exit
182 }
183 }
184
185 s.error(offs, "comment not terminated")
186
187 exit:
188 return string(s.src[offs:s.offset])
189 }
190
191 func (s *Scanner) findLineEnd() bool {
192 // initial '/' already consumed
193
194 defer func(offs int) {
195 // reset scanner state to where it was upon calling findLineEnd
196 s.ch = '/'
197 s.offset = offs
198 s.rdOffset = offs + 1
199 s.next() // consume initial '/' again
200 }(s.offset - 1)
201
202 // read ahead until a newline, EOF, or non-comment token is found
203 for s.ch == '/' || s.ch == '*' {
204 if s.ch == '/' {
205 //-style comment always contains a newline
206 return true
207 }
208 /*-style comment: look for newline */
209 s.next()
210 for s.ch >= 0 {
211 ch := s.ch
212 if ch == '\n' {
213 return true
214 }
215 s.next()
216 if ch == '*' && s.ch == '/' {
217 s.next()
218 break
219 }
220 }
221 s.skipWhitespace() // s.insertSemi is set
222 if s.ch < 0 || s.ch == '\n' {
223 return true
224 }
225 if s.ch != '/' {
226 // non-comment token
227 return false
228 }
229 s.next() // consume '/'
230 }
231
232 return false
233 }
234
235 func isLetter(ch rune) bool {
236 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
237 }
238
239 func isDigit(ch rune) bool {
240 return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
241 }
242
243 func (s *Scanner) scanIdentifier() string {
244 offs := s.offset
245 for isLetter(s.ch) || isDigit(s.ch) {
246 s.next()
247 }
248 return string(s.src[offs:s.offset])
249 }
250
251 func digitVal(ch rune) int {
252 switch {
253 case '0' <= ch && ch <= '9':
254 return int(ch - '0')
255 case 'a' <= ch && ch <= 'f':
256 return int(ch - 'a' + 10)
257 case 'A' <= ch && ch <= 'F':
258 return int(ch - 'A' + 10)
259 }
260 return 16 // larger than any legal digit val
261 }
262
263 func (s *Scanner) scanMantissa(base int) {
264 for digitVal(s.ch) < base {
265 s.next()
266 }
267 }
268
269 func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
270 // digitVal(s.ch) < 10
271 offs := s.offset
272 tok := token.INT
273
274 if seenDecimalPoint {
275 offs--
276 tok = token.FLOAT
277 s.scanMantissa(10)
278 goto exponent
279 }
280
281 if s.ch == '0' {
282 // int or float
283 offs := s.offset
284 s.next()
285 if s.ch == 'x' || s.ch == 'X' {
286 // hexadecimal int
287 s.next()
288 s.scanMantissa(16)
289 if s.offset-offs <= 2 {
290 // only scanned "0x" or "0X"
291 s.error(offs, "illegal hexadecimal number")
292 }
293 } else {
294 // octal int or float
295 seenDecimalDigit := false
296 s.scanMantissa(8)
297 if s.ch == '8' || s.ch == '9' {
298 // illegal octal int or float
299 seenDecimalDigit = true
300 s.scanMantissa(10)
301 }
302 if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' {
303 goto fraction
304 }
305 // octal int
306 if seenDecimalDigit {
307 s.error(offs, "illegal octal number")
308 }
309 }
310 goto exit
311 }
312
313 // decimal int or float
314 s.scanMantissa(10)
315
316 fraction:
317 if s.ch == '.' {
318 tok = token.FLOAT
319 s.next()
320 s.scanMantissa(10)
321 }
322
323 exponent:
324 if s.ch == 'e' || s.ch == 'E' {
325 tok = token.FLOAT
326 s.next()
327 if s.ch == '-' || s.ch == '+' {
328 s.next()
329 }
330 s.scanMantissa(10)
331 }
332
333 if s.ch == 'i' {
334 tok = token.IMAG
335 s.next()
336 }
337
338 exit:
339 return tok, string(s.src[offs:s.offset])
340 }
341
342 func (s *Scanner) scanEscape(quote rune) {
343 offs := s.offset
344
345 var i, base, max uint32
346 switch s.ch {
347 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
348 s.next()
349 return
350 case '0', '1', '2', '3', '4', '5', '6', '7':
351 i, base, max = 3, 8, 255
352 case 'x':
353 s.next()
354 i, base, max = 2, 16, 255
355 case 'u':
356 s.next()
357 i, base, max = 4, 16, unicode.MaxRune
358 case 'U':
359 s.next()
360 i, base, max = 8, 16, unicode.MaxRune
361 default:
362 s.next() // always make progress
363 s.error(offs, "unknown escape sequence")
364 return
365 }
366
367 var x uint32
368 for ; i > 0 && s.ch != quote && s.ch >= 0; i-- {
369 d := uint32(digitVal(s.ch))
370 if d >= base {
371 s.error(s.offset, "illegal character in escape sequence")
372 break
373 }
374 x = x*base + d
375 s.next()
376 }
377 // in case of an error, consume remaining chars
378 for ; i > 0 && s.ch != quote && s.ch >= 0; i-- {
379 s.next()
380 }
381 if x > max || 0xd800 <= x && x < 0xe000 {
382 s.error(offs, "escape sequence is invalid Unicode code point")
383 }
384 }
385
386 func (s *Scanner) scanChar() string {
387 // '\'' opening already consumed
388 offs := s.offset - 1
389
390 n := 0
391 for s.ch != '\'' {
392 ch := s.ch
393 n++
394 s.next()
395 if ch == '\n' || ch < 0 {
396 s.error(offs, "character literal not terminated")
397 n = 1
398 break
399 }
400 if ch == '\\' {
401 s.scanEscape('\'')
402 }
403 }
404
405 s.next()
406
407 if n != 1 {
408 s.error(offs, "illegal character literal")
409 }
410
411 return string(s.src[offs:s.offset])
412 }
413
414 func (s *Scanner) scanString() string {
415 // '"' opening already consumed
416 offs := s.offset - 1
417
418 for s.ch != '"' {
419 ch := s.ch
420 s.next()
421 if ch == '\n' || ch < 0 {
422 s.error(offs, "string not terminated")
423 break
424 }
425 if ch == '\\' {
426 s.scanEscape('"')
427 }
428 }
429
430 s.next()
431
432 return string(s.src[offs:s.offset])
433 }
434
435 func stripCR(b []byte) []byte {
436 c := make([]byte, len(b))
437 i := 0
438 for _, ch := range b {
439 if ch != '\r' {
440 c[i] = ch
441 i++
442 }
443 }
444 return c[:i]
445 }
446
447 func (s *Scanner) scanRawString() string {
448 // '`' opening already consumed
449 offs := s.offset - 1
450
451 hasCR := false
452 for s.ch != '`' {
453 ch := s.ch
454 s.next()
455 if ch == '\r' {
456 hasCR = true
457 }
458 if ch < 0 {
459 s.error(offs, "string not terminated")
460 break
461 }
462 }
463
464 s.next()
465
466 lit := s.src[offs:s.offset]
467 if hasCR {
468 lit = stripCR(lit)
469 }
470
471 return string(lit)
472 }
473
474 func (s *Scanner) skipWhitespace() {
475 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
476 s.next()
477 }
478 }
479
480 // Helper functions for scanning multi-byte tokens such as >> += >>= .
481 // Different routines recognize different length tok_i based on matches
482 // of ch_i. If a token ends in '=', the result is tok1 or tok3
483 // respectively. Otherwise, the result is tok0 if there was no other
484 // matching character, or tok2 if the matching character was ch2.
485
486 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
487 if s.ch == '=' {
488 s.next()
489 return tok1
490 }
491 return tok0
492 }
493
494 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
495 if s.ch == '=' {
496 s.next()
497 return tok1
498 }
499 if s.ch == ch2 {
500 s.next()
501 return tok2
502 }
503 return tok0
504 }
505
506 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
507 if s.ch == '=' {
508 s.next()
509 return tok1
510 }
511 if s.ch == ch2 {
512 s.next()
513 if s.ch == '=' {
514 s.next()
515 return tok3
516 }
517 return tok2
518 }
519 return tok0
520 }
521
522 // Scan scans the next token and returns the token position, the token,
523 // and its literal string if applicable. The source end is indicated by
524 // token.EOF.
525 //
526 // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
527 // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
528 // has the corresponding value.
529 //
530 // If the returned token is token.SEMICOLON, the corresponding
531 // literal string is ";" if the semicolon was present in the source,
532 // and "\n" if the semicolon was inserted because of a newline or
533 // at EOF.
534 //
535 // If the returned token is token.ILLEGAL, the literal string is the
536 // offending character.
537 //
538 // In all other cases, Scan returns an empty literal string.
539 //
540 // For more tolerant parsing, Scan will return a valid token if
541 // possible even if a syntax error was encountered. Thus, even
542 // if the resulting token sequence contains no illegal tokens,
543 // a client may not assume that no error occurred. Instead it
544 // must check the scanner's ErrorCount or the number of calls
545 // of the error handler, if there was one installed.
546 //
547 // Scan adds line information to the file added to the file
548 // set with Init. Token positions are relative to that file
549 // and thus relative to the file set.
550 //
551 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
552 scanAgain:
553 s.skipWhitespace()
554
555 // current token start
556 pos = s.file.Pos(s.offset)
557
558 // determine token value
559 insertSemi := false
560 switch ch := s.ch; {
561 case isLetter(ch):
562 lit = s.scanIdentifier()
563 tok = token.Lookup(lit)
564 switch tok {
565 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
566 insertSemi = true
567 }
568 case digitVal(ch) < 10:
569 insertSemi = true
570 tok, lit = s.scanNumber(false)
571 default:
572 s.next() // always make progress
573 switch ch {
574 case -1:
575 if s.insertSemi {
576 s.insertSemi = false // EOF consumed
577 return pos, token.SEMICOLON, "\n"
578 }
579 tok = token.EOF
580 case '\n':
581 // we only reach here if s.insertSemi was
582 // set in the first place and exited early
583 // from s.skipWhitespace()
584 s.insertSemi = false // newline consumed
585 return pos, token.SEMICOLON, "\n"
586 case '"':
587 insertSemi = true
588 tok = token.STRING
589 lit = s.scanString()
590 case '\'':
591 insertSemi = true
592 tok = token.CHAR
593 lit = s.scanChar()
594 case '`':
595 insertSemi = true
596 tok = token.STRING
597 lit = s.scanRawString()
598 case ':':
599 tok = s.switch2(token.COLON, token.DEFINE)
600 case '.':
601 if digitVal(s.ch) < 10 {
602 insertSemi = true
603 tok, lit = s.scanNumber(true)
604 } else if s.ch == '.' {
605 s.next()
606 if s.ch == '.' {
607 s.next()
608 tok = token.ELLIPSIS
609 }
610 } else {
611 tok = token.PERIOD
612 }
613 case ',':
614 tok = token.COMMA
615 case ';':
616 tok = token.SEMICOLON
617 lit = ";"
618 case '(':
619 tok = token.LPAREN
620 case ')':
621 insertSemi = true
622 tok = token.RPAREN
623 case '[':
624 tok = token.LBRACK
625 case ']':
626 insertSemi = true
627 tok = token.RBRACK
628 case '{':
629 tok = token.LBRACE
630 case '}':
631 insertSemi = true
632 tok = token.RBRACE
633 case '+':
634 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
635 if tok == token.INC {
636 insertSemi = true
637 }
638 case '-':
639 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
640 if tok == token.DEC {
641 insertSemi = true
642 }
643 case '*':
644 tok = s.switch2(token.MUL, token.MUL_ASSIGN)
645 case '/':
646 if s.ch == '/' || s.ch == '*' {
647 // comment
648 if s.insertSemi && s.findLineEnd() {
649 // reset position to the beginning of the comment
650 s.ch = '/'
651 s.offset = s.file.Offset(pos)
652 s.rdOffset = s.offset + 1
653 s.insertSemi = false // newline consumed
654 return pos, token.SEMICOLON, "\n"
655 }
656 lit = s.scanComment()
657 if s.mode&ScanComments == 0 {
658 // skip comment
659 s.insertSemi = false // newline consumed
660 goto scanAgain
661 }
662 tok = token.COMMENT
663 } else {
664 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
665 }
666 case '%':
667 tok = s.switch2(token.REM, token.REM_ASSIGN)
668 case '^':
669 tok = s.switch2(token.XOR, token.XOR_ASSIGN)
670 case '<':
671 if s.ch == '-' {
672 s.next()
673 tok = token.ARROW
674 } else {
675 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
676 }
677 case '>':
678 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
679 case '=':
680 tok = s.switch2(token.ASSIGN, token.EQL)
681 case '!':
682 tok = s.switch2(token.NOT, token.NEQ)
683 case '&':
684 if s.ch == '^' {
685 s.next()
686 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
687 } else {
688 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
689 }
690 case '|':
691 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
692 default:
693 s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
694 insertSemi = s.insertSemi // preserve insertSemi info
695 tok = token.ILLEGAL
696 lit = string(ch)
697 }
698 }
699 if s.mode&dontInsertSemis == 0 {
700 s.insertSemi = insertSemi
701 }
702
703 return
704 }