src/pkg/go/scanner/scanner.go - The Go Programming Language

Golang

Source file src/pkg/go/scanner/scanner.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package scanner implements a scanner for Go source text.
     6	// It takes a []byte as source which can then be tokenized
     7	// through repeated calls to the Scan method.
     8	//
     9	package scanner
    10	
    11	import (
    12		"bytes"
    13		"fmt"
    14		"go/token"
    15		"path/filepath"
    16		"strconv"
    17		"unicode"
    18		"unicode/utf8"
    19	)
    20	
    21	// An ErrorHandler may be provided to Scanner.Init. If a syntax error is
    22	// encountered and a handler was installed, the handler is called with a
    23	// position and an error message. The position points to the beginning of
    24	// the offending token.
    25	//
    26	type ErrorHandler func(pos token.Position, msg string)
    27	
    28	// A Scanner holds the scanner's internal state while processing
    29	// a given text.  It can be allocated as part of another data
    30	// structure but must be initialized via Init before use.
    31	//
    32	type Scanner struct {
    33		// immutable state
    34		file *token.File  // source file handle
    35		dir  string       // directory portion of file.Name()
    36		src  []byte       // source
    37		err  ErrorHandler // error reporting; or nil
    38		mode Mode         // scanning mode
    39	
    40		// scanning state
    41		ch         rune // current character
    42		offset     int  // character offset
    43		rdOffset   int  // reading offset (position after current character)
    44		lineOffset int  // current line offset
    45		insertSemi bool // insert a semicolon before next newline
    46	
    47		// public state - ok to modify
    48		ErrorCount int // number of errors encountered
    49	}
    50	
    51	// Read the next Unicode char into s.ch.
    52	// s.ch < 0 means end-of-file.
    53	//
    54	func (s *Scanner) next() {
    55		if s.rdOffset < len(s.src) {
    56			s.offset = s.rdOffset
    57			if s.ch == '\n' {
    58				s.lineOffset = s.offset
    59				s.file.AddLine(s.offset)
    60			}
    61			r, w := rune(s.src[s.rdOffset]), 1
    62			switch {
    63			case r == 0:
    64				s.error(s.offset, "illegal character NUL")
    65			case r >= 0x80:
    66				// not ASCII
    67				r, w = utf8.DecodeRune(s.src[s.rdOffset:])
    68				if r == utf8.RuneError && w == 1 {
    69					s.error(s.offset, "illegal UTF-8 encoding")
    70				}
    71			}
    72			s.rdOffset += w
    73			s.ch = r
    74		} else {
    75			s.offset = len(s.src)
    76			if s.ch == '\n' {
    77				s.lineOffset = s.offset
    78				s.file.AddLine(s.offset)
    79			}
    80			s.ch = -1 // eof
    81		}
    82	}
    83	
    84	// A mode value is set of flags (or 0).
    85	// They control scanner behavior.
    86	//
    87	type Mode uint
    88	
    89	const (
    90		ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
    91		dontInsertSemis                  // do not automatically insert semicolons - for testing only
    92	)
    93	
    94	// Init prepares the scanner s to tokenize the text src by setting the
    95	// scanner at the beginning of src. The scanner uses the file set file
    96	// for position information and it adds line information for each line.
    97	// It is ok to re-use the same file when re-scanning the same file as
    98	// line information which is already present is ignored. Init causes a
    99	// panic if the file size does not match the src size.
   100	//
   101	// Calls to Scan will invoke the error handler err if they encounter a
   102	// syntax error and err is not nil. Also, for each error encountered,
   103	// the Scanner field ErrorCount is incremented by one. The mode parameter
   104	// determines how comments are handled.
   105	//
   106	// Note that Init may call err if there is an error in the first character
   107	// of the file.
   108	//
   109	func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
   110		// Explicitly initialize all fields since a scanner may be reused.
   111		if file.Size() != len(src) {
   112			panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
   113		}
   114		s.file = file
   115		s.dir, _ = filepath.Split(file.Name())
   116		s.src = src
   117		s.err = err
   118		s.mode = mode
   119	
   120		s.ch = ' '
   121		s.offset = 0
   122		s.rdOffset = 0
   123		s.lineOffset = 0
   124		s.insertSemi = false
   125		s.ErrorCount = 0
   126	
   127		s.next()
   128	}
   129	
   130	func (s *Scanner) error(offs int, msg string) {
   131		if s.err != nil {
   132			s.err(s.file.Position(s.file.Pos(offs)), msg)
   133		}
   134		s.ErrorCount++
   135	}
   136	
   137	var prefix = []byte("//line ")
   138	
   139	func (s *Scanner) interpretLineComment(text []byte) {
   140		if bytes.HasPrefix(text, prefix) {
   141			// get filename and line number, if any
   142			if i := bytes.LastIndex(text, []byte{':'}); i > 0 {
   143				if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
   144					// valid //line filename:line comment;
   145					filename := filepath.Clean(string(text[len(prefix):i]))
   146					if !filepath.IsAbs(filename) {
   147						// make filename relative to current directory
   148						filename = filepath.Join(s.dir, filename)
   149					}
   150					// update scanner position
   151					s.file.AddLineInfo(s.lineOffset+len(text)+1, filename, line) // +len(text)+1 since comment applies to next line
   152				}
   153			}
   154		}
   155	}
   156	
   157	func (s *Scanner) scanComment() string {
   158		// initial '/' already consumed; s.ch == '/' || s.ch == '*'
   159		offs := s.offset - 1 // position of initial '/'
   160	
   161		if s.ch == '/' {
   162			//-style comment
   163			s.next()
   164			for s.ch != '\n' && s.ch >= 0 {
   165				s.next()
   166			}
   167			if offs == s.lineOffset {
   168				// comment starts at the beginning of the current line
   169				s.interpretLineComment(s.src[offs:s.offset])
   170			}
   171			goto exit
   172		}
   173	
   174		/*-style comment */
   175		s.next()
   176		for s.ch >= 0 {
   177			ch := s.ch
   178			s.next()
   179			if ch == '*' && s.ch == '/' {
   180				s.next()
   181				goto exit
   182			}
   183		}
   184	
   185		s.error(offs, "comment not terminated")
   186	
   187	exit:
   188		return string(s.src[offs:s.offset])
   189	}
   190	
   191	func (s *Scanner) findLineEnd() bool {
   192		// initial '/' already consumed
   193	
   194		defer func(offs int) {
   195			// reset scanner state to where it was upon calling findLineEnd
   196			s.ch = '/'
   197			s.offset = offs
   198			s.rdOffset = offs + 1
   199			s.next() // consume initial '/' again
   200		}(s.offset - 1)
   201	
   202		// read ahead until a newline, EOF, or non-comment token is found
   203		for s.ch == '/' || s.ch == '*' {
   204			if s.ch == '/' {
   205				//-style comment always contains a newline
   206				return true
   207			}
   208			/*-style comment: look for newline */
   209			s.next()
   210			for s.ch >= 0 {
   211				ch := s.ch
   212				if ch == '\n' {
   213					return true
   214				}
   215				s.next()
   216				if ch == '*' && s.ch == '/' {
   217					s.next()
   218					break
   219				}
   220			}
   221			s.skipWhitespace() // s.insertSemi is set
   222			if s.ch < 0 || s.ch == '\n' {
   223				return true
   224			}
   225			if s.ch != '/' {
   226				// non-comment token
   227				return false
   228			}
   229			s.next() // consume '/'
   230		}
   231	
   232		return false
   233	}
   234	
   235	func isLetter(ch rune) bool {
   236		return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
   237	}
   238	
   239	func isDigit(ch rune) bool {
   240		return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
   241	}
   242	
   243	func (s *Scanner) scanIdentifier() string {
   244		offs := s.offset
   245		for isLetter(s.ch) || isDigit(s.ch) {
   246			s.next()
   247		}
   248		return string(s.src[offs:s.offset])
   249	}
   250	
   251	func digitVal(ch rune) int {
   252		switch {
   253		case '0' <= ch && ch <= '9':
   254			return int(ch - '0')
   255		case 'a' <= ch && ch <= 'f':
   256			return int(ch - 'a' + 10)
   257		case 'A' <= ch && ch <= 'F':
   258			return int(ch - 'A' + 10)
   259		}
   260		return 16 // larger than any legal digit val
   261	}
   262	
   263	func (s *Scanner) scanMantissa(base int) {
   264		for digitVal(s.ch) < base {
   265			s.next()
   266		}
   267	}
   268	
   269	func (s *Scanner) scanNumber(seenDecimalPoint bool) (token.Token, string) {
   270		// digitVal(s.ch) < 10
   271		offs := s.offset
   272		tok := token.INT
   273	
   274		if seenDecimalPoint {
   275			offs--
   276			tok = token.FLOAT
   277			s.scanMantissa(10)
   278			goto exponent
   279		}
   280	
   281		if s.ch == '0' {
   282			// int or float
   283			offs := s.offset
   284			s.next()
   285			if s.ch == 'x' || s.ch == 'X' {
   286				// hexadecimal int
   287				s.next()
   288				s.scanMantissa(16)
   289				if s.offset-offs <= 2 {
   290					// only scanned "0x" or "0X"
   291					s.error(offs, "illegal hexadecimal number")
   292				}
   293			} else {
   294				// octal int or float
   295				seenDecimalDigit := false
   296				s.scanMantissa(8)
   297				if s.ch == '8' || s.ch == '9' {
   298					// illegal octal int or float
   299					seenDecimalDigit = true
   300					s.scanMantissa(10)
   301				}
   302				if s.ch == '.' || s.ch == 'e' || s.ch == 'E' || s.ch == 'i' {
   303					goto fraction
   304				}
   305				// octal int
   306				if seenDecimalDigit {
   307					s.error(offs, "illegal octal number")
   308				}
   309			}
   310			goto exit
   311		}
   312	
   313		// decimal int or float
   314		s.scanMantissa(10)
   315	
   316	fraction:
   317		if s.ch == '.' {
   318			tok = token.FLOAT
   319			s.next()
   320			s.scanMantissa(10)
   321		}
   322	
   323	exponent:
   324		if s.ch == 'e' || s.ch == 'E' {
   325			tok = token.FLOAT
   326			s.next()
   327			if s.ch == '-' || s.ch == '+' {
   328				s.next()
   329			}
   330			s.scanMantissa(10)
   331		}
   332	
   333		if s.ch == 'i' {
   334			tok = token.IMAG
   335			s.next()
   336		}
   337	
   338	exit:
   339		return tok, string(s.src[offs:s.offset])
   340	}
   341	
   342	func (s *Scanner) scanEscape(quote rune) {
   343		offs := s.offset
   344	
   345		var i, base, max uint32
   346		switch s.ch {
   347		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   348			s.next()
   349			return
   350		case '0', '1', '2', '3', '4', '5', '6', '7':
   351			i, base, max = 3, 8, 255
   352		case 'x':
   353			s.next()
   354			i, base, max = 2, 16, 255
   355		case 'u':
   356			s.next()
   357			i, base, max = 4, 16, unicode.MaxRune
   358		case 'U':
   359			s.next()
   360			i, base, max = 8, 16, unicode.MaxRune
   361		default:
   362			s.next() // always make progress
   363			s.error(offs, "unknown escape sequence")
   364			return
   365		}
   366	
   367		var x uint32
   368		for ; i > 0 && s.ch != quote && s.ch >= 0; i-- {
   369			d := uint32(digitVal(s.ch))
   370			if d >= base {
   371				s.error(s.offset, "illegal character in escape sequence")
   372				break
   373			}
   374			x = x*base + d
   375			s.next()
   376		}
   377		// in case of an error, consume remaining chars
   378		for ; i > 0 && s.ch != quote && s.ch >= 0; i-- {
   379			s.next()
   380		}
   381		if x > max || 0xd800 <= x && x < 0xe000 {
   382			s.error(offs, "escape sequence is invalid Unicode code point")
   383		}
   384	}
   385	
   386	func (s *Scanner) scanChar() string {
   387		// '\'' opening already consumed
   388		offs := s.offset - 1
   389	
   390		n := 0
   391		for s.ch != '\'' {
   392			ch := s.ch
   393			n++
   394			s.next()
   395			if ch == '\n' || ch < 0 {
   396				s.error(offs, "character literal not terminated")
   397				n = 1
   398				break
   399			}
   400			if ch == '\\' {
   401				s.scanEscape('\'')
   402			}
   403		}
   404	
   405		s.next()
   406	
   407		if n != 1 {
   408			s.error(offs, "illegal character literal")
   409		}
   410	
   411		return string(s.src[offs:s.offset])
   412	}
   413	
   414	func (s *Scanner) scanString() string {
   415		// '"' opening already consumed
   416		offs := s.offset - 1
   417	
   418		for s.ch != '"' {
   419			ch := s.ch
   420			s.next()
   421			if ch == '\n' || ch < 0 {
   422				s.error(offs, "string not terminated")
   423				break
   424			}
   425			if ch == '\\' {
   426				s.scanEscape('"')
   427			}
   428		}
   429	
   430		s.next()
   431	
   432		return string(s.src[offs:s.offset])
   433	}
   434	
   435	func stripCR(b []byte) []byte {
   436		c := make([]byte, len(b))
   437		i := 0
   438		for _, ch := range b {
   439			if ch != '\r' {
   440				c[i] = ch
   441				i++
   442			}
   443		}
   444		return c[:i]
   445	}
   446	
   447	func (s *Scanner) scanRawString() string {
   448		// '`' opening already consumed
   449		offs := s.offset - 1
   450	
   451		hasCR := false
   452		for s.ch != '`' {
   453			ch := s.ch
   454			s.next()
   455			if ch == '\r' {
   456				hasCR = true
   457			}
   458			if ch < 0 {
   459				s.error(offs, "string not terminated")
   460				break
   461			}
   462		}
   463	
   464		s.next()
   465	
   466		lit := s.src[offs:s.offset]
   467		if hasCR {
   468			lit = stripCR(lit)
   469		}
   470	
   471		return string(lit)
   472	}
   473	
   474	func (s *Scanner) skipWhitespace() {
   475		for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
   476			s.next()
   477		}
   478	}
   479	
   480	// Helper functions for scanning multi-byte tokens such as >> += >>= .
   481	// Different routines recognize different length tok_i based on matches
   482	// of ch_i. If a token ends in '=', the result is tok1 or tok3
   483	// respectively. Otherwise, the result is tok0 if there was no other
   484	// matching character, or tok2 if the matching character was ch2.
   485	
   486	func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
   487		if s.ch == '=' {
   488			s.next()
   489			return tok1
   490		}
   491		return tok0
   492	}
   493	
   494	func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
   495		if s.ch == '=' {
   496			s.next()
   497			return tok1
   498		}
   499		if s.ch == ch2 {
   500			s.next()
   501			return tok2
   502		}
   503		return tok0
   504	}
   505	
   506	func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
   507		if s.ch == '=' {
   508			s.next()
   509			return tok1
   510		}
   511		if s.ch == ch2 {
   512			s.next()
   513			if s.ch == '=' {
   514				s.next()
   515				return tok3
   516			}
   517			return tok2
   518		}
   519		return tok0
   520	}
   521	
   522	// Scan scans the next token and returns the token position, the token,
   523	// and its literal string if applicable. The source end is indicated by
   524	// token.EOF.
   525	//
   526	// If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
   527	// token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
   528	// has the corresponding value.
   529	//
   530	// If the returned token is token.SEMICOLON, the corresponding
   531	// literal string is ";" if the semicolon was present in the source,
   532	// and "\n" if the semicolon was inserted because of a newline or
   533	// at EOF.
   534	//
   535	// If the returned token is token.ILLEGAL, the literal string is the
   536	// offending character.
   537	//
   538	// In all other cases, Scan returns an empty literal string.
   539	//
   540	// For more tolerant parsing, Scan will return a valid token if
   541	// possible even if a syntax error was encountered. Thus, even
   542	// if the resulting token sequence contains no illegal tokens,
   543	// a client may not assume that no error occurred. Instead it
   544	// must check the scanner's ErrorCount or the number of calls
   545	// of the error handler, if there was one installed.
   546	//
   547	// Scan adds line information to the file added to the file
   548	// set with Init. Token positions are relative to that file
   549	// and thus relative to the file set.
   550	//
   551	func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
   552	scanAgain:
   553		s.skipWhitespace()
   554	
   555		// current token start
   556		pos = s.file.Pos(s.offset)
   557	
   558		// determine token value
   559		insertSemi := false
   560		switch ch := s.ch; {
   561		case isLetter(ch):
   562			lit = s.scanIdentifier()
   563			tok = token.Lookup(lit)
   564			switch tok {
   565			case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
   566				insertSemi = true
   567			}
   568		case digitVal(ch) < 10:
   569			insertSemi = true
   570			tok, lit = s.scanNumber(false)
   571		default:
   572			s.next() // always make progress
   573			switch ch {
   574			case -1:
   575				if s.insertSemi {
   576					s.insertSemi = false // EOF consumed
   577					return pos, token.SEMICOLON, "\n"
   578				}
   579				tok = token.EOF
   580			case '\n':
   581				// we only reach here if s.insertSemi was
   582				// set in the first place and exited early
   583				// from s.skipWhitespace()
   584				s.insertSemi = false // newline consumed
   585				return pos, token.SEMICOLON, "\n"
   586			case '"':
   587				insertSemi = true
   588				tok = token.STRING
   589				lit = s.scanString()
   590			case '\'':
   591				insertSemi = true
   592				tok = token.CHAR
   593				lit = s.scanChar()
   594			case '`':
   595				insertSemi = true
   596				tok = token.STRING
   597				lit = s.scanRawString()
   598			case ':':
   599				tok = s.switch2(token.COLON, token.DEFINE)
   600			case '.':
   601				if digitVal(s.ch) < 10 {
   602					insertSemi = true
   603					tok, lit = s.scanNumber(true)
   604				} else if s.ch == '.' {
   605					s.next()
   606					if s.ch == '.' {
   607						s.next()
   608						tok = token.ELLIPSIS
   609					}
   610				} else {
   611					tok = token.PERIOD
   612				}
   613			case ',':
   614				tok = token.COMMA
   615			case ';':
   616				tok = token.SEMICOLON
   617				lit = ";"
   618			case '(':
   619				tok = token.LPAREN
   620			case ')':
   621				insertSemi = true
   622				tok = token.RPAREN
   623			case '[':
   624				tok = token.LBRACK
   625			case ']':
   626				insertSemi = true
   627				tok = token.RBRACK
   628			case '{':
   629				tok = token.LBRACE
   630			case '}':
   631				insertSemi = true
   632				tok = token.RBRACE
   633			case '+':
   634				tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
   635				if tok == token.INC {
   636					insertSemi = true
   637				}
   638			case '-':
   639				tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
   640				if tok == token.DEC {
   641					insertSemi = true
   642				}
   643			case '*':
   644				tok = s.switch2(token.MUL, token.MUL_ASSIGN)
   645			case '/':
   646				if s.ch == '/' || s.ch == '*' {
   647					// comment
   648					if s.insertSemi && s.findLineEnd() {
   649						// reset position to the beginning of the comment
   650						s.ch = '/'
   651						s.offset = s.file.Offset(pos)
   652						s.rdOffset = s.offset + 1
   653						s.insertSemi = false // newline consumed
   654						return pos, token.SEMICOLON, "\n"
   655					}
   656					lit = s.scanComment()
   657					if s.mode&ScanComments == 0 {
   658						// skip comment
   659						s.insertSemi = false // newline consumed
   660						goto scanAgain
   661					}
   662					tok = token.COMMENT
   663				} else {
   664					tok = s.switch2(token.QUO, token.QUO_ASSIGN)
   665				}
   666			case '%':
   667				tok = s.switch2(token.REM, token.REM_ASSIGN)
   668			case '^':
   669				tok = s.switch2(token.XOR, token.XOR_ASSIGN)
   670			case '<':
   671				if s.ch == '-' {
   672					s.next()
   673					tok = token.ARROW
   674				} else {
   675					tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
   676				}
   677			case '>':
   678				tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
   679			case '=':
   680				tok = s.switch2(token.ASSIGN, token.EQL)
   681			case '!':
   682				tok = s.switch2(token.NOT, token.NEQ)
   683			case '&':
   684				if s.ch == '^' {
   685					s.next()
   686					tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
   687				} else {
   688					tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
   689				}
   690			case '|':
   691				tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
   692			default:
   693				s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
   694				insertSemi = s.insertSemi // preserve insertSemi info
   695				tok = token.ILLEGAL
   696				lit = string(ch)
   697			}
   698		}
   699		if s.mode&dontInsertSemis == 0 {
   700			s.insertSemi = insertSemi
   701		}
   702	
   703		return
   704	}