src/pkg/text/scanner/scanner.go - The Go Programming Language

Golang

Source file src/pkg/text/scanner/scanner.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package scanner provides a scanner and tokenizer for UTF-8-encoded text.
     6	// It takes an io.Reader providing the source, which then can be tokenized
     7	// through repeated calls to the Scan function.  For compatibility with
     8	// existing tools, the NUL character is not allowed.
     9	//
    10	// By default, a Scanner skips white space and Go comments and recognizes all
    11	// literals as defined by the Go language specification.  It may be
    12	// customized to recognize only a subset of those literals and to recognize
    13	// different white space characters.
    14	//
    15	// Basic usage pattern:
    16	//
    17	//	var s scanner.Scanner
    18	//	s.Init(src)
    19	//	tok := s.Scan()
    20	//	for tok != scanner.EOF {
    21	//		// do something with tok
    22	//		tok = s.Scan()
    23	//	}
    24	//
    25	package scanner
    26	
    27	import (
    28		"bytes"
    29		"fmt"
    30		"io"
    31		"os"
    32		"unicode"
    33		"unicode/utf8"
    34	)
    35	
    36	// TODO(gri): Consider changing this to use the new (token) Position package.
    37	
    38	// A source position is represented by a Position value.
    39	// A position is valid if Line > 0.
    40	type Position struct {
    41		Filename string // filename, if any
    42		Offset   int    // byte offset, starting at 0
    43		Line     int    // line number, starting at 1
    44		Column   int    // column number, starting at 1 (character count per line)
    45	}
    46	
    47	// IsValid returns true if the position is valid.
    48	func (pos *Position) IsValid() bool { return pos.Line > 0 }
    49	
    50	func (pos Position) String() string {
    51		s := pos.Filename
    52		if pos.IsValid() {
    53			if s != "" {
    54				s += ":"
    55			}
    56			s += fmt.Sprintf("%d:%d", pos.Line, pos.Column)
    57		}
    58		if s == "" {
    59			s = "???"
    60		}
    61		return s
    62	}
    63	
    64	// Predefined mode bits to control recognition of tokens. For instance,
    65	// to configure a Scanner such that it only recognizes (Go) identifiers,
    66	// integers, and skips comments, set the Scanner's Mode field to:
    67	//
    68	//	ScanIdents | ScanInts | SkipComments
    69	//
    70	const (
    71		ScanIdents     = 1 << -Ident
    72		ScanInts       = 1 << -Int
    73		ScanFloats     = 1 << -Float // includes Ints
    74		ScanChars      = 1 << -Char
    75		ScanStrings    = 1 << -String
    76		ScanRawStrings = 1 << -RawString
    77		ScanComments   = 1 << -Comment
    78		SkipComments   = 1 << -skipComment // if set with ScanComments, comments become white space
    79		GoTokens       = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments
    80	)
    81	
    82	// The result of Scan is one of the following tokens or a Unicode character.
    83	const (
    84		EOF = -(iota + 1)
    85		Ident
    86		Int
    87		Float
    88		Char
    89		String
    90		RawString
    91		Comment
    92		skipComment
    93	)
    94	
    95	var tokenString = map[rune]string{
    96		EOF:       "EOF",
    97		Ident:     "Ident",
    98		Int:       "Int",
    99		Float:     "Float",
   100		Char:      "Char",
   101		String:    "String",
   102		RawString: "RawString",
   103		Comment:   "Comment",
   104	}
   105	
   106	// TokenString returns a printable string for a token or Unicode character.
   107	func TokenString(tok rune) string {
   108		if s, found := tokenString[tok]; found {
   109			return s
   110		}
   111		return fmt.Sprintf("%q", string(tok))
   112	}
   113	
   114	// GoWhitespace is the default value for the Scanner's Whitespace field.
   115	// Its value selects Go's white space characters.
   116	const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '
   117	
   118	const bufLen = 1024 // at least utf8.UTFMax
   119	
   120	// A Scanner implements reading of Unicode characters and tokens from an io.Reader.
   121	type Scanner struct {
   122		// Input
   123		src io.Reader
   124	
   125		// Source buffer
   126		srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next()
   127		srcPos int              // reading position (srcBuf index)
   128		srcEnd int              // source end (srcBuf index)
   129	
   130		// Source position
   131		srcBufOffset int // byte offset of srcBuf[0] in source
   132		line         int // line count
   133		column       int // character count
   134		lastLineLen  int // length of last line in characters (for correct column reporting)
   135		lastCharLen  int // length of last character in bytes
   136	
   137		// Token text buffer
   138		// Typically, token text is stored completely in srcBuf, but in general
   139		// the token text's head may be buffered in tokBuf while the token text's
   140		// tail is stored in srcBuf.
   141		tokBuf bytes.Buffer // token text head that is not in srcBuf anymore
   142		tokPos int          // token text tail position (srcBuf index); valid if >= 0
   143		tokEnd int          // token text tail end (srcBuf index)
   144	
   145		// One character look-ahead
   146		ch rune // character before current srcPos
   147	
   148		// Error is called for each error encountered. If no Error
   149		// function is set, the error is reported to os.Stderr.
   150		Error func(s *Scanner, msg string)
   151	
   152		// ErrorCount is incremented by one for each error encountered.
   153		ErrorCount int
   154	
   155		// The Mode field controls which tokens are recognized. For instance,
   156		// to recognize Ints, set the ScanInts bit in Mode. The field may be
   157		// changed at any time.
   158		Mode uint
   159	
   160		// The Whitespace field controls which characters are recognized
   161		// as white space. To recognize a character ch <= ' ' as white space,
   162		// set the ch'th bit in Whitespace (the Scanner's behavior is undefined
   163		// for values ch > ' '). The field may be changed at any time.
   164		Whitespace uint64
   165	
   166		// Start position of most recently scanned token; set by Scan.
   167		// Calling Init or Next invalidates the position (Line == 0).
   168		// The Filename field is always left untouched by the Scanner.
   169		// If an error is reported (via Error) and Position is invalid,
   170		// the scanner is not inside a token. Call Pos to obtain an error
   171		// position in that case.
   172		Position
   173	}
   174	
   175	// Init initializes a Scanner with a new source and returns s.
   176	// Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
   177	// and Whitespace is set to GoWhitespace.
   178	func (s *Scanner) Init(src io.Reader) *Scanner {
   179		s.src = src
   180	
   181		// initialize source buffer
   182		// (the first call to next() will fill it by calling src.Read)
   183		s.srcBuf[0] = utf8.RuneSelf // sentinel
   184		s.srcPos = 0
   185		s.srcEnd = 0
   186	
   187		// initialize source position
   188		s.srcBufOffset = 0
   189		s.line = 1
   190		s.column = 0
   191		s.lastLineLen = 0
   192		s.lastCharLen = 0
   193	
   194		// initialize token text buffer
   195		// (required for first call to next()).
   196		s.tokPos = -1
   197	
   198		// initialize one character look-ahead
   199		s.ch = -1 // no char read yet
   200	
   201		// initialize public fields
   202		s.Error = nil
   203		s.ErrorCount = 0
   204		s.Mode = GoTokens
   205		s.Whitespace = GoWhitespace
   206		s.Line = 0 // invalidate token position
   207	
   208		return s
   209	}
   210	
   211	// TODO(gri): The code for next() and the internal scanner state could benefit
   212	//            from a rethink. While next() is optimized for the common ASCII
   213	//            case, the "corrections" needed for proper position tracking undo
   214	//            some of the attempts for fast-path optimization.
   215	
   216	// next reads and returns the next Unicode character. It is designed such
   217	// that only a minimal amount of work needs to be done in the common ASCII
   218	// case (one test to check for both ASCII and end-of-buffer, and one test
   219	// to check for newlines).
   220	func (s *Scanner) next() rune {
   221		ch, width := rune(s.srcBuf[s.srcPos]), 1
   222	
   223		if ch >= utf8.RuneSelf {
   224			// uncommon case: not ASCII or not enough bytes
   225			for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) {
   226				// not enough bytes: read some more, but first
   227				// save away token text if any
   228				if s.tokPos >= 0 {
   229					s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
   230					s.tokPos = 0
   231					// s.tokEnd is set by Scan()
   232				}
   233				// move unread bytes to beginning of buffer
   234				copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
   235				s.srcBufOffset += s.srcPos
   236				// read more bytes
   237				// (an io.Reader must return io.EOF when it reaches
   238				// the end of what it is reading - simply returning
   239				// n == 0 will make this loop retry forever; but the
   240				// error is in the reader implementation in that case)
   241				i := s.srcEnd - s.srcPos
   242				n, err := s.src.Read(s.srcBuf[i:bufLen])
   243				s.srcPos = 0
   244				s.srcEnd = i + n
   245				s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
   246				if err != nil {
   247					if s.srcEnd == 0 {
   248						if s.lastCharLen > 0 {
   249							// previous character was not EOF
   250							s.column++
   251						}
   252						s.lastCharLen = 0
   253						return EOF
   254					}
   255					if err != io.EOF {
   256						s.error(err.Error())
   257					}
   258					// If err == EOF, we won't be getting more
   259					// bytes; break to avoid infinite loop. If
   260					// err is something else, we don't know if
   261					// we can get more bytes; thus also break.
   262					break
   263				}
   264			}
   265			// at least one byte
   266			ch = rune(s.srcBuf[s.srcPos])
   267			if ch >= utf8.RuneSelf {
   268				// uncommon case: not ASCII
   269				ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
   270				if ch == utf8.RuneError && width == 1 {
   271					// advance for correct error position
   272					s.srcPos += width
   273					s.lastCharLen = width
   274					s.column++
   275					s.error("illegal UTF-8 encoding")
   276					return ch
   277				}
   278			}
   279		}
   280	
   281		// advance
   282		s.srcPos += width
   283		s.lastCharLen = width
   284		s.column++
   285	
   286		// special situations
   287		switch ch {
   288		case 0:
   289			// for compatibility with other tools
   290			s.error("illegal character NUL")
   291		case '\n':
   292			s.line++
   293			s.lastLineLen = s.column
   294			s.column = 0
   295		}
   296	
   297		return ch
   298	}
   299	
   300	// Next reads and returns the next Unicode character.
   301	// It returns EOF at the end of the source. It reports
   302	// a read error by calling s.Error, if not nil; otherwise
   303	// it prints an error message to os.Stderr. Next does not
   304	// update the Scanner's Position field; use Pos() to
   305	// get the current position.
   306	func (s *Scanner) Next() rune {
   307		s.tokPos = -1 // don't collect token text
   308		s.Line = 0    // invalidate token position
   309		ch := s.Peek()
   310		s.ch = s.next()
   311		return ch
   312	}
   313	
   314	// Peek returns the next Unicode character in the source without advancing
   315	// the scanner. It returns EOF if the scanner's position is at the last
   316	// character of the source.
   317	func (s *Scanner) Peek() rune {
   318		if s.ch < 0 {
   319			s.ch = s.next()
   320		}
   321		return s.ch
   322	}
   323	
   324	func (s *Scanner) error(msg string) {
   325		s.ErrorCount++
   326		if s.Error != nil {
   327			s.Error(s, msg)
   328			return
   329		}
   330		pos := s.Position
   331		if !pos.IsValid() {
   332			pos = s.Pos()
   333		}
   334		fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
   335	}
   336	
   337	func (s *Scanner) scanIdentifier() rune {
   338		ch := s.next() // read character after first '_' or letter
   339		for ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) {
   340			ch = s.next()
   341		}
   342		return ch
   343	}
   344	
   345	func digitVal(ch rune) int {
   346		switch {
   347		case '0' <= ch && ch <= '9':
   348			return int(ch - '0')
   349		case 'a' <= ch && ch <= 'f':
   350			return int(ch - 'a' + 10)
   351		case 'A' <= ch && ch <= 'F':
   352			return int(ch - 'A' + 10)
   353		}
   354		return 16 // larger than any legal digit val
   355	}
   356	
   357	func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
   358	
   359	func (s *Scanner) scanMantissa(ch rune) rune {
   360		for isDecimal(ch) {
   361			ch = s.next()
   362		}
   363		return ch
   364	}
   365	
   366	func (s *Scanner) scanFraction(ch rune) rune {
   367		if ch == '.' {
   368			ch = s.scanMantissa(s.next())
   369		}
   370		return ch
   371	}
   372	
   373	func (s *Scanner) scanExponent(ch rune) rune {
   374		if ch == 'e' || ch == 'E' {
   375			ch = s.next()
   376			if ch == '-' || ch == '+' {
   377				ch = s.next()
   378			}
   379			ch = s.scanMantissa(ch)
   380		}
   381		return ch
   382	}
   383	
   384	func (s *Scanner) scanNumber(ch rune) (rune, rune) {
   385		// isDecimal(ch)
   386		if ch == '0' {
   387			// int or float
   388			ch = s.next()
   389			if ch == 'x' || ch == 'X' {
   390				// hexadecimal int
   391				ch = s.next()
   392				for digitVal(ch) < 16 {
   393					ch = s.next()
   394				}
   395			} else {
   396				// octal int or float
   397				seenDecimalDigit := false
   398				for isDecimal(ch) {
   399					if ch > '7' {
   400						seenDecimalDigit = true
   401					}
   402					ch = s.next()
   403				}
   404				if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
   405					// float
   406					ch = s.scanFraction(ch)
   407					ch = s.scanExponent(ch)
   408					return Float, ch
   409				}
   410				// octal int
   411				if seenDecimalDigit {
   412					s.error("illegal octal number")
   413				}
   414			}
   415			return Int, ch
   416		}
   417		// decimal int or float
   418		ch = s.scanMantissa(ch)
   419		if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
   420			// float
   421			ch = s.scanFraction(ch)
   422			ch = s.scanExponent(ch)
   423			return Float, ch
   424		}
   425		return Int, ch
   426	}
   427	
   428	func (s *Scanner) scanDigits(ch rune, base, n int) rune {
   429		for n > 0 && digitVal(ch) < base {
   430			ch = s.next()
   431			n--
   432		}
   433		if n > 0 {
   434			s.error("illegal char escape")
   435		}
   436		return ch
   437	}
   438	
   439	func (s *Scanner) scanEscape(quote rune) rune {
   440		ch := s.next() // read character after '/'
   441		switch ch {
   442		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   443			// nothing to do
   444			ch = s.next()
   445		case '0', '1', '2', '3', '4', '5', '6', '7':
   446			ch = s.scanDigits(ch, 8, 3)
   447		case 'x':
   448			ch = s.scanDigits(s.next(), 16, 2)
   449		case 'u':
   450			ch = s.scanDigits(s.next(), 16, 4)
   451		case 'U':
   452			ch = s.scanDigits(s.next(), 16, 8)
   453		default:
   454			s.error("illegal char escape")
   455		}
   456		return ch
   457	}
   458	
   459	func (s *Scanner) scanString(quote rune) (n int) {
   460		ch := s.next() // read character after quote
   461		for ch != quote {
   462			if ch == '\n' || ch < 0 {
   463				s.error("literal not terminated")
   464				return
   465			}
   466			if ch == '\\' {
   467				ch = s.scanEscape(quote)
   468			} else {
   469				ch = s.next()
   470			}
   471			n++
   472		}
   473		return
   474	}
   475	
   476	func (s *Scanner) scanRawString() {
   477		ch := s.next() // read character after '`'
   478		for ch != '`' {
   479			if ch < 0 {
   480				s.error("literal not terminated")
   481				return
   482			}
   483			ch = s.next()
   484		}
   485	}
   486	
   487	func (s *Scanner) scanChar() {
   488		if s.scanString('\'') != 1 {
   489			s.error("illegal char literal")
   490		}
   491	}
   492	
   493	func (s *Scanner) scanComment(ch rune) rune {
   494		// ch == '/' || ch == '*'
   495		if ch == '/' {
   496			// line comment
   497			ch = s.next() // read character after "//"
   498			for ch != '\n' && ch >= 0 {
   499				ch = s.next()
   500			}
   501			return ch
   502		}
   503	
   504		// general comment
   505		ch = s.next() // read character after "/*"
   506		for {
   507			if ch < 0 {
   508				s.error("comment not terminated")
   509				break
   510			}
   511			ch0 := ch
   512			ch = s.next()
   513			if ch0 == '*' && ch == '/' {
   514				ch = s.next()
   515				break
   516			}
   517		}
   518		return ch
   519	}
   520	
   521	// Scan reads the next token or Unicode character from source and returns it.
   522	// It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
   523	// It returns EOF at the end of the source. It reports scanner errors (read and
   524	// token errors) by calling s.Error, if not nil; otherwise it prints an error
   525	// message to os.Stderr.
   526	func (s *Scanner) Scan() rune {
   527		ch := s.Peek()
   528	
   529		// reset token text position
   530		s.tokPos = -1
   531		s.Line = 0
   532	
   533	redo:
   534		// skip white space
   535		for s.Whitespace&(1<<uint(ch)) != 0 {
   536			ch = s.next()
   537		}
   538	
   539		// start collecting token text
   540		s.tokBuf.Reset()
   541		s.tokPos = s.srcPos - s.lastCharLen
   542	
   543		// set token position
   544		// (this is a slightly optimized version of the code in Pos())
   545		s.Offset = s.srcBufOffset + s.tokPos
   546		if s.column > 0 {
   547			// common case: last character was not a '\n'
   548			s.Line = s.line
   549			s.Column = s.column
   550		} else {
   551			// last character was a '\n'
   552			// (we cannot be at the beginning of the source
   553			// since we have called next() at least once)
   554			s.Line = s.line - 1
   555			s.Column = s.lastLineLen
   556		}
   557	
   558		// determine token value
   559		tok := ch
   560		switch {
   561		case unicode.IsLetter(ch) || ch == '_':
   562			if s.Mode&ScanIdents != 0 {
   563				tok = Ident
   564				ch = s.scanIdentifier()
   565			} else {
   566				ch = s.next()
   567			}
   568		case isDecimal(ch):
   569			if s.Mode&(ScanInts|ScanFloats) != 0 {
   570				tok, ch = s.scanNumber(ch)
   571			} else {
   572				ch = s.next()
   573			}
   574		default:
   575			switch ch {
   576			case '"':
   577				if s.Mode&ScanStrings != 0 {
   578					s.scanString('"')
   579					tok = String
   580				}
   581				ch = s.next()
   582			case '\'':
   583				if s.Mode&ScanChars != 0 {
   584					s.scanChar()
   585					tok = Char
   586				}
   587				ch = s.next()
   588			case '.':
   589				ch = s.next()
   590				if isDecimal(ch) && s.Mode&ScanFloats != 0 {
   591					tok = Float
   592					ch = s.scanMantissa(ch)
   593					ch = s.scanExponent(ch)
   594				}
   595			case '/':
   596				ch = s.next()
   597				if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 {
   598					if s.Mode&SkipComments != 0 {
   599						s.tokPos = -1 // don't collect token text
   600						ch = s.scanComment(ch)
   601						goto redo
   602					}
   603					ch = s.scanComment(ch)
   604					tok = Comment
   605				}
   606			case '`':
   607				if s.Mode&ScanRawStrings != 0 {
   608					s.scanRawString()
   609					tok = String
   610				}
   611				ch = s.next()
   612			default:
   613				ch = s.next()
   614			}
   615		}
   616	
   617		// end of token text
   618		s.tokEnd = s.srcPos - s.lastCharLen
   619	
   620		s.ch = ch
   621		return tok
   622	}
   623	
   624	// Pos returns the position of the character immediately after
   625	// the character or token returned by the last call to Next or Scan.
   626	func (s *Scanner) Pos() (pos Position) {
   627		pos.Filename = s.Filename
   628		pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen
   629		switch {
   630		case s.column > 0:
   631			// common case: last character was not a '\n'
   632			pos.Line = s.line
   633			pos.Column = s.column
   634		case s.lastLineLen > 0:
   635			// last character was a '\n'
   636			pos.Line = s.line - 1
   637			pos.Column = s.lastLineLen
   638		default:
   639			// at the beginning of the source
   640			pos.Line = 1
   641			pos.Column = 1
   642		}
   643		return
   644	}
   645	
   646	// TokenText returns the string corresponding to the most recently scanned token.
   647	// Valid after calling Scan().
   648	func (s *Scanner) TokenText() string {
   649		if s.tokPos < 0 {
   650			// no token text
   651			return ""
   652		}
   653	
   654		if s.tokEnd < 0 {
   655			// if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0)
   656			s.tokEnd = s.tokPos
   657		}
   658	
   659		if s.tokBuf.Len() == 0 {
   660			// common case: the entire token text is still in srcBuf
   661			return string(s.srcBuf[s.tokPos:s.tokEnd])
   662		}
   663	
   664		// part of the token text was saved in tokBuf: save the rest in
   665		// tokBuf as well and return its content
   666		s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd])
   667		s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
   668		return s.tokBuf.String()
   669	}