src/pkg/encoding/json/scanner.go - The Go Programming Language

Golang

previous page next page
Source file src/pkg/encoding/json/scanner.go

     1	// Copyright 2010 The Go Authors.  All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package json
     6	
     7	// JSON value parser state machine.
     8	// Just about at the limit of what is reasonable to write by hand.
     9	// Some parts are a bit tedious, but overall it nicely factors out the
    10	// otherwise common code from the multiple scanning functions
    11	// in this package (Compact, Indent, checkValid, nextValue, etc).
    12	//
    13	// This file starts with two simple examples using the scanner
    14	// before diving into the scanner itself.
    15	
    16	import "strconv"
    17	
    18	// checkValid verifies that data is valid JSON-encoded data.
    19	// scan is passed in for use by checkValid to avoid an allocation.
    20	func checkValid(data []byte, scan *scanner) error {
    21		scan.reset()
    22		for _, c := range data {
    23			scan.bytes++
    24			if scan.step(scan, int(c)) == scanError {
    25				return scan.err
    26			}
    27		}
    28		if scan.eof() == scanError {
    29			return scan.err
    30		}
    31		return nil
    32	}
    33	
    34	// nextValue splits data after the next whole JSON value,
    35	// returning that value and the bytes that follow it as separate slices.
    36	// scan is passed in for use by nextValue to avoid an allocation.
    37	func nextValue(data []byte, scan *scanner) (value, rest []byte, err error) {
    38		scan.reset()
    39		for i, c := range data {
    40			v := scan.step(scan, int(c))
    41			if v >= scanEnd {
    42				switch v {
    43				case scanError:
    44					return nil, nil, scan.err
    45				case scanEnd:
    46					return data[0:i], data[i:], nil
    47				}
    48			}
    49		}
    50		if scan.eof() == scanError {
    51			return nil, nil, scan.err
    52		}
    53		return data, nil, nil
    54	}
    55	
    56	// A SyntaxError is a description of a JSON syntax error.
    57	type SyntaxError struct {
    58		msg    string // description of error
    59		Offset int64  // error occurred after reading Offset bytes
    60	}
    61	
    62	func (e *SyntaxError) Error() string { return e.msg }
    63	
    64	// A scanner is a JSON scanning state machine.
    65	// Callers call scan.reset() and then pass bytes in one at a time
    66	// by calling scan.step(&scan, c) for each byte.
    67	// The return value, referred to as an opcode, tells the
    68	// caller about significant parsing events like beginning
    69	// and ending literals, objects, and arrays, so that the
    70	// caller can follow along if it wishes.
    71	// The return value scanEnd indicates that a single top-level
    72	// JSON value has been completed, *before* the byte that
    73	// just got passed in.  (The indication must be delayed in order
    74	// to recognize the end of numbers: is 123 a whole value or
    75	// the beginning of 12345e+6?).
    76	type scanner struct {
    77		// The step is a func to be called to execute the next transition.
    78		// Also tried using an integer constant and a single func
    79		// with a switch, but using the func directly was 10% faster
    80		// on a 64-bit Mac Mini, and it's nicer to read.
    81		step func(*scanner, int) int
    82	
    83		// Reached end of top-level value.
    84		endTop bool
    85	
    86		// Stack of what we're in the middle of - array values, object keys, object values.
    87		parseState []int
    88	
    89		// Error that happened, if any.
    90		err error
    91	
    92		// 1-byte redo (see undo method)
    93		redo      bool
    94		redoCode  int
    95		redoState func(*scanner, int) int
    96	
    97		// total bytes consumed, updated by decoder.Decode
    98		bytes int64
    99	}
   100	
   101	// These values are returned by the state transition functions
   102	// assigned to scanner.state and the method scanner.eof.
   103	// They give details about the current state of the scan that
   104	// callers might be interested to know about.
   105	// It is okay to ignore the return value of any particular
   106	// call to scanner.state: if one call returns scanError,
   107	// every subsequent call will return scanError too.
   108	const (
   109		// Continue.
   110		scanContinue     = iota // uninteresting byte
   111		scanBeginLiteral        // end implied by next result != scanContinue
   112		scanBeginObject         // begin object
   113		scanObjectKey           // just finished object key (string)
   114		scanObjectValue         // just finished non-last object value
   115		scanEndObject           // end object (implies scanObjectValue if possible)
   116		scanBeginArray          // begin array
   117		scanArrayValue          // just finished array value
   118		scanEndArray            // end array (implies scanArrayValue if possible)
   119		scanSkipSpace           // space byte; can skip; known to be last "continue" result
   120	
   121		// Stop.
   122		scanEnd   // top-level value ended *before* this byte; known to be first "stop" result
   123		scanError // hit an error, scanner.err.
   124	)
   125	
   126	// These values are stored in the parseState stack.
   127	// They give the current state of a composite value
   128	// being scanned.  If the parser is inside a nested value
   129	// the parseState describes the nested state, outermost at entry 0.
   130	const (
   131		parseObjectKey   = iota // parsing object key (before colon)
   132		parseObjectValue        // parsing object value (after colon)
   133		parseArrayValue         // parsing array value
   134	)
   135	
   136	// reset prepares the scanner for use.
   137	// It must be called before calling s.step.
   138	func (s *scanner) reset() {
   139		s.step = stateBeginValue
   140		s.parseState = s.parseState[0:0]
   141		s.err = nil
   142		s.redo = false
   143		s.endTop = false
   144	}
   145	
   146	// eof tells the scanner that the end of input has been reached.
   147	// It returns a scan status just as s.step does.
   148	func (s *scanner) eof() int {
   149		if s.err != nil {
   150			return scanError
   151		}
   152		if s.endTop {
   153			return scanEnd
   154		}
   155		s.step(s, ' ')
   156		if s.endTop {
   157			return scanEnd
   158		}
   159		if s.err == nil {
   160			s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
   161		}
   162		return scanError
   163	}
   164	
   165	// pushParseState pushes a new parse state p onto the parse stack.
   166	func (s *scanner) pushParseState(p int) {
   167		s.parseState = append(s.parseState, p)
   168	}
   169	
   170	// popParseState pops a parse state (already obtained) off the stack
   171	// and updates s.step accordingly.
   172	func (s *scanner) popParseState() {
   173		n := len(s.parseState) - 1
   174		s.parseState = s.parseState[0:n]
   175		s.redo = false
   176		if n == 0 {
   177			s.step = stateEndTop
   178			s.endTop = true
   179		} else {
   180			s.step = stateEndValue
   181		}
   182	}
   183	
   184	func isSpace(c rune) bool {
   185		return c == ' ' || c == '\t' || c == '\r' || c == '\n'
   186	}
   187	
   188	// stateBeginValueOrEmpty is the state after reading `[`.
   189	func stateBeginValueOrEmpty(s *scanner, c int) int {
   190		if c <= ' ' && isSpace(rune(c)) {
   191			return scanSkipSpace
   192		}
   193		if c == ']' {
   194			return stateEndValue(s, c)
   195		}
   196		return stateBeginValue(s, c)
   197	}
   198	
   199	// stateBeginValue is the state at the beginning of the input.
   200	func stateBeginValue(s *scanner, c int) int {
   201		if c <= ' ' && isSpace(rune(c)) {
   202			return scanSkipSpace
   203		}
   204		switch c {
   205		case '{':
   206			s.step = stateBeginStringOrEmpty
   207			s.pushParseState(parseObjectKey)
   208			return scanBeginObject
   209		case '[':
   210			s.step = stateBeginValueOrEmpty
   211			s.pushParseState(parseArrayValue)
   212			return scanBeginArray
   213		case '"':
   214			s.step = stateInString
   215			return scanBeginLiteral
   216		case '-':
   217			s.step = stateNeg
   218			return scanBeginLiteral
   219		case '0': // beginning of 0.123
   220			s.step = state0
   221			return scanBeginLiteral
   222		case 't': // beginning of true
   223			s.step = stateT
   224			return scanBeginLiteral
   225		case 'f': // beginning of false
   226			s.step = stateF
   227			return scanBeginLiteral
   228		case 'n': // beginning of null
   229			s.step = stateN
   230			return scanBeginLiteral
   231		}
   232		if '1' <= c && c <= '9' { // beginning of 1234.5
   233			s.step = state1
   234			return scanBeginLiteral
   235		}
   236		return s.error(c, "looking for beginning of value")
   237	}
   238	
   239	// stateBeginStringOrEmpty is the state after reading `{`.
   240	func stateBeginStringOrEmpty(s *scanner, c int) int {
   241		if c <= ' ' && isSpace(rune(c)) {
   242			return scanSkipSpace
   243		}
   244		if c == '}' {
   245			n := len(s.parseState)
   246			s.parseState[n-1] = parseObjectValue
   247			return stateEndValue(s, c)
   248		}
   249		return stateBeginString(s, c)
   250	}
   251	
   252	// stateBeginString is the state after reading `{"key": value,`.
   253	func stateBeginString(s *scanner, c int) int {
   254		if c <= ' ' && isSpace(rune(c)) {
   255			return scanSkipSpace
   256		}
   257		if c == '"' {
   258			s.step = stateInString
   259			return scanBeginLiteral
   260		}
   261		return s.error(c, "looking for beginning of object key string")
   262	}
   263	
   264	// stateEndValue is the state after completing a value,
   265	// such as after reading `{}` or `true` or `["x"`.
   266	func stateEndValue(s *scanner, c int) int {
   267		n := len(s.parseState)
   268		if n == 0 {
   269			// Completed top-level before the current byte.
   270			s.step = stateEndTop
   271			s.endTop = true
   272			return stateEndTop(s, c)
   273		}
   274		if c <= ' ' && isSpace(rune(c)) {
   275			s.step = stateEndValue
   276			return scanSkipSpace
   277		}
   278		ps := s.parseState[n-1]
   279		switch ps {
   280		case parseObjectKey:
   281			if c == ':' {
   282				s.parseState[n-1] = parseObjectValue
   283				s.step = stateBeginValue
   284				return scanObjectKey
   285			}
   286			return s.error(c, "after object key")
   287		case parseObjectValue:
   288			if c == ',' {
   289				s.parseState[n-1] = parseObjectKey
   290				s.step = stateBeginString
   291				return scanObjectValue
   292			}
   293			if c == '}' {
   294				s.popParseState()
   295				return scanEndObject
   296			}
   297			return s.error(c, "after object key:value pair")
   298		case parseArrayValue:
   299			if c == ',' {
   300				s.step = stateBeginValue
   301				return scanArrayValue
   302			}
   303			if c == ']' {
   304				s.popParseState()
   305				return scanEndArray
   306			}
   307			return s.error(c, "after array element")
   308		}
   309		return s.error(c, "")
   310	}
   311	
   312	// stateEndTop is the state after finishing the top-level value,
   313	// such as after reading `{}` or `[1,2,3]`.
   314	// Only space characters should be seen now.
   315	func stateEndTop(s *scanner, c int) int {
   316		if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
   317			// Complain about non-space byte on next call.
   318			s.error(c, "after top-level value")
   319		}
   320		return scanEnd
   321	}
   322	
   323	// stateInString is the state after reading `"`.
   324	func stateInString(s *scanner, c int) int {
   325		if c == '"' {
   326			s.step = stateEndValue
   327			return scanContinue
   328		}
   329		if c == '\\' {
   330			s.step = stateInStringEsc
   331			return scanContinue
   332		}
   333		if c < 0x20 {
   334			return s.error(c, "in string literal")
   335		}
   336		return scanContinue
   337	}
   338	
   339	// stateInStringEsc is the state after reading `"\` during a quoted string.
   340	func stateInStringEsc(s *scanner, c int) int {
   341		switch c {
   342		case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
   343			s.step = stateInString
   344			return scanContinue
   345		}
   346		if c == 'u' {
   347			s.step = stateInStringEscU
   348			return scanContinue
   349		}
   350		return s.error(c, "in string escape code")
   351	}
   352	
   353	// stateInStringEscU is the state after reading `"\u` during a quoted string.
   354	func stateInStringEscU(s *scanner, c int) int {
   355		if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   356			s.step = stateInStringEscU1
   357			return scanContinue
   358		}
   359		// numbers
   360		return s.error(c, "in \\u hexadecimal character escape")
   361	}
   362	
   363	// stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
   364	func stateInStringEscU1(s *scanner, c int) int {
   365		if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   366			s.step = stateInStringEscU12
   367			return scanContinue
   368		}
   369		// numbers
   370		return s.error(c, "in \\u hexadecimal character escape")
   371	}
   372	
   373	// stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
   374	func stateInStringEscU12(s *scanner, c int) int {
   375		if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   376			s.step = stateInStringEscU123
   377			return scanContinue
   378		}
   379		// numbers
   380		return s.error(c, "in \\u hexadecimal character escape")
   381	}
   382	
   383	// stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
   384	func stateInStringEscU123(s *scanner, c int) int {
   385		if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   386			s.step = stateInString
   387			return scanContinue
   388		}
   389		// numbers
   390		return s.error(c, "in \\u hexadecimal character escape")
   391	}
   392	
   393	// stateInStringEscU123 is the state after reading `-` during a number.
   394	func stateNeg(s *scanner, c int) int {
   395		if c == '0' {
   396			s.step = state0
   397			return scanContinue
   398		}
   399		if '1' <= c && c <= '9' {
   400			s.step = state1
   401			return scanContinue
   402		}
   403		return s.error(c, "in numeric literal")
   404	}
   405	
   406	// state1 is the state after reading a non-zero integer during a number,
   407	// such as after reading `1` or `100` but not `0`.
   408	func state1(s *scanner, c int) int {
   409		if '0' <= c && c <= '9' {
   410			s.step = state1
   411			return scanContinue
   412		}
   413		return state0(s, c)
   414	}
   415	
   416	// state0 is the state after reading `0` during a number.
   417	func state0(s *scanner, c int) int {
   418		if c == '.' {
   419			s.step = stateDot
   420			return scanContinue
   421		}
   422		if c == 'e' || c == 'E' {
   423			s.step = stateE
   424			return scanContinue
   425		}
   426		return stateEndValue(s, c)
   427	}
   428	
   429	// stateDot is the state after reading the integer and decimal point in a number,
   430	// such as after reading `1.`.
   431	func stateDot(s *scanner, c int) int {
   432		if '0' <= c && c <= '9' {
   433			s.step = stateDot0
   434			return scanContinue
   435		}
   436		return s.error(c, "after decimal point in numeric literal")
   437	}
   438	
   439	// stateDot0 is the state after reading the integer, decimal point, and subsequent
   440	// digits of a number, such as after reading `3.14`.
   441	func stateDot0(s *scanner, c int) int {
   442		if '0' <= c && c <= '9' {
   443			s.step = stateDot0
   444			return scanContinue
   445		}
   446		if c == 'e' || c == 'E' {
   447			s.step = stateE
   448			return scanContinue
   449		}
   450		return stateEndValue(s, c)
   451	}
   452	
   453	// stateE is the state after reading the mantissa and e in a number,
   454	// such as after reading `314e` or `0.314e`.
   455	func stateE(s *scanner, c int) int {
   456		if c == '+' {
   457			s.step = stateESign
   458			return scanContinue
   459		}
   460		if c == '-' {
   461			s.step = stateESign
   462			return scanContinue
   463		}
   464		return stateESign(s, c)
   465	}
   466	
   467	// stateESign is the state after reading the mantissa, e, and sign in a number,
   468	// such as after reading `314e-` or `0.314e+`.
   469	func stateESign(s *scanner, c int) int {
   470		if '0' <= c && c <= '9' {
   471			s.step = stateE0
   472			return scanContinue
   473		}
   474		return s.error(c, "in exponent of numeric literal")
   475	}
   476	
   477	// stateE0 is the state after reading the mantissa, e, optional sign,
   478	// and at least one digit of the exponent in a number,
   479	// such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
   480	func stateE0(s *scanner, c int) int {
   481		if '0' <= c && c <= '9' {
   482			s.step = stateE0
   483			return scanContinue
   484		}
   485		return stateEndValue(s, c)
   486	}
   487	
   488	// stateT is the state after reading `t`.
   489	func stateT(s *scanner, c int) int {
   490		if c == 'r' {
   491			s.step = stateTr
   492			return scanContinue
   493		}
   494		return s.error(c, "in literal true (expecting 'r')")
   495	}
   496	
   497	// stateTr is the state after reading `tr`.
   498	func stateTr(s *scanner, c int) int {
   499		if c == 'u' {
   500			s.step = stateTru
   501			return scanContinue
   502		}
   503		return s.error(c, "in literal true (expecting 'u')")
   504	}
   505	
   506	// stateTru is the state after reading `tru`.
   507	func stateTru(s *scanner, c int) int {
   508		if c == 'e' {
   509			s.step = stateEndValue
   510			return scanContinue
   511		}
   512		return s.error(c, "in literal true (expecting 'e')")
   513	}
   514	
   515	// stateF is the state after reading `f`.
   516	func stateF(s *scanner, c int) int {
   517		if c == 'a' {
   518			s.step = stateFa
   519			return scanContinue
   520		}
   521		return s.error(c, "in literal false (expecting 'a')")
   522	}
   523	
   524	// stateFa is the state after reading `fa`.
   525	func stateFa(s *scanner, c int) int {
   526		if c == 'l' {
   527			s.step = stateFal
   528			return scanContinue
   529		}
   530		return s.error(c, "in literal false (expecting 'l')")
   531	}
   532	
   533	// stateFal is the state after reading `fal`.
   534	func stateFal(s *scanner, c int) int {
   535		if c == 's' {
   536			s.step = stateFals
   537			return scanContinue
   538		}
   539		return s.error(c, "in literal false (expecting 's')")
   540	}
   541	
   542	// stateFals is the state after reading `fals`.
   543	func stateFals(s *scanner, c int) int {
   544		if c == 'e' {
   545			s.step = stateEndValue
   546			return scanContinue
   547		}
   548		return s.error(c, "in literal false (expecting 'e')")
   549	}
   550	
   551	// stateN is the state after reading `n`.
   552	func stateN(s *scanner, c int) int {
   553		if c == 'u' {
   554			s.step = stateNu
   555			return scanContinue
   556		}
   557		return s.error(c, "in literal null (expecting 'u')")
   558	}
   559	
   560	// stateNu is the state after reading `nu`.
   561	func stateNu(s *scanner, c int) int {
   562		if c == 'l' {
   563			s.step = stateNul
   564			return scanContinue
   565		}
   566		return s.error(c, "in literal null (expecting 'l')")
   567	}
   568	
   569	// stateNul is the state after reading `nul`.
   570	func stateNul(s *scanner, c int) int {
   571		if c == 'l' {
   572			s.step = stateEndValue
   573			return scanContinue
   574		}
   575		return s.error(c, "in literal null (expecting 'l')")
   576	}
   577	
   578	// stateError is the state after reaching a syntax error,
   579	// such as after reading `[1}` or `5.1.2`.
   580	func stateError(s *scanner, c int) int {
   581		return scanError
   582	}
   583	
   584	// error records an error and switches to the error state.
   585	func (s *scanner) error(c int, context string) int {
   586		s.step = stateError
   587		s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
   588		return scanError
   589	}
   590	
   591	// quoteChar formats c as a quoted character literal
   592	func quoteChar(c int) string {
   593		// special cases - different from quoted strings
   594		if c == '\'' {
   595			return `'\''`
   596		}
   597		if c == '"' {
   598			return `'"'`
   599		}
   600	
   601		// use quoted string with different quotation marks
   602		s := strconv.Quote(string(c))
   603		return "'" + s[1:len(s)-1] + "'"
   604	}
   605	
   606	// undo causes the scanner to return scanCode from the next state transition.
   607	// This gives callers a simple 1-byte undo mechanism.
   608	func (s *scanner) undo(scanCode int) {
   609		if s.redo {
   610			panic("json: invalid use of scanner")
   611		}
   612		s.redoCode = scanCode
   613		s.redoState = s.step
   614		s.step = stateRedo
   615		s.redo = true
   616	}
   617	
   618	// stateRedo helps implement the scanner's 1-byte undo.
   619	func stateRedo(s *scanner, c int) int {
   620		s.redo = false
   621		s.step = s.redoState
   622		return s.redoCode
   623	}
previous page start next page