Source file src/pkg/encoding/json/scanner.go
1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package json
6
7 // JSON value parser state machine.
8 // Just about at the limit of what is reasonable to write by hand.
9 // Some parts are a bit tedious, but overall it nicely factors out the
10 // otherwise common code from the multiple scanning functions
11 // in this package (Compact, Indent, checkValid, nextValue, etc).
12 //
13 // This file starts with two simple examples using the scanner
14 // before diving into the scanner itself.
15
16 import "strconv"
17
18 // checkValid verifies that data is valid JSON-encoded data.
19 // scan is passed in for use by checkValid to avoid an allocation.
20 func checkValid(data []byte, scan *scanner) error {
21 scan.reset()
22 for _, c := range data {
23 scan.bytes++
24 if scan.step(scan, int(c)) == scanError {
25 return scan.err
26 }
27 }
28 if scan.eof() == scanError {
29 return scan.err
30 }
31 return nil
32 }
33
34 // nextValue splits data after the next whole JSON value,
35 // returning that value and the bytes that follow it as separate slices.
36 // scan is passed in for use by nextValue to avoid an allocation.
37 func nextValue(data []byte, scan *scanner) (value, rest []byte, err error) {
38 scan.reset()
39 for i, c := range data {
40 v := scan.step(scan, int(c))
41 if v >= scanEnd {
42 switch v {
43 case scanError:
44 return nil, nil, scan.err
45 case scanEnd:
46 return data[0:i], data[i:], nil
47 }
48 }
49 }
50 if scan.eof() == scanError {
51 return nil, nil, scan.err
52 }
53 return data, nil, nil
54 }
55
56 // A SyntaxError is a description of a JSON syntax error.
57 type SyntaxError struct {
58 msg string // description of error
59 Offset int64 // error occurred after reading Offset bytes
60 }
61
62 func (e *SyntaxError) Error() string { return e.msg }
63
64 // A scanner is a JSON scanning state machine.
65 // Callers call scan.reset() and then pass bytes in one at a time
66 // by calling scan.step(&scan, c) for each byte.
67 // The return value, referred to as an opcode, tells the
68 // caller about significant parsing events like beginning
69 // and ending literals, objects, and arrays, so that the
70 // caller can follow along if it wishes.
71 // The return value scanEnd indicates that a single top-level
72 // JSON value has been completed, *before* the byte that
73 // just got passed in. (The indication must be delayed in order
74 // to recognize the end of numbers: is 123 a whole value or
75 // the beginning of 12345e+6?).
76 type scanner struct {
77 // The step is a func to be called to execute the next transition.
78 // Also tried using an integer constant and a single func
79 // with a switch, but using the func directly was 10% faster
80 // on a 64-bit Mac Mini, and it's nicer to read.
81 step func(*scanner, int) int
82
83 // Reached end of top-level value.
84 endTop bool
85
86 // Stack of what we're in the middle of - array values, object keys, object values.
87 parseState []int
88
89 // Error that happened, if any.
90 err error
91
92 // 1-byte redo (see undo method)
93 redo bool
94 redoCode int
95 redoState func(*scanner, int) int
96
97 // total bytes consumed, updated by decoder.Decode
98 bytes int64
99 }
100
101 // These values are returned by the state transition functions
102 // assigned to scanner.state and the method scanner.eof.
103 // They give details about the current state of the scan that
104 // callers might be interested to know about.
105 // It is okay to ignore the return value of any particular
106 // call to scanner.state: if one call returns scanError,
107 // every subsequent call will return scanError too.
108 const (
109 // Continue.
110 scanContinue = iota // uninteresting byte
111 scanBeginLiteral // end implied by next result != scanContinue
112 scanBeginObject // begin object
113 scanObjectKey // just finished object key (string)
114 scanObjectValue // just finished non-last object value
115 scanEndObject // end object (implies scanObjectValue if possible)
116 scanBeginArray // begin array
117 scanArrayValue // just finished array value
118 scanEndArray // end array (implies scanArrayValue if possible)
119 scanSkipSpace // space byte; can skip; known to be last "continue" result
120
121 // Stop.
122 scanEnd // top-level value ended *before* this byte; known to be first "stop" result
123 scanError // hit an error, scanner.err.
124 )
125
126 // These values are stored in the parseState stack.
127 // They give the current state of a composite value
128 // being scanned. If the parser is inside a nested value
129 // the parseState describes the nested state, outermost at entry 0.
130 const (
131 parseObjectKey = iota // parsing object key (before colon)
132 parseObjectValue // parsing object value (after colon)
133 parseArrayValue // parsing array value
134 )
135
136 // reset prepares the scanner for use.
137 // It must be called before calling s.step.
138 func (s *scanner) reset() {
139 s.step = stateBeginValue
140 s.parseState = s.parseState[0:0]
141 s.err = nil
142 s.redo = false
143 s.endTop = false
144 }
145
146 // eof tells the scanner that the end of input has been reached.
147 // It returns a scan status just as s.step does.
148 func (s *scanner) eof() int {
149 if s.err != nil {
150 return scanError
151 }
152 if s.endTop {
153 return scanEnd
154 }
155 s.step(s, ' ')
156 if s.endTop {
157 return scanEnd
158 }
159 if s.err == nil {
160 s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
161 }
162 return scanError
163 }
164
165 // pushParseState pushes a new parse state p onto the parse stack.
166 func (s *scanner) pushParseState(p int) {
167 s.parseState = append(s.parseState, p)
168 }
169
170 // popParseState pops a parse state (already obtained) off the stack
171 // and updates s.step accordingly.
172 func (s *scanner) popParseState() {
173 n := len(s.parseState) - 1
174 s.parseState = s.parseState[0:n]
175 s.redo = false
176 if n == 0 {
177 s.step = stateEndTop
178 s.endTop = true
179 } else {
180 s.step = stateEndValue
181 }
182 }
183
184 func isSpace(c rune) bool {
185 return c == ' ' || c == '\t' || c == '\r' || c == '\n'
186 }
187
188 // stateBeginValueOrEmpty is the state after reading `[`.
189 func stateBeginValueOrEmpty(s *scanner, c int) int {
190 if c <= ' ' && isSpace(rune(c)) {
191 return scanSkipSpace
192 }
193 if c == ']' {
194 return stateEndValue(s, c)
195 }
196 return stateBeginValue(s, c)
197 }
198
199 // stateBeginValue is the state at the beginning of the input.
200 func stateBeginValue(s *scanner, c int) int {
201 if c <= ' ' && isSpace(rune(c)) {
202 return scanSkipSpace
203 }
204 switch c {
205 case '{':
206 s.step = stateBeginStringOrEmpty
207 s.pushParseState(parseObjectKey)
208 return scanBeginObject
209 case '[':
210 s.step = stateBeginValueOrEmpty
211 s.pushParseState(parseArrayValue)
212 return scanBeginArray
213 case '"':
214 s.step = stateInString
215 return scanBeginLiteral
216 case '-':
217 s.step = stateNeg
218 return scanBeginLiteral
219 case '0': // beginning of 0.123
220 s.step = state0
221 return scanBeginLiteral
222 case 't': // beginning of true
223 s.step = stateT
224 return scanBeginLiteral
225 case 'f': // beginning of false
226 s.step = stateF
227 return scanBeginLiteral
228 case 'n': // beginning of null
229 s.step = stateN
230 return scanBeginLiteral
231 }
232 if '1' <= c && c <= '9' { // beginning of 1234.5
233 s.step = state1
234 return scanBeginLiteral
235 }
236 return s.error(c, "looking for beginning of value")
237 }
238
239 // stateBeginStringOrEmpty is the state after reading `{`.
240 func stateBeginStringOrEmpty(s *scanner, c int) int {
241 if c <= ' ' && isSpace(rune(c)) {
242 return scanSkipSpace
243 }
244 if c == '}' {
245 n := len(s.parseState)
246 s.parseState[n-1] = parseObjectValue
247 return stateEndValue(s, c)
248 }
249 return stateBeginString(s, c)
250 }
251
252 // stateBeginString is the state after reading `{"key": value,`.
253 func stateBeginString(s *scanner, c int) int {
254 if c <= ' ' && isSpace(rune(c)) {
255 return scanSkipSpace
256 }
257 if c == '"' {
258 s.step = stateInString
259 return scanBeginLiteral
260 }
261 return s.error(c, "looking for beginning of object key string")
262 }
263
264 // stateEndValue is the state after completing a value,
265 // such as after reading `{}` or `true` or `["x"`.
266 func stateEndValue(s *scanner, c int) int {
267 n := len(s.parseState)
268 if n == 0 {
269 // Completed top-level before the current byte.
270 s.step = stateEndTop
271 s.endTop = true
272 return stateEndTop(s, c)
273 }
274 if c <= ' ' && isSpace(rune(c)) {
275 s.step = stateEndValue
276 return scanSkipSpace
277 }
278 ps := s.parseState[n-1]
279 switch ps {
280 case parseObjectKey:
281 if c == ':' {
282 s.parseState[n-1] = parseObjectValue
283 s.step = stateBeginValue
284 return scanObjectKey
285 }
286 return s.error(c, "after object key")
287 case parseObjectValue:
288 if c == ',' {
289 s.parseState[n-1] = parseObjectKey
290 s.step = stateBeginString
291 return scanObjectValue
292 }
293 if c == '}' {
294 s.popParseState()
295 return scanEndObject
296 }
297 return s.error(c, "after object key:value pair")
298 case parseArrayValue:
299 if c == ',' {
300 s.step = stateBeginValue
301 return scanArrayValue
302 }
303 if c == ']' {
304 s.popParseState()
305 return scanEndArray
306 }
307 return s.error(c, "after array element")
308 }
309 return s.error(c, "")
310 }
311
312 // stateEndTop is the state after finishing the top-level value,
313 // such as after reading `{}` or `[1,2,3]`.
314 // Only space characters should be seen now.
315 func stateEndTop(s *scanner, c int) int {
316 if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
317 // Complain about non-space byte on next call.
318 s.error(c, "after top-level value")
319 }
320 return scanEnd
321 }
322
323 // stateInString is the state after reading `"`.
324 func stateInString(s *scanner, c int) int {
325 if c == '"' {
326 s.step = stateEndValue
327 return scanContinue
328 }
329 if c == '\\' {
330 s.step = stateInStringEsc
331 return scanContinue
332 }
333 if c < 0x20 {
334 return s.error(c, "in string literal")
335 }
336 return scanContinue
337 }
338
339 // stateInStringEsc is the state after reading `"\` during a quoted string.
340 func stateInStringEsc(s *scanner, c int) int {
341 switch c {
342 case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
343 s.step = stateInString
344 return scanContinue
345 }
346 if c == 'u' {
347 s.step = stateInStringEscU
348 return scanContinue
349 }
350 return s.error(c, "in string escape code")
351 }
352
353 // stateInStringEscU is the state after reading `"\u` during a quoted string.
354 func stateInStringEscU(s *scanner, c int) int {
355 if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
356 s.step = stateInStringEscU1
357 return scanContinue
358 }
359 // numbers
360 return s.error(c, "in \\u hexadecimal character escape")
361 }
362
363 // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
364 func stateInStringEscU1(s *scanner, c int) int {
365 if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
366 s.step = stateInStringEscU12
367 return scanContinue
368 }
369 // numbers
370 return s.error(c, "in \\u hexadecimal character escape")
371 }
372
373 // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
374 func stateInStringEscU12(s *scanner, c int) int {
375 if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
376 s.step = stateInStringEscU123
377 return scanContinue
378 }
379 // numbers
380 return s.error(c, "in \\u hexadecimal character escape")
381 }
382
383 // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
384 func stateInStringEscU123(s *scanner, c int) int {
385 if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
386 s.step = stateInString
387 return scanContinue
388 }
389 // numbers
390 return s.error(c, "in \\u hexadecimal character escape")
391 }
392
393 // stateInStringEscU123 is the state after reading `-` during a number.
394 func stateNeg(s *scanner, c int) int {
395 if c == '0' {
396 s.step = state0
397 return scanContinue
398 }
399 if '1' <= c && c <= '9' {
400 s.step = state1
401 return scanContinue
402 }
403 return s.error(c, "in numeric literal")
404 }
405
406 // state1 is the state after reading a non-zero integer during a number,
407 // such as after reading `1` or `100` but not `0`.
408 func state1(s *scanner, c int) int {
409 if '0' <= c && c <= '9' {
410 s.step = state1
411 return scanContinue
412 }
413 return state0(s, c)
414 }
415
416 // state0 is the state after reading `0` during a number.
417 func state0(s *scanner, c int) int {
418 if c == '.' {
419 s.step = stateDot
420 return scanContinue
421 }
422 if c == 'e' || c == 'E' {
423 s.step = stateE
424 return scanContinue
425 }
426 return stateEndValue(s, c)
427 }
428
429 // stateDot is the state after reading the integer and decimal point in a number,
430 // such as after reading `1.`.
431 func stateDot(s *scanner, c int) int {
432 if '0' <= c && c <= '9' {
433 s.step = stateDot0
434 return scanContinue
435 }
436 return s.error(c, "after decimal point in numeric literal")
437 }
438
439 // stateDot0 is the state after reading the integer, decimal point, and subsequent
440 // digits of a number, such as after reading `3.14`.
441 func stateDot0(s *scanner, c int) int {
442 if '0' <= c && c <= '9' {
443 s.step = stateDot0
444 return scanContinue
445 }
446 if c == 'e' || c == 'E' {
447 s.step = stateE
448 return scanContinue
449 }
450 return stateEndValue(s, c)
451 }
452
453 // stateE is the state after reading the mantissa and e in a number,
454 // such as after reading `314e` or `0.314e`.
455 func stateE(s *scanner, c int) int {
456 if c == '+' {
457 s.step = stateESign
458 return scanContinue
459 }
460 if c == '-' {
461 s.step = stateESign
462 return scanContinue
463 }
464 return stateESign(s, c)
465 }
466
467 // stateESign is the state after reading the mantissa, e, and sign in a number,
468 // such as after reading `314e-` or `0.314e+`.
469 func stateESign(s *scanner, c int) int {
470 if '0' <= c && c <= '9' {
471 s.step = stateE0
472 return scanContinue
473 }
474 return s.error(c, "in exponent of numeric literal")
475 }
476
477 // stateE0 is the state after reading the mantissa, e, optional sign,
478 // and at least one digit of the exponent in a number,
479 // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
480 func stateE0(s *scanner, c int) int {
481 if '0' <= c && c <= '9' {
482 s.step = stateE0
483 return scanContinue
484 }
485 return stateEndValue(s, c)
486 }
487
488 // stateT is the state after reading `t`.
489 func stateT(s *scanner, c int) int {
490 if c == 'r' {
491 s.step = stateTr
492 return scanContinue
493 }
494 return s.error(c, "in literal true (expecting 'r')")
495 }
496
497 // stateTr is the state after reading `tr`.
498 func stateTr(s *scanner, c int) int {
499 if c == 'u' {
500 s.step = stateTru
501 return scanContinue
502 }
503 return s.error(c, "in literal true (expecting 'u')")
504 }
505
506 // stateTru is the state after reading `tru`.
507 func stateTru(s *scanner, c int) int {
508 if c == 'e' {
509 s.step = stateEndValue
510 return scanContinue
511 }
512 return s.error(c, "in literal true (expecting 'e')")
513 }
514
515 // stateF is the state after reading `f`.
516 func stateF(s *scanner, c int) int {
517 if c == 'a' {
518 s.step = stateFa
519 return scanContinue
520 }
521 return s.error(c, "in literal false (expecting 'a')")
522 }
523
524 // stateFa is the state after reading `fa`.
525 func stateFa(s *scanner, c int) int {
526 if c == 'l' {
527 s.step = stateFal
528 return scanContinue
529 }
530 return s.error(c, "in literal false (expecting 'l')")
531 }
532
533 // stateFal is the state after reading `fal`.
534 func stateFal(s *scanner, c int) int {
535 if c == 's' {
536 s.step = stateFals
537 return scanContinue
538 }
539 return s.error(c, "in literal false (expecting 's')")
540 }
541
542 // stateFals is the state after reading `fals`.
543 func stateFals(s *scanner, c int) int {
544 if c == 'e' {
545 s.step = stateEndValue
546 return scanContinue
547 }
548 return s.error(c, "in literal false (expecting 'e')")
549 }
550
551 // stateN is the state after reading `n`.
552 func stateN(s *scanner, c int) int {
553 if c == 'u' {
554 s.step = stateNu
555 return scanContinue
556 }
557 return s.error(c, "in literal null (expecting 'u')")
558 }
559
560 // stateNu is the state after reading `nu`.
561 func stateNu(s *scanner, c int) int {
562 if c == 'l' {
563 s.step = stateNul
564 return scanContinue
565 }
566 return s.error(c, "in literal null (expecting 'l')")
567 }
568
569 // stateNul is the state after reading `nul`.
570 func stateNul(s *scanner, c int) int {
571 if c == 'l' {
572 s.step = stateEndValue
573 return scanContinue
574 }
575 return s.error(c, "in literal null (expecting 'l')")
576 }
577
578 // stateError is the state after reaching a syntax error,
579 // such as after reading `[1}` or `5.1.2`.
580 func stateError(s *scanner, c int) int {
581 return scanError
582 }
583
584 // error records an error and switches to the error state.
585 func (s *scanner) error(c int, context string) int {
586 s.step = stateError
587 s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
588 return scanError
589 }
590
591 // quoteChar formats c as a quoted character literal
592 func quoteChar(c int) string {
593 // special cases - different from quoted strings
594 if c == '\'' {
595 return `'\''`
596 }
597 if c == '"' {
598 return `'"'`
599 }
600
601 // use quoted string with different quotation marks
602 s := strconv.Quote(string(c))
603 return "'" + s[1:len(s)-1] + "'"
604 }
605
606 // undo causes the scanner to return scanCode from the next state transition.
607 // This gives callers a simple 1-byte undo mechanism.
608 func (s *scanner) undo(scanCode int) {
609 if s.redo {
610 panic("json: invalid use of scanner")
611 }
612 s.redoCode = scanCode
613 s.redoState = s.step
614 s.step = stateRedo
615 s.redo = true
616 }
617
618 // stateRedo helps implement the scanner's 1-byte undo.
619 func stateRedo(s *scanner, c int) int {
620 s.redo = false
621 s.step = s.redoState
622 return s.redoCode
623 }