src/pkg/strconv/quote.go - The Go Programming Language

Golang

Source file src/pkg/strconv/quote.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package strconv
     6	
     7	import (
     8		"unicode/utf8"
     9	)
    10	
    11	const lowerhex = "0123456789abcdef"
    12	
    13	func quoteWith(s string, quote byte, ASCIIonly bool) string {
    14		var runeTmp [utf8.UTFMax]byte
    15		buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
    16		buf = append(buf, quote)
    17		for width := 0; len(s) > 0; s = s[width:] {
    18			r := rune(s[0])
    19			width = 1
    20			if r >= utf8.RuneSelf {
    21				r, width = utf8.DecodeRuneInString(s)
    22			}
    23			if width == 1 && r == utf8.RuneError {
    24				buf = append(buf, `\x`...)
    25				buf = append(buf, lowerhex[s[0]>>4])
    26				buf = append(buf, lowerhex[s[0]&0xF])
    27				continue
    28			}
    29			if r == rune(quote) || r == '\\' { // always backslashed
    30				buf = append(buf, '\\')
    31				buf = append(buf, byte(r))
    32				continue
    33			}
    34			if ASCIIonly {
    35				if r < utf8.RuneSelf && IsPrint(r) {
    36					buf = append(buf, byte(r))
    37					continue
    38				}
    39			} else if IsPrint(r) {
    40				n := utf8.EncodeRune(runeTmp[:], r)
    41				buf = append(buf, runeTmp[:n]...)
    42				continue
    43			}
    44			switch r {
    45			case '\a':
    46				buf = append(buf, `\a`...)
    47			case '\b':
    48				buf = append(buf, `\b`...)
    49			case '\f':
    50				buf = append(buf, `\f`...)
    51			case '\n':
    52				buf = append(buf, `\n`...)
    53			case '\r':
    54				buf = append(buf, `\r`...)
    55			case '\t':
    56				buf = append(buf, `\t`...)
    57			case '\v':
    58				buf = append(buf, `\v`...)
    59			default:
    60				switch {
    61				case r < ' ':
    62					buf = append(buf, `\x`...)
    63					buf = append(buf, lowerhex[s[0]>>4])
    64					buf = append(buf, lowerhex[s[0]&0xF])
    65				case r > utf8.MaxRune:
    66					r = 0xFFFD
    67					fallthrough
    68				case r < 0x10000:
    69					buf = append(buf, `\u`...)
    70					for s := 12; s >= 0; s -= 4 {
    71						buf = append(buf, lowerhex[r>>uint(s)&0xF])
    72					}
    73				default:
    74					buf = append(buf, `\U`...)
    75					for s := 28; s >= 0; s -= 4 {
    76						buf = append(buf, lowerhex[r>>uint(s)&0xF])
    77					}
    78				}
    79			}
    80		}
    81		buf = append(buf, quote)
    82		return string(buf)
    83	
    84	}
    85	
    86	// Quote returns a double-quoted Go string literal representing s.  The
    87	// returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
    88	// control characters and non-printable characters as defined by
    89	// IsPrint.
    90	func Quote(s string) string {
    91		return quoteWith(s, '"', false)
    92	}
    93	
    94	// AppendQuote appends a double-quoted Go string literal representing s,
    95	// as generated by Quote, to dst and returns the extended buffer.
    96	func AppendQuote(dst []byte, s string) []byte {
    97		return append(dst, Quote(s)...)
    98	}
    99	
   100	// QuoteToASCII returns a double-quoted Go string literal representing s.
   101	// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   102	// non-ASCII characters and non-printable characters as defined by IsPrint.
   103	func QuoteToASCII(s string) string {
   104		return quoteWith(s, '"', true)
   105	}
   106	
   107	// AppendQuoteToASCII appends a double-quoted Go string literal representing s,
   108	// as generated by QuoteToASCII, to dst and returns the extended buffer.
   109	func AppendQuoteToASCII(dst []byte, s string) []byte {
   110		return append(dst, QuoteToASCII(s)...)
   111	}
   112	
   113	// QuoteRune returns a single-quoted Go character literal representing the
   114	// rune.  The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
   115	// for control characters and non-printable characters as defined by IsPrint.
   116	func QuoteRune(r rune) string {
   117		// TODO: avoid the allocation here.
   118		return quoteWith(string(r), '\'', false)
   119	}
   120	
   121	// AppendQuoteRune appends a single-quoted Go character literal representing the rune,
   122	// as generated by QuoteRune, to dst and returns the extended buffer.
   123	func AppendQuoteRune(dst []byte, r rune) []byte {
   124		return append(dst, QuoteRune(r)...)
   125	}
   126	
   127	// QuoteRuneToASCII returns a single-quoted Go character literal representing
   128	// the rune.  The returned string uses Go escape sequences (\t, \n, \xFF,
   129	// \u0100) for non-ASCII characters and non-printable characters as defined
   130	// by IsPrint.
   131	func QuoteRuneToASCII(r rune) string {
   132		// TODO: avoid the allocation here.
   133		return quoteWith(string(r), '\'', true)
   134	}
   135	
   136	// AppendQuoteRune appends a single-quoted Go character literal representing the rune,
   137	// as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
   138	func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
   139		return append(dst, QuoteRuneToASCII(r)...)
   140	}
   141	
   142	// CanBackquote returns whether the string s would be
   143	// a valid Go string literal if enclosed in backquotes.
   144	func CanBackquote(s string) bool {
   145		for i := 0; i < len(s); i++ {
   146			if (s[i] < ' ' && s[i] != '\t') || s[i] == '`' {
   147				return false
   148			}
   149		}
   150		return true
   151	}
   152	
   153	func unhex(b byte) (v rune, ok bool) {
   154		c := rune(b)
   155		switch {
   156		case '0' <= c && c <= '9':
   157			return c - '0', true
   158		case 'a' <= c && c <= 'f':
   159			return c - 'a' + 10, true
   160		case 'A' <= c && c <= 'F':
   161			return c - 'A' + 10, true
   162		}
   163		return
   164	}
   165	
   166	// UnquoteChar decodes the first character or byte in the escaped string
   167	// or character literal represented by the string s.
   168	// It returns four values:
   169	//
   170	//	1) value, the decoded Unicode code point or byte value;
   171	//	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   172	//	3) tail, the remainder of the string after the character; and
   173	//	4) an error that will be nil if the character is syntactically valid.
   174	//
   175	// The second argument, quote, specifies the type of literal being parsed
   176	// and therefore which escaped quote character is permitted.
   177	// If set to a single quote, it permits the sequence \' and disallows unescaped '.
   178	// If set to a double quote, it permits \" and disallows unescaped ".
   179	// If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
   180	func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
   181		// easy cases
   182		switch c := s[0]; {
   183		case c == quote && (quote == '\'' || quote == '"'):
   184			err = ErrSyntax
   185			return
   186		case c >= utf8.RuneSelf:
   187			r, size := utf8.DecodeRuneInString(s)
   188			return r, true, s[size:], nil
   189		case c != '\\':
   190			return rune(s[0]), false, s[1:], nil
   191		}
   192	
   193		// hard case: c is backslash
   194		if len(s) <= 1 {
   195			err = ErrSyntax
   196			return
   197		}
   198		c := s[1]
   199		s = s[2:]
   200	
   201		switch c {
   202		case 'a':
   203			value = '\a'
   204		case 'b':
   205			value = '\b'
   206		case 'f':
   207			value = '\f'
   208		case 'n':
   209			value = '\n'
   210		case 'r':
   211			value = '\r'
   212		case 't':
   213			value = '\t'
   214		case 'v':
   215			value = '\v'
   216		case 'x', 'u', 'U':
   217			n := 0
   218			switch c {
   219			case 'x':
   220				n = 2
   221			case 'u':
   222				n = 4
   223			case 'U':
   224				n = 8
   225			}
   226			var v rune
   227			if len(s) < n {
   228				err = ErrSyntax
   229				return
   230			}
   231			for j := 0; j < n; j++ {
   232				x, ok := unhex(s[j])
   233				if !ok {
   234					err = ErrSyntax
   235					return
   236				}
   237				v = v<<4 | x
   238			}
   239			s = s[n:]
   240			if c == 'x' {
   241				// single-byte string, possibly not UTF-8
   242				value = v
   243				break
   244			}
   245			if v > utf8.MaxRune {
   246				err = ErrSyntax
   247				return
   248			}
   249			value = v
   250			multibyte = true
   251		case '0', '1', '2', '3', '4', '5', '6', '7':
   252			v := rune(c) - '0'
   253			if len(s) < 2 {
   254				err = ErrSyntax
   255				return
   256			}
   257			for j := 0; j < 2; j++ { // one digit already; two more
   258				x := rune(s[j]) - '0'
   259				if x < 0 || x > 7 {
   260					err = ErrSyntax
   261					return
   262				}
   263				v = (v << 3) | x
   264			}
   265			s = s[2:]
   266			if v > 255 {
   267				err = ErrSyntax
   268				return
   269			}
   270			value = v
   271		case '\\':
   272			value = '\\'
   273		case '\'', '"':
   274			if c != quote {
   275				err = ErrSyntax
   276				return
   277			}
   278			value = rune(c)
   279		default:
   280			err = ErrSyntax
   281			return
   282		}
   283		tail = s
   284		return
   285	}
   286	
   287	// Unquote interprets s as a single-quoted, double-quoted,
   288	// or backquoted Go string literal, returning the string value
   289	// that s quotes.  (If s is single-quoted, it would be a Go
   290	// character literal; Unquote returns the corresponding
   291	// one-character string.)
   292	func Unquote(s string) (t string, err error) {
   293		n := len(s)
   294		if n < 2 {
   295			return "", ErrSyntax
   296		}
   297		quote := s[0]
   298		if quote != s[n-1] {
   299			return "", ErrSyntax
   300		}
   301		s = s[1 : n-1]
   302	
   303		if quote == '`' {
   304			if contains(s, '`') {
   305				return "", ErrSyntax
   306			}
   307			return s, nil
   308		}
   309		if quote != '"' && quote != '\'' {
   310			return "", ErrSyntax
   311		}
   312		if contains(s, '\n') {
   313			return "", ErrSyntax
   314		}
   315	
   316		// Is it trivial?  Avoid allocation.
   317		if !contains(s, '\\') && !contains(s, quote) {
   318			switch quote {
   319			case '"':
   320				return s, nil
   321			case '\'':
   322				r, size := utf8.DecodeRuneInString(s)
   323				if size == len(s) && (r != utf8.RuneError || size != 1) {
   324					return s, nil
   325				}
   326			}
   327		}
   328	
   329		var runeTmp [utf8.UTFMax]byte
   330		buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
   331		for len(s) > 0 {
   332			c, multibyte, ss, err := UnquoteChar(s, quote)
   333			if err != nil {
   334				return "", err
   335			}
   336			s = ss
   337			if c < utf8.RuneSelf || !multibyte {
   338				buf = append(buf, byte(c))
   339			} else {
   340				n := utf8.EncodeRune(runeTmp[:], c)
   341				buf = append(buf, runeTmp[:n]...)
   342			}
   343			if quote == '\'' && len(s) != 0 {
   344				// single-quoted must be single character
   345				return "", ErrSyntax
   346			}
   347		}
   348		return string(buf), nil
   349	}
   350	
   351	// contains reports whether the string contains the byte c.
   352	func contains(s string, c byte) bool {
   353		for i := 0; i < len(s); i++ {
   354			if s[i] == c {
   355				return true
   356			}
   357		}
   358		return false
   359	}
   360	
   361	// bsearch16 returns the smallest i such that a[i] >= x.
   362	// If there is no such i, bsearch16 returns len(a).
   363	func bsearch16(a []uint16, x uint16) int {
   364		i, j := 0, len(a)
   365		for i < j {
   366			h := i + (j-i)/2
   367			if a[h] < x {
   368				i = h + 1
   369			} else {
   370				j = h
   371			}
   372		}
   373		return i
   374	}
   375	
   376	// bsearch32 returns the smallest i such that a[i] >= x.
   377	// If there is no such i, bsearch32 returns len(a).
   378	func bsearch32(a []uint32, x uint32) int {
   379		i, j := 0, len(a)
   380		for i < j {
   381			h := i + (j-i)/2
   382			if a[h] < x {
   383				i = h + 1
   384			} else {
   385				j = h
   386			}
   387		}
   388		return i
   389	}
   390	
   391	// TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
   392	// to give the same answer. It allows this package not to depend on unicode,
   393	// and therefore not pull in all the Unicode tables. If the linker were better
   394	// at tossing unused tables, we could get rid of this implementation.
   395	// That would be nice.
   396	
   397	// IsPrint reports whether the rune is defined as printable by Go, with
   398	// the same definition as unicode.IsPrint: letters, numbers, punctuation,
   399	// symbols and ASCII space.
   400	func IsPrint(r rune) bool {
   401		// Fast check for Latin-1
   402		if r <= 0xFF {
   403			if 0x20 <= r && r <= 0x7E {
   404				// All the ASCII is printable from space through DEL-1.
   405				return true
   406			}
   407			if 0xA1 <= r && r <= 0xFF {
   408				// Similarly for ¡ through ÿ...
   409				return r != 0xAD // ...except for the bizarre soft hyphen.
   410			}
   411			return false
   412		}
   413	
   414		// Same algorithm, either on uint16 or uint32 value.
   415		// First, find first i such that isPrint[i] >= x.
   416		// This is the index of either the start or end of a pair that might span x.
   417		// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
   418		// If we find x in a range, make sure x is not in isNotPrint list.
   419	
   420		if 0 <= r && r < 1<<16 {
   421			rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
   422			i := bsearch16(isPrint, rr)
   423			if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   424				return false
   425			}
   426			j := bsearch16(isNotPrint, rr)
   427			return j >= len(isNotPrint) || isNotPrint[j] != rr
   428		}
   429	
   430		rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
   431		i := bsearch32(isPrint, rr)
   432		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   433			return false
   434		}
   435		if r >= 0x20000 {
   436			return true
   437		}
   438		r -= 0x10000
   439		j := bsearch16(isNotPrint, uint16(r))
   440		return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
   441	}