Source file src/pkg/strconv/quote.go
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package strconv
6
7 import (
8 "unicode/utf8"
9 )
10
11 const lowerhex = "0123456789abcdef"
12
13 func quoteWith(s string, quote byte, ASCIIonly bool) string {
14 var runeTmp [utf8.UTFMax]byte
15 buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
16 buf = append(buf, quote)
17 for width := 0; len(s) > 0; s = s[width:] {
18 r := rune(s[0])
19 width = 1
20 if r >= utf8.RuneSelf {
21 r, width = utf8.DecodeRuneInString(s)
22 }
23 if width == 1 && r == utf8.RuneError {
24 buf = append(buf, `\x`...)
25 buf = append(buf, lowerhex[s[0]>>4])
26 buf = append(buf, lowerhex[s[0]&0xF])
27 continue
28 }
29 if r == rune(quote) || r == '\\' { // always backslashed
30 buf = append(buf, '\\')
31 buf = append(buf, byte(r))
32 continue
33 }
34 if ASCIIonly {
35 if r < utf8.RuneSelf && IsPrint(r) {
36 buf = append(buf, byte(r))
37 continue
38 }
39 } else if IsPrint(r) {
40 n := utf8.EncodeRune(runeTmp[:], r)
41 buf = append(buf, runeTmp[:n]...)
42 continue
43 }
44 switch r {
45 case '\a':
46 buf = append(buf, `\a`...)
47 case '\b':
48 buf = append(buf, `\b`...)
49 case '\f':
50 buf = append(buf, `\f`...)
51 case '\n':
52 buf = append(buf, `\n`...)
53 case '\r':
54 buf = append(buf, `\r`...)
55 case '\t':
56 buf = append(buf, `\t`...)
57 case '\v':
58 buf = append(buf, `\v`...)
59 default:
60 switch {
61 case r < ' ':
62 buf = append(buf, `\x`...)
63 buf = append(buf, lowerhex[s[0]>>4])
64 buf = append(buf, lowerhex[s[0]&0xF])
65 case r > utf8.MaxRune:
66 r = 0xFFFD
67 fallthrough
68 case r < 0x10000:
69 buf = append(buf, `\u`...)
70 for s := 12; s >= 0; s -= 4 {
71 buf = append(buf, lowerhex[r>>uint(s)&0xF])
72 }
73 default:
74 buf = append(buf, `\U`...)
75 for s := 28; s >= 0; s -= 4 {
76 buf = append(buf, lowerhex[r>>uint(s)&0xF])
77 }
78 }
79 }
80 }
81 buf = append(buf, quote)
82 return string(buf)
83
84 }
85
86 // Quote returns a double-quoted Go string literal representing s. The
87 // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
88 // control characters and non-printable characters as defined by
89 // IsPrint.
90 func Quote(s string) string {
91 return quoteWith(s, '"', false)
92 }
93
94 // AppendQuote appends a double-quoted Go string literal representing s,
95 // as generated by Quote, to dst and returns the extended buffer.
96 func AppendQuote(dst []byte, s string) []byte {
97 return append(dst, Quote(s)...)
98 }
99
100 // QuoteToASCII returns a double-quoted Go string literal representing s.
101 // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
102 // non-ASCII characters and non-printable characters as defined by IsPrint.
103 func QuoteToASCII(s string) string {
104 return quoteWith(s, '"', true)
105 }
106
107 // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
108 // as generated by QuoteToASCII, to dst and returns the extended buffer.
109 func AppendQuoteToASCII(dst []byte, s string) []byte {
110 return append(dst, QuoteToASCII(s)...)
111 }
112
113 // QuoteRune returns a single-quoted Go character literal representing the
114 // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
115 // for control characters and non-printable characters as defined by IsPrint.
116 func QuoteRune(r rune) string {
117 // TODO: avoid the allocation here.
118 return quoteWith(string(r), '\'', false)
119 }
120
121 // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
122 // as generated by QuoteRune, to dst and returns the extended buffer.
123 func AppendQuoteRune(dst []byte, r rune) []byte {
124 return append(dst, QuoteRune(r)...)
125 }
126
127 // QuoteRuneToASCII returns a single-quoted Go character literal representing
128 // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
129 // \u0100) for non-ASCII characters and non-printable characters as defined
130 // by IsPrint.
131 func QuoteRuneToASCII(r rune) string {
132 // TODO: avoid the allocation here.
133 return quoteWith(string(r), '\'', true)
134 }
135
136 // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
137 // as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
138 func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
139 return append(dst, QuoteRuneToASCII(r)...)
140 }
141
142 // CanBackquote returns whether the string s would be
143 // a valid Go string literal if enclosed in backquotes.
144 func CanBackquote(s string) bool {
145 for i := 0; i < len(s); i++ {
146 if (s[i] < ' ' && s[i] != '\t') || s[i] == '`' {
147 return false
148 }
149 }
150 return true
151 }
152
153 func unhex(b byte) (v rune, ok bool) {
154 c := rune(b)
155 switch {
156 case '0' <= c && c <= '9':
157 return c - '0', true
158 case 'a' <= c && c <= 'f':
159 return c - 'a' + 10, true
160 case 'A' <= c && c <= 'F':
161 return c - 'A' + 10, true
162 }
163 return
164 }
165
166 // UnquoteChar decodes the first character or byte in the escaped string
167 // or character literal represented by the string s.
168 // It returns four values:
169 //
170 // 1) value, the decoded Unicode code point or byte value;
171 // 2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
172 // 3) tail, the remainder of the string after the character; and
173 // 4) an error that will be nil if the character is syntactically valid.
174 //
175 // The second argument, quote, specifies the type of literal being parsed
176 // and therefore which escaped quote character is permitted.
177 // If set to a single quote, it permits the sequence \' and disallows unescaped '.
178 // If set to a double quote, it permits \" and disallows unescaped ".
179 // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
180 func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
181 // easy cases
182 switch c := s[0]; {
183 case c == quote && (quote == '\'' || quote == '"'):
184 err = ErrSyntax
185 return
186 case c >= utf8.RuneSelf:
187 r, size := utf8.DecodeRuneInString(s)
188 return r, true, s[size:], nil
189 case c != '\\':
190 return rune(s[0]), false, s[1:], nil
191 }
192
193 // hard case: c is backslash
194 if len(s) <= 1 {
195 err = ErrSyntax
196 return
197 }
198 c := s[1]
199 s = s[2:]
200
201 switch c {
202 case 'a':
203 value = '\a'
204 case 'b':
205 value = '\b'
206 case 'f':
207 value = '\f'
208 case 'n':
209 value = '\n'
210 case 'r':
211 value = '\r'
212 case 't':
213 value = '\t'
214 case 'v':
215 value = '\v'
216 case 'x', 'u', 'U':
217 n := 0
218 switch c {
219 case 'x':
220 n = 2
221 case 'u':
222 n = 4
223 case 'U':
224 n = 8
225 }
226 var v rune
227 if len(s) < n {
228 err = ErrSyntax
229 return
230 }
231 for j := 0; j < n; j++ {
232 x, ok := unhex(s[j])
233 if !ok {
234 err = ErrSyntax
235 return
236 }
237 v = v<<4 | x
238 }
239 s = s[n:]
240 if c == 'x' {
241 // single-byte string, possibly not UTF-8
242 value = v
243 break
244 }
245 if v > utf8.MaxRune {
246 err = ErrSyntax
247 return
248 }
249 value = v
250 multibyte = true
251 case '0', '1', '2', '3', '4', '5', '6', '7':
252 v := rune(c) - '0'
253 if len(s) < 2 {
254 err = ErrSyntax
255 return
256 }
257 for j := 0; j < 2; j++ { // one digit already; two more
258 x := rune(s[j]) - '0'
259 if x < 0 || x > 7 {
260 err = ErrSyntax
261 return
262 }
263 v = (v << 3) | x
264 }
265 s = s[2:]
266 if v > 255 {
267 err = ErrSyntax
268 return
269 }
270 value = v
271 case '\\':
272 value = '\\'
273 case '\'', '"':
274 if c != quote {
275 err = ErrSyntax
276 return
277 }
278 value = rune(c)
279 default:
280 err = ErrSyntax
281 return
282 }
283 tail = s
284 return
285 }
286
287 // Unquote interprets s as a single-quoted, double-quoted,
288 // or backquoted Go string literal, returning the string value
289 // that s quotes. (If s is single-quoted, it would be a Go
290 // character literal; Unquote returns the corresponding
291 // one-character string.)
292 func Unquote(s string) (t string, err error) {
293 n := len(s)
294 if n < 2 {
295 return "", ErrSyntax
296 }
297 quote := s[0]
298 if quote != s[n-1] {
299 return "", ErrSyntax
300 }
301 s = s[1 : n-1]
302
303 if quote == '`' {
304 if contains(s, '`') {
305 return "", ErrSyntax
306 }
307 return s, nil
308 }
309 if quote != '"' && quote != '\'' {
310 return "", ErrSyntax
311 }
312 if contains(s, '\n') {
313 return "", ErrSyntax
314 }
315
316 // Is it trivial? Avoid allocation.
317 if !contains(s, '\\') && !contains(s, quote) {
318 switch quote {
319 case '"':
320 return s, nil
321 case '\'':
322 r, size := utf8.DecodeRuneInString(s)
323 if size == len(s) && (r != utf8.RuneError || size != 1) {
324 return s, nil
325 }
326 }
327 }
328
329 var runeTmp [utf8.UTFMax]byte
330 buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
331 for len(s) > 0 {
332 c, multibyte, ss, err := UnquoteChar(s, quote)
333 if err != nil {
334 return "", err
335 }
336 s = ss
337 if c < utf8.RuneSelf || !multibyte {
338 buf = append(buf, byte(c))
339 } else {
340 n := utf8.EncodeRune(runeTmp[:], c)
341 buf = append(buf, runeTmp[:n]...)
342 }
343 if quote == '\'' && len(s) != 0 {
344 // single-quoted must be single character
345 return "", ErrSyntax
346 }
347 }
348 return string(buf), nil
349 }
350
351 // contains reports whether the string contains the byte c.
352 func contains(s string, c byte) bool {
353 for i := 0; i < len(s); i++ {
354 if s[i] == c {
355 return true
356 }
357 }
358 return false
359 }
360
361 // bsearch16 returns the smallest i such that a[i] >= x.
362 // If there is no such i, bsearch16 returns len(a).
363 func bsearch16(a []uint16, x uint16) int {
364 i, j := 0, len(a)
365 for i < j {
366 h := i + (j-i)/2
367 if a[h] < x {
368 i = h + 1
369 } else {
370 j = h
371 }
372 }
373 return i
374 }
375
376 // bsearch32 returns the smallest i such that a[i] >= x.
377 // If there is no such i, bsearch32 returns len(a).
378 func bsearch32(a []uint32, x uint32) int {
379 i, j := 0, len(a)
380 for i < j {
381 h := i + (j-i)/2
382 if a[h] < x {
383 i = h + 1
384 } else {
385 j = h
386 }
387 }
388 return i
389 }
390
391 // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
392 // to give the same answer. It allows this package not to depend on unicode,
393 // and therefore not pull in all the Unicode tables. If the linker were better
394 // at tossing unused tables, we could get rid of this implementation.
395 // That would be nice.
396
397 // IsPrint reports whether the rune is defined as printable by Go, with
398 // the same definition as unicode.IsPrint: letters, numbers, punctuation,
399 // symbols and ASCII space.
400 func IsPrint(r rune) bool {
401 // Fast check for Latin-1
402 if r <= 0xFF {
403 if 0x20 <= r && r <= 0x7E {
404 // All the ASCII is printable from space through DEL-1.
405 return true
406 }
407 if 0xA1 <= r && r <= 0xFF {
408 // Similarly for ¡ through ÿ...
409 return r != 0xAD // ...except for the bizarre soft hyphen.
410 }
411 return false
412 }
413
414 // Same algorithm, either on uint16 or uint32 value.
415 // First, find first i such that isPrint[i] >= x.
416 // This is the index of either the start or end of a pair that might span x.
417 // The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
418 // If we find x in a range, make sure x is not in isNotPrint list.
419
420 if 0 <= r && r < 1<<16 {
421 rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
422 i := bsearch16(isPrint, rr)
423 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
424 return false
425 }
426 j := bsearch16(isNotPrint, rr)
427 return j >= len(isNotPrint) || isNotPrint[j] != rr
428 }
429
430 rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
431 i := bsearch32(isPrint, rr)
432 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
433 return false
434 }
435 if r >= 0x20000 {
436 return true
437 }
438 r -= 0x10000
439 j := bsearch16(isNotPrint, uint16(r))
440 return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
441 }