Source file src/pkg/bytes/bytes.go
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package bytes implements functions for the manipulation of byte slices.
6 // It is analogous to the facilities of the strings package.
7 package bytes
8
9 import (
10 "unicode"
11 "unicode/utf8"
12 )
13
14 // Compare returns an integer comparing the two byte arrays lexicographically.
15 // The result will be 0 if a==b, -1 if a < b, and +1 if a > b
16 // A nil argument is equivalent to an empty slice.
17 func Compare(a, b []byte) int {
18 m := len(a)
19 if m > len(b) {
20 m = len(b)
21 }
22 for i, ac := range a[0:m] {
23 bc := b[i]
24 switch {
25 case ac > bc:
26 return 1
27 case ac < bc:
28 return -1
29 }
30 }
31 switch {
32 case len(a) < len(b):
33 return -1
34 case len(a) > len(b):
35 return 1
36 }
37 return 0
38 }
39
40 // Equal returns a boolean reporting whether a == b.
41 // A nil argument is equivalent to an empty slice.
42 func Equal(a, b []byte) bool
43
44 func equalPortable(a, b []byte) bool {
45 if len(a) != len(b) {
46 return false
47 }
48 for i, c := range a {
49 if c != b[i] {
50 return false
51 }
52 }
53 return true
54 }
55
56 // explode splits s into an array of UTF-8 sequences, one per Unicode character (still arrays of bytes),
57 // up to a maximum of n byte arrays. Invalid UTF-8 sequences are chopped into individual bytes.
58 func explode(s []byte, n int) [][]byte {
59 if n <= 0 {
60 n = len(s)
61 }
62 a := make([][]byte, n)
63 var size int
64 na := 0
65 for len(s) > 0 {
66 if na+1 >= n {
67 a[na] = s
68 na++
69 break
70 }
71 _, size = utf8.DecodeRune(s)
72 a[na] = s[0:size]
73 s = s[size:]
74 na++
75 }
76 return a[0:na]
77 }
78
79 // Count counts the number of non-overlapping instances of sep in s.
80 func Count(s, sep []byte) int {
81 n := len(sep)
82 if n == 0 {
83 return utf8.RuneCount(s) + 1
84 }
85 if n > len(s) {
86 return 0
87 }
88 count := 0
89 c := sep[0]
90 i := 0
91 t := s[:len(s)-n+1]
92 for i < len(t) {
93 if t[i] != c {
94 o := IndexByte(t[i:], c)
95 if o < 0 {
96 break
97 }
98 i += o
99 }
100 if n == 1 || Equal(s[i:i+n], sep) {
101 count++
102 i += n
103 continue
104 }
105 i++
106 }
107 return count
108 }
109
110 // Contains returns whether subslice is within b.
111 func Contains(b, subslice []byte) bool {
112 return Index(b, subslice) != -1
113 }
114
115 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
116 func Index(s, sep []byte) int {
117 n := len(sep)
118 if n == 0 {
119 return 0
120 }
121 if n > len(s) {
122 return -1
123 }
124 c := sep[0]
125 if n == 1 {
126 return IndexByte(s, c)
127 }
128 i := 0
129 t := s[:len(s)-n+1]
130 for i < len(t) {
131 if t[i] != c {
132 o := IndexByte(t[i:], c)
133 if o < 0 {
134 break
135 }
136 i += o
137 }
138 if Equal(s[i:i+n], sep) {
139 return i
140 }
141 i++
142 }
143 return -1
144 }
145
146 func indexBytePortable(s []byte, c byte) int {
147 for i, b := range s {
148 if b == c {
149 return i
150 }
151 }
152 return -1
153 }
154
155 // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
156 func LastIndex(s, sep []byte) int {
157 n := len(sep)
158 if n == 0 {
159 return len(s)
160 }
161 c := sep[0]
162 for i := len(s) - n; i >= 0; i-- {
163 if s[i] == c && (n == 1 || Equal(s[i:i+n], sep)) {
164 return i
165 }
166 }
167 return -1
168 }
169
170 // IndexRune interprets s as a sequence of UTF-8-encoded Unicode code points.
171 // It returns the byte index of the first occurrence in s of the given rune.
172 // It returns -1 if rune is not present in s.
173 func IndexRune(s []byte, r rune) int {
174 for i := 0; i < len(s); {
175 r1, size := utf8.DecodeRune(s[i:])
176 if r == r1 {
177 return i
178 }
179 i += size
180 }
181 return -1
182 }
183
184 // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
185 // It returns the byte index of the first occurrence in s of any of the Unicode
186 // code points in chars. It returns -1 if chars is empty or if there is no code
187 // point in common.
188 func IndexAny(s []byte, chars string) int {
189 if len(chars) > 0 {
190 var r rune
191 var width int
192 for i := 0; i < len(s); i += width {
193 r = rune(s[i])
194 if r < utf8.RuneSelf {
195 width = 1
196 } else {
197 r, width = utf8.DecodeRune(s[i:])
198 }
199 for _, ch := range chars {
200 if r == ch {
201 return i
202 }
203 }
204 }
205 }
206 return -1
207 }
208
209 // LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
210 // points. It returns the byte index of the last occurrence in s of any of
211 // the Unicode code points in chars. It returns -1 if chars is empty or if
212 // there is no code point in common.
213 func LastIndexAny(s []byte, chars string) int {
214 if len(chars) > 0 {
215 for i := len(s); i > 0; {
216 r, size := utf8.DecodeLastRune(s[0:i])
217 i -= size
218 for _, ch := range chars {
219 if r == ch {
220 return i
221 }
222 }
223 }
224 }
225 return -1
226 }
227
228 // Generic split: splits after each instance of sep,
229 // including sepSave bytes of sep in the subarrays.
230 func genSplit(s, sep []byte, sepSave, n int) [][]byte {
231 if n == 0 {
232 return nil
233 }
234 if len(sep) == 0 {
235 return explode(s, n)
236 }
237 if n < 0 {
238 n = Count(s, sep) + 1
239 }
240 c := sep[0]
241 start := 0
242 a := make([][]byte, n)
243 na := 0
244 for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ {
245 if s[i] == c && (len(sep) == 1 || Equal(s[i:i+len(sep)], sep)) {
246 a[na] = s[start : i+sepSave]
247 na++
248 start = i + len(sep)
249 i += len(sep) - 1
250 }
251 }
252 a[na] = s[start:]
253 return a[0 : na+1]
254 }
255
256 // SplitN slices s into subslices separated by sep and returns a slice of
257 // the subslices between those separators.
258 // If sep is empty, SplitN splits after each UTF-8 sequence.
259 // The count determines the number of subslices to return:
260 // n > 0: at most n subslices; the last subslice will be the unsplit remainder.
261 // n == 0: the result is nil (zero subslices)
262 // n < 0: all subslices
263 func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
264
265 // SplitAfterN slices s into subslices after each instance of sep and
266 // returns a slice of those subslices.
267 // If sep is empty, SplitAfterN splits after each UTF-8 sequence.
268 // The count determines the number of subslices to return:
269 // n > 0: at most n subslices; the last subslice will be the unsplit remainder.
270 // n == 0: the result is nil (zero subslices)
271 // n < 0: all subslices
272 func SplitAfterN(s, sep []byte, n int) [][]byte {
273 return genSplit(s, sep, len(sep), n)
274 }
275
276 // Split slices s into all subslices separated by sep and returns a slice of
277 // the subslices between those separators.
278 // If sep is empty, Split splits after each UTF-8 sequence.
279 // It is equivalent to SplitN with a count of -1.
280 func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }
281
282 // SplitAfter slices s into all subslices after each instance of sep and
283 // returns a slice of those subslices.
284 // If sep is empty, SplitAfter splits after each UTF-8 sequence.
285 // It is equivalent to SplitAfterN with a count of -1.
286 func SplitAfter(s, sep []byte) [][]byte {
287 return genSplit(s, sep, len(sep), -1)
288 }
289
290 // Fields splits the array s around each instance of one or more consecutive white space
291 // characters, returning a slice of subarrays of s or an empty list if s contains only white space.
292 func Fields(s []byte) [][]byte {
293 return FieldsFunc(s, unicode.IsSpace)
294 }
295
296 // FieldsFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
297 // It splits the array s at each run of code points c satisfying f(c) and
298 // returns a slice of subarrays of s. If no code points in s satisfy f(c), an
299 // empty slice is returned.
300 func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
301 n := 0
302 inField := false
303 for i := 0; i < len(s); {
304 r, size := utf8.DecodeRune(s[i:])
305 wasInField := inField
306 inField = !f(r)
307 if inField && !wasInField {
308 n++
309 }
310 i += size
311 }
312
313 a := make([][]byte, n)
314 na := 0
315 fieldStart := -1
316 for i := 0; i <= len(s) && na < n; {
317 r, size := utf8.DecodeRune(s[i:])
318 if fieldStart < 0 && size > 0 && !f(r) {
319 fieldStart = i
320 i += size
321 continue
322 }
323 if fieldStart >= 0 && (size == 0 || f(r)) {
324 a[na] = s[fieldStart:i]
325 na++
326 fieldStart = -1
327 }
328 if size == 0 {
329 break
330 }
331 i += size
332 }
333 return a[0:na]
334 }
335
336 // Join concatenates the elements of a to create a single byte array. The separator
337 // sep is placed between elements in the resulting array.
338 func Join(a [][]byte, sep []byte) []byte {
339 if len(a) == 0 {
340 return []byte{}
341 }
342 if len(a) == 1 {
343 return a[0]
344 }
345 n := len(sep) * (len(a) - 1)
346 for i := 0; i < len(a); i++ {
347 n += len(a[i])
348 }
349
350 b := make([]byte, n)
351 bp := copy(b, a[0])
352 for _, s := range a[1:] {
353 bp += copy(b[bp:], sep)
354 bp += copy(b[bp:], s)
355 }
356 return b
357 }
358
359 // HasPrefix tests whether the byte array s begins with prefix.
360 func HasPrefix(s, prefix []byte) bool {
361 return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix)
362 }
363
364 // HasSuffix tests whether the byte array s ends with suffix.
365 func HasSuffix(s, suffix []byte) bool {
366 return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix)
367 }
368
369 // Map returns a copy of the byte array s with all its characters modified
370 // according to the mapping function. If mapping returns a negative value, the character is
371 // dropped from the string with no replacement. The characters in s and the
372 // output are interpreted as UTF-8-encoded Unicode code points.
373 func Map(mapping func(r rune) rune, s []byte) []byte {
374 // In the worst case, the array can grow when mapped, making
375 // things unpleasant. But it's so rare we barge in assuming it's
376 // fine. It could also shrink but that falls out naturally.
377 maxbytes := len(s) // length of b
378 nbytes := 0 // number of bytes encoded in b
379 b := make([]byte, maxbytes)
380 for i := 0; i < len(s); {
381 wid := 1
382 r := rune(s[i])
383 if r >= utf8.RuneSelf {
384 r, wid = utf8.DecodeRune(s[i:])
385 }
386 r = mapping(r)
387 if r >= 0 {
388 if nbytes+utf8.RuneLen(r) > maxbytes {
389 // Grow the buffer.
390 maxbytes = maxbytes*2 + utf8.UTFMax
391 nb := make([]byte, maxbytes)
392 copy(nb, b[0:nbytes])
393 b = nb
394 }
395 nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
396 }
397 i += wid
398 }
399 return b[0:nbytes]
400 }
401
402 // Repeat returns a new byte slice consisting of count copies of b.
403 func Repeat(b []byte, count int) []byte {
404 nb := make([]byte, len(b)*count)
405 bp := 0
406 for i := 0; i < count; i++ {
407 for j := 0; j < len(b); j++ {
408 nb[bp] = b[j]
409 bp++
410 }
411 }
412 return nb
413 }
414
415 // ToUpper returns a copy of the byte array s with all Unicode letters mapped to their upper case.
416 func ToUpper(s []byte) []byte { return Map(unicode.ToUpper, s) }
417
418 // ToUpper returns a copy of the byte array s with all Unicode letters mapped to their lower case.
419 func ToLower(s []byte) []byte { return Map(unicode.ToLower, s) }
420
421 // ToTitle returns a copy of the byte array s with all Unicode letters mapped to their title case.
422 func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }
423
424 // ToUpperSpecial returns a copy of the byte array s with all Unicode letters mapped to their
425 // upper case, giving priority to the special casing rules.
426 func ToUpperSpecial(_case unicode.SpecialCase, s []byte) []byte {
427 return Map(func(r rune) rune { return _case.ToUpper(r) }, s)
428 }
429
430 // ToLowerSpecial returns a copy of the byte array s with all Unicode letters mapped to their
431 // lower case, giving priority to the special casing rules.
432 func ToLowerSpecial(_case unicode.SpecialCase, s []byte) []byte {
433 return Map(func(r rune) rune { return _case.ToLower(r) }, s)
434 }
435
436 // ToTitleSpecial returns a copy of the byte array s with all Unicode letters mapped to their
437 // title case, giving priority to the special casing rules.
438 func ToTitleSpecial(_case unicode.SpecialCase, s []byte) []byte {
439 return Map(func(r rune) rune { return _case.ToTitle(r) }, s)
440 }
441
442 // isSeparator reports whether the rune could mark a word boundary.
443 // TODO: update when package unicode captures more of the properties.
444 func isSeparator(r rune) bool {
445 // ASCII alphanumerics and underscore are not separators
446 if r <= 0x7F {
447 switch {
448 case '0' <= r && r <= '9':
449 return false
450 case 'a' <= r && r <= 'z':
451 return false
452 case 'A' <= r && r <= 'Z':
453 return false
454 case r == '_':
455 return false
456 }
457 return true
458 }
459 // Letters and digits are not separators
460 if unicode.IsLetter(r) || unicode.IsDigit(r) {
461 return false
462 }
463 // Otherwise, all we can do for now is treat spaces as separators.
464 return unicode.IsSpace(r)
465 }
466
467 // BUG(r): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
468
469 // Title returns a copy of s with all Unicode letters that begin words
470 // mapped to their title case.
471 func Title(s []byte) []byte {
472 // Use a closure here to remember state.
473 // Hackish but effective. Depends on Map scanning in order and calling
474 // the closure once per rune.
475 prev := ' '
476 return Map(
477 func(r rune) rune {
478 if isSeparator(prev) {
479 prev = r
480 return unicode.ToTitle(r)
481 }
482 prev = r
483 return r
484 },
485 s)
486 }
487
488 // TrimLeftFunc returns a subslice of s by slicing off all leading UTF-8-encoded
489 // Unicode code points c that satisfy f(c).
490 func TrimLeftFunc(s []byte, f func(r rune) bool) []byte {
491 i := indexFunc(s, f, false)
492 if i == -1 {
493 return nil
494 }
495 return s[i:]
496 }
497
498 // TrimRightFunc returns a subslice of s by slicing off all trailing UTF-8
499 // encoded Unicode code points c that satisfy f(c).
500 func TrimRightFunc(s []byte, f func(r rune) bool) []byte {
501 i := lastIndexFunc(s, f, false)
502 if i >= 0 && s[i] >= utf8.RuneSelf {
503 _, wid := utf8.DecodeRune(s[i:])
504 i += wid
505 } else {
506 i++
507 }
508 return s[0:i]
509 }
510
511 // TrimFunc returns a subslice of s by slicing off all leading and trailing
512 // UTF-8-encoded Unicode code points c that satisfy f(c).
513 func TrimFunc(s []byte, f func(r rune) bool) []byte {
514 return TrimRightFunc(TrimLeftFunc(s, f), f)
515 }
516
517 // IndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
518 // It returns the byte index in s of the first Unicode
519 // code point satisfying f(c), or -1 if none do.
520 func IndexFunc(s []byte, f func(r rune) bool) int {
521 return indexFunc(s, f, true)
522 }
523
524 // LastIndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
525 // It returns the byte index in s of the last Unicode
526 // code point satisfying f(c), or -1 if none do.
527 func LastIndexFunc(s []byte, f func(r rune) bool) int {
528 return lastIndexFunc(s, f, true)
529 }
530
531 // indexFunc is the same as IndexFunc except that if
532 // truth==false, the sense of the predicate function is
533 // inverted.
534 func indexFunc(s []byte, f func(r rune) bool, truth bool) int {
535 start := 0
536 for start < len(s) {
537 wid := 1
538 r := rune(s[start])
539 if r >= utf8.RuneSelf {
540 r, wid = utf8.DecodeRune(s[start:])
541 }
542 if f(r) == truth {
543 return start
544 }
545 start += wid
546 }
547 return -1
548 }
549
550 // lastIndexFunc is the same as LastIndexFunc except that if
551 // truth==false, the sense of the predicate function is
552 // inverted.
553 func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
554 for i := len(s); i > 0; {
555 r, size := utf8.DecodeLastRune(s[0:i])
556 i -= size
557 if f(r) == truth {
558 return i
559 }
560 }
561 return -1
562 }
563
564 func makeCutsetFunc(cutset string) func(r rune) bool {
565 return func(r rune) bool {
566 for _, c := range cutset {
567 if c == r {
568 return true
569 }
570 }
571 return false
572 }
573 }
574
575 // Trim returns a subslice of s by slicing off all leading and
576 // trailing UTF-8-encoded Unicode code points contained in cutset.
577 func Trim(s []byte, cutset string) []byte {
578 return TrimFunc(s, makeCutsetFunc(cutset))
579 }
580
581 // TrimLeft returns a subslice of s by slicing off all leading
582 // UTF-8-encoded Unicode code points contained in cutset.
583 func TrimLeft(s []byte, cutset string) []byte {
584 return TrimLeftFunc(s, makeCutsetFunc(cutset))
585 }
586
587 // TrimRight returns a subslice of s by slicing off all trailing
588 // UTF-8-encoded Unicode code points that are contained in cutset.
589 func TrimRight(s []byte, cutset string) []byte {
590 return TrimRightFunc(s, makeCutsetFunc(cutset))
591 }
592
593 // TrimSpace returns a subslice of s by slicing off all leading and
594 // trailing white space, as defined by Unicode.
595 func TrimSpace(s []byte) []byte {
596 return TrimFunc(s, unicode.IsSpace)
597 }
598
599 // Runes returns a slice of runes (Unicode code points) equivalent to s.
600 func Runes(s []byte) []rune {
601 t := make([]rune, utf8.RuneCount(s))
602 i := 0
603 for len(s) > 0 {
604 r, l := utf8.DecodeRune(s)
605 t[i] = r
606 i++
607 s = s[l:]
608 }
609 return t
610 }
611
612 // Replace returns a copy of the slice s with the first n
613 // non-overlapping instances of old replaced by new.
614 // If n < 0, there is no limit on the number of replacements.
615 func Replace(s, old, new []byte, n int) []byte {
616 m := 0
617 if n != 0 {
618 // Compute number of replacements.
619 m = Count(s, old)
620 }
621 if m == 0 {
622 // Nothing to do. Just copy.
623 t := make([]byte, len(s))
624 copy(t, s)
625 return t
626 }
627 if n < 0 || m < n {
628 n = m
629 }
630
631 // Apply replacements to buffer.
632 t := make([]byte, len(s)+n*(len(new)-len(old)))
633 w := 0
634 start := 0
635 for i := 0; i < n; i++ {
636 j := start
637 if len(old) == 0 {
638 if i > 0 {
639 _, wid := utf8.DecodeRune(s[start:])
640 j += wid
641 }
642 } else {
643 j += Index(s[start:], old)
644 }
645 w += copy(t[w:], s[start:j])
646 w += copy(t[w:], new)
647 start = j + len(old)
648 }
649 w += copy(t[w:], s[start:])
650 return t[0:w]
651 }
652
653 // EqualFold reports whether s and t, interpreted as UTF-8 strings,
654 // are equal under Unicode case-folding.
655 func EqualFold(s, t []byte) bool {
656 for len(s) != 0 && len(t) != 0 {
657 // Extract first rune from each.
658 var sr, tr rune
659 if s[0] < utf8.RuneSelf {
660 sr, s = rune(s[0]), s[1:]
661 } else {
662 r, size := utf8.DecodeRune(s)
663 sr, s = r, s[size:]
664 }
665 if t[0] < utf8.RuneSelf {
666 tr, t = rune(t[0]), t[1:]
667 } else {
668 r, size := utf8.DecodeRune(t)
669 tr, t = r, t[size:]
670 }
671
672 // If they match, keep going; if not, return false.
673
674 // Easy case.
675 if tr == sr {
676 continue
677 }
678
679 // Make sr < tr to simplify what follows.
680 if tr < sr {
681 tr, sr = sr, tr
682 }
683 // Fast check for ASCII.
684 if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' {
685 // ASCII, and sr is upper case. tr must be lower case.
686 if tr == sr+'a'-'A' {
687 continue
688 }
689 return false
690 }
691
692 // General case. SimpleFold(x) returns the next equivalent rune > x
693 // or wraps around to smaller values.
694 r := unicode.SimpleFold(sr)
695 for r != sr && r < tr {
696 r = unicode.SimpleFold(r)
697 }
698 if r == tr {
699 continue
700 }
701 return false
702 }
703
704 // One string is empty. Are both?
705 return len(s) == len(t)
706 }