Source file src/pkg/bytes/bytes.go
1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package bytes implements functions for the manipulation of byte slices. 6 // It is analogous to the facilities of the strings package. 7 package bytes 8 9 import ( 10 "unicode" 11 "unicode/utf8" 12 ) 13 14 // Compare returns an integer comparing the two byte arrays lexicographically. 15 // The result will be 0 if a==b, -1 if a < b, and +1 if a > b 16 // A nil argument is equivalent to an empty slice. 17 func Compare(a, b []byte) int { 18 m := len(a) 19 if m > len(b) { 20 m = len(b) 21 } 22 for i, ac := range a[0:m] { 23 bc := b[i] 24 switch { 25 case ac > bc: 26 return 1 27 case ac < bc: 28 return -1 29 } 30 } 31 switch { 32 case len(a) < len(b): 33 return -1 34 case len(a) > len(b): 35 return 1 36 } 37 return 0 38 } 39 40 // Equal returns a boolean reporting whether a == b. 41 // A nil argument is equivalent to an empty slice. 42 func Equal(a, b []byte) bool 43 44 func equalPortable(a, b []byte) bool { 45 if len(a) != len(b) { 46 return false 47 } 48 for i, c := range a { 49 if c != b[i] { 50 return false 51 } 52 } 53 return true 54 } 55 56 // explode splits s into an array of UTF-8 sequences, one per Unicode character (still arrays of bytes), 57 // up to a maximum of n byte arrays. Invalid UTF-8 sequences are chopped into individual bytes. 58 func explode(s []byte, n int) [][]byte { 59 if n <= 0 { 60 n = len(s) 61 } 62 a := make([][]byte, n) 63 var size int 64 na := 0 65 for len(s) > 0 { 66 if na+1 >= n { 67 a[na] = s 68 na++ 69 break 70 } 71 _, size = utf8.DecodeRune(s) 72 a[na] = s[0:size] 73 s = s[size:] 74 na++ 75 } 76 return a[0:na] 77 } 78 79 // Count counts the number of non-overlapping instances of sep in s. 80 func Count(s, sep []byte) int { 81 n := len(sep) 82 if n == 0 { 83 return utf8.RuneCount(s) + 1 84 } 85 if n > len(s) { 86 return 0 87 } 88 count := 0 89 c := sep[0] 90 i := 0 91 t := s[:len(s)-n+1] 92 for i < len(t) { 93 if t[i] != c { 94 o := IndexByte(t[i:], c) 95 if o < 0 { 96 break 97 } 98 i += o 99 } 100 if n == 1 || Equal(s[i:i+n], sep) { 101 count++ 102 i += n 103 continue 104 } 105 i++ 106 } 107 return count 108 } 109 110 // Contains returns whether subslice is within b. 111 func Contains(b, subslice []byte) bool { 112 return Index(b, subslice) != -1 113 } 114 115 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. 116 func Index(s, sep []byte) int { 117 n := len(sep) 118 if n == 0 { 119 return 0 120 } 121 if n > len(s) { 122 return -1 123 } 124 c := sep[0] 125 if n == 1 { 126 return IndexByte(s, c) 127 } 128 i := 0 129 t := s[:len(s)-n+1] 130 for i < len(t) { 131 if t[i] != c { 132 o := IndexByte(t[i:], c) 133 if o < 0 { 134 break 135 } 136 i += o 137 } 138 if Equal(s[i:i+n], sep) { 139 return i 140 } 141 i++ 142 } 143 return -1 144 } 145 146 func indexBytePortable(s []byte, c byte) int { 147 for i, b := range s { 148 if b == c { 149 return i 150 } 151 } 152 return -1 153 } 154 155 // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s. 156 func LastIndex(s, sep []byte) int { 157 n := len(sep) 158 if n == 0 { 159 return len(s) 160 } 161 c := sep[0] 162 for i := len(s) - n; i >= 0; i-- { 163 if s[i] == c && (n == 1 || Equal(s[i:i+n], sep)) { 164 return i 165 } 166 } 167 return -1 168 } 169 170 // IndexRune interprets s as a sequence of UTF-8-encoded Unicode code points. 171 // It returns the byte index of the first occurrence in s of the given rune. 172 // It returns -1 if rune is not present in s. 173 func IndexRune(s []byte, r rune) int { 174 for i := 0; i < len(s); { 175 r1, size := utf8.DecodeRune(s[i:]) 176 if r == r1 { 177 return i 178 } 179 i += size 180 } 181 return -1 182 } 183 184 // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points. 185 // It returns the byte index of the first occurrence in s of any of the Unicode 186 // code points in chars. It returns -1 if chars is empty or if there is no code 187 // point in common. 188 func IndexAny(s []byte, chars string) int { 189 if len(chars) > 0 { 190 var r rune 191 var width int 192 for i := 0; i < len(s); i += width { 193 r = rune(s[i]) 194 if r < utf8.RuneSelf { 195 width = 1 196 } else { 197 r, width = utf8.DecodeRune(s[i:]) 198 } 199 for _, ch := range chars { 200 if r == ch { 201 return i 202 } 203 } 204 } 205 } 206 return -1 207 } 208 209 // LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code 210 // points. It returns the byte index of the last occurrence in s of any of 211 // the Unicode code points in chars. It returns -1 if chars is empty or if 212 // there is no code point in common. 213 func LastIndexAny(s []byte, chars string) int { 214 if len(chars) > 0 { 215 for i := len(s); i > 0; { 216 r, size := utf8.DecodeLastRune(s[0:i]) 217 i -= size 218 for _, ch := range chars { 219 if r == ch { 220 return i 221 } 222 } 223 } 224 } 225 return -1 226 } 227 228 // Generic split: splits after each instance of sep, 229 // including sepSave bytes of sep in the subarrays. 230 func genSplit(s, sep []byte, sepSave, n int) [][]byte { 231 if n == 0 { 232 return nil 233 } 234 if len(sep) == 0 { 235 return explode(s, n) 236 } 237 if n < 0 { 238 n = Count(s, sep) + 1 239 } 240 c := sep[0] 241 start := 0 242 a := make([][]byte, n) 243 na := 0 244 for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ { 245 if s[i] == c && (len(sep) == 1 || Equal(s[i:i+len(sep)], sep)) { 246 a[na] = s[start : i+sepSave] 247 na++ 248 start = i + len(sep) 249 i += len(sep) - 1 250 } 251 } 252 a[na] = s[start:] 253 return a[0 : na+1] 254 } 255 256 // SplitN slices s into subslices separated by sep and returns a slice of 257 // the subslices between those separators. 258 // If sep is empty, SplitN splits after each UTF-8 sequence. 259 // The count determines the number of subslices to return: 260 // n > 0: at most n subslices; the last subslice will be the unsplit remainder. 261 // n == 0: the result is nil (zero subslices) 262 // n < 0: all subslices 263 func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) } 264 265 // SplitAfterN slices s into subslices after each instance of sep and 266 // returns a slice of those subslices. 267 // If sep is empty, SplitAfterN splits after each UTF-8 sequence. 268 // The count determines the number of subslices to return: 269 // n > 0: at most n subslices; the last subslice will be the unsplit remainder. 270 // n == 0: the result is nil (zero subslices) 271 // n < 0: all subslices 272 func SplitAfterN(s, sep []byte, n int) [][]byte { 273 return genSplit(s, sep, len(sep), n) 274 } 275 276 // Split slices s into all subslices separated by sep and returns a slice of 277 // the subslices between those separators. 278 // If sep is empty, Split splits after each UTF-8 sequence. 279 // It is equivalent to SplitN with a count of -1. 280 func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) } 281 282 // SplitAfter slices s into all subslices after each instance of sep and 283 // returns a slice of those subslices. 284 // If sep is empty, SplitAfter splits after each UTF-8 sequence. 285 // It is equivalent to SplitAfterN with a count of -1. 286 func SplitAfter(s, sep []byte) [][]byte { 287 return genSplit(s, sep, len(sep), -1) 288 } 289 290 // Fields splits the array s around each instance of one or more consecutive white space 291 // characters, returning a slice of subarrays of s or an empty list if s contains only white space. 292 func Fields(s []byte) [][]byte { 293 return FieldsFunc(s, unicode.IsSpace) 294 } 295 296 // FieldsFunc interprets s as a sequence of UTF-8-encoded Unicode code points. 297 // It splits the array s at each run of code points c satisfying f(c) and 298 // returns a slice of subarrays of s. If no code points in s satisfy f(c), an 299 // empty slice is returned. 300 func FieldsFunc(s []byte, f func(rune) bool) [][]byte { 301 n := 0 302 inField := false 303 for i := 0; i < len(s); { 304 r, size := utf8.DecodeRune(s[i:]) 305 wasInField := inField 306 inField = !f(r) 307 if inField && !wasInField { 308 n++ 309 } 310 i += size 311 } 312 313 a := make([][]byte, n) 314 na := 0 315 fieldStart := -1 316 for i := 0; i <= len(s) && na < n; { 317 r, size := utf8.DecodeRune(s[i:]) 318 if fieldStart < 0 && size > 0 && !f(r) { 319 fieldStart = i 320 i += size 321 continue 322 } 323 if fieldStart >= 0 && (size == 0 || f(r)) { 324 a[na] = s[fieldStart:i] 325 na++ 326 fieldStart = -1 327 } 328 if size == 0 { 329 break 330 } 331 i += size 332 } 333 return a[0:na] 334 } 335 336 // Join concatenates the elements of a to create a single byte array. The separator 337 // sep is placed between elements in the resulting array. 338 func Join(a [][]byte, sep []byte) []byte { 339 if len(a) == 0 { 340 return []byte{} 341 } 342 if len(a) == 1 { 343 return a[0] 344 } 345 n := len(sep) * (len(a) - 1) 346 for i := 0; i < len(a); i++ { 347 n += len(a[i]) 348 } 349 350 b := make([]byte, n) 351 bp := copy(b, a[0]) 352 for _, s := range a[1:] { 353 bp += copy(b[bp:], sep) 354 bp += copy(b[bp:], s) 355 } 356 return b 357 } 358 359 // HasPrefix tests whether the byte array s begins with prefix. 360 func HasPrefix(s, prefix []byte) bool { 361 return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix) 362 } 363 364 // HasSuffix tests whether the byte array s ends with suffix. 365 func HasSuffix(s, suffix []byte) bool { 366 return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix) 367 } 368 369 // Map returns a copy of the byte array s with all its characters modified 370 // according to the mapping function. If mapping returns a negative value, the character is 371 // dropped from the string with no replacement. The characters in s and the 372 // output are interpreted as UTF-8-encoded Unicode code points. 373 func Map(mapping func(r rune) rune, s []byte) []byte { 374 // In the worst case, the array can grow when mapped, making 375 // things unpleasant. But it's so rare we barge in assuming it's 376 // fine. It could also shrink but that falls out naturally. 377 maxbytes := len(s) // length of b 378 nbytes := 0 // number of bytes encoded in b 379 b := make([]byte, maxbytes) 380 for i := 0; i < len(s); { 381 wid := 1 382 r := rune(s[i]) 383 if r >= utf8.RuneSelf { 384 r, wid = utf8.DecodeRune(s[i:]) 385 } 386 r = mapping(r) 387 if r >= 0 { 388 if nbytes+utf8.RuneLen(r) > maxbytes { 389 // Grow the buffer. 390 maxbytes = maxbytes*2 + utf8.UTFMax 391 nb := make([]byte, maxbytes) 392 copy(nb, b[0:nbytes]) 393 b = nb 394 } 395 nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r) 396 } 397 i += wid 398 } 399 return b[0:nbytes] 400 } 401 402 // Repeat returns a new byte slice consisting of count copies of b. 403 func Repeat(b []byte, count int) []byte { 404 nb := make([]byte, len(b)*count) 405 bp := 0 406 for i := 0; i < count; i++ { 407 for j := 0; j < len(b); j++ { 408 nb[bp] = b[j] 409 bp++ 410 } 411 } 412 return nb 413 } 414 415 // ToUpper returns a copy of the byte array s with all Unicode letters mapped to their upper case. 416 func ToUpper(s []byte) []byte { return Map(unicode.ToUpper, s) } 417 418 // ToUpper returns a copy of the byte array s with all Unicode letters mapped to their lower case. 419 func ToLower(s []byte) []byte { return Map(unicode.ToLower, s) } 420 421 // ToTitle returns a copy of the byte array s with all Unicode letters mapped to their title case. 422 func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) } 423 424 // ToUpperSpecial returns a copy of the byte array s with all Unicode letters mapped to their 425 // upper case, giving priority to the special casing rules. 426 func ToUpperSpecial(_case unicode.SpecialCase, s []byte) []byte { 427 return Map(func(r rune) rune { return _case.ToUpper(r) }, s) 428 } 429 430 // ToLowerSpecial returns a copy of the byte array s with all Unicode letters mapped to their 431 // lower case, giving priority to the special casing rules. 432 func ToLowerSpecial(_case unicode.SpecialCase, s []byte) []byte { 433 return Map(func(r rune) rune { return _case.ToLower(r) }, s) 434 } 435 436 // ToTitleSpecial returns a copy of the byte array s with all Unicode letters mapped to their 437 // title case, giving priority to the special casing rules. 438 func ToTitleSpecial(_case unicode.SpecialCase, s []byte) []byte { 439 return Map(func(r rune) rune { return _case.ToTitle(r) }, s) 440 } 441 442 // isSeparator reports whether the rune could mark a word boundary. 443 // TODO: update when package unicode captures more of the properties. 444 func isSeparator(r rune) bool { 445 // ASCII alphanumerics and underscore are not separators 446 if r <= 0x7F { 447 switch { 448 case '0' <= r && r <= '9': 449 return false 450 case 'a' <= r && r <= 'z': 451 return false 452 case 'A' <= r && r <= 'Z': 453 return false 454 case r == '_': 455 return false 456 } 457 return true 458 } 459 // Letters and digits are not separators 460 if unicode.IsLetter(r) || unicode.IsDigit(r) { 461 return false 462 } 463 // Otherwise, all we can do for now is treat spaces as separators. 464 return unicode.IsSpace(r) 465 } 466 467 // BUG(r): The rule Title uses for word boundaries does not handle Unicode punctuation properly. 468 469 // Title returns a copy of s with all Unicode letters that begin words 470 // mapped to their title case. 471 func Title(s []byte) []byte { 472 // Use a closure here to remember state. 473 // Hackish but effective. Depends on Map scanning in order and calling 474 // the closure once per rune. 475 prev := ' ' 476 return Map( 477 func(r rune) rune { 478 if isSeparator(prev) { 479 prev = r 480 return unicode.ToTitle(r) 481 } 482 prev = r 483 return r 484 }, 485 s) 486 } 487 488 // TrimLeftFunc returns a subslice of s by slicing off all leading UTF-8-encoded 489 // Unicode code points c that satisfy f(c). 490 func TrimLeftFunc(s []byte, f func(r rune) bool) []byte { 491 i := indexFunc(s, f, false) 492 if i == -1 { 493 return nil 494 } 495 return s[i:] 496 } 497 498 // TrimRightFunc returns a subslice of s by slicing off all trailing UTF-8 499 // encoded Unicode code points c that satisfy f(c). 500 func TrimRightFunc(s []byte, f func(r rune) bool) []byte { 501 i := lastIndexFunc(s, f, false) 502 if i >= 0 && s[i] >= utf8.RuneSelf { 503 _, wid := utf8.DecodeRune(s[i:]) 504 i += wid 505 } else { 506 i++ 507 } 508 return s[0:i] 509 } 510 511 // TrimFunc returns a subslice of s by slicing off all leading and trailing 512 // UTF-8-encoded Unicode code points c that satisfy f(c). 513 func TrimFunc(s []byte, f func(r rune) bool) []byte { 514 return TrimRightFunc(TrimLeftFunc(s, f), f) 515 } 516 517 // IndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points. 518 // It returns the byte index in s of the first Unicode 519 // code point satisfying f(c), or -1 if none do. 520 func IndexFunc(s []byte, f func(r rune) bool) int { 521 return indexFunc(s, f, true) 522 } 523 524 // LastIndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points. 525 // It returns the byte index in s of the last Unicode 526 // code point satisfying f(c), or -1 if none do. 527 func LastIndexFunc(s []byte, f func(r rune) bool) int { 528 return lastIndexFunc(s, f, true) 529 } 530 531 // indexFunc is the same as IndexFunc except that if 532 // truth==false, the sense of the predicate function is 533 // inverted. 534 func indexFunc(s []byte, f func(r rune) bool, truth bool) int { 535 start := 0 536 for start < len(s) { 537 wid := 1 538 r := rune(s[start]) 539 if r >= utf8.RuneSelf { 540 r, wid = utf8.DecodeRune(s[start:]) 541 } 542 if f(r) == truth { 543 return start 544 } 545 start += wid 546 } 547 return -1 548 } 549 550 // lastIndexFunc is the same as LastIndexFunc except that if 551 // truth==false, the sense of the predicate function is 552 // inverted. 553 func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int { 554 for i := len(s); i > 0; { 555 r, size := utf8.DecodeLastRune(s[0:i]) 556 i -= size 557 if f(r) == truth { 558 return i 559 } 560 } 561 return -1 562 } 563 564 func makeCutsetFunc(cutset string) func(r rune) bool { 565 return func(r rune) bool { 566 for _, c := range cutset { 567 if c == r { 568 return true 569 } 570 } 571 return false 572 } 573 } 574 575 // Trim returns a subslice of s by slicing off all leading and 576 // trailing UTF-8-encoded Unicode code points contained in cutset. 577 func Trim(s []byte, cutset string) []byte { 578 return TrimFunc(s, makeCutsetFunc(cutset)) 579 } 580 581 // TrimLeft returns a subslice of s by slicing off all leading 582 // UTF-8-encoded Unicode code points contained in cutset. 583 func TrimLeft(s []byte, cutset string) []byte { 584 return TrimLeftFunc(s, makeCutsetFunc(cutset)) 585 } 586 587 // TrimRight returns a subslice of s by slicing off all trailing 588 // UTF-8-encoded Unicode code points that are contained in cutset. 589 func TrimRight(s []byte, cutset string) []byte { 590 return TrimRightFunc(s, makeCutsetFunc(cutset)) 591 } 592 593 // TrimSpace returns a subslice of s by slicing off all leading and 594 // trailing white space, as defined by Unicode. 595 func TrimSpace(s []byte) []byte { 596 return TrimFunc(s, unicode.IsSpace) 597 } 598 599 // Runes returns a slice of runes (Unicode code points) equivalent to s. 600 func Runes(s []byte) []rune { 601 t := make([]rune, utf8.RuneCount(s)) 602 i := 0 603 for len(s) > 0 { 604 r, l := utf8.DecodeRune(s) 605 t[i] = r 606 i++ 607 s = s[l:] 608 } 609 return t 610 } 611 612 // Replace returns a copy of the slice s with the first n 613 // non-overlapping instances of old replaced by new. 614 // If n < 0, there is no limit on the number of replacements. 615 func Replace(s, old, new []byte, n int) []byte { 616 m := 0 617 if n != 0 { 618 // Compute number of replacements. 619 m = Count(s, old) 620 } 621 if m == 0 { 622 // Nothing to do. Just copy. 623 t := make([]byte, len(s)) 624 copy(t, s) 625 return t 626 } 627 if n < 0 || m < n { 628 n = m 629 } 630 631 // Apply replacements to buffer. 632 t := make([]byte, len(s)+n*(len(new)-len(old))) 633 w := 0 634 start := 0 635 for i := 0; i < n; i++ { 636 j := start 637 if len(old) == 0 { 638 if i > 0 { 639 _, wid := utf8.DecodeRune(s[start:]) 640 j += wid 641 } 642 } else { 643 j += Index(s[start:], old) 644 } 645 w += copy(t[w:], s[start:j]) 646 w += copy(t[w:], new) 647 start = j + len(old) 648 } 649 w += copy(t[w:], s[start:]) 650 return t[0:w] 651 } 652 653 // EqualFold reports whether s and t, interpreted as UTF-8 strings, 654 // are equal under Unicode case-folding. 655 func EqualFold(s, t []byte) bool { 656 for len(s) != 0 && len(t) != 0 { 657 // Extract first rune from each. 658 var sr, tr rune 659 if s[0] < utf8.RuneSelf { 660 sr, s = rune(s[0]), s[1:] 661 } else { 662 r, size := utf8.DecodeRune(s) 663 sr, s = r, s[size:] 664 } 665 if t[0] < utf8.RuneSelf { 666 tr, t = rune(t[0]), t[1:] 667 } else { 668 r, size := utf8.DecodeRune(t) 669 tr, t = r, t[size:] 670 } 671 672 // If they match, keep going; if not, return false. 673 674 // Easy case. 675 if tr == sr { 676 continue 677 } 678 679 // Make sr < tr to simplify what follows. 680 if tr < sr { 681 tr, sr = sr, tr 682 } 683 // Fast check for ASCII. 684 if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' { 685 // ASCII, and sr is upper case. tr must be lower case. 686 if tr == sr+'a'-'A' { 687 continue 688 } 689 return false 690 } 691 692 // General case. SimpleFold(x) returns the next equivalent rune > x 693 // or wraps around to smaller values. 694 r := unicode.SimpleFold(sr) 695 for r != sr && r < tr { 696 r = unicode.SimpleFold(r) 697 } 698 if r == tr { 699 continue 700 } 701 return false 702 } 703 704 // One string is empty. Are both? 705 return len(s) == len(t) 706 }