Source file src/pkg/strings/strings.go
1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package strings implements simple functions to manipulate strings. 6 package strings 7 8 import ( 9 "unicode" 10 "unicode/utf8" 11 ) 12 13 // explode splits s into an array of UTF-8 sequences, one per Unicode character (still strings) up to a maximum of n (n < 0 means no limit). 14 // Invalid UTF-8 sequences become correct encodings of U+FFF8. 15 func explode(s string, n int) []string { 16 if n == 0 { 17 return nil 18 } 19 l := utf8.RuneCountInString(s) 20 if n <= 0 || n > l { 21 n = l 22 } 23 a := make([]string, n) 24 var size int 25 var ch rune 26 i, cur := 0, 0 27 for ; i+1 < n; i++ { 28 ch, size = utf8.DecodeRuneInString(s[cur:]) 29 a[i] = string(ch) 30 cur += size 31 } 32 // add the rest, if there is any 33 if cur < len(s) { 34 a[i] = s[cur:] 35 } 36 return a 37 } 38 39 // Count counts the number of non-overlapping instances of sep in s. 40 func Count(s, sep string) int { 41 if sep == "" { 42 return utf8.RuneCountInString(s) + 1 43 } 44 c := sep[0] 45 l := len(sep) 46 n := 0 47 if l == 1 { 48 // special case worth making fast 49 for i := 0; i < len(s); i++ { 50 if s[i] == c { 51 n++ 52 } 53 } 54 return n 55 } 56 for i := 0; i+l <= len(s); i++ { 57 if s[i] == c && s[i:i+l] == sep { 58 n++ 59 i += l - 1 60 } 61 } 62 return n 63 } 64 65 // Contains returns true if substr is within s. 66 func Contains(s, substr string) bool { 67 return Index(s, substr) >= 0 68 } 69 70 // ContainsAny returns true if any Unicode code points in chars are within s. 71 func ContainsAny(s, chars string) bool { 72 return IndexAny(s, chars) >= 0 73 } 74 75 // ContainsRune returns true if the Unicode code point r is within s. 76 func ContainsRune(s string, r rune) bool { 77 return IndexRune(s, r) >= 0 78 } 79 80 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. 81 func Index(s, sep string) int { 82 n := len(sep) 83 if n == 0 { 84 return 0 85 } 86 c := sep[0] 87 if n == 1 { 88 // special case worth making fast 89 for i := 0; i < len(s); i++ { 90 if s[i] == c { 91 return i 92 } 93 } 94 return -1 95 } 96 // n > 1 97 for i := 0; i+n <= len(s); i++ { 98 if s[i] == c && s[i:i+n] == sep { 99 return i 100 } 101 } 102 return -1 103 } 104 105 // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s. 106 func LastIndex(s, sep string) int { 107 n := len(sep) 108 if n == 0 { 109 return len(s) 110 } 111 c := sep[0] 112 if n == 1 { 113 // special case worth making fast 114 for i := len(s) - 1; i >= 0; i-- { 115 if s[i] == c { 116 return i 117 } 118 } 119 return -1 120 } 121 // n > 1 122 for i := len(s) - n; i >= 0; i-- { 123 if s[i] == c && s[i:i+n] == sep { 124 return i 125 } 126 } 127 return -1 128 } 129 130 // IndexRune returns the index of the first instance of the Unicode code point 131 // r, or -1 if rune is not present in s. 132 func IndexRune(s string, r rune) int { 133 switch { 134 case r < 0x80: 135 b := byte(r) 136 for i := 0; i < len(s); i++ { 137 if s[i] == b { 138 return i 139 } 140 } 141 default: 142 for i, c := range s { 143 if c == r { 144 return i 145 } 146 } 147 } 148 return -1 149 } 150 151 // IndexAny returns the index of the first instance of any Unicode code point 152 // from chars in s, or -1 if no Unicode code point from chars is present in s. 153 func IndexAny(s, chars string) int { 154 if len(chars) > 0 { 155 for i, c := range s { 156 for _, m := range chars { 157 if c == m { 158 return i 159 } 160 } 161 } 162 } 163 return -1 164 } 165 166 // LastIndexAny returns the index of the last instance of any Unicode code 167 // point from chars in s, or -1 if no Unicode code point from chars is 168 // present in s. 169 func LastIndexAny(s, chars string) int { 170 if len(chars) > 0 { 171 for i := len(s); i > 0; { 172 rune, size := utf8.DecodeLastRuneInString(s[0:i]) 173 i -= size 174 for _, m := range chars { 175 if rune == m { 176 return i 177 } 178 } 179 } 180 } 181 return -1 182 } 183 184 // Generic split: splits after each instance of sep, 185 // including sepSave bytes of sep in the subarrays. 186 func genSplit(s, sep string, sepSave, n int) []string { 187 if n == 0 { 188 return nil 189 } 190 if sep == "" { 191 return explode(s, n) 192 } 193 if n < 0 { 194 n = Count(s, sep) + 1 195 } 196 c := sep[0] 197 start := 0 198 a := make([]string, n) 199 na := 0 200 for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ { 201 if s[i] == c && (len(sep) == 1 || s[i:i+len(sep)] == sep) { 202 a[na] = s[start : i+sepSave] 203 na++ 204 start = i + len(sep) 205 i += len(sep) - 1 206 } 207 } 208 a[na] = s[start:] 209 return a[0 : na+1] 210 } 211 212 // SplitN slices s into substrings separated by sep and returns a slice of 213 // the substrings between those separators. 214 // If sep is empty, SplitN splits after each UTF-8 sequence. 215 // The count determines the number of substrings to return: 216 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 217 // n == 0: the result is nil (zero substrings) 218 // n < 0: all substrings 219 func SplitN(s, sep string, n int) []string { return genSplit(s, sep, 0, n) } 220 221 // SplitAfterN slices s into substrings after each instance of sep and 222 // returns a slice of those substrings. 223 // If sep is empty, SplitAfterN splits after each UTF-8 sequence. 224 // The count determines the number of substrings to return: 225 // n > 0: at most n substrings; the last substring will be the unsplit remainder. 226 // n == 0: the result is nil (zero substrings) 227 // n < 0: all substrings 228 func SplitAfterN(s, sep string, n int) []string { 229 return genSplit(s, sep, len(sep), n) 230 } 231 232 // Split slices s into all substrings separated by sep and returns a slice of 233 // the substrings between those separators. 234 // If sep is empty, Split splits after each UTF-8 sequence. 235 // It is equivalent to SplitN with a count of -1. 236 func Split(s, sep string) []string { return genSplit(s, sep, 0, -1) } 237 238 // SplitAfter slices s into all substrings after each instance of sep and 239 // returns a slice of those substrings. 240 // If sep is empty, SplitAfter splits after each UTF-8 sequence. 241 // It is equivalent to SplitAfterN with a count of -1. 242 func SplitAfter(s, sep string) []string { 243 return genSplit(s, sep, len(sep), -1) 244 } 245 246 // Fields splits the string s around each instance of one or more consecutive white space 247 // characters, returning an array of substrings of s or an empty list if s contains only white space. 248 func Fields(s string) []string { 249 return FieldsFunc(s, unicode.IsSpace) 250 } 251 252 // FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c) 253 // and returns an array of slices of s. If all code points in s satisfy f(c) or the 254 // string is empty, an empty slice is returned. 255 func FieldsFunc(s string, f func(rune) bool) []string { 256 // First count the fields. 257 n := 0 258 inField := false 259 for _, rune := range s { 260 wasInField := inField 261 inField = !f(rune) 262 if inField && !wasInField { 263 n++ 264 } 265 } 266 267 // Now create them. 268 a := make([]string, n) 269 na := 0 270 fieldStart := -1 // Set to -1 when looking for start of field. 271 for i, rune := range s { 272 if f(rune) { 273 if fieldStart >= 0 { 274 a[na] = s[fieldStart:i] 275 na++ 276 fieldStart = -1 277 } 278 } else if fieldStart == -1 { 279 fieldStart = i 280 } 281 } 282 if fieldStart >= 0 { // Last field might end at EOF. 283 a[na] = s[fieldStart:] 284 } 285 return a 286 } 287 288 // Join concatenates the elements of a to create a single string. The separator string 289 // sep is placed between elements in the resulting string. 290 func Join(a []string, sep string) string { 291 if len(a) == 0 { 292 return "" 293 } 294 if len(a) == 1 { 295 return a[0] 296 } 297 n := len(sep) * (len(a) - 1) 298 for i := 0; i < len(a); i++ { 299 n += len(a[i]) 300 } 301 302 b := make([]byte, n) 303 bp := copy(b, a[0]) 304 for _, s := range a[1:] { 305 bp += copy(b[bp:], sep) 306 bp += copy(b[bp:], s) 307 } 308 return string(b) 309 } 310 311 // HasPrefix tests whether the string s begins with prefix. 312 func HasPrefix(s, prefix string) bool { 313 return len(s) >= len(prefix) && s[0:len(prefix)] == prefix 314 } 315 316 // HasSuffix tests whether the string s ends with suffix. 317 func HasSuffix(s, suffix string) bool { 318 return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix 319 } 320 321 // Map returns a copy of the string s with all its characters modified 322 // according to the mapping function. If mapping returns a negative value, the character is 323 // dropped from the string with no replacement. 324 func Map(mapping func(rune) rune, s string) string { 325 // In the worst case, the string can grow when mapped, making 326 // things unpleasant. But it's so rare we barge in assuming it's 327 // fine. It could also shrink but that falls out naturally. 328 maxbytes := len(s) // length of b 329 nbytes := 0 // number of bytes encoded in b 330 // The output buffer b is initialized on demand, the first 331 // time a character differs. 332 var b []byte 333 334 for i, c := range s { 335 r := mapping(c) 336 if b == nil { 337 if r == c { 338 continue 339 } 340 b = make([]byte, maxbytes) 341 nbytes = copy(b, s[:i]) 342 } 343 if r >= 0 { 344 wid := 1 345 if r >= utf8.RuneSelf { 346 wid = utf8.RuneLen(r) 347 } 348 if nbytes+wid > maxbytes { 349 // Grow the buffer. 350 maxbytes = maxbytes*2 + utf8.UTFMax 351 nb := make([]byte, maxbytes) 352 copy(nb, b[0:nbytes]) 353 b = nb 354 } 355 nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r) 356 } 357 } 358 if b == nil { 359 return s 360 } 361 return string(b[0:nbytes]) 362 } 363 364 // Repeat returns a new string consisting of count copies of the string s. 365 func Repeat(s string, count int) string { 366 b := make([]byte, len(s)*count) 367 bp := 0 368 for i := 0; i < count; i++ { 369 for j := 0; j < len(s); j++ { 370 b[bp] = s[j] 371 bp++ 372 } 373 } 374 return string(b) 375 } 376 377 // ToUpper returns a copy of the string s with all Unicode letters mapped to their upper case. 378 func ToUpper(s string) string { return Map(unicode.ToUpper, s) } 379 380 // ToLower returns a copy of the string s with all Unicode letters mapped to their lower case. 381 func ToLower(s string) string { return Map(unicode.ToLower, s) } 382 383 // ToTitle returns a copy of the string s with all Unicode letters mapped to their title case. 384 func ToTitle(s string) string { return Map(unicode.ToTitle, s) } 385 386 // ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their 387 // upper case, giving priority to the special casing rules. 388 func ToUpperSpecial(_case unicode.SpecialCase, s string) string { 389 return Map(func(r rune) rune { return _case.ToUpper(r) }, s) 390 } 391 392 // ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their 393 // lower case, giving priority to the special casing rules. 394 func ToLowerSpecial(_case unicode.SpecialCase, s string) string { 395 return Map(func(r rune) rune { return _case.ToLower(r) }, s) 396 } 397 398 // ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their 399 // title case, giving priority to the special casing rules. 400 func ToTitleSpecial(_case unicode.SpecialCase, s string) string { 401 return Map(func(r rune) rune { return _case.ToTitle(r) }, s) 402 } 403 404 // isSeparator reports whether the rune could mark a word boundary. 405 // TODO: update when package unicode captures more of the properties. 406 func isSeparator(r rune) bool { 407 // ASCII alphanumerics and underscore are not separators 408 if r <= 0x7F { 409 switch { 410 case '0' <= r && r <= '9': 411 return false 412 case 'a' <= r && r <= 'z': 413 return false 414 case 'A' <= r && r <= 'Z': 415 return false 416 case r == '_': 417 return false 418 } 419 return true 420 } 421 // Letters and digits are not separators 422 if unicode.IsLetter(r) || unicode.IsDigit(r) { 423 return false 424 } 425 // Otherwise, all we can do for now is treat spaces as separators. 426 return unicode.IsSpace(r) 427 } 428 429 // BUG(r): The rule Title uses for word boundaries does not handle Unicode punctuation properly. 430 431 // Title returns a copy of the string s with all Unicode letters that begin words 432 // mapped to their title case. 433 func Title(s string) string { 434 // Use a closure here to remember state. 435 // Hackish but effective. Depends on Map scanning in order and calling 436 // the closure once per rune. 437 prev := ' ' 438 return Map( 439 func(r rune) rune { 440 if isSeparator(prev) { 441 prev = r 442 return unicode.ToTitle(r) 443 } 444 prev = r 445 return r 446 }, 447 s) 448 } 449 450 // TrimLeftFunc returns a slice of the string s with all leading 451 // Unicode code points c satisfying f(c) removed. 452 func TrimLeftFunc(s string, f func(rune) bool) string { 453 i := indexFunc(s, f, false) 454 if i == -1 { 455 return "" 456 } 457 return s[i:] 458 } 459 460 // TrimRightFunc returns a slice of the string s with all trailing 461 // Unicode code points c satisfying f(c) removed. 462 func TrimRightFunc(s string, f func(rune) bool) string { 463 i := lastIndexFunc(s, f, false) 464 if i >= 0 && s[i] >= utf8.RuneSelf { 465 _, wid := utf8.DecodeRuneInString(s[i:]) 466 i += wid 467 } else { 468 i++ 469 } 470 return s[0:i] 471 } 472 473 // TrimFunc returns a slice of the string s with all leading 474 // and trailing Unicode code points c satisfying f(c) removed. 475 func TrimFunc(s string, f func(rune) bool) string { 476 return TrimRightFunc(TrimLeftFunc(s, f), f) 477 } 478 479 // IndexFunc returns the index into s of the first Unicode 480 // code point satisfying f(c), or -1 if none do. 481 func IndexFunc(s string, f func(rune) bool) int { 482 return indexFunc(s, f, true) 483 } 484 485 // LastIndexFunc returns the index into s of the last 486 // Unicode code point satisfying f(c), or -1 if none do. 487 func LastIndexFunc(s string, f func(rune) bool) int { 488 return lastIndexFunc(s, f, true) 489 } 490 491 // indexFunc is the same as IndexFunc except that if 492 // truth==false, the sense of the predicate function is 493 // inverted. 494 func indexFunc(s string, f func(rune) bool, truth bool) int { 495 start := 0 496 for start < len(s) { 497 wid := 1 498 r := rune(s[start]) 499 if r >= utf8.RuneSelf { 500 r, wid = utf8.DecodeRuneInString(s[start:]) 501 } 502 if f(r) == truth { 503 return start 504 } 505 start += wid 506 } 507 return -1 508 } 509 510 // lastIndexFunc is the same as LastIndexFunc except that if 511 // truth==false, the sense of the predicate function is 512 // inverted. 513 func lastIndexFunc(s string, f func(rune) bool, truth bool) int { 514 for i := len(s); i > 0; { 515 r, size := utf8.DecodeLastRuneInString(s[0:i]) 516 i -= size 517 if f(r) == truth { 518 return i 519 } 520 } 521 return -1 522 } 523 524 func makeCutsetFunc(cutset string) func(rune) bool { 525 return func(r rune) bool { return IndexRune(cutset, r) >= 0 } 526 } 527 528 // Trim returns a slice of the string s with all leading and 529 // trailing Unicode code points contained in cutset removed. 530 func Trim(s string, cutset string) string { 531 if s == "" || cutset == "" { 532 return s 533 } 534 return TrimFunc(s, makeCutsetFunc(cutset)) 535 } 536 537 // TrimLeft returns a slice of the string s with all leading 538 // Unicode code points contained in cutset removed. 539 func TrimLeft(s string, cutset string) string { 540 if s == "" || cutset == "" { 541 return s 542 } 543 return TrimLeftFunc(s, makeCutsetFunc(cutset)) 544 } 545 546 // TrimRight returns a slice of the string s, with all trailing 547 // Unicode code points contained in cutset removed. 548 func TrimRight(s string, cutset string) string { 549 if s == "" || cutset == "" { 550 return s 551 } 552 return TrimRightFunc(s, makeCutsetFunc(cutset)) 553 } 554 555 // TrimSpace returns a slice of the string s, with all leading 556 // and trailing white space removed, as defined by Unicode. 557 func TrimSpace(s string) string { 558 return TrimFunc(s, unicode.IsSpace) 559 } 560 561 // Replace returns a copy of the string s with the first n 562 // non-overlapping instances of old replaced by new. 563 // If n < 0, there is no limit on the number of replacements. 564 func Replace(s, old, new string, n int) string { 565 if old == new || n == 0 { 566 return s // avoid allocation 567 } 568 569 // Compute number of replacements. 570 if m := Count(s, old); m == 0 { 571 return s // avoid allocation 572 } else if n < 0 || m < n { 573 n = m 574 } 575 576 // Apply replacements to buffer. 577 t := make([]byte, len(s)+n*(len(new)-len(old))) 578 w := 0 579 start := 0 580 for i := 0; i < n; i++ { 581 j := start 582 if len(old) == 0 { 583 if i > 0 { 584 _, wid := utf8.DecodeRuneInString(s[start:]) 585 j += wid 586 } 587 } else { 588 j += Index(s[start:], old) 589 } 590 w += copy(t[w:], s[start:j]) 591 w += copy(t[w:], new) 592 start = j + len(old) 593 } 594 w += copy(t[w:], s[start:]) 595 return string(t[0:w]) 596 } 597 598 // EqualFold reports whether s and t, interpreted as UTF-8 strings, 599 // are equal under Unicode case-folding. 600 func EqualFold(s, t string) bool { 601 for s != "" && t != "" { 602 // Extract first rune from each string. 603 var sr, tr rune 604 if s[0] < utf8.RuneSelf { 605 sr, s = rune(s[0]), s[1:] 606 } else { 607 r, size := utf8.DecodeRuneInString(s) 608 sr, s = r, s[size:] 609 } 610 if t[0] < utf8.RuneSelf { 611 tr, t = rune(t[0]), t[1:] 612 } else { 613 r, size := utf8.DecodeRuneInString(t) 614 tr, t = r, t[size:] 615 } 616 617 // If they match, keep going; if not, return false. 618 619 // Easy case. 620 if tr == sr { 621 continue 622 } 623 624 // Make sr < tr to simplify what follows. 625 if tr < sr { 626 tr, sr = sr, tr 627 } 628 // Fast check for ASCII. 629 if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' { 630 // ASCII, and sr is upper case. tr must be lower case. 631 if tr == sr+'a'-'A' { 632 continue 633 } 634 return false 635 } 636 637 // General case. SimpleFold(x) returns the next equivalent rune > x 638 // or wraps around to smaller values. 639 r := unicode.SimpleFold(sr) 640 for r != sr && r < tr { 641 r = unicode.SimpleFold(r) 642 } 643 if r == tr { 644 continue 645 } 646 return false 647 } 648 649 // One string is empty. Are both? 650 return s == t 651 }