src/pkg/strings/strings.go - The Go Programming Language

Golang

previous page next page
Source file src/pkg/strings/strings.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package strings implements simple functions to manipulate strings.
     6	package strings
     7	
     8	import (
     9		"unicode"
    10		"unicode/utf8"
    11	)
    12	
    13	// explode splits s into an array of UTF-8 sequences, one per Unicode character (still strings) up to a maximum of n (n < 0 means no limit).
    14	// Invalid UTF-8 sequences become correct encodings of U+FFF8.
    15	func explode(s string, n int) []string {
    16		if n == 0 {
    17			return nil
    18		}
    19		l := utf8.RuneCountInString(s)
    20		if n <= 0 || n > l {
    21			n = l
    22		}
    23		a := make([]string, n)
    24		var size int
    25		var ch rune
    26		i, cur := 0, 0
    27		for ; i+1 < n; i++ {
    28			ch, size = utf8.DecodeRuneInString(s[cur:])
    29			a[i] = string(ch)
    30			cur += size
    31		}
    32		// add the rest, if there is any
    33		if cur < len(s) {
    34			a[i] = s[cur:]
    35		}
    36		return a
    37	}
    38	
    39	// Count counts the number of non-overlapping instances of sep in s.
    40	func Count(s, sep string) int {
    41		if sep == "" {
    42			return utf8.RuneCountInString(s) + 1
    43		}
    44		c := sep[0]
    45		l := len(sep)
    46		n := 0
    47		if l == 1 {
    48			// special case worth making fast
    49			for i := 0; i < len(s); i++ {
    50				if s[i] == c {
    51					n++
    52				}
    53			}
    54			return n
    55		}
    56		for i := 0; i+l <= len(s); i++ {
    57			if s[i] == c && s[i:i+l] == sep {
    58				n++
    59				i += l - 1
    60			}
    61		}
    62		return n
    63	}
    64	
    65	// Contains returns true if substr is within s.
    66	func Contains(s, substr string) bool {
    67		return Index(s, substr) >= 0
    68	}
    69	
    70	// ContainsAny returns true if any Unicode code points in chars are within s.
    71	func ContainsAny(s, chars string) bool {
    72		return IndexAny(s, chars) >= 0
    73	}
    74	
    75	// ContainsRune returns true if the Unicode code point r is within s.
    76	func ContainsRune(s string, r rune) bool {
    77		return IndexRune(s, r) >= 0
    78	}
    79	
    80	// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
    81	func Index(s, sep string) int {
    82		n := len(sep)
    83		if n == 0 {
    84			return 0
    85		}
    86		c := sep[0]
    87		if n == 1 {
    88			// special case worth making fast
    89			for i := 0; i < len(s); i++ {
    90				if s[i] == c {
    91					return i
    92				}
    93			}
    94			return -1
    95		}
    96		// n > 1
    97		for i := 0; i+n <= len(s); i++ {
    98			if s[i] == c && s[i:i+n] == sep {
    99				return i
   100			}
   101		}
   102		return -1
   103	}
   104	
   105	// LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
   106	func LastIndex(s, sep string) int {
   107		n := len(sep)
   108		if n == 0 {
   109			return len(s)
   110		}
   111		c := sep[0]
   112		if n == 1 {
   113			// special case worth making fast
   114			for i := len(s) - 1; i >= 0; i-- {
   115				if s[i] == c {
   116					return i
   117				}
   118			}
   119			return -1
   120		}
   121		// n > 1
   122		for i := len(s) - n; i >= 0; i-- {
   123			if s[i] == c && s[i:i+n] == sep {
   124				return i
   125			}
   126		}
   127		return -1
   128	}
   129	
   130	// IndexRune returns the index of the first instance of the Unicode code point
   131	// r, or -1 if rune is not present in s.
   132	func IndexRune(s string, r rune) int {
   133		switch {
   134		case r < 0x80:
   135			b := byte(r)
   136			for i := 0; i < len(s); i++ {
   137				if s[i] == b {
   138					return i
   139				}
   140			}
   141		default:
   142			for i, c := range s {
   143				if c == r {
   144					return i
   145				}
   146			}
   147		}
   148		return -1
   149	}
   150	
   151	// IndexAny returns the index of the first instance of any Unicode code point
   152	// from chars in s, or -1 if no Unicode code point from chars is present in s.
   153	func IndexAny(s, chars string) int {
   154		if len(chars) > 0 {
   155			for i, c := range s {
   156				for _, m := range chars {
   157					if c == m {
   158						return i
   159					}
   160				}
   161			}
   162		}
   163		return -1
   164	}
   165	
   166	// LastIndexAny returns the index of the last instance of any Unicode code
   167	// point from chars in s, or -1 if no Unicode code point from chars is
   168	// present in s.
   169	func LastIndexAny(s, chars string) int {
   170		if len(chars) > 0 {
   171			for i := len(s); i > 0; {
   172				rune, size := utf8.DecodeLastRuneInString(s[0:i])
   173				i -= size
   174				for _, m := range chars {
   175					if rune == m {
   176						return i
   177					}
   178				}
   179			}
   180		}
   181		return -1
   182	}
   183	
   184	// Generic split: splits after each instance of sep,
   185	// including sepSave bytes of sep in the subarrays.
   186	func genSplit(s, sep string, sepSave, n int) []string {
   187		if n == 0 {
   188			return nil
   189		}
   190		if sep == "" {
   191			return explode(s, n)
   192		}
   193		if n < 0 {
   194			n = Count(s, sep) + 1
   195		}
   196		c := sep[0]
   197		start := 0
   198		a := make([]string, n)
   199		na := 0
   200		for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ {
   201			if s[i] == c && (len(sep) == 1 || s[i:i+len(sep)] == sep) {
   202				a[na] = s[start : i+sepSave]
   203				na++
   204				start = i + len(sep)
   205				i += len(sep) - 1
   206			}
   207		}
   208		a[na] = s[start:]
   209		return a[0 : na+1]
   210	}
   211	
   212	// SplitN slices s into substrings separated by sep and returns a slice of
   213	// the substrings between those separators.
   214	// If sep is empty, SplitN splits after each UTF-8 sequence.
   215	// The count determines the number of substrings to return:
   216	//   n > 0: at most n substrings; the last substring will be the unsplit remainder.
   217	//   n == 0: the result is nil (zero substrings)
   218	//   n < 0: all substrings
   219	func SplitN(s, sep string, n int) []string { return genSplit(s, sep, 0, n) }
   220	
   221	// SplitAfterN slices s into substrings after each instance of sep and
   222	// returns a slice of those substrings.
   223	// If sep is empty, SplitAfterN splits after each UTF-8 sequence.
   224	// The count determines the number of substrings to return:
   225	//   n > 0: at most n substrings; the last substring will be the unsplit remainder.
   226	//   n == 0: the result is nil (zero substrings)
   227	//   n < 0: all substrings
   228	func SplitAfterN(s, sep string, n int) []string {
   229		return genSplit(s, sep, len(sep), n)
   230	}
   231	
   232	// Split slices s into all substrings separated by sep and returns a slice of
   233	// the substrings between those separators.
   234	// If sep is empty, Split splits after each UTF-8 sequence.
   235	// It is equivalent to SplitN with a count of -1.
   236	func Split(s, sep string) []string { return genSplit(s, sep, 0, -1) }
   237	
   238	// SplitAfter slices s into all substrings after each instance of sep and
   239	// returns a slice of those substrings.
   240	// If sep is empty, SplitAfter splits after each UTF-8 sequence.
   241	// It is equivalent to SplitAfterN with a count of -1.
   242	func SplitAfter(s, sep string) []string {
   243		return genSplit(s, sep, len(sep), -1)
   244	}
   245	
   246	// Fields splits the string s around each instance of one or more consecutive white space
   247	// characters, returning an array of substrings of s or an empty list if s contains only white space.
   248	func Fields(s string) []string {
   249		return FieldsFunc(s, unicode.IsSpace)
   250	}
   251	
   252	// FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)
   253	// and returns an array of slices of s. If all code points in s satisfy f(c) or the
   254	// string is empty, an empty slice is returned.
   255	func FieldsFunc(s string, f func(rune) bool) []string {
   256		// First count the fields.
   257		n := 0
   258		inField := false
   259		for _, rune := range s {
   260			wasInField := inField
   261			inField = !f(rune)
   262			if inField && !wasInField {
   263				n++
   264			}
   265		}
   266	
   267		// Now create them.
   268		a := make([]string, n)
   269		na := 0
   270		fieldStart := -1 // Set to -1 when looking for start of field.
   271		for i, rune := range s {
   272			if f(rune) {
   273				if fieldStart >= 0 {
   274					a[na] = s[fieldStart:i]
   275					na++
   276					fieldStart = -1
   277				}
   278			} else if fieldStart == -1 {
   279				fieldStart = i
   280			}
   281		}
   282		if fieldStart >= 0 { // Last field might end at EOF.
   283			a[na] = s[fieldStart:]
   284		}
   285		return a
   286	}
   287	
   288	// Join concatenates the elements of a to create a single string.   The separator string
   289	// sep is placed between elements in the resulting string.
   290	func Join(a []string, sep string) string {
   291		if len(a) == 0 {
   292			return ""
   293		}
   294		if len(a) == 1 {
   295			return a[0]
   296		}
   297		n := len(sep) * (len(a) - 1)
   298		for i := 0; i < len(a); i++ {
   299			n += len(a[i])
   300		}
   301	
   302		b := make([]byte, n)
   303		bp := copy(b, a[0])
   304		for _, s := range a[1:] {
   305			bp += copy(b[bp:], sep)
   306			bp += copy(b[bp:], s)
   307		}
   308		return string(b)
   309	}
   310	
   311	// HasPrefix tests whether the string s begins with prefix.
   312	func HasPrefix(s, prefix string) bool {
   313		return len(s) >= len(prefix) && s[0:len(prefix)] == prefix
   314	}
   315	
   316	// HasSuffix tests whether the string s ends with suffix.
   317	func HasSuffix(s, suffix string) bool {
   318		return len(s) >= len(suffix) && s[len(s)-len(suffix):] == suffix
   319	}
   320	
   321	// Map returns a copy of the string s with all its characters modified
   322	// according to the mapping function. If mapping returns a negative value, the character is
   323	// dropped from the string with no replacement.
   324	func Map(mapping func(rune) rune, s string) string {
   325		// In the worst case, the string can grow when mapped, making
   326		// things unpleasant.  But it's so rare we barge in assuming it's
   327		// fine.  It could also shrink but that falls out naturally.
   328		maxbytes := len(s) // length of b
   329		nbytes := 0        // number of bytes encoded in b
   330		// The output buffer b is initialized on demand, the first
   331		// time a character differs.
   332		var b []byte
   333	
   334		for i, c := range s {
   335			r := mapping(c)
   336			if b == nil {
   337				if r == c {
   338					continue
   339				}
   340				b = make([]byte, maxbytes)
   341				nbytes = copy(b, s[:i])
   342			}
   343			if r >= 0 {
   344				wid := 1
   345				if r >= utf8.RuneSelf {
   346					wid = utf8.RuneLen(r)
   347				}
   348				if nbytes+wid > maxbytes {
   349					// Grow the buffer.
   350					maxbytes = maxbytes*2 + utf8.UTFMax
   351					nb := make([]byte, maxbytes)
   352					copy(nb, b[0:nbytes])
   353					b = nb
   354				}
   355				nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
   356			}
   357		}
   358		if b == nil {
   359			return s
   360		}
   361		return string(b[0:nbytes])
   362	}
   363	
   364	// Repeat returns a new string consisting of count copies of the string s.
   365	func Repeat(s string, count int) string {
   366		b := make([]byte, len(s)*count)
   367		bp := 0
   368		for i := 0; i < count; i++ {
   369			for j := 0; j < len(s); j++ {
   370				b[bp] = s[j]
   371				bp++
   372			}
   373		}
   374		return string(b)
   375	}
   376	
   377	// ToUpper returns a copy of the string s with all Unicode letters mapped to their upper case.
   378	func ToUpper(s string) string { return Map(unicode.ToUpper, s) }
   379	
   380	// ToLower returns a copy of the string s with all Unicode letters mapped to their lower case.
   381	func ToLower(s string) string { return Map(unicode.ToLower, s) }
   382	
   383	// ToTitle returns a copy of the string s with all Unicode letters mapped to their title case.
   384	func ToTitle(s string) string { return Map(unicode.ToTitle, s) }
   385	
   386	// ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their
   387	// upper case, giving priority to the special casing rules.
   388	func ToUpperSpecial(_case unicode.SpecialCase, s string) string {
   389		return Map(func(r rune) rune { return _case.ToUpper(r) }, s)
   390	}
   391	
   392	// ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their
   393	// lower case, giving priority to the special casing rules.
   394	func ToLowerSpecial(_case unicode.SpecialCase, s string) string {
   395		return Map(func(r rune) rune { return _case.ToLower(r) }, s)
   396	}
   397	
   398	// ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their
   399	// title case, giving priority to the special casing rules.
   400	func ToTitleSpecial(_case unicode.SpecialCase, s string) string {
   401		return Map(func(r rune) rune { return _case.ToTitle(r) }, s)
   402	}
   403	
   404	// isSeparator reports whether the rune could mark a word boundary.
   405	// TODO: update when package unicode captures more of the properties.
   406	func isSeparator(r rune) bool {
   407		// ASCII alphanumerics and underscore are not separators
   408		if r <= 0x7F {
   409			switch {
   410			case '0' <= r && r <= '9':
   411				return false
   412			case 'a' <= r && r <= 'z':
   413				return false
   414			case 'A' <= r && r <= 'Z':
   415				return false
   416			case r == '_':
   417				return false
   418			}
   419			return true
   420		}
   421		// Letters and digits are not separators
   422		if unicode.IsLetter(r) || unicode.IsDigit(r) {
   423			return false
   424		}
   425		// Otherwise, all we can do for now is treat spaces as separators.
   426		return unicode.IsSpace(r)
   427	}
   428	
   429	// BUG(r): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
   430	
   431	// Title returns a copy of the string s with all Unicode letters that begin words
   432	// mapped to their title case.
   433	func Title(s string) string {
   434		// Use a closure here to remember state.
   435		// Hackish but effective. Depends on Map scanning in order and calling
   436		// the closure once per rune.
   437		prev := ' '
   438		return Map(
   439			func(r rune) rune {
   440				if isSeparator(prev) {
   441					prev = r
   442					return unicode.ToTitle(r)
   443				}
   444				prev = r
   445				return r
   446			},
   447			s)
   448	}
   449	
   450	// TrimLeftFunc returns a slice of the string s with all leading
   451	// Unicode code points c satisfying f(c) removed.
   452	func TrimLeftFunc(s string, f func(rune) bool) string {
   453		i := indexFunc(s, f, false)
   454		if i == -1 {
   455			return ""
   456		}
   457		return s[i:]
   458	}
   459	
   460	// TrimRightFunc returns a slice of the string s with all trailing
   461	// Unicode code points c satisfying f(c) removed.
   462	func TrimRightFunc(s string, f func(rune) bool) string {
   463		i := lastIndexFunc(s, f, false)
   464		if i >= 0 && s[i] >= utf8.RuneSelf {
   465			_, wid := utf8.DecodeRuneInString(s[i:])
   466			i += wid
   467		} else {
   468			i++
   469		}
   470		return s[0:i]
   471	}
   472	
   473	// TrimFunc returns a slice of the string s with all leading
   474	// and trailing Unicode code points c satisfying f(c) removed.
   475	func TrimFunc(s string, f func(rune) bool) string {
   476		return TrimRightFunc(TrimLeftFunc(s, f), f)
   477	}
   478	
   479	// IndexFunc returns the index into s of the first Unicode
   480	// code point satisfying f(c), or -1 if none do.
   481	func IndexFunc(s string, f func(rune) bool) int {
   482		return indexFunc(s, f, true)
   483	}
   484	
   485	// LastIndexFunc returns the index into s of the last
   486	// Unicode code point satisfying f(c), or -1 if none do.
   487	func LastIndexFunc(s string, f func(rune) bool) int {
   488		return lastIndexFunc(s, f, true)
   489	}
   490	
   491	// indexFunc is the same as IndexFunc except that if
   492	// truth==false, the sense of the predicate function is
   493	// inverted.
   494	func indexFunc(s string, f func(rune) bool, truth bool) int {
   495		start := 0
   496		for start < len(s) {
   497			wid := 1
   498			r := rune(s[start])
   499			if r >= utf8.RuneSelf {
   500				r, wid = utf8.DecodeRuneInString(s[start:])
   501			}
   502			if f(r) == truth {
   503				return start
   504			}
   505			start += wid
   506		}
   507		return -1
   508	}
   509	
   510	// lastIndexFunc is the same as LastIndexFunc except that if
   511	// truth==false, the sense of the predicate function is
   512	// inverted.
   513	func lastIndexFunc(s string, f func(rune) bool, truth bool) int {
   514		for i := len(s); i > 0; {
   515			r, size := utf8.DecodeLastRuneInString(s[0:i])
   516			i -= size
   517			if f(r) == truth {
   518				return i
   519			}
   520		}
   521		return -1
   522	}
   523	
   524	func makeCutsetFunc(cutset string) func(rune) bool {
   525		return func(r rune) bool { return IndexRune(cutset, r) >= 0 }
   526	}
   527	
   528	// Trim returns a slice of the string s with all leading and
   529	// trailing Unicode code points contained in cutset removed.
   530	func Trim(s string, cutset string) string {
   531		if s == "" || cutset == "" {
   532			return s
   533		}
   534		return TrimFunc(s, makeCutsetFunc(cutset))
   535	}
   536	
   537	// TrimLeft returns a slice of the string s with all leading
   538	// Unicode code points contained in cutset removed.
   539	func TrimLeft(s string, cutset string) string {
   540		if s == "" || cutset == "" {
   541			return s
   542		}
   543		return TrimLeftFunc(s, makeCutsetFunc(cutset))
   544	}
   545	
   546	// TrimRight returns a slice of the string s, with all trailing
   547	// Unicode code points contained in cutset removed.
   548	func TrimRight(s string, cutset string) string {
   549		if s == "" || cutset == "" {
   550			return s
   551		}
   552		return TrimRightFunc(s, makeCutsetFunc(cutset))
   553	}
   554	
   555	// TrimSpace returns a slice of the string s, with all leading
   556	// and trailing white space removed, as defined by Unicode.
   557	func TrimSpace(s string) string {
   558		return TrimFunc(s, unicode.IsSpace)
   559	}
   560	
   561	// Replace returns a copy of the string s with the first n
   562	// non-overlapping instances of old replaced by new.
   563	// If n < 0, there is no limit on the number of replacements.
   564	func Replace(s, old, new string, n int) string {
   565		if old == new || n == 0 {
   566			return s // avoid allocation
   567		}
   568	
   569		// Compute number of replacements.
   570		if m := Count(s, old); m == 0 {
   571			return s // avoid allocation
   572		} else if n < 0 || m < n {
   573			n = m
   574		}
   575	
   576		// Apply replacements to buffer.
   577		t := make([]byte, len(s)+n*(len(new)-len(old)))
   578		w := 0
   579		start := 0
   580		for i := 0; i < n; i++ {
   581			j := start
   582			if len(old) == 0 {
   583				if i > 0 {
   584					_, wid := utf8.DecodeRuneInString(s[start:])
   585					j += wid
   586				}
   587			} else {
   588				j += Index(s[start:], old)
   589			}
   590			w += copy(t[w:], s[start:j])
   591			w += copy(t[w:], new)
   592			start = j + len(old)
   593		}
   594		w += copy(t[w:], s[start:])
   595		return string(t[0:w])
   596	}
   597	
   598	// EqualFold reports whether s and t, interpreted as UTF-8 strings,
   599	// are equal under Unicode case-folding.
   600	func EqualFold(s, t string) bool {
   601		for s != "" && t != "" {
   602			// Extract first rune from each string.
   603			var sr, tr rune
   604			if s[0] < utf8.RuneSelf {
   605				sr, s = rune(s[0]), s[1:]
   606			} else {
   607				r, size := utf8.DecodeRuneInString(s)
   608				sr, s = r, s[size:]
   609			}
   610			if t[0] < utf8.RuneSelf {
   611				tr, t = rune(t[0]), t[1:]
   612			} else {
   613				r, size := utf8.DecodeRuneInString(t)
   614				tr, t = r, t[size:]
   615			}
   616	
   617			// If they match, keep going; if not, return false.
   618	
   619			// Easy case.
   620			if tr == sr {
   621				continue
   622			}
   623	
   624			// Make sr < tr to simplify what follows.
   625			if tr < sr {
   626				tr, sr = sr, tr
   627			}
   628			// Fast check for ASCII.
   629			if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' {
   630				// ASCII, and sr is upper case.  tr must be lower case.
   631				if tr == sr+'a'-'A' {
   632					continue
   633				}
   634				return false
   635			}
   636	
   637			// General case.  SimpleFold(x) returns the next equivalent rune > x
   638			// or wraps around to smaller values.
   639			r := unicode.SimpleFold(sr)
   640			for r != sr && r < tr {
   641				r = unicode.SimpleFold(r)
   642			}
   643			if r == tr {
   644				continue
   645			}
   646			return false
   647		}
   648	
   649		// One string is empty.  Are both?
   650		return s == t
   651	}
previous page start next page