src/pkg/bytes/bytes.go - The Go Programming Language

Golang

previous page next page
Source file src/pkg/bytes/bytes.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package bytes implements functions for the manipulation of byte slices.
     6	// It is analogous to the facilities of the strings package.
     7	package bytes
     8	
     9	import (
    10		"unicode"
    11		"unicode/utf8"
    12	)
    13	
    14	// Compare returns an integer comparing the two byte arrays lexicographically.
    15	// The result will be 0 if a==b, -1 if a < b, and +1 if a > b
    16	// A nil argument is equivalent to an empty slice.
    17	func Compare(a, b []byte) int {
    18		m := len(a)
    19		if m > len(b) {
    20			m = len(b)
    21		}
    22		for i, ac := range a[0:m] {
    23			bc := b[i]
    24			switch {
    25			case ac > bc:
    26				return 1
    27			case ac < bc:
    28				return -1
    29			}
    30		}
    31		switch {
    32		case len(a) < len(b):
    33			return -1
    34		case len(a) > len(b):
    35			return 1
    36		}
    37		return 0
    38	}
    39	
    40	// Equal returns a boolean reporting whether a == b.
    41	// A nil argument is equivalent to an empty slice.
    42	func Equal(a, b []byte) bool
    43	
    44	func equalPortable(a, b []byte) bool {
    45		if len(a) != len(b) {
    46			return false
    47		}
    48		for i, c := range a {
    49			if c != b[i] {
    50				return false
    51			}
    52		}
    53		return true
    54	}
    55	
    56	// explode splits s into an array of UTF-8 sequences, one per Unicode character (still arrays of bytes),
    57	// up to a maximum of n byte arrays. Invalid UTF-8 sequences are chopped into individual bytes.
    58	func explode(s []byte, n int) [][]byte {
    59		if n <= 0 {
    60			n = len(s)
    61		}
    62		a := make([][]byte, n)
    63		var size int
    64		na := 0
    65		for len(s) > 0 {
    66			if na+1 >= n {
    67				a[na] = s
    68				na++
    69				break
    70			}
    71			_, size = utf8.DecodeRune(s)
    72			a[na] = s[0:size]
    73			s = s[size:]
    74			na++
    75		}
    76		return a[0:na]
    77	}
    78	
    79	// Count counts the number of non-overlapping instances of sep in s.
    80	func Count(s, sep []byte) int {
    81		n := len(sep)
    82		if n == 0 {
    83			return utf8.RuneCount(s) + 1
    84		}
    85		if n > len(s) {
    86			return 0
    87		}
    88		count := 0
    89		c := sep[0]
    90		i := 0
    91		t := s[:len(s)-n+1]
    92		for i < len(t) {
    93			if t[i] != c {
    94				o := IndexByte(t[i:], c)
    95				if o < 0 {
    96					break
    97				}
    98				i += o
    99			}
   100			if n == 1 || Equal(s[i:i+n], sep) {
   101				count++
   102				i += n
   103				continue
   104			}
   105			i++
   106		}
   107		return count
   108	}
   109	
   110	// Contains returns whether subslice is within b.
   111	func Contains(b, subslice []byte) bool {
   112		return Index(b, subslice) != -1
   113	}
   114	
   115	// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
   116	func Index(s, sep []byte) int {
   117		n := len(sep)
   118		if n == 0 {
   119			return 0
   120		}
   121		if n > len(s) {
   122			return -1
   123		}
   124		c := sep[0]
   125		if n == 1 {
   126			return IndexByte(s, c)
   127		}
   128		i := 0
   129		t := s[:len(s)-n+1]
   130		for i < len(t) {
   131			if t[i] != c {
   132				o := IndexByte(t[i:], c)
   133				if o < 0 {
   134					break
   135				}
   136				i += o
   137			}
   138			if Equal(s[i:i+n], sep) {
   139				return i
   140			}
   141			i++
   142		}
   143		return -1
   144	}
   145	
   146	func indexBytePortable(s []byte, c byte) int {
   147		for i, b := range s {
   148			if b == c {
   149				return i
   150			}
   151		}
   152		return -1
   153	}
   154	
   155	// LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
   156	func LastIndex(s, sep []byte) int {
   157		n := len(sep)
   158		if n == 0 {
   159			return len(s)
   160		}
   161		c := sep[0]
   162		for i := len(s) - n; i >= 0; i-- {
   163			if s[i] == c && (n == 1 || Equal(s[i:i+n], sep)) {
   164				return i
   165			}
   166		}
   167		return -1
   168	}
   169	
   170	// IndexRune interprets s as a sequence of UTF-8-encoded Unicode code points.
   171	// It returns the byte index of the first occurrence in s of the given rune.
   172	// It returns -1 if rune is not present in s.
   173	func IndexRune(s []byte, r rune) int {
   174		for i := 0; i < len(s); {
   175			r1, size := utf8.DecodeRune(s[i:])
   176			if r == r1 {
   177				return i
   178			}
   179			i += size
   180		}
   181		return -1
   182	}
   183	
   184	// IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
   185	// It returns the byte index of the first occurrence in s of any of the Unicode
   186	// code points in chars.  It returns -1 if chars is empty or if there is no code
   187	// point in common.
   188	func IndexAny(s []byte, chars string) int {
   189		if len(chars) > 0 {
   190			var r rune
   191			var width int
   192			for i := 0; i < len(s); i += width {
   193				r = rune(s[i])
   194				if r < utf8.RuneSelf {
   195					width = 1
   196				} else {
   197					r, width = utf8.DecodeRune(s[i:])
   198				}
   199				for _, ch := range chars {
   200					if r == ch {
   201						return i
   202					}
   203				}
   204			}
   205		}
   206		return -1
   207	}
   208	
   209	// LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
   210	// points.  It returns the byte index of the last occurrence in s of any of
   211	// the Unicode code points in chars.  It returns -1 if chars is empty or if
   212	// there is no code point in common.
   213	func LastIndexAny(s []byte, chars string) int {
   214		if len(chars) > 0 {
   215			for i := len(s); i > 0; {
   216				r, size := utf8.DecodeLastRune(s[0:i])
   217				i -= size
   218				for _, ch := range chars {
   219					if r == ch {
   220						return i
   221					}
   222				}
   223			}
   224		}
   225		return -1
   226	}
   227	
   228	// Generic split: splits after each instance of sep,
   229	// including sepSave bytes of sep in the subarrays.
   230	func genSplit(s, sep []byte, sepSave, n int) [][]byte {
   231		if n == 0 {
   232			return nil
   233		}
   234		if len(sep) == 0 {
   235			return explode(s, n)
   236		}
   237		if n < 0 {
   238			n = Count(s, sep) + 1
   239		}
   240		c := sep[0]
   241		start := 0
   242		a := make([][]byte, n)
   243		na := 0
   244		for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ {
   245			if s[i] == c && (len(sep) == 1 || Equal(s[i:i+len(sep)], sep)) {
   246				a[na] = s[start : i+sepSave]
   247				na++
   248				start = i + len(sep)
   249				i += len(sep) - 1
   250			}
   251		}
   252		a[na] = s[start:]
   253		return a[0 : na+1]
   254	}
   255	
   256	// SplitN slices s into subslices separated by sep and returns a slice of
   257	// the subslices between those separators.
   258	// If sep is empty, SplitN splits after each UTF-8 sequence.
   259	// The count determines the number of subslices to return:
   260	//   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
   261	//   n == 0: the result is nil (zero subslices)
   262	//   n < 0: all subslices
   263	func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
   264	
   265	// SplitAfterN slices s into subslices after each instance of sep and
   266	// returns a slice of those subslices.
   267	// If sep is empty, SplitAfterN splits after each UTF-8 sequence.
   268	// The count determines the number of subslices to return:
   269	//   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
   270	//   n == 0: the result is nil (zero subslices)
   271	//   n < 0: all subslices
   272	func SplitAfterN(s, sep []byte, n int) [][]byte {
   273		return genSplit(s, sep, len(sep), n)
   274	}
   275	
   276	// Split slices s into all subslices separated by sep and returns a slice of
   277	// the subslices between those separators.
   278	// If sep is empty, Split splits after each UTF-8 sequence.
   279	// It is equivalent to SplitN with a count of -1.
   280	func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }
   281	
   282	// SplitAfter slices s into all subslices after each instance of sep and
   283	// returns a slice of those subslices.
   284	// If sep is empty, SplitAfter splits after each UTF-8 sequence.
   285	// It is equivalent to SplitAfterN with a count of -1.
   286	func SplitAfter(s, sep []byte) [][]byte {
   287		return genSplit(s, sep, len(sep), -1)
   288	}
   289	
   290	// Fields splits the array s around each instance of one or more consecutive white space
   291	// characters, returning a slice of subarrays of s or an empty list if s contains only white space.
   292	func Fields(s []byte) [][]byte {
   293		return FieldsFunc(s, unicode.IsSpace)
   294	}
   295	
   296	// FieldsFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
   297	// It splits the array s at each run of code points c satisfying f(c) and
   298	// returns a slice of subarrays of s.  If no code points in s satisfy f(c), an
   299	// empty slice is returned.
   300	func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
   301		n := 0
   302		inField := false
   303		for i := 0; i < len(s); {
   304			r, size := utf8.DecodeRune(s[i:])
   305			wasInField := inField
   306			inField = !f(r)
   307			if inField && !wasInField {
   308				n++
   309			}
   310			i += size
   311		}
   312	
   313		a := make([][]byte, n)
   314		na := 0
   315		fieldStart := -1
   316		for i := 0; i <= len(s) && na < n; {
   317			r, size := utf8.DecodeRune(s[i:])
   318			if fieldStart < 0 && size > 0 && !f(r) {
   319				fieldStart = i
   320				i += size
   321				continue
   322			}
   323			if fieldStart >= 0 && (size == 0 || f(r)) {
   324				a[na] = s[fieldStart:i]
   325				na++
   326				fieldStart = -1
   327			}
   328			if size == 0 {
   329				break
   330			}
   331			i += size
   332		}
   333		return a[0:na]
   334	}
   335	
   336	// Join concatenates the elements of a to create a single byte array.   The separator
   337	// sep is placed between elements in the resulting array.
   338	func Join(a [][]byte, sep []byte) []byte {
   339		if len(a) == 0 {
   340			return []byte{}
   341		}
   342		if len(a) == 1 {
   343			return a[0]
   344		}
   345		n := len(sep) * (len(a) - 1)
   346		for i := 0; i < len(a); i++ {
   347			n += len(a[i])
   348		}
   349	
   350		b := make([]byte, n)
   351		bp := copy(b, a[0])
   352		for _, s := range a[1:] {
   353			bp += copy(b[bp:], sep)
   354			bp += copy(b[bp:], s)
   355		}
   356		return b
   357	}
   358	
   359	// HasPrefix tests whether the byte array s begins with prefix.
   360	func HasPrefix(s, prefix []byte) bool {
   361		return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix)
   362	}
   363	
   364	// HasSuffix tests whether the byte array s ends with suffix.
   365	func HasSuffix(s, suffix []byte) bool {
   366		return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix)
   367	}
   368	
   369	// Map returns a copy of the byte array s with all its characters modified
   370	// according to the mapping function. If mapping returns a negative value, the character is
   371	// dropped from the string with no replacement.  The characters in s and the
   372	// output are interpreted as UTF-8-encoded Unicode code points.
   373	func Map(mapping func(r rune) rune, s []byte) []byte {
   374		// In the worst case, the array can grow when mapped, making
   375		// things unpleasant.  But it's so rare we barge in assuming it's
   376		// fine.  It could also shrink but that falls out naturally.
   377		maxbytes := len(s) // length of b
   378		nbytes := 0        // number of bytes encoded in b
   379		b := make([]byte, maxbytes)
   380		for i := 0; i < len(s); {
   381			wid := 1
   382			r := rune(s[i])
   383			if r >= utf8.RuneSelf {
   384				r, wid = utf8.DecodeRune(s[i:])
   385			}
   386			r = mapping(r)
   387			if r >= 0 {
   388				if nbytes+utf8.RuneLen(r) > maxbytes {
   389					// Grow the buffer.
   390					maxbytes = maxbytes*2 + utf8.UTFMax
   391					nb := make([]byte, maxbytes)
   392					copy(nb, b[0:nbytes])
   393					b = nb
   394				}
   395				nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
   396			}
   397			i += wid
   398		}
   399		return b[0:nbytes]
   400	}
   401	
   402	// Repeat returns a new byte slice consisting of count copies of b.
   403	func Repeat(b []byte, count int) []byte {
   404		nb := make([]byte, len(b)*count)
   405		bp := 0
   406		for i := 0; i < count; i++ {
   407			for j := 0; j < len(b); j++ {
   408				nb[bp] = b[j]
   409				bp++
   410			}
   411		}
   412		return nb
   413	}
   414	
   415	// ToUpper returns a copy of the byte array s with all Unicode letters mapped to their upper case.
   416	func ToUpper(s []byte) []byte { return Map(unicode.ToUpper, s) }
   417	
   418	// ToUpper returns a copy of the byte array s with all Unicode letters mapped to their lower case.
   419	func ToLower(s []byte) []byte { return Map(unicode.ToLower, s) }
   420	
   421	// ToTitle returns a copy of the byte array s with all Unicode letters mapped to their title case.
   422	func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }
   423	
   424	// ToUpperSpecial returns a copy of the byte array s with all Unicode letters mapped to their
   425	// upper case, giving priority to the special casing rules.
   426	func ToUpperSpecial(_case unicode.SpecialCase, s []byte) []byte {
   427		return Map(func(r rune) rune { return _case.ToUpper(r) }, s)
   428	}
   429	
   430	// ToLowerSpecial returns a copy of the byte array s with all Unicode letters mapped to their
   431	// lower case, giving priority to the special casing rules.
   432	func ToLowerSpecial(_case unicode.SpecialCase, s []byte) []byte {
   433		return Map(func(r rune) rune { return _case.ToLower(r) }, s)
   434	}
   435	
   436	// ToTitleSpecial returns a copy of the byte array s with all Unicode letters mapped to their
   437	// title case, giving priority to the special casing rules.
   438	func ToTitleSpecial(_case unicode.SpecialCase, s []byte) []byte {
   439		return Map(func(r rune) rune { return _case.ToTitle(r) }, s)
   440	}
   441	
   442	// isSeparator reports whether the rune could mark a word boundary.
   443	// TODO: update when package unicode captures more of the properties.
   444	func isSeparator(r rune) bool {
   445		// ASCII alphanumerics and underscore are not separators
   446		if r <= 0x7F {
   447			switch {
   448			case '0' <= r && r <= '9':
   449				return false
   450			case 'a' <= r && r <= 'z':
   451				return false
   452			case 'A' <= r && r <= 'Z':
   453				return false
   454			case r == '_':
   455				return false
   456			}
   457			return true
   458		}
   459		// Letters and digits are not separators
   460		if unicode.IsLetter(r) || unicode.IsDigit(r) {
   461			return false
   462		}
   463		// Otherwise, all we can do for now is treat spaces as separators.
   464		return unicode.IsSpace(r)
   465	}
   466	
   467	// BUG(r): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
   468	
   469	// Title returns a copy of s with all Unicode letters that begin words
   470	// mapped to their title case.
   471	func Title(s []byte) []byte {
   472		// Use a closure here to remember state.
   473		// Hackish but effective. Depends on Map scanning in order and calling
   474		// the closure once per rune.
   475		prev := ' '
   476		return Map(
   477			func(r rune) rune {
   478				if isSeparator(prev) {
   479					prev = r
   480					return unicode.ToTitle(r)
   481				}
   482				prev = r
   483				return r
   484			},
   485			s)
   486	}
   487	
   488	// TrimLeftFunc returns a subslice of s by slicing off all leading UTF-8-encoded
   489	// Unicode code points c that satisfy f(c).
   490	func TrimLeftFunc(s []byte, f func(r rune) bool) []byte {
   491		i := indexFunc(s, f, false)
   492		if i == -1 {
   493			return nil
   494		}
   495		return s[i:]
   496	}
   497	
   498	// TrimRightFunc returns a subslice of s by slicing off all trailing UTF-8
   499	// encoded Unicode code points c that satisfy f(c).
   500	func TrimRightFunc(s []byte, f func(r rune) bool) []byte {
   501		i := lastIndexFunc(s, f, false)
   502		if i >= 0 && s[i] >= utf8.RuneSelf {
   503			_, wid := utf8.DecodeRune(s[i:])
   504			i += wid
   505		} else {
   506			i++
   507		}
   508		return s[0:i]
   509	}
   510	
   511	// TrimFunc returns a subslice of s by slicing off all leading and trailing
   512	// UTF-8-encoded Unicode code points c that satisfy f(c).
   513	func TrimFunc(s []byte, f func(r rune) bool) []byte {
   514		return TrimRightFunc(TrimLeftFunc(s, f), f)
   515	}
   516	
   517	// IndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
   518	// It returns the byte index in s of the first Unicode
   519	// code point satisfying f(c), or -1 if none do.
   520	func IndexFunc(s []byte, f func(r rune) bool) int {
   521		return indexFunc(s, f, true)
   522	}
   523	
   524	// LastIndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
   525	// It returns the byte index in s of the last Unicode
   526	// code point satisfying f(c), or -1 if none do.
   527	func LastIndexFunc(s []byte, f func(r rune) bool) int {
   528		return lastIndexFunc(s, f, true)
   529	}
   530	
   531	// indexFunc is the same as IndexFunc except that if
   532	// truth==false, the sense of the predicate function is
   533	// inverted.
   534	func indexFunc(s []byte, f func(r rune) bool, truth bool) int {
   535		start := 0
   536		for start < len(s) {
   537			wid := 1
   538			r := rune(s[start])
   539			if r >= utf8.RuneSelf {
   540				r, wid = utf8.DecodeRune(s[start:])
   541			}
   542			if f(r) == truth {
   543				return start
   544			}
   545			start += wid
   546		}
   547		return -1
   548	}
   549	
   550	// lastIndexFunc is the same as LastIndexFunc except that if
   551	// truth==false, the sense of the predicate function is
   552	// inverted.
   553	func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
   554		for i := len(s); i > 0; {
   555			r, size := utf8.DecodeLastRune(s[0:i])
   556			i -= size
   557			if f(r) == truth {
   558				return i
   559			}
   560		}
   561		return -1
   562	}
   563	
   564	func makeCutsetFunc(cutset string) func(r rune) bool {
   565		return func(r rune) bool {
   566			for _, c := range cutset {
   567				if c == r {
   568					return true
   569				}
   570			}
   571			return false
   572		}
   573	}
   574	
   575	// Trim returns a subslice of s by slicing off all leading and
   576	// trailing UTF-8-encoded Unicode code points contained in cutset.
   577	func Trim(s []byte, cutset string) []byte {
   578		return TrimFunc(s, makeCutsetFunc(cutset))
   579	}
   580	
   581	// TrimLeft returns a subslice of s by slicing off all leading
   582	// UTF-8-encoded Unicode code points contained in cutset.
   583	func TrimLeft(s []byte, cutset string) []byte {
   584		return TrimLeftFunc(s, makeCutsetFunc(cutset))
   585	}
   586	
   587	// TrimRight returns a subslice of s by slicing off all trailing
   588	// UTF-8-encoded Unicode code points that are contained in cutset.
   589	func TrimRight(s []byte, cutset string) []byte {
   590		return TrimRightFunc(s, makeCutsetFunc(cutset))
   591	}
   592	
   593	// TrimSpace returns a subslice of s by slicing off all leading and
   594	// trailing white space, as defined by Unicode.
   595	func TrimSpace(s []byte) []byte {
   596		return TrimFunc(s, unicode.IsSpace)
   597	}
   598	
   599	// Runes returns a slice of runes (Unicode code points) equivalent to s.
   600	func Runes(s []byte) []rune {
   601		t := make([]rune, utf8.RuneCount(s))
   602		i := 0
   603		for len(s) > 0 {
   604			r, l := utf8.DecodeRune(s)
   605			t[i] = r
   606			i++
   607			s = s[l:]
   608		}
   609		return t
   610	}
   611	
   612	// Replace returns a copy of the slice s with the first n
   613	// non-overlapping instances of old replaced by new.
   614	// If n < 0, there is no limit on the number of replacements.
   615	func Replace(s, old, new []byte, n int) []byte {
   616		m := 0
   617		if n != 0 {
   618			// Compute number of replacements.
   619			m = Count(s, old)
   620		}
   621		if m == 0 {
   622			// Nothing to do. Just copy.
   623			t := make([]byte, len(s))
   624			copy(t, s)
   625			return t
   626		}
   627		if n < 0 || m < n {
   628			n = m
   629		}
   630	
   631		// Apply replacements to buffer.
   632		t := make([]byte, len(s)+n*(len(new)-len(old)))
   633		w := 0
   634		start := 0
   635		for i := 0; i < n; i++ {
   636			j := start
   637			if len(old) == 0 {
   638				if i > 0 {
   639					_, wid := utf8.DecodeRune(s[start:])
   640					j += wid
   641				}
   642			} else {
   643				j += Index(s[start:], old)
   644			}
   645			w += copy(t[w:], s[start:j])
   646			w += copy(t[w:], new)
   647			start = j + len(old)
   648		}
   649		w += copy(t[w:], s[start:])
   650		return t[0:w]
   651	}
   652	
   653	// EqualFold reports whether s and t, interpreted as UTF-8 strings,
   654	// are equal under Unicode case-folding.
   655	func EqualFold(s, t []byte) bool {
   656		for len(s) != 0 && len(t) != 0 {
   657			// Extract first rune from each.
   658			var sr, tr rune
   659			if s[0] < utf8.RuneSelf {
   660				sr, s = rune(s[0]), s[1:]
   661			} else {
   662				r, size := utf8.DecodeRune(s)
   663				sr, s = r, s[size:]
   664			}
   665			if t[0] < utf8.RuneSelf {
   666				tr, t = rune(t[0]), t[1:]
   667			} else {
   668				r, size := utf8.DecodeRune(t)
   669				tr, t = r, t[size:]
   670			}
   671	
   672			// If they match, keep going; if not, return false.
   673	
   674			// Easy case.
   675			if tr == sr {
   676				continue
   677			}
   678	
   679			// Make sr < tr to simplify what follows.
   680			if tr < sr {
   681				tr, sr = sr, tr
   682			}
   683			// Fast check for ASCII.
   684			if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' {
   685				// ASCII, and sr is upper case.  tr must be lower case.
   686				if tr == sr+'a'-'A' {
   687					continue
   688				}
   689				return false
   690			}
   691	
   692			// General case.  SimpleFold(x) returns the next equivalent rune > x
   693			// or wraps around to smaller values.
   694			r := unicode.SimpleFold(sr)
   695			for r != sr && r < tr {
   696				r = unicode.SimpleFold(r)
   697			}
   698			if r == tr {
   699				continue
   700			}
   701			return false
   702		}
   703	
   704		// One string is empty.  Are both?
   705		return len(s) == len(t)
   706	}
previous page start next page