src/pkg/unicode/utf8/utf8.go - The Go Programming Language

Golang

Source file src/pkg/unicode/utf8/utf8.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package utf8 implements functions and constants to support text encoded in
     6	// UTF-8. It includes functions to translate between runes and UTF-8 byte sequences.
     7	package utf8
     8	
     9	// The conditions RuneError==unicode.ReplacementChar and
    10	// MaxRune==unicode.MaxRune are verified in the tests.
    11	// Defining them locally avoids this package depending on package unicode.
    12	
    13	// Numbers fundamental to the encoding.
    14	const (
    15		RuneError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
    16		RuneSelf  = 0x80         // characters below Runeself are represented as themselves in a single byte.
    17		MaxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
    18		UTFMax    = 4            // maximum number of bytes of a UTF-8 encoded Unicode character.
    19	)
    20	
    21	const (
    22		t1 = 0x00 // 0000 0000
    23		tx = 0x80 // 1000 0000
    24		t2 = 0xC0 // 1100 0000
    25		t3 = 0xE0 // 1110 0000
    26		t4 = 0xF0 // 1111 0000
    27		t5 = 0xF8 // 1111 1000
    28	
    29		maskx = 0x3F // 0011 1111
    30		mask2 = 0x1F // 0001 1111
    31		mask3 = 0x0F // 0000 1111
    32		mask4 = 0x07 // 0000 0111
    33	
    34		rune1Max = 1<<7 - 1
    35		rune2Max = 1<<11 - 1
    36		rune3Max = 1<<16 - 1
    37		rune4Max = 1<<21 - 1
    38	)
    39	
    40	func decodeRuneInternal(p []byte) (r rune, size int, short bool) {
    41		n := len(p)
    42		if n < 1 {
    43			return RuneError, 0, true
    44		}
    45		c0 := p[0]
    46	
    47		// 1-byte, 7-bit sequence?
    48		if c0 < tx {
    49			return rune(c0), 1, false
    50		}
    51	
    52		// unexpected continuation byte?
    53		if c0 < t2 {
    54			return RuneError, 1, false
    55		}
    56	
    57		// need first continuation byte
    58		if n < 2 {
    59			return RuneError, 1, true
    60		}
    61		c1 := p[1]
    62		if c1 < tx || t2 <= c1 {
    63			return RuneError, 1, false
    64		}
    65	
    66		// 2-byte, 11-bit sequence?
    67		if c0 < t3 {
    68			r = rune(c0&mask2)<<6 | rune(c1&maskx)
    69			if r <= rune1Max {
    70				return RuneError, 1, false
    71			}
    72			return r, 2, false
    73		}
    74	
    75		// need second continuation byte
    76		if n < 3 {
    77			return RuneError, 1, true
    78		}
    79		c2 := p[2]
    80		if c2 < tx || t2 <= c2 {
    81			return RuneError, 1, false
    82		}
    83	
    84		// 3-byte, 16-bit sequence?
    85		if c0 < t4 {
    86			r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
    87			if r <= rune2Max {
    88				return RuneError, 1, false
    89			}
    90			return r, 3, false
    91		}
    92	
    93		// need third continuation byte
    94		if n < 4 {
    95			return RuneError, 1, true
    96		}
    97		c3 := p[3]
    98		if c3 < tx || t2 <= c3 {
    99			return RuneError, 1, false
   100		}
   101	
   102		// 4-byte, 21-bit sequence?
   103		if c0 < t5 {
   104			r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
   105			if r <= rune3Max {
   106				return RuneError, 1, false
   107			}
   108			return r, 4, false
   109		}
   110	
   111		// error
   112		return RuneError, 1, false
   113	}
   114	
   115	func decodeRuneInStringInternal(s string) (r rune, size int, short bool) {
   116		n := len(s)
   117		if n < 1 {
   118			return RuneError, 0, true
   119		}
   120		c0 := s[0]
   121	
   122		// 1-byte, 7-bit sequence?
   123		if c0 < tx {
   124			return rune(c0), 1, false
   125		}
   126	
   127		// unexpected continuation byte?
   128		if c0 < t2 {
   129			return RuneError, 1, false
   130		}
   131	
   132		// need first continuation byte
   133		if n < 2 {
   134			return RuneError, 1, true
   135		}
   136		c1 := s[1]
   137		if c1 < tx || t2 <= c1 {
   138			return RuneError, 1, false
   139		}
   140	
   141		// 2-byte, 11-bit sequence?
   142		if c0 < t3 {
   143			r = rune(c0&mask2)<<6 | rune(c1&maskx)
   144			if r <= rune1Max {
   145				return RuneError, 1, false
   146			}
   147			return r, 2, false
   148		}
   149	
   150		// need second continuation byte
   151		if n < 3 {
   152			return RuneError, 1, true
   153		}
   154		c2 := s[2]
   155		if c2 < tx || t2 <= c2 {
   156			return RuneError, 1, false
   157		}
   158	
   159		// 3-byte, 16-bit sequence?
   160		if c0 < t4 {
   161			r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx)
   162			if r <= rune2Max {
   163				return RuneError, 1, false
   164			}
   165			return r, 3, false
   166		}
   167	
   168		// need third continuation byte
   169		if n < 4 {
   170			return RuneError, 1, true
   171		}
   172		c3 := s[3]
   173		if c3 < tx || t2 <= c3 {
   174			return RuneError, 1, false
   175		}
   176	
   177		// 4-byte, 21-bit sequence?
   178		if c0 < t5 {
   179			r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx)
   180			if r <= rune3Max {
   181				return RuneError, 1, false
   182			}
   183			return r, 4, false
   184		}
   185	
   186		// error
   187		return RuneError, 1, false
   188	}
   189	
   190	// FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune.
   191	// An invalid encoding is considered a full Rune since it will convert as a width-1 error rune.
   192	func FullRune(p []byte) bool {
   193		_, _, short := decodeRuneInternal(p)
   194		return !short
   195	}
   196	
   197	// FullRuneInString is like FullRune but its input is a string.
   198	func FullRuneInString(s string) bool {
   199		_, _, short := decodeRuneInStringInternal(s)
   200		return !short
   201	}
   202	
   203	// DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes.
   204	// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
   205	func DecodeRune(p []byte) (r rune, size int) {
   206		r, size, _ = decodeRuneInternal(p)
   207		return
   208	}
   209	
   210	// DecodeRuneInString is like DecodeRune but its input is a string.
   211	// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
   212	func DecodeRuneInString(s string) (r rune, size int) {
   213		r, size, _ = decodeRuneInStringInternal(s)
   214		return
   215	}
   216	
   217	// DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and its width in bytes.
   218	// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
   219	func DecodeLastRune(p []byte) (r rune, size int) {
   220		end := len(p)
   221		if end == 0 {
   222			return RuneError, 0
   223		}
   224		start := end - 1
   225		r = rune(p[start])
   226		if r < RuneSelf {
   227			return r, 1
   228		}
   229		// guard against O(n^2) behavior when traversing
   230		// backwards through strings with long sequences of
   231		// invalid UTF-8.
   232		lim := end - UTFMax
   233		if lim < 0 {
   234			lim = 0
   235		}
   236		for start--; start >= lim; start-- {
   237			if RuneStart(p[start]) {
   238				break
   239			}
   240		}
   241		if start < 0 {
   242			start = 0
   243		}
   244		r, size = DecodeRune(p[start:end])
   245		if start+size != end {
   246			return RuneError, 1
   247		}
   248		return r, size
   249	}
   250	
   251	// DecodeLastRuneInString is like DecodeLastRune but its input is a string.
   252	// If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8.
   253	func DecodeLastRuneInString(s string) (r rune, size int) {
   254		end := len(s)
   255		if end == 0 {
   256			return RuneError, 0
   257		}
   258		start := end - 1
   259		r = rune(s[start])
   260		if r < RuneSelf {
   261			return r, 1
   262		}
   263		// guard against O(n^2) behavior when traversing
   264		// backwards through strings with long sequences of
   265		// invalid UTF-8.
   266		lim := end - UTFMax
   267		if lim < 0 {
   268			lim = 0
   269		}
   270		for start--; start >= lim; start-- {
   271			if RuneStart(s[start]) {
   272				break
   273			}
   274		}
   275		if start < 0 {
   276			start = 0
   277		}
   278		r, size = DecodeRuneInString(s[start:end])
   279		if start+size != end {
   280			return RuneError, 1
   281		}
   282		return r, size
   283	}
   284	
   285	// RuneLen returns the number of bytes required to encode the rune.
   286	func RuneLen(r rune) int {
   287		switch {
   288		case r <= rune1Max:
   289			return 1
   290		case r <= rune2Max:
   291			return 2
   292		case r <= rune3Max:
   293			return 3
   294		case r <= rune4Max:
   295			return 4
   296		}
   297		return -1
   298	}
   299	
   300	// EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune.
   301	// It returns the number of bytes written.
   302	func EncodeRune(p []byte, r rune) int {
   303		// Negative values are erroneous.  Making it unsigned addresses the problem.
   304		if uint32(r) <= rune1Max {
   305			p[0] = byte(r)
   306			return 1
   307		}
   308	
   309		if uint32(r) <= rune2Max {
   310			p[0] = t2 | byte(r>>6)
   311			p[1] = tx | byte(r)&maskx
   312			return 2
   313		}
   314	
   315		if uint32(r) > MaxRune {
   316			r = RuneError
   317		}
   318	
   319		if uint32(r) <= rune3Max {
   320			p[0] = t3 | byte(r>>12)
   321			p[1] = tx | byte(r>>6)&maskx
   322			p[2] = tx | byte(r)&maskx
   323			return 3
   324		}
   325	
   326		p[0] = t4 | byte(r>>18)
   327		p[1] = tx | byte(r>>12)&maskx
   328		p[2] = tx | byte(r>>6)&maskx
   329		p[3] = tx | byte(r)&maskx
   330		return 4
   331	}
   332	
   333	// RuneCount returns the number of runes in p.  Erroneous and short
   334	// encodings are treated as single runes of width 1 byte.
   335	func RuneCount(p []byte) int {
   336		i := 0
   337		var n int
   338		for n = 0; i < len(p); n++ {
   339			if p[i] < RuneSelf {
   340				i++
   341			} else {
   342				_, size := DecodeRune(p[i:])
   343				i += size
   344			}
   345		}
   346		return n
   347	}
   348	
   349	// RuneCountInString is like RuneCount but its input is a string.
   350	func RuneCountInString(s string) (n int) {
   351		for _ = range s {
   352			n++
   353		}
   354		return
   355	}
   356	
   357	// RuneStart reports whether the byte could be the first byte of
   358	// an encoded rune.  Second and subsequent bytes always have the top
   359	// two bits set to 10.
   360	func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
   361	
   362	// Valid reports whether p consists entirely of valid UTF-8-encoded runes.
   363	func Valid(p []byte) bool {
   364		i := 0
   365		for i < len(p) {
   366			if p[i] < RuneSelf {
   367				i++
   368			} else {
   369				_, size := DecodeRune(p[i:])
   370				if size == 1 {
   371					// All valid runes of size of 1 (those
   372					// below RuneSelf) were handled above.
   373					// This must be a RuneError.
   374					return false
   375				}
   376				i += size
   377			}
   378		}
   379		return true
   380	}
   381	
   382	// ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
   383	func ValidString(s string) bool {
   384		for i, r := range s {
   385			if r == RuneError {
   386				// The RuneError value can be an error
   387				// sentinel value (if it's size 1) or the same
   388				// value encoded properly. Decode it to see if
   389				// it's the 1 byte sentinel value.
   390				_, size := DecodeRuneInString(s[i:])
   391				if size == 1 {
   392					return false
   393				}
   394			}
   395		}
   396		return true
   397	}