Source file src/pkg/unicode/utf8/utf8.go
1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package utf8 implements functions and constants to support text encoded in 6 // UTF-8. It includes functions to translate between runes and UTF-8 byte sequences. 7 package utf8 8 9 // The conditions RuneError==unicode.ReplacementChar and 10 // MaxRune==unicode.MaxRune are verified in the tests. 11 // Defining them locally avoids this package depending on package unicode. 12 13 // Numbers fundamental to the encoding. 14 const ( 15 RuneError = '\uFFFD' // the "error" Rune or "Unicode replacement character" 16 RuneSelf = 0x80 // characters below Runeself are represented as themselves in a single byte. 17 MaxRune = '\U0010FFFF' // Maximum valid Unicode code point. 18 UTFMax = 4 // maximum number of bytes of a UTF-8 encoded Unicode character. 19 ) 20 21 const ( 22 t1 = 0x00 // 0000 0000 23 tx = 0x80 // 1000 0000 24 t2 = 0xC0 // 1100 0000 25 t3 = 0xE0 // 1110 0000 26 t4 = 0xF0 // 1111 0000 27 t5 = 0xF8 // 1111 1000 28 29 maskx = 0x3F // 0011 1111 30 mask2 = 0x1F // 0001 1111 31 mask3 = 0x0F // 0000 1111 32 mask4 = 0x07 // 0000 0111 33 34 rune1Max = 1<<7 - 1 35 rune2Max = 1<<11 - 1 36 rune3Max = 1<<16 - 1 37 rune4Max = 1<<21 - 1 38 ) 39 40 func decodeRuneInternal(p []byte) (r rune, size int, short bool) { 41 n := len(p) 42 if n < 1 { 43 return RuneError, 0, true 44 } 45 c0 := p[0] 46 47 // 1-byte, 7-bit sequence? 48 if c0 < tx { 49 return rune(c0), 1, false 50 } 51 52 // unexpected continuation byte? 53 if c0 < t2 { 54 return RuneError, 1, false 55 } 56 57 // need first continuation byte 58 if n < 2 { 59 return RuneError, 1, true 60 } 61 c1 := p[1] 62 if c1 < tx || t2 <= c1 { 63 return RuneError, 1, false 64 } 65 66 // 2-byte, 11-bit sequence? 67 if c0 < t3 { 68 r = rune(c0&mask2)<<6 | rune(c1&maskx) 69 if r <= rune1Max { 70 return RuneError, 1, false 71 } 72 return r, 2, false 73 } 74 75 // need second continuation byte 76 if n < 3 { 77 return RuneError, 1, true 78 } 79 c2 := p[2] 80 if c2 < tx || t2 <= c2 { 81 return RuneError, 1, false 82 } 83 84 // 3-byte, 16-bit sequence? 85 if c0 < t4 { 86 r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx) 87 if r <= rune2Max { 88 return RuneError, 1, false 89 } 90 return r, 3, false 91 } 92 93 // need third continuation byte 94 if n < 4 { 95 return RuneError, 1, true 96 } 97 c3 := p[3] 98 if c3 < tx || t2 <= c3 { 99 return RuneError, 1, false 100 } 101 102 // 4-byte, 21-bit sequence? 103 if c0 < t5 { 104 r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) 105 if r <= rune3Max { 106 return RuneError, 1, false 107 } 108 return r, 4, false 109 } 110 111 // error 112 return RuneError, 1, false 113 } 114 115 func decodeRuneInStringInternal(s string) (r rune, size int, short bool) { 116 n := len(s) 117 if n < 1 { 118 return RuneError, 0, true 119 } 120 c0 := s[0] 121 122 // 1-byte, 7-bit sequence? 123 if c0 < tx { 124 return rune(c0), 1, false 125 } 126 127 // unexpected continuation byte? 128 if c0 < t2 { 129 return RuneError, 1, false 130 } 131 132 // need first continuation byte 133 if n < 2 { 134 return RuneError, 1, true 135 } 136 c1 := s[1] 137 if c1 < tx || t2 <= c1 { 138 return RuneError, 1, false 139 } 140 141 // 2-byte, 11-bit sequence? 142 if c0 < t3 { 143 r = rune(c0&mask2)<<6 | rune(c1&maskx) 144 if r <= rune1Max { 145 return RuneError, 1, false 146 } 147 return r, 2, false 148 } 149 150 // need second continuation byte 151 if n < 3 { 152 return RuneError, 1, true 153 } 154 c2 := s[2] 155 if c2 < tx || t2 <= c2 { 156 return RuneError, 1, false 157 } 158 159 // 3-byte, 16-bit sequence? 160 if c0 < t4 { 161 r = rune(c0&mask3)<<12 | rune(c1&maskx)<<6 | rune(c2&maskx) 162 if r <= rune2Max { 163 return RuneError, 1, false 164 } 165 return r, 3, false 166 } 167 168 // need third continuation byte 169 if n < 4 { 170 return RuneError, 1, true 171 } 172 c3 := s[3] 173 if c3 < tx || t2 <= c3 { 174 return RuneError, 1, false 175 } 176 177 // 4-byte, 21-bit sequence? 178 if c0 < t5 { 179 r = rune(c0&mask4)<<18 | rune(c1&maskx)<<12 | rune(c2&maskx)<<6 | rune(c3&maskx) 180 if r <= rune3Max { 181 return RuneError, 1, false 182 } 183 return r, 4, false 184 } 185 186 // error 187 return RuneError, 1, false 188 } 189 190 // FullRune reports whether the bytes in p begin with a full UTF-8 encoding of a rune. 191 // An invalid encoding is considered a full Rune since it will convert as a width-1 error rune. 192 func FullRune(p []byte) bool { 193 _, _, short := decodeRuneInternal(p) 194 return !short 195 } 196 197 // FullRuneInString is like FullRune but its input is a string. 198 func FullRuneInString(s string) bool { 199 _, _, short := decodeRuneInStringInternal(s) 200 return !short 201 } 202 203 // DecodeRune unpacks the first UTF-8 encoding in p and returns the rune and its width in bytes. 204 // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. 205 func DecodeRune(p []byte) (r rune, size int) { 206 r, size, _ = decodeRuneInternal(p) 207 return 208 } 209 210 // DecodeRuneInString is like DecodeRune but its input is a string. 211 // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. 212 func DecodeRuneInString(s string) (r rune, size int) { 213 r, size, _ = decodeRuneInStringInternal(s) 214 return 215 } 216 217 // DecodeLastRune unpacks the last UTF-8 encoding in p and returns the rune and its width in bytes. 218 // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. 219 func DecodeLastRune(p []byte) (r rune, size int) { 220 end := len(p) 221 if end == 0 { 222 return RuneError, 0 223 } 224 start := end - 1 225 r = rune(p[start]) 226 if r < RuneSelf { 227 return r, 1 228 } 229 // guard against O(n^2) behavior when traversing 230 // backwards through strings with long sequences of 231 // invalid UTF-8. 232 lim := end - UTFMax 233 if lim < 0 { 234 lim = 0 235 } 236 for start--; start >= lim; start-- { 237 if RuneStart(p[start]) { 238 break 239 } 240 } 241 if start < 0 { 242 start = 0 243 } 244 r, size = DecodeRune(p[start:end]) 245 if start+size != end { 246 return RuneError, 1 247 } 248 return r, size 249 } 250 251 // DecodeLastRuneInString is like DecodeLastRune but its input is a string. 252 // If the encoding is invalid, it returns (RuneError, 1), an impossible result for correct UTF-8. 253 func DecodeLastRuneInString(s string) (r rune, size int) { 254 end := len(s) 255 if end == 0 { 256 return RuneError, 0 257 } 258 start := end - 1 259 r = rune(s[start]) 260 if r < RuneSelf { 261 return r, 1 262 } 263 // guard against O(n^2) behavior when traversing 264 // backwards through strings with long sequences of 265 // invalid UTF-8. 266 lim := end - UTFMax 267 if lim < 0 { 268 lim = 0 269 } 270 for start--; start >= lim; start-- { 271 if RuneStart(s[start]) { 272 break 273 } 274 } 275 if start < 0 { 276 start = 0 277 } 278 r, size = DecodeRuneInString(s[start:end]) 279 if start+size != end { 280 return RuneError, 1 281 } 282 return r, size 283 } 284 285 // RuneLen returns the number of bytes required to encode the rune. 286 func RuneLen(r rune) int { 287 switch { 288 case r <= rune1Max: 289 return 1 290 case r <= rune2Max: 291 return 2 292 case r <= rune3Max: 293 return 3 294 case r <= rune4Max: 295 return 4 296 } 297 return -1 298 } 299 300 // EncodeRune writes into p (which must be large enough) the UTF-8 encoding of the rune. 301 // It returns the number of bytes written. 302 func EncodeRune(p []byte, r rune) int { 303 // Negative values are erroneous. Making it unsigned addresses the problem. 304 if uint32(r) <= rune1Max { 305 p[0] = byte(r) 306 return 1 307 } 308 309 if uint32(r) <= rune2Max { 310 p[0] = t2 | byte(r>>6) 311 p[1] = tx | byte(r)&maskx 312 return 2 313 } 314 315 if uint32(r) > MaxRune { 316 r = RuneError 317 } 318 319 if uint32(r) <= rune3Max { 320 p[0] = t3 | byte(r>>12) 321 p[1] = tx | byte(r>>6)&maskx 322 p[2] = tx | byte(r)&maskx 323 return 3 324 } 325 326 p[0] = t4 | byte(r>>18) 327 p[1] = tx | byte(r>>12)&maskx 328 p[2] = tx | byte(r>>6)&maskx 329 p[3] = tx | byte(r)&maskx 330 return 4 331 } 332 333 // RuneCount returns the number of runes in p. Erroneous and short 334 // encodings are treated as single runes of width 1 byte. 335 func RuneCount(p []byte) int { 336 i := 0 337 var n int 338 for n = 0; i < len(p); n++ { 339 if p[i] < RuneSelf { 340 i++ 341 } else { 342 _, size := DecodeRune(p[i:]) 343 i += size 344 } 345 } 346 return n 347 } 348 349 // RuneCountInString is like RuneCount but its input is a string. 350 func RuneCountInString(s string) (n int) { 351 for _ = range s { 352 n++ 353 } 354 return 355 } 356 357 // RuneStart reports whether the byte could be the first byte of 358 // an encoded rune. Second and subsequent bytes always have the top 359 // two bits set to 10. 360 func RuneStart(b byte) bool { return b&0xC0 != 0x80 } 361 362 // Valid reports whether p consists entirely of valid UTF-8-encoded runes. 363 func Valid(p []byte) bool { 364 i := 0 365 for i < len(p) { 366 if p[i] < RuneSelf { 367 i++ 368 } else { 369 _, size := DecodeRune(p[i:]) 370 if size == 1 { 371 // All valid runes of size of 1 (those 372 // below RuneSelf) were handled above. 373 // This must be a RuneError. 374 return false 375 } 376 i += size 377 } 378 } 379 return true 380 } 381 382 // ValidString reports whether s consists entirely of valid UTF-8-encoded runes. 383 func ValidString(s string) bool { 384 for i, r := range s { 385 if r == RuneError { 386 // The RuneError value can be an error 387 // sentinel value (if it's size 1) or the same 388 // value encoded properly. Decode it to see if 389 // it's the 1 byte sentinel value. 390 _, size := DecodeRuneInString(s[i:]) 391 if size == 1 { 392 return false 393 } 394 } 395 } 396 return true 397 }