Source file src/pkg/unicode/utf16/utf16.go
1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package utf16 implements encoding and decoding of UTF-16 sequences.
6 package utf16
7
8 // The conditions replacementChar==unicode.ReplacementChar and
9 // maxRune==unicode.MaxRune are verified in the tests.
10 // Defining them locally avoids this package depending on package unicode.
11
12 const (
13 replacementChar = '\uFFFD' // Unicode replacement character
14 maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
15 )
16
17 const (
18 // 0xd800-0xdc00 encodes the high 10 bits of a pair.
19 // 0xdc00-0xe000 encodes the low 10 bits of a pair.
20 // the value is those 20 bits plus 0x10000.
21 surr1 = 0xd800
22 surr2 = 0xdc00
23 surr3 = 0xe000
24
25 surrSelf = 0x10000
26 )
27
28 // IsSurrogate returns true if the specified Unicode code point
29 // can appear in a surrogate pair.
30 func IsSurrogate(r rune) bool {
31 return surr1 <= r && r < surr3
32 }
33
34 // DecodeRune returns the UTF-16 decoding of a surrogate pair.
35 // If the pair is not a valid UTF-16 surrogate pair, DecodeRune returns
36 // the Unicode replacement code point U+FFFD.
37 func DecodeRune(r1, r2 rune) rune {
38 if surr1 <= r1 && r1 < surr2 && surr2 <= r2 && r2 < surr3 {
39 return (rune(r1)-surr1)<<10 | (rune(r2) - surr2) + 0x10000
40 }
41 return replacementChar
42 }
43
44 // EncodeRune returns the UTF-16 surrogate pair r1, r2 for the given rune.
45 // If the rune is not a valid Unicode code point or does not need encoding,
46 // EncodeRune returns U+FFFD, U+FFFD.
47 func EncodeRune(r rune) (r1, r2 rune) {
48 if r < surrSelf || r > maxRune || IsSurrogate(r) {
49 return replacementChar, replacementChar
50 }
51 r -= surrSelf
52 return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff
53 }
54
55 // Encode returns the UTF-16 encoding of the Unicode code point sequence s.
56 func Encode(s []rune) []uint16 {
57 n := len(s)
58 for _, v := range s {
59 if v >= surrSelf {
60 n++
61 }
62 }
63
64 a := make([]uint16, n)
65 n = 0
66 for _, v := range s {
67 switch {
68 case v < 0, surr1 <= v && v < surr3, v > maxRune:
69 v = replacementChar
70 fallthrough
71 case v < surrSelf:
72 a[n] = uint16(v)
73 n++
74 default:
75 r1, r2 := EncodeRune(v)
76 a[n] = uint16(r1)
77 a[n+1] = uint16(r2)
78 n += 2
79 }
80 }
81 return a[0:n]
82 }
83
84 // Decode returns the Unicode code point sequence represented
85 // by the UTF-16 encoding s.
86 func Decode(s []uint16) []rune {
87 a := make([]rune, len(s))
88 n := 0
89 for i := 0; i < len(s); i++ {
90 switch r := s[i]; {
91 case surr1 <= r && r < surr2 && i+1 < len(s) &&
92 surr2 <= s[i+1] && s[i+1] < surr3:
93 // valid surrogate sequence
94 a[n] = DecodeRune(rune(r), rune(s[i+1]))
95 i++
96 n++
97 case surr1 <= r && r < surr3:
98 // invalid surrogate sequence
99 a[n] = replacementChar
100 n++
101 default:
102 // normal rune
103 a[n] = rune(r)
104 n++
105 }
106 }
107 return a[0:n]
108 }