src/pkg/unicode/letter.go - The Go Programming Language

Golang

Source file src/pkg/unicode/letter.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package unicode provides data and functions to test some properties of
     6	// Unicode code points.
     7	package unicode
     8	
     9	const (
    10		MaxRune         = '\U0010FFFF' // Maximum valid Unicode code point.
    11		ReplacementChar = '\uFFFD'     // Represents invalid code points.
    12		MaxASCII        = '\u007F'     // maximum ASCII value.
    13		MaxLatin1       = '\u00FF'     // maximum Latin-1 value.
    14	)
    15	
    16	// RangeTable defines a set of Unicode code points by listing the ranges of
    17	// code points within the set. The ranges are listed in two slices
    18	// to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
    19	// The two slices must be in sorted order and non-overlapping.
    20	// Also, R32 should contain only values >= 0x10000 (1<<16).
    21	type RangeTable struct {
    22		R16 []Range16
    23		R32 []Range32
    24	}
    25	
    26	// Range16 represents of a range of 16-bit Unicode code points.  The range runs from Lo to Hi
    27	// inclusive and has the specified stride.
    28	type Range16 struct {
    29		Lo     uint16
    30		Hi     uint16
    31		Stride uint16
    32	}
    33	
    34	// Range32 represents of a range of Unicode code points and is used when one or
    35	// more of the values will not fit in 16 bits.  The range runs from Lo to Hi
    36	// inclusive and has the specified stride. Lo and Hi must always be >= 1<<16.
    37	type Range32 struct {
    38		Lo     uint32
    39		Hi     uint32
    40		Stride uint32
    41	}
    42	
    43	// CaseRange represents a range of Unicode code points for simple (one
    44	// code point to one code point) case conversion.
    45	// The range runs from Lo to Hi inclusive, with a fixed stride of 1.  Deltas
    46	// are the number to add to the code point to reach the code point for a
    47	// different case for that character.  They may be negative.  If zero, it
    48	// means the character is in the corresponding case. There is a special
    49	// case representing sequences of alternating corresponding Upper and Lower
    50	// pairs.  It appears with a fixed Delta of
    51	//	{UpperLower, UpperLower, UpperLower}
    52	// The constant UpperLower has an otherwise impossible delta value.
    53	type CaseRange struct {
    54		Lo    uint32
    55		Hi    uint32
    56		Delta d
    57	}
    58	
    59	// SpecialCase represents language-specific case mappings such as Turkish.
    60	// Methods of SpecialCase customize (by overriding) the standard mappings.
    61	type SpecialCase []CaseRange
    62	
    63	// BUG(r): There is no mechanism for full case folding, that is, for
    64	// characters that involve multiple runes in the input or output.
    65	
    66	// Indices into the Delta arrays inside CaseRanges for case mapping.
    67	const (
    68		UpperCase = iota
    69		LowerCase
    70		TitleCase
    71		MaxCase
    72	)
    73	
    74	type d [MaxCase]rune // to make the CaseRanges text shorter
    75	
    76	// If the Delta field of a CaseRange is UpperLower or LowerUpper, it means
    77	// this CaseRange represents a sequence of the form (say)
    78	// Upper Lower Upper Lower.
    79	const (
    80		UpperLower = MaxRune + 1 // (Cannot be a valid delta.)
    81	)
    82	
    83	// is16 uses binary search to test whether rune is in the specified slice of 16-bit ranges.
    84	func is16(ranges []Range16, r uint16) bool {
    85		// binary search over ranges
    86		lo := 0
    87		hi := len(ranges)
    88		for lo < hi {
    89			m := lo + (hi-lo)/2
    90			range_ := ranges[m]
    91			if range_.Lo <= r && r <= range_.Hi {
    92				return (r-range_.Lo)%range_.Stride == 0
    93			}
    94			if r < range_.Lo {
    95				hi = m
    96			} else {
    97				lo = m + 1
    98			}
    99		}
   100		return false
   101	}
   102	
   103	// is32 uses binary search to test whether rune is in the specified slice of 32-bit ranges.
   104	func is32(ranges []Range32, r uint32) bool {
   105		// binary search over ranges
   106		lo := 0
   107		hi := len(ranges)
   108		for lo < hi {
   109			m := lo + (hi-lo)/2
   110			range_ := ranges[m]
   111			if range_.Lo <= r && r <= range_.Hi {
   112				return (r-range_.Lo)%range_.Stride == 0
   113			}
   114			if r < range_.Lo {
   115				hi = m
   116			} else {
   117				lo = m + 1
   118			}
   119		}
   120		return false
   121	}
   122	
   123	// Is tests whether rune is in the specified table of ranges.
   124	func Is(rangeTab *RangeTable, r rune) bool {
   125		// common case: rune is ASCII or Latin-1.
   126		if uint32(r) <= MaxLatin1 {
   127			// Only need to check R16, since R32 is always >= 1<<16.
   128			r16 := uint16(r)
   129			for _, r := range rangeTab.R16 {
   130				if r16 > r.Hi {
   131					continue
   132				}
   133				if r16 < r.Lo {
   134					return false
   135				}
   136				return (r16-r.Lo)%r.Stride == 0
   137			}
   138			return false
   139		}
   140		r16 := rangeTab.R16
   141		if len(r16) > 0 && r <= rune(r16[len(r16)-1].Hi) {
   142			return is16(r16, uint16(r))
   143		}
   144		r32 := rangeTab.R32
   145		if len(r32) > 0 && r >= rune(r32[0].Lo) {
   146			return is32(r32, uint32(r))
   147		}
   148		return false
   149	}
   150	
   151	// IsUpper reports whether the rune is an upper case letter.
   152	func IsUpper(r rune) bool {
   153		// See comment in IsGraphic.
   154		if uint32(r) <= MaxLatin1 {
   155			return properties[uint8(r)]&pLu != 0
   156		}
   157		return Is(Upper, r)
   158	}
   159	
   160	// IsLower reports whether the rune is a lower case letter.
   161	func IsLower(r rune) bool {
   162		// See comment in IsGraphic.
   163		if uint32(r) <= MaxLatin1 {
   164			return properties[uint8(r)]&pLl != 0
   165		}
   166		return Is(Lower, r)
   167	}
   168	
   169	// IsTitle reports whether the rune is a title case letter.
   170	func IsTitle(r rune) bool {
   171		if r <= MaxLatin1 {
   172			return false
   173		}
   174		return Is(Title, r)
   175	}
   176	
   177	// to maps the rune using the specified case mapping.
   178	func to(_case int, r rune, caseRange []CaseRange) rune {
   179		if _case < 0 || MaxCase <= _case {
   180			return ReplacementChar // as reasonable an error as any
   181		}
   182		// binary search over ranges
   183		lo := 0
   184		hi := len(caseRange)
   185		for lo < hi {
   186			m := lo + (hi-lo)/2
   187			cr := caseRange[m]
   188			if rune(cr.Lo) <= r && r <= rune(cr.Hi) {
   189				delta := rune(cr.Delta[_case])
   190				if delta > MaxRune {
   191					// In an Upper-Lower sequence, which always starts with
   192					// an UpperCase letter, the real deltas always look like:
   193					//	{0, 1, 0}    UpperCase (Lower is next)
   194					//	{-1, 0, -1}  LowerCase (Upper, Title are previous)
   195					// The characters at even offsets from the beginning of the
   196					// sequence are upper case; the ones at odd offsets are lower.
   197					// The correct mapping can be done by clearing or setting the low
   198					// bit in the sequence offset.
   199					// The constants UpperCase and TitleCase are even while LowerCase
   200					// is odd so we take the low bit from _case.
   201					return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1))
   202				}
   203				return r + delta
   204			}
   205			if r < rune(cr.Lo) {
   206				hi = m
   207			} else {
   208				lo = m + 1
   209			}
   210		}
   211		return r
   212	}
   213	
   214	// To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
   215	func To(_case int, r rune) rune {
   216		return to(_case, r, CaseRanges)
   217	}
   218	
   219	// ToUpper maps the rune to upper case.
   220	func ToUpper(r rune) rune {
   221		if r <= MaxASCII {
   222			if 'a' <= r && r <= 'z' {
   223				r -= 'a' - 'A'
   224			}
   225			return r
   226		}
   227		return To(UpperCase, r)
   228	}
   229	
   230	// ToLower maps the rune to lower case.
   231	func ToLower(r rune) rune {
   232		if r <= MaxASCII {
   233			if 'A' <= r && r <= 'Z' {
   234				r += 'a' - 'A'
   235			}
   236			return r
   237		}
   238		return To(LowerCase, r)
   239	}
   240	
   241	// ToTitle maps the rune to title case.
   242	func ToTitle(r rune) rune {
   243		if r <= MaxASCII {
   244			if 'a' <= r && r <= 'z' { // title case is upper case for ASCII
   245				r -= 'a' - 'A'
   246			}
   247			return r
   248		}
   249		return To(TitleCase, r)
   250	}
   251	
   252	// ToUpper maps the rune to upper case giving priority to the special mapping.
   253	func (special SpecialCase) ToUpper(r rune) rune {
   254		r1 := to(UpperCase, r, []CaseRange(special))
   255		if r1 == r {
   256			r1 = ToUpper(r)
   257		}
   258		return r1
   259	}
   260	
   261	// ToTitle maps the rune to title case giving priority to the special mapping.
   262	func (special SpecialCase) ToTitle(r rune) rune {
   263		r1 := to(TitleCase, r, []CaseRange(special))
   264		if r1 == r {
   265			r1 = ToTitle(r)
   266		}
   267		return r1
   268	}
   269	
   270	// ToLower maps the rune to lower case giving priority to the special mapping.
   271	func (special SpecialCase) ToLower(r rune) rune {
   272		r1 := to(LowerCase, r, []CaseRange(special))
   273		if r1 == r {
   274			r1 = ToLower(r)
   275		}
   276		return r1
   277	}
   278	
   279	// caseOrbit is defined in tables.go as []foldPair.  Right now all the
   280	// entries fit in uint16, so use uint16.  If that changes, compilation
   281	// will fail (the constants in the composite literal will not fit in uint16)
   282	// and the types here can change to uint32.
   283	type foldPair struct {
   284		From uint16
   285		To   uint16
   286	}
   287	
   288	// SimpleFold iterates over Unicode code points equivalent under
   289	// the Unicode-defined simple case folding.  Among the code points
   290	// equivalent to rune (including rune itself), SimpleFold returns the
   291	// smallest rune >= r if one exists, or else the smallest rune >= 0. 
   292	//
   293	// For example:
   294	//	SimpleFold('A') = 'a'
   295	//	SimpleFold('a') = 'A'
   296	//
   297	//	SimpleFold('K') = 'k'
   298	//	SimpleFold('k') = '\u212A' (Kelvin symbol, K)
   299	//	SimpleFold('\u212A') = 'K'
   300	//
   301	//	SimpleFold('1') = '1'
   302	//
   303	func SimpleFold(r rune) rune {
   304		// Consult caseOrbit table for special cases.
   305		lo := 0
   306		hi := len(caseOrbit)
   307		for lo < hi {
   308			m := lo + (hi-lo)/2
   309			if rune(caseOrbit[m].From) < r {
   310				lo = m + 1
   311			} else {
   312				hi = m
   313			}
   314		}
   315		if lo < len(caseOrbit) && rune(caseOrbit[lo].From) == r {
   316			return rune(caseOrbit[lo].To)
   317		}
   318	
   319		// No folding specified.  This is a one- or two-element
   320		// equivalence class containing rune and ToLower(rune)
   321		// and ToUpper(rune) if they are different from rune.
   322		if l := ToLower(r); l != r {
   323			return l
   324		}
   325		return ToUpper(r)
   326	}