src/pkg/html/escape.go - The Go Programming Language

Golang

Source file src/pkg/html/escape.go

     1	// Copyright 2010 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package html provides functions for escaping and unescaping HTML text.
     6	package html
     7	
     8	import (
     9		"bytes"
    10		"strings"
    11		"unicode/utf8"
    12	)
    13	
    14	type writer interface {
    15		WriteString(string) (int, error)
    16	}
    17	
    18	// These replacements permit compatibility with old numeric entities that 
    19	// assumed Windows-1252 encoding.
    20	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
    21	var replacementTable = [...]rune{
    22		'\u20AC', // First entry is what 0x80 should be replaced with.
    23		'\u0081',
    24		'\u201A',
    25		'\u0192',
    26		'\u201E',
    27		'\u2026',
    28		'\u2020',
    29		'\u2021',
    30		'\u02C6',
    31		'\u2030',
    32		'\u0160',
    33		'\u2039',
    34		'\u0152',
    35		'\u008D',
    36		'\u017D',
    37		'\u008F',
    38		'\u0090',
    39		'\u2018',
    40		'\u2019',
    41		'\u201C',
    42		'\u201D',
    43		'\u2022',
    44		'\u2013',
    45		'\u2014',
    46		'\u02DC',
    47		'\u2122',
    48		'\u0161',
    49		'\u203A',
    50		'\u0153',
    51		'\u009D',
    52		'\u017E',
    53		'\u0178', // Last entry is 0x9F.
    54		// 0x00->'\uFFFD' is handled programmatically. 
    55		// 0x0D->'\u000D' is a no-op.
    56	}
    57	
    58	// unescapeEntity reads an entity like "<" from b[src:] and writes the
    59	// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
    60	// Precondition: b[src] == '&' && dst <= src.
    61	// attribute should be true if parsing an attribute value.
    62	func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
    63		// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
    64	
    65		// i starts at 1 because we already know that s[0] == '&'.
    66		i, s := 1, b[src:]
    67	
    68		if len(s) <= 1 {
    69			b[dst] = b[src]
    70			return dst + 1, src + 1
    71		}
    72	
    73		if s[i] == '#' {
    74			if len(s) <= 3 { // We need to have at least "&#.".
    75				b[dst] = b[src]
    76				return dst + 1, src + 1
    77			}
    78			i++
    79			c := s[i]
    80			hex := false
    81			if c == 'x' || c == 'X' {
    82				hex = true
    83				i++
    84			}
    85	
    86			x := '\x00'
    87			for i < len(s) {
    88				c = s[i]
    89				i++
    90				if hex {
    91					if '0' <= c && c <= '9' {
    92						x = 16*x + rune(c) - '0'
    93						continue
    94					} else if 'a' <= c && c <= 'f' {
    95						x = 16*x + rune(c) - 'a' + 10
    96						continue
    97					} else if 'A' <= c && c <= 'F' {
    98						x = 16*x + rune(c) - 'A' + 10
    99						continue
   100					}
   101				} else if '0' <= c && c <= '9' {
   102					x = 10*x + rune(c) - '0'
   103					continue
   104				}
   105				if c != ';' {
   106					i--
   107				}
   108				break
   109			}
   110	
   111			if i <= 3 { // No characters matched.
   112				b[dst] = b[src]
   113				return dst + 1, src + 1
   114			}
   115	
   116			if 0x80 <= x && x <= 0x9F {
   117				// Replace characters from Windows-1252 with UTF-8 equivalents.
   118				x = replacementTable[x-0x80]
   119			} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
   120				// Replace invalid characters with the replacement character.
   121				x = '\uFFFD'
   122			}
   123	
   124			return dst + utf8.EncodeRune(b[dst:], x), src + i
   125		}
   126	
   127		// Consume the maximum number of characters possible, with the
   128		// consumed characters matching one of the named references.
   129	
   130		for i < len(s) {
   131			c := s[i]
   132			i++
   133			// Lower-cased characters are more common in entities, so we check for them first.
   134			if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
   135				continue
   136			}
   137			if c != ';' {
   138				i--
   139			}
   140			break
   141		}
   142	
   143		entityName := string(s[1:i])
   144		if entityName == "" {
   145			// No-op.
   146		} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
   147			// No-op.
   148		} else if x := entity[entityName]; x != 0 {
   149			return dst + utf8.EncodeRune(b[dst:], x), src + i
   150		} else if x := entity2[entityName]; x[0] != 0 {
   151			dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
   152			return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
   153		} else if !attribute {
   154			maxLen := len(entityName) - 1
   155			if maxLen > longestEntityWithoutSemicolon {
   156				maxLen = longestEntityWithoutSemicolon
   157			}
   158			for j := maxLen; j > 1; j-- {
   159				if x := entity[entityName[:j]]; x != 0 {
   160					return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
   161				}
   162			}
   163		}
   164	
   165		dst1, src1 = dst+i, src+i
   166		copy(b[dst:dst1], b[src:src1])
   167		return dst1, src1
   168	}
   169	
   170	// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
   171	func unescape(b []byte) []byte {
   172		for i, c := range b {
   173			if c == '&' {
   174				dst, src := unescapeEntity(b, i, i, false)
   175				for src < len(b) {
   176					c := b[src]
   177					if c == '&' {
   178						dst, src = unescapeEntity(b, dst, src, false)
   179					} else {
   180						b[dst] = c
   181						dst, src = dst+1, src+1
   182					}
   183				}
   184				return b[0:dst]
   185			}
   186		}
   187		return b
   188	}
   189	
   190	// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc".
   191	func lower(b []byte) []byte {
   192		for i, c := range b {
   193			if 'A' <= c && c <= 'Z' {
   194				b[i] = c + 'a' - 'A'
   195			}
   196		}
   197		return b
   198	}
   199	
   200	const escapedChars = `&'<>"`
   201	
   202	func escape(w writer, s string) error {
   203		i := strings.IndexAny(s, escapedChars)
   204		for i != -1 {
   205			if _, err := w.WriteString(s[:i]); err != nil {
   206				return err
   207			}
   208			var esc string
   209			switch s[i] {
   210			case '&':
   211				esc = "&amp;"
   212			case '\'':
   213				// "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
   214				esc = "&#39;"
   215			case '<':
   216				esc = "&lt;"
   217			case '>':
   218				esc = "&gt;"
   219			case '"':
   220				// "&#34;" is shorter than "&quot;".
   221				esc = "&#34;"
   222			default:
   223				panic("unrecognized escape character")
   224			}
   225			s = s[i+1:]
   226			if _, err := w.WriteString(esc); err != nil {
   227				return err
   228			}
   229			i = strings.IndexAny(s, escapedChars)
   230		}
   231		_, err := w.WriteString(s)
   232		return err
   233	}
   234	
   235	// EscapeString escapes special characters like "<" to become "&lt;". It
   236	// escapes only five such characters: <, >, &, ' and ".
   237	// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   238	// always true.
   239	func EscapeString(s string) string {
   240		if strings.IndexAny(s, escapedChars) == -1 {
   241			return s
   242		}
   243		var buf bytes.Buffer
   244		escape(&buf, s)
   245		return buf.String()
   246	}
   247	
   248	// UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
   249	// larger range of entities than EscapeString escapes. For example, "&aacute;"
   250	// unescapes to "รก", as does "&#225;" and "&xE1;".
   251	// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   252	// always true.
   253	func UnescapeString(s string) string {
   254		for _, c := range s {
   255			if c == '&' {
   256				return string(unescape([]byte(s)))
   257			}
   258		}
   259		return s
   260	}