src/pkg/encoding/xml/xml.go - The Go Programming Language

Golang

Source file src/pkg/encoding/xml/xml.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package xml implements a simple XML 1.0 parser that
     6	// understands XML name spaces.
     7	package xml
     8	
     9	// References:
    10	//    Annotated XML spec: http://www.xml.com/axml/testaxml.htm
    11	//    XML name spaces: http://www.w3.org/TR/REC-xml-names/
    12	
    13	// TODO(rsc):
    14	//	Test error handling.
    15	
    16	import (
    17		"bufio"
    18		"bytes"
    19		"fmt"
    20		"io"
    21		"strconv"
    22		"strings"
    23		"unicode"
    24		"unicode/utf8"
    25	)
    26	
    27	// A SyntaxError represents a syntax error in the XML input stream.
    28	type SyntaxError struct {
    29		Msg  string
    30		Line int
    31	}
    32	
    33	func (e *SyntaxError) Error() string {
    34		return "XML syntax error on line " + strconv.Itoa(e.Line) + ": " + e.Msg
    35	}
    36	
    37	// A Name represents an XML name (Local) annotated
    38	// with a name space identifier (Space).
    39	// In tokens returned by Decoder.Token, the Space identifier
    40	// is given as a canonical URL, not the short prefix used
    41	// in the document being parsed.
    42	type Name struct {
    43		Space, Local string
    44	}
    45	
    46	// An Attr represents an attribute in an XML element (Name=Value).
    47	type Attr struct {
    48		Name  Name
    49		Value string
    50	}
    51	
    52	// A Token is an interface holding one of the token types:
    53	// StartElement, EndElement, CharData, Comment, ProcInst, or Directive.
    54	type Token interface{}
    55	
    56	// A StartElement represents an XML start element.
    57	type StartElement struct {
    58		Name Name
    59		Attr []Attr
    60	}
    61	
    62	func (e StartElement) Copy() StartElement {
    63		attrs := make([]Attr, len(e.Attr))
    64		copy(attrs, e.Attr)
    65		e.Attr = attrs
    66		return e
    67	}
    68	
    69	// An EndElement represents an XML end element.
    70	type EndElement struct {
    71		Name Name
    72	}
    73	
    74	// A CharData represents XML character data (raw text),
    75	// in which XML escape sequences have been replaced by
    76	// the characters they represent.
    77	type CharData []byte
    78	
    79	func makeCopy(b []byte) []byte {
    80		b1 := make([]byte, len(b))
    81		copy(b1, b)
    82		return b1
    83	}
    84	
    85	func (c CharData) Copy() CharData { return CharData(makeCopy(c)) }
    86	
    87	// A Comment represents an XML comment of the form <!--comment-->.
    88	// The bytes do not include the <!-- and --> comment markers.
    89	type Comment []byte
    90	
    91	func (c Comment) Copy() Comment { return Comment(makeCopy(c)) }
    92	
    93	// A ProcInst represents an XML processing instruction of the form <?target inst?>
    94	type ProcInst struct {
    95		Target string
    96		Inst   []byte
    97	}
    98	
    99	func (p ProcInst) Copy() ProcInst {
   100		p.Inst = makeCopy(p.Inst)
   101		return p
   102	}
   103	
   104	// A Directive represents an XML directive of the form <!text>.
   105	// The bytes do not include the <! and > markers.
   106	type Directive []byte
   107	
   108	func (d Directive) Copy() Directive { return Directive(makeCopy(d)) }
   109	
   110	// CopyToken returns a copy of a Token.
   111	func CopyToken(t Token) Token {
   112		switch v := t.(type) {
   113		case CharData:
   114			return v.Copy()
   115		case Comment:
   116			return v.Copy()
   117		case Directive:
   118			return v.Copy()
   119		case ProcInst:
   120			return v.Copy()
   121		case StartElement:
   122			return v.Copy()
   123		}
   124		return t
   125	}
   126	
   127	// A Decoder represents an XML parser reading a particular input stream.
   128	// The parser assumes that its input is encoded in UTF-8.
   129	type Decoder struct {
   130		// Strict defaults to true, enforcing the requirements
   131		// of the XML specification.
   132		// If set to false, the parser allows input containing common
   133		// mistakes:
   134		//	* If an element is missing an end tag, the parser invents
   135		//	  end tags as necessary to keep the return values from Token
   136		//	  properly balanced.
   137		//	* In attribute values and character data, unknown or malformed
   138		//	  character entities (sequences beginning with &) are left alone.
   139		//
   140		// Setting:
   141		//
   142		//	d.Strict = false;
   143		//	d.AutoClose = HTMLAutoClose;
   144		//	d.Entity = HTMLEntity
   145		//
   146		// creates a parser that can handle typical HTML.
   147		Strict bool
   148	
   149		// When Strict == false, AutoClose indicates a set of elements to
   150		// consider closed immediately after they are opened, regardless
   151		// of whether an end element is present.
   152		AutoClose []string
   153	
   154		// Entity can be used to map non-standard entity names to string replacements.
   155		// The parser behaves as if these standard mappings are present in the map,
   156		// regardless of the actual map content:
   157		//
   158		//	"lt": "<",
   159		//	"gt": ">",
   160		//	"amp": "&",
   161		//	"apos": "'",
   162		//	"quot": `"`,
   163		Entity map[string]string
   164	
   165		// CharsetReader, if non-nil, defines a function to generate
   166		// charset-conversion readers, converting from the provided
   167		// non-UTF-8 charset into UTF-8. If CharsetReader is nil or
   168		// returns an error, parsing stops with an error. One of the
   169		// the CharsetReader's result values must be non-nil.
   170		CharsetReader func(charset string, input io.Reader) (io.Reader, error)
   171	
   172		r         io.ByteReader
   173		buf       bytes.Buffer
   174		saved     *bytes.Buffer
   175		stk       *stack
   176		free      *stack
   177		needClose bool
   178		toClose   Name
   179		nextToken Token
   180		nextByte  int
   181		ns        map[string]string
   182		err       error
   183		line      int
   184		tmp       [32]byte
   185	}
   186	
   187	// NewDecoder creates a new XML parser reading from r.
   188	func NewDecoder(r io.Reader) *Decoder {
   189		d := &Decoder{
   190			ns:       make(map[string]string),
   191			nextByte: -1,
   192			line:     1,
   193			Strict:   true,
   194		}
   195		d.switchToReader(r)
   196		return d
   197	}
   198	
   199	// Token returns the next XML token in the input stream.
   200	// At the end of the input stream, Token returns nil, io.EOF.
   201	//
   202	// Slices of bytes in the returned token data refer to the
   203	// parser's internal buffer and remain valid only until the next
   204	// call to Token.  To acquire a copy of the bytes, call CopyToken
   205	// or the token's Copy method.
   206	//
   207	// Token expands self-closing elements such as <br/>
   208	// into separate start and end elements returned by successive calls.
   209	//
   210	// Token guarantees that the StartElement and EndElement
   211	// tokens it returns are properly nested and matched:
   212	// if Token encounters an unexpected end element,
   213	// it will return an error.
   214	//
   215	// Token implements XML name spaces as described by
   216	// http://www.w3.org/TR/REC-xml-names/.  Each of the
   217	// Name structures contained in the Token has the Space
   218	// set to the URL identifying its name space when known.
   219	// If Token encounters an unrecognized name space prefix,
   220	// it uses the prefix as the Space rather than report an error.
   221	func (d *Decoder) Token() (t Token, err error) {
   222		if d.nextToken != nil {
   223			t = d.nextToken
   224			d.nextToken = nil
   225		} else if t, err = d.RawToken(); err != nil {
   226			return
   227		}
   228	
   229		if !d.Strict {
   230			if t1, ok := d.autoClose(t); ok {
   231				d.nextToken = t
   232				t = t1
   233			}
   234		}
   235		switch t1 := t.(type) {
   236		case StartElement:
   237			// In XML name spaces, the translations listed in the
   238			// attributes apply to the element name and
   239			// to the other attribute names, so process
   240			// the translations first.
   241			for _, a := range t1.Attr {
   242				if a.Name.Space == "xmlns" {
   243					v, ok := d.ns[a.Name.Local]
   244					d.pushNs(a.Name.Local, v, ok)
   245					d.ns[a.Name.Local] = a.Value
   246				}
   247				if a.Name.Space == "" && a.Name.Local == "xmlns" {
   248					// Default space for untagged names
   249					v, ok := d.ns[""]
   250					d.pushNs("", v, ok)
   251					d.ns[""] = a.Value
   252				}
   253			}
   254	
   255			d.translate(&t1.Name, true)
   256			for i := range t1.Attr {
   257				d.translate(&t1.Attr[i].Name, false)
   258			}
   259			d.pushElement(t1.Name)
   260			t = t1
   261	
   262		case EndElement:
   263			d.translate(&t1.Name, true)
   264			if !d.popElement(&t1) {
   265				return nil, d.err
   266			}
   267			t = t1
   268		}
   269		return
   270	}
   271	
   272	// Apply name space translation to name n.
   273	// The default name space (for Space=="")
   274	// applies only to element names, not to attribute names.
   275	func (d *Decoder) translate(n *Name, isElementName bool) {
   276		switch {
   277		case n.Space == "xmlns":
   278			return
   279		case n.Space == "" && !isElementName:
   280			return
   281		case n.Space == "" && n.Local == "xmlns":
   282			return
   283		}
   284		if v, ok := d.ns[n.Space]; ok {
   285			n.Space = v
   286		}
   287	}
   288	
   289	func (d *Decoder) switchToReader(r io.Reader) {
   290		// Get efficient byte at a time reader.
   291		// Assume that if reader has its own
   292		// ReadByte, it's efficient enough.
   293		// Otherwise, use bufio.
   294		if rb, ok := r.(io.ByteReader); ok {
   295			d.r = rb
   296		} else {
   297			d.r = bufio.NewReader(r)
   298		}
   299	}
   300	
   301	// Parsing state - stack holds old name space translations
   302	// and the current set of open elements.  The translations to pop when
   303	// ending a given tag are *below* it on the stack, which is
   304	// more work but forced on us by XML.
   305	type stack struct {
   306		next *stack
   307		kind int
   308		name Name
   309		ok   bool
   310	}
   311	
   312	const (
   313		stkStart = iota
   314		stkNs
   315	)
   316	
   317	func (d *Decoder) push(kind int) *stack {
   318		s := d.free
   319		if s != nil {
   320			d.free = s.next
   321		} else {
   322			s = new(stack)
   323		}
   324		s.next = d.stk
   325		s.kind = kind
   326		d.stk = s
   327		return s
   328	}
   329	
   330	func (d *Decoder) pop() *stack {
   331		s := d.stk
   332		if s != nil {
   333			d.stk = s.next
   334			s.next = d.free
   335			d.free = s
   336		}
   337		return s
   338	}
   339	
   340	// Record that we are starting an element with the given name.
   341	func (d *Decoder) pushElement(name Name) {
   342		s := d.push(stkStart)
   343		s.name = name
   344	}
   345	
   346	// Record that we are changing the value of ns[local].
   347	// The old value is url, ok.
   348	func (d *Decoder) pushNs(local string, url string, ok bool) {
   349		s := d.push(stkNs)
   350		s.name.Local = local
   351		s.name.Space = url
   352		s.ok = ok
   353	}
   354	
   355	// Creates a SyntaxError with the current line number.
   356	func (d *Decoder) syntaxError(msg string) error {
   357		return &SyntaxError{Msg: msg, Line: d.line}
   358	}
   359	
   360	// Record that we are ending an element with the given name.
   361	// The name must match the record at the top of the stack,
   362	// which must be a pushElement record.
   363	// After popping the element, apply any undo records from
   364	// the stack to restore the name translations that existed
   365	// before we saw this element.
   366	func (d *Decoder) popElement(t *EndElement) bool {
   367		s := d.pop()
   368		name := t.Name
   369		switch {
   370		case s == nil || s.kind != stkStart:
   371			d.err = d.syntaxError("unexpected end element </" + name.Local + ">")
   372			return false
   373		case s.name.Local != name.Local:
   374			if !d.Strict {
   375				d.needClose = true
   376				d.toClose = t.Name
   377				t.Name = s.name
   378				return true
   379			}
   380			d.err = d.syntaxError("element <" + s.name.Local + "> closed by </" + name.Local + ">")
   381			return false
   382		case s.name.Space != name.Space:
   383			d.err = d.syntaxError("element <" + s.name.Local + "> in space " + s.name.Space +
   384				"closed by </" + name.Local + "> in space " + name.Space)
   385			return false
   386		}
   387	
   388		// Pop stack until a Start is on the top, undoing the
   389		// translations that were associated with the element we just closed.
   390		for d.stk != nil && d.stk.kind != stkStart {
   391			s := d.pop()
   392			if s.ok {
   393				d.ns[s.name.Local] = s.name.Space
   394			} else {
   395				delete(d.ns, s.name.Local)
   396			}
   397		}
   398	
   399		return true
   400	}
   401	
   402	// If the top element on the stack is autoclosing and
   403	// t is not the end tag, invent the end tag.
   404	func (d *Decoder) autoClose(t Token) (Token, bool) {
   405		if d.stk == nil || d.stk.kind != stkStart {
   406			return nil, false
   407		}
   408		name := strings.ToLower(d.stk.name.Local)
   409		for _, s := range d.AutoClose {
   410			if strings.ToLower(s) == name {
   411				// This one should be auto closed if t doesn't close it.
   412				et, ok := t.(EndElement)
   413				if !ok || et.Name.Local != name {
   414					return EndElement{d.stk.name}, true
   415				}
   416				break
   417			}
   418		}
   419		return nil, false
   420	}
   421	
   422	// RawToken is like Token but does not verify that
   423	// start and end elements match and does not translate
   424	// name space prefixes to their corresponding URLs.
   425	func (d *Decoder) RawToken() (Token, error) {
   426		if d.err != nil {
   427			return nil, d.err
   428		}
   429		if d.needClose {
   430			// The last element we read was self-closing and
   431			// we returned just the StartElement half.
   432			// Return the EndElement half now.
   433			d.needClose = false
   434			return EndElement{d.toClose}, nil
   435		}
   436	
   437		b, ok := d.getc()
   438		if !ok {
   439			return nil, d.err
   440		}
   441	
   442		if b != '<' {
   443			// Text section.
   444			d.ungetc(b)
   445			data := d.text(-1, false)
   446			if data == nil {
   447				return nil, d.err
   448			}
   449			return CharData(data), nil
   450		}
   451	
   452		if b, ok = d.mustgetc(); !ok {
   453			return nil, d.err
   454		}
   455		switch b {
   456		case '/':
   457			// </: End element
   458			var name Name
   459			if name, ok = d.nsname(); !ok {
   460				if d.err == nil {
   461					d.err = d.syntaxError("expected element name after </")
   462				}
   463				return nil, d.err
   464			}
   465			d.space()
   466			if b, ok = d.mustgetc(); !ok {
   467				return nil, d.err
   468			}
   469			if b != '>' {
   470				d.err = d.syntaxError("invalid characters between </" + name.Local + " and >")
   471				return nil, d.err
   472			}
   473			return EndElement{name}, nil
   474	
   475		case '?':
   476			// <?: Processing instruction.
   477			// TODO(rsc): Should parse the <?xml declaration to make sure
   478			// the version is 1.0 and the encoding is UTF-8.
   479			var target string
   480			if target, ok = d.name(); !ok {
   481				if d.err == nil {
   482					d.err = d.syntaxError("expected target name after <?")
   483				}
   484				return nil, d.err
   485			}
   486			d.space()
   487			d.buf.Reset()
   488			var b0 byte
   489			for {
   490				if b, ok = d.mustgetc(); !ok {
   491					return nil, d.err
   492				}
   493				d.buf.WriteByte(b)
   494				if b0 == '?' && b == '>' {
   495					break
   496				}
   497				b0 = b
   498			}
   499			data := d.buf.Bytes()
   500			data = data[0 : len(data)-2] // chop ?>
   501	
   502			if target == "xml" {
   503				enc := procInstEncoding(string(data))
   504				if enc != "" && enc != "utf-8" && enc != "UTF-8" {
   505					if d.CharsetReader == nil {
   506						d.err = fmt.Errorf("xml: encoding %q declared but Decoder.CharsetReader is nil", enc)
   507						return nil, d.err
   508					}
   509					newr, err := d.CharsetReader(enc, d.r.(io.Reader))
   510					if err != nil {
   511						d.err = fmt.Errorf("xml: opening charset %q: %v", enc, err)
   512						return nil, d.err
   513					}
   514					if newr == nil {
   515						panic("CharsetReader returned a nil Reader for charset " + enc)
   516					}
   517					d.switchToReader(newr)
   518				}
   519			}
   520			return ProcInst{target, data}, nil
   521	
   522		case '!':
   523			// <!: Maybe comment, maybe CDATA.
   524			if b, ok = d.mustgetc(); !ok {
   525				return nil, d.err
   526			}
   527			switch b {
   528			case '-': // <!-
   529				// Probably <!-- for a comment.
   530				if b, ok = d.mustgetc(); !ok {
   531					return nil, d.err
   532				}
   533				if b != '-' {
   534					d.err = d.syntaxError("invalid sequence <!- not part of <!--")
   535					return nil, d.err
   536				}
   537				// Look for terminator.
   538				d.buf.Reset()
   539				var b0, b1 byte
   540				for {
   541					if b, ok = d.mustgetc(); !ok {
   542						return nil, d.err
   543					}
   544					d.buf.WriteByte(b)
   545					if b0 == '-' && b1 == '-' && b == '>' {
   546						break
   547					}
   548					b0, b1 = b1, b
   549				}
   550				data := d.buf.Bytes()
   551				data = data[0 : len(data)-3] // chop -->
   552				return Comment(data), nil
   553	
   554			case '[': // <![
   555				// Probably <![CDATA[.
   556				for i := 0; i < 6; i++ {
   557					if b, ok = d.mustgetc(); !ok {
   558						return nil, d.err
   559					}
   560					if b != "CDATA["[i] {
   561						d.err = d.syntaxError("invalid <![ sequence")
   562						return nil, d.err
   563					}
   564				}
   565				// Have <![CDATA[.  Read text until ]]>.
   566				data := d.text(-1, true)
   567				if data == nil {
   568					return nil, d.err
   569				}
   570				return CharData(data), nil
   571			}
   572	
   573			// Probably a directive: <!DOCTYPE ...>, <!ENTITY ...>, etc.
   574			// We don't care, but accumulate for caller. Quoted angle
   575			// brackets do not count for nesting.
   576			d.buf.Reset()
   577			d.buf.WriteByte(b)
   578			inquote := uint8(0)
   579			depth := 0
   580			for {
   581				if b, ok = d.mustgetc(); !ok {
   582					return nil, d.err
   583				}
   584				if inquote == 0 && b == '>' && depth == 0 {
   585					break
   586				}
   587				d.buf.WriteByte(b)
   588				switch {
   589				case b == inquote:
   590					inquote = 0
   591	
   592				case inquote != 0:
   593					// in quotes, no special action
   594	
   595				case b == '\'' || b == '"':
   596					inquote = b
   597	
   598				case b == '>' && inquote == 0:
   599					depth--
   600	
   601				case b == '<' && inquote == 0:
   602					depth++
   603				}
   604			}
   605			return Directive(d.buf.Bytes()), nil
   606		}
   607	
   608		// Must be an open element like <a href="foo">
   609		d.ungetc(b)
   610	
   611		var (
   612			name  Name
   613			empty bool
   614			attr  []Attr
   615		)
   616		if name, ok = d.nsname(); !ok {
   617			if d.err == nil {
   618				d.err = d.syntaxError("expected element name after <")
   619			}
   620			return nil, d.err
   621		}
   622	
   623		attr = make([]Attr, 0, 4)
   624		for {
   625			d.space()
   626			if b, ok = d.mustgetc(); !ok {
   627				return nil, d.err
   628			}
   629			if b == '/' {
   630				empty = true
   631				if b, ok = d.mustgetc(); !ok {
   632					return nil, d.err
   633				}
   634				if b != '>' {
   635					d.err = d.syntaxError("expected /> in element")
   636					return nil, d.err
   637				}
   638				break
   639			}
   640			if b == '>' {
   641				break
   642			}
   643			d.ungetc(b)
   644	
   645			n := len(attr)
   646			if n >= cap(attr) {
   647				nattr := make([]Attr, n, 2*cap(attr))
   648				copy(nattr, attr)
   649				attr = nattr
   650			}
   651			attr = attr[0 : n+1]
   652			a := &attr[n]
   653			if a.Name, ok = d.nsname(); !ok {
   654				if d.err == nil {
   655					d.err = d.syntaxError("expected attribute name in element")
   656				}
   657				return nil, d.err
   658			}
   659			d.space()
   660			if b, ok = d.mustgetc(); !ok {
   661				return nil, d.err
   662			}
   663			if b != '=' {
   664				if d.Strict {
   665					d.err = d.syntaxError("attribute name without = in element")
   666					return nil, d.err
   667				} else {
   668					d.ungetc(b)
   669					a.Value = a.Name.Local
   670				}
   671			} else {
   672				d.space()
   673				data := d.attrval()
   674				if data == nil {
   675					return nil, d.err
   676				}
   677				a.Value = string(data)
   678			}
   679		}
   680		if empty {
   681			d.needClose = true
   682			d.toClose = name
   683		}
   684		return StartElement{name, attr}, nil
   685	}
   686	
   687	func (d *Decoder) attrval() []byte {
   688		b, ok := d.mustgetc()
   689		if !ok {
   690			return nil
   691		}
   692		// Handle quoted attribute values
   693		if b == '"' || b == '\'' {
   694			return d.text(int(b), false)
   695		}
   696		// Handle unquoted attribute values for strict parsers
   697		if d.Strict {
   698			d.err = d.syntaxError("unquoted or missing attribute value in element")
   699			return nil
   700		}
   701		// Handle unquoted attribute values for unstrict parsers
   702		d.ungetc(b)
   703		d.buf.Reset()
   704		for {
   705			b, ok = d.mustgetc()
   706			if !ok {
   707				return nil
   708			}
   709			// http://www.w3.org/TR/REC-html40/intro/sgmltut.html#h-3.2.2
   710			if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' ||
   711				'0' <= b && b <= '9' || b == '_' || b == ':' || b == '-' {
   712				d.buf.WriteByte(b)
   713			} else {
   714				d.ungetc(b)
   715				break
   716			}
   717		}
   718		return d.buf.Bytes()
   719	}
   720	
   721	// Skip spaces if any
   722	func (d *Decoder) space() {
   723		for {
   724			b, ok := d.getc()
   725			if !ok {
   726				return
   727			}
   728			switch b {
   729			case ' ', '\r', '\n', '\t':
   730			default:
   731				d.ungetc(b)
   732				return
   733			}
   734		}
   735	}
   736	
   737	// Read a single byte.
   738	// If there is no byte to read, return ok==false
   739	// and leave the error in d.err.
   740	// Maintain line number.
   741	func (d *Decoder) getc() (b byte, ok bool) {
   742		if d.err != nil {
   743			return 0, false
   744		}
   745		if d.nextByte >= 0 {
   746			b = byte(d.nextByte)
   747			d.nextByte = -1
   748		} else {
   749			b, d.err = d.r.ReadByte()
   750			if d.err != nil {
   751				return 0, false
   752			}
   753			if d.saved != nil {
   754				d.saved.WriteByte(b)
   755			}
   756		}
   757		if b == '\n' {
   758			d.line++
   759		}
   760		return b, true
   761	}
   762	
   763	// Return saved offset.
   764	// If we did ungetc (nextByte >= 0), have to back up one.
   765	func (d *Decoder) savedOffset() int {
   766		n := d.saved.Len()
   767		if d.nextByte >= 0 {
   768			n--
   769		}
   770		return n
   771	}
   772	
   773	// Must read a single byte.
   774	// If there is no byte to read,
   775	// set d.err to SyntaxError("unexpected EOF")
   776	// and return ok==false
   777	func (d *Decoder) mustgetc() (b byte, ok bool) {
   778		if b, ok = d.getc(); !ok {
   779			if d.err == io.EOF {
   780				d.err = d.syntaxError("unexpected EOF")
   781			}
   782		}
   783		return
   784	}
   785	
   786	// Unread a single byte.
   787	func (d *Decoder) ungetc(b byte) {
   788		if b == '\n' {
   789			d.line--
   790		}
   791		d.nextByte = int(b)
   792	}
   793	
   794	var entity = map[string]int{
   795		"lt":   '<',
   796		"gt":   '>',
   797		"amp":  '&',
   798		"apos": '\'',
   799		"quot": '"',
   800	}
   801	
   802	// Read plain text section (XML calls it character data).
   803	// If quote >= 0, we are in a quoted string and need to find the matching quote.
   804	// If cdata == true, we are in a <![CDATA[ section and need to find ]]>.
   805	// On failure return nil and leave the error in d.err.
   806	func (d *Decoder) text(quote int, cdata bool) []byte {
   807		var b0, b1 byte
   808		var trunc int
   809		d.buf.Reset()
   810	Input:
   811		for {
   812			b, ok := d.getc()
   813			if !ok {
   814				if cdata {
   815					if d.err == io.EOF {
   816						d.err = d.syntaxError("unexpected EOF in CDATA section")
   817					}
   818					return nil
   819				}
   820				break Input
   821			}
   822	
   823			// <![CDATA[ section ends with ]]>.
   824			// It is an error for ]]> to appear in ordinary text.
   825			if b0 == ']' && b1 == ']' && b == '>' {
   826				if cdata {
   827					trunc = 2
   828					break Input
   829				}
   830				d.err = d.syntaxError("unescaped ]]> not in CDATA section")
   831				return nil
   832			}
   833	
   834			// Stop reading text if we see a <.
   835			if b == '<' && !cdata {
   836				if quote >= 0 {
   837					d.err = d.syntaxError("unescaped < inside quoted string")
   838					return nil
   839				}
   840				d.ungetc('<')
   841				break Input
   842			}
   843			if quote >= 0 && b == byte(quote) {
   844				break Input
   845			}
   846			if b == '&' && !cdata {
   847				// Read escaped character expression up to semicolon.
   848				// XML in all its glory allows a document to define and use
   849				// its own character names with <!ENTITY ...> directives.
   850				// Parsers are required to recognize lt, gt, amp, apos, and quot
   851				// even if they have not been declared.  That's all we allow.
   852				var i int
   853				for i = 0; i < len(d.tmp); i++ {
   854					var ok bool
   855					d.tmp[i], ok = d.getc()
   856					if !ok {
   857						if d.err == io.EOF {
   858							d.err = d.syntaxError("unexpected EOF")
   859						}
   860						return nil
   861					}
   862					c := d.tmp[i]
   863					if c == ';' {
   864						break
   865					}
   866					if 'a' <= c && c <= 'z' ||
   867						'A' <= c && c <= 'Z' ||
   868						'0' <= c && c <= '9' ||
   869						c == '_' || c == '#' {
   870						continue
   871					}
   872					d.ungetc(c)
   873					break
   874				}
   875				s := string(d.tmp[0:i])
   876				if i >= len(d.tmp) {
   877					if !d.Strict {
   878						b0, b1 = 0, 0
   879						d.buf.WriteByte('&')
   880						d.buf.Write(d.tmp[0:i])
   881						continue Input
   882					}
   883					d.err = d.syntaxError("character entity expression &" + s + "... too long")
   884					return nil
   885				}
   886				var haveText bool
   887				var text string
   888				if i >= 2 && s[0] == '#' {
   889					var n uint64
   890					var err error
   891					if i >= 3 && s[1] == 'x' {
   892						n, err = strconv.ParseUint(s[2:], 16, 64)
   893					} else {
   894						n, err = strconv.ParseUint(s[1:], 10, 64)
   895					}
   896					if err == nil && n <= unicode.MaxRune {
   897						text = string(n)
   898						haveText = true
   899					}
   900				} else {
   901					if r, ok := entity[s]; ok {
   902						text = string(r)
   903						haveText = true
   904					} else if d.Entity != nil {
   905						text, haveText = d.Entity[s]
   906					}
   907				}
   908				if !haveText {
   909					if !d.Strict {
   910						b0, b1 = 0, 0
   911						d.buf.WriteByte('&')
   912						d.buf.Write(d.tmp[0:i])
   913						continue Input
   914					}
   915					d.err = d.syntaxError("invalid character entity &" + s + ";")
   916					return nil
   917				}
   918				d.buf.Write([]byte(text))
   919				b0, b1 = 0, 0
   920				continue Input
   921			}
   922			d.buf.WriteByte(b)
   923			b0, b1 = b1, b
   924		}
   925		data := d.buf.Bytes()
   926		data = data[0 : len(data)-trunc]
   927	
   928		// Inspect each rune for being a disallowed character.
   929		buf := data
   930		for len(buf) > 0 {
   931			r, size := utf8.DecodeRune(buf)
   932			if r == utf8.RuneError && size == 1 {
   933				d.err = d.syntaxError("invalid UTF-8")
   934				return nil
   935			}
   936			buf = buf[size:]
   937			if !isInCharacterRange(r) {
   938				d.err = d.syntaxError(fmt.Sprintf("illegal character code %U", r))
   939				return nil
   940			}
   941		}
   942	
   943		// Must rewrite \r and \r\n into \n.
   944		w := 0
   945		for r := 0; r < len(data); r++ {
   946			b := data[r]
   947			if b == '\r' {
   948				if r+1 < len(data) && data[r+1] == '\n' {
   949					continue
   950				}
   951				b = '\n'
   952			}
   953			data[w] = b
   954			w++
   955		}
   956		return data[0:w]
   957	}
   958	
   959	// Decide whether the given rune is in the XML Character Range, per
   960	// the Char production of http://www.xml.com/axml/testaxml.htm,
   961	// Section 2.2 Characters.
   962	func isInCharacterRange(r rune) (inrange bool) {
   963		return r == 0x09 ||
   964			r == 0x0A ||
   965			r == 0x0D ||
   966			r >= 0x20 && r <= 0xDF77 ||
   967			r >= 0xE000 && r <= 0xFFFD ||
   968			r >= 0x10000 && r <= 0x10FFFF
   969	}
   970	
   971	// Get name space name: name with a : stuck in the middle.
   972	// The part before the : is the name space identifier.
   973	func (d *Decoder) nsname() (name Name, ok bool) {
   974		s, ok := d.name()
   975		if !ok {
   976			return
   977		}
   978		i := strings.Index(s, ":")
   979		if i < 0 {
   980			name.Local = s
   981		} else {
   982			name.Space = s[0:i]
   983			name.Local = s[i+1:]
   984		}
   985		return name, true
   986	}
   987	
   988	// Get name: /first(first|second)*/
   989	// Do not set d.err if the name is missing (unless unexpected EOF is received):
   990	// let the caller provide better context.
   991	func (d *Decoder) name() (s string, ok bool) {
   992		var b byte
   993		if b, ok = d.mustgetc(); !ok {
   994			return
   995		}
   996	
   997		// As a first approximation, we gather the bytes [A-Za-z_:.-\x80-\xFF]*
   998		if b < utf8.RuneSelf && !isNameByte(b) {
   999			d.ungetc(b)
  1000			return "", false
  1001		}
  1002		d.buf.Reset()
  1003		d.buf.WriteByte(b)
  1004		for {
  1005			if b, ok = d.mustgetc(); !ok {
  1006				return
  1007			}
  1008			if b < utf8.RuneSelf && !isNameByte(b) {
  1009				d.ungetc(b)
  1010				break
  1011			}
  1012			d.buf.WriteByte(b)
  1013		}
  1014	
  1015		// Then we check the characters.
  1016		s = d.buf.String()
  1017		for i, c := range s {
  1018			if !unicode.Is(first, c) && (i == 0 || !unicode.Is(second, c)) {
  1019				d.err = d.syntaxError("invalid XML name: " + s)
  1020				return "", false
  1021			}
  1022		}
  1023		return s, true
  1024	}
  1025	
  1026	func isNameByte(c byte) bool {
  1027		return 'A' <= c && c <= 'Z' ||
  1028			'a' <= c && c <= 'z' ||
  1029			'0' <= c && c <= '9' ||
  1030			c == '_' || c == ':' || c == '.' || c == '-'
  1031	}
  1032	
  1033	// These tables were generated by cut and paste from Appendix B of
  1034	// the XML spec at http://www.xml.com/axml/testaxml.htm
  1035	// and then reformatting.  First corresponds to (Letter | '_' | ':')
  1036	// and second corresponds to NameChar.
  1037	
  1038	var first = &unicode.RangeTable{
  1039		R16: []unicode.Range16{
  1040			{0x003A, 0x003A, 1},
  1041			{0x0041, 0x005A, 1},
  1042			{0x005F, 0x005F, 1},
  1043			{0x0061, 0x007A, 1},
  1044			{0x00C0, 0x00D6, 1},
  1045			{0x00D8, 0x00F6, 1},
  1046			{0x00F8, 0x00FF, 1},
  1047			{0x0100, 0x0131, 1},
  1048			{0x0134, 0x013E, 1},
  1049			{0x0141, 0x0148, 1},
  1050			{0x014A, 0x017E, 1},
  1051			{0x0180, 0x01C3, 1},
  1052			{0x01CD, 0x01F0, 1},
  1053			{0x01F4, 0x01F5, 1},
  1054			{0x01FA, 0x0217, 1},
  1055			{0x0250, 0x02A8, 1},
  1056			{0x02BB, 0x02C1, 1},
  1057			{0x0386, 0x0386, 1},
  1058			{0x0388, 0x038A, 1},
  1059			{0x038C, 0x038C, 1},
  1060			{0x038E, 0x03A1, 1},
  1061			{0x03A3, 0x03CE, 1},
  1062			{0x03D0, 0x03D6, 1},
  1063			{0x03DA, 0x03E0, 2},
  1064			{0x03E2, 0x03F3, 1},
  1065			{0x0401, 0x040C, 1},
  1066			{0x040E, 0x044F, 1},
  1067			{0x0451, 0x045C, 1},
  1068			{0x045E, 0x0481, 1},
  1069			{0x0490, 0x04C4, 1},
  1070			{0x04C7, 0x04C8, 1},
  1071			{0x04CB, 0x04CC, 1},
  1072			{0x04D0, 0x04EB, 1},
  1073			{0x04EE, 0x04F5, 1},
  1074			{0x04F8, 0x04F9, 1},
  1075			{0x0531, 0x0556, 1},
  1076			{0x0559, 0x0559, 1},
  1077			{0x0561, 0x0586, 1},
  1078			{0x05D0, 0x05EA, 1},
  1079			{0x05F0, 0x05F2, 1},
  1080			{0x0621, 0x063A, 1},
  1081			{0x0641, 0x064A, 1},
  1082			{0x0671, 0x06B7, 1},
  1083			{0x06BA, 0x06BE, 1},
  1084			{0x06C0, 0x06CE, 1},
  1085			{0x06D0, 0x06D3, 1},
  1086			{0x06D5, 0x06D5, 1},
  1087			{0x06E5, 0x06E6, 1},
  1088			{0x0905, 0x0939, 1},
  1089			{0x093D, 0x093D, 1},
  1090			{0x0958, 0x0961, 1},
  1091			{0x0985, 0x098C, 1},
  1092			{0x098F, 0x0990, 1},
  1093			{0x0993, 0x09A8, 1},
  1094			{0x09AA, 0x09B0, 1},
  1095			{0x09B2, 0x09B2, 1},
  1096			{0x09B6, 0x09B9, 1},
  1097			{0x09DC, 0x09DD, 1},
  1098			{0x09DF, 0x09E1, 1},
  1099			{0x09F0, 0x09F1, 1},
  1100			{0x0A05, 0x0A0A, 1},
  1101			{0x0A0F, 0x0A10, 1},
  1102			{0x0A13, 0x0A28, 1},
  1103			{0x0A2A, 0x0A30, 1},
  1104			{0x0A32, 0x0A33, 1},
  1105			{0x0A35, 0x0A36, 1},
  1106			{0x0A38, 0x0A39, 1},
  1107			{0x0A59, 0x0A5C, 1},
  1108			{0x0A5E, 0x0A5E, 1},
  1109			{0x0A72, 0x0A74, 1},
  1110			{0x0A85, 0x0A8B, 1},
  1111			{0x0A8D, 0x0A8D, 1},
  1112			{0x0A8F, 0x0A91, 1},
  1113			{0x0A93, 0x0AA8, 1},
  1114			{0x0AAA, 0x0AB0, 1},
  1115			{0x0AB2, 0x0AB3, 1},
  1116			{0x0AB5, 0x0AB9, 1},
  1117			{0x0ABD, 0x0AE0, 0x23},
  1118			{0x0B05, 0x0B0C, 1},
  1119			{0x0B0F, 0x0B10, 1},
  1120			{0x0B13, 0x0B28, 1},
  1121			{0x0B2A, 0x0B30, 1},
  1122			{0x0B32, 0x0B33, 1},
  1123			{0x0B36, 0x0B39, 1},
  1124			{0x0B3D, 0x0B3D, 1},
  1125			{0x0B5C, 0x0B5D, 1},
  1126			{0x0B5F, 0x0B61, 1},
  1127			{0x0B85, 0x0B8A, 1},
  1128			{0x0B8E, 0x0B90, 1},
  1129			{0x0B92, 0x0B95, 1},
  1130			{0x0B99, 0x0B9A, 1},
  1131			{0x0B9C, 0x0B9C, 1},
  1132			{0x0B9E, 0x0B9F, 1},
  1133			{0x0BA3, 0x0BA4, 1},
  1134			{0x0BA8, 0x0BAA, 1},
  1135			{0x0BAE, 0x0BB5, 1},
  1136			{0x0BB7, 0x0BB9, 1},
  1137			{0x0C05, 0x0C0C, 1},
  1138			{0x0C0E, 0x0C10, 1},
  1139			{0x0C12, 0x0C28, 1},
  1140			{0x0C2A, 0x0C33, 1},
  1141			{0x0C35, 0x0C39, 1},
  1142			{0x0C60, 0x0C61, 1},
  1143			{0x0C85, 0x0C8C, 1},
  1144			{0x0C8E, 0x0C90, 1},
  1145			{0x0C92, 0x0CA8, 1},
  1146			{0x0CAA, 0x0CB3, 1},
  1147			{0x0CB5, 0x0CB9, 1},
  1148			{0x0CDE, 0x0CDE, 1},
  1149			{0x0CE0, 0x0CE1, 1},
  1150			{0x0D05, 0x0D0C, 1},
  1151			{0x0D0E, 0x0D10, 1},
  1152			{0x0D12, 0x0D28, 1},
  1153			{0x0D2A, 0x0D39, 1},
  1154			{0x0D60, 0x0D61, 1},
  1155			{0x0E01, 0x0E2E, 1},
  1156			{0x0E30, 0x0E30, 1},
  1157			{0x0E32, 0x0E33, 1},
  1158			{0x0E40, 0x0E45, 1},
  1159			{0x0E81, 0x0E82, 1},
  1160			{0x0E84, 0x0E84, 1},
  1161			{0x0E87, 0x0E88, 1},
  1162			{0x0E8A, 0x0E8D, 3},
  1163			{0x0E94, 0x0E97, 1},
  1164			{0x0E99, 0x0E9F, 1},
  1165			{0x0EA1, 0x0EA3, 1},
  1166			{0x0EA5, 0x0EA7, 2},
  1167			{0x0EAA, 0x0EAB, 1},
  1168			{0x0EAD, 0x0EAE, 1},
  1169			{0x0EB0, 0x0EB0, 1},
  1170			{0x0EB2, 0x0EB3, 1},
  1171			{0x0EBD, 0x0EBD, 1},
  1172			{0x0EC0, 0x0EC4, 1},
  1173			{0x0F40, 0x0F47, 1},
  1174			{0x0F49, 0x0F69, 1},
  1175			{0x10A0, 0x10C5, 1},
  1176			{0x10D0, 0x10F6, 1},
  1177			{0x1100, 0x1100, 1},
  1178			{0x1102, 0x1103, 1},
  1179			{0x1105, 0x1107, 1},
  1180			{0x1109, 0x1109, 1},
  1181			{0x110B, 0x110C, 1},
  1182			{0x110E, 0x1112, 1},
  1183			{0x113C, 0x1140, 2},
  1184			{0x114C, 0x1150, 2},
  1185			{0x1154, 0x1155, 1},
  1186			{0x1159, 0x1159, 1},
  1187			{0x115F, 0x1161, 1},
  1188			{0x1163, 0x1169, 2},
  1189			{0x116D, 0x116E, 1},
  1190			{0x1172, 0x1173, 1},
  1191			{0x1175, 0x119E, 0x119E - 0x1175},
  1192			{0x11A8, 0x11AB, 0x11AB - 0x11A8},
  1193			{0x11AE, 0x11AF, 1},
  1194			{0x11B7, 0x11B8, 1},
  1195			{0x11BA, 0x11BA, 1},
  1196			{0x11BC, 0x11C2, 1},
  1197			{0x11EB, 0x11F0, 0x11F0 - 0x11EB},
  1198			{0x11F9, 0x11F9, 1},
  1199			{0x1E00, 0x1E9B, 1},
  1200			{0x1EA0, 0x1EF9, 1},
  1201			{0x1F00, 0x1F15, 1},
  1202			{0x1F18, 0x1F1D, 1},
  1203			{0x1F20, 0x1F45, 1},
  1204			{0x1F48, 0x1F4D, 1},
  1205			{0x1F50, 0x1F57, 1},
  1206			{0x1F59, 0x1F5B, 0x1F5B - 0x1F59},
  1207			{0x1F5D, 0x1F5D, 1},
  1208			{0x1F5F, 0x1F7D, 1},
  1209			{0x1F80, 0x1FB4, 1},
  1210			{0x1FB6, 0x1FBC, 1},
  1211			{0x1FBE, 0x1FBE, 1},
  1212			{0x1FC2, 0x1FC4, 1},
  1213			{0x1FC6, 0x1FCC, 1},
  1214			{0x1FD0, 0x1FD3, 1},
  1215			{0x1FD6, 0x1FDB, 1},
  1216			{0x1FE0, 0x1FEC, 1},
  1217			{0x1FF2, 0x1FF4, 1},
  1218			{0x1FF6, 0x1FFC, 1},
  1219			{0x2126, 0x2126, 1},
  1220			{0x212A, 0x212B, 1},
  1221			{0x212E, 0x212E, 1},
  1222			{0x2180, 0x2182, 1},
  1223			{0x3007, 0x3007, 1},
  1224			{0x3021, 0x3029, 1},
  1225			{0x3041, 0x3094, 1},
  1226			{0x30A1, 0x30FA, 1},
  1227			{0x3105, 0x312C, 1},
  1228			{0x4E00, 0x9FA5, 1},
  1229			{0xAC00, 0xD7A3, 1},
  1230		},
  1231	}
  1232	
  1233	var second = &unicode.RangeTable{
  1234		R16: []unicode.Range16{
  1235			{0x002D, 0x002E, 1},
  1236			{0x0030, 0x0039, 1},
  1237			{0x00B7, 0x00B7, 1},
  1238			{0x02D0, 0x02D1, 1},
  1239			{0x0300, 0x0345, 1},
  1240			{0x0360, 0x0361, 1},
  1241			{0x0387, 0x0387, 1},
  1242			{0x0483, 0x0486, 1},
  1243			{0x0591, 0x05A1, 1},
  1244			{0x05A3, 0x05B9, 1},
  1245			{0x05BB, 0x05BD, 1},
  1246			{0x05BF, 0x05BF, 1},
  1247			{0x05C1, 0x05C2, 1},
  1248			{0x05C4, 0x0640, 0x0640 - 0x05C4},
  1249			{0x064B, 0x0652, 1},
  1250			{0x0660, 0x0669, 1},
  1251			{0x0670, 0x0670, 1},
  1252			{0x06D6, 0x06DC, 1},
  1253			{0x06DD, 0x06DF, 1},
  1254			{0x06E0, 0x06E4, 1},
  1255			{0x06E7, 0x06E8, 1},
  1256			{0x06EA, 0x06ED, 1},
  1257			{0x06F0, 0x06F9, 1},
  1258			{0x0901, 0x0903, 1},
  1259			{0x093C, 0x093C, 1},
  1260			{0x093E, 0x094C, 1},
  1261			{0x094D, 0x094D, 1},
  1262			{0x0951, 0x0954, 1},
  1263			{0x0962, 0x0963, 1},
  1264			{0x0966, 0x096F, 1},
  1265			{0x0981, 0x0983, 1},
  1266			{0x09BC, 0x09BC, 1},
  1267			{0x09BE, 0x09BF, 1},
  1268			{0x09C0, 0x09C4, 1},
  1269			{0x09C7, 0x09C8, 1},
  1270			{0x09CB, 0x09CD, 1},
  1271			{0x09D7, 0x09D7, 1},
  1272			{0x09E2, 0x09E3, 1},
  1273			{0x09E6, 0x09EF, 1},
  1274			{0x0A02, 0x0A3C, 0x3A},
  1275			{0x0A3E, 0x0A3F, 1},
  1276			{0x0A40, 0x0A42, 1},
  1277			{0x0A47, 0x0A48, 1},
  1278			{0x0A4B, 0x0A4D, 1},
  1279			{0x0A66, 0x0A6F, 1},
  1280			{0x0A70, 0x0A71, 1},
  1281			{0x0A81, 0x0A83, 1},
  1282			{0x0ABC, 0x0ABC, 1},
  1283			{0x0ABE, 0x0AC5, 1},
  1284			{0x0AC7, 0x0AC9, 1},
  1285			{0x0ACB, 0x0ACD, 1},
  1286			{0x0AE6, 0x0AEF, 1},
  1287			{0x0B01, 0x0B03, 1},
  1288			{0x0B3C, 0x0B3C, 1},
  1289			{0x0B3E, 0x0B43, 1},
  1290			{0x0B47, 0x0B48, 1},
  1291			{0x0B4B, 0x0B4D, 1},
  1292			{0x0B56, 0x0B57, 1},
  1293			{0x0B66, 0x0B6F, 1},
  1294			{0x0B82, 0x0B83, 1},
  1295			{0x0BBE, 0x0BC2, 1},
  1296			{0x0BC6, 0x0BC8, 1},
  1297			{0x0BCA, 0x0BCD, 1},
  1298			{0x0BD7, 0x0BD7, 1},
  1299			{0x0BE7, 0x0BEF, 1},
  1300			{0x0C01, 0x0C03, 1},
  1301			{0x0C3E, 0x0C44, 1},
  1302			{0x0C46, 0x0C48, 1},
  1303			{0x0C4A, 0x0C4D, 1},
  1304			{0x0C55, 0x0C56, 1},
  1305			{0x0C66, 0x0C6F, 1},
  1306			{0x0C82, 0x0C83, 1},
  1307			{0x0CBE, 0x0CC4, 1},
  1308			{0x0CC6, 0x0CC8, 1},
  1309			{0x0CCA, 0x0CCD, 1},
  1310			{0x0CD5, 0x0CD6, 1},
  1311			{0x0CE6, 0x0CEF, 1},
  1312			{0x0D02, 0x0D03, 1},
  1313			{0x0D3E, 0x0D43, 1},
  1314			{0x0D46, 0x0D48, 1},
  1315			{0x0D4A, 0x0D4D, 1},
  1316			{0x0D57, 0x0D57, 1},
  1317			{0x0D66, 0x0D6F, 1},
  1318			{0x0E31, 0x0E31, 1},
  1319			{0x0E34, 0x0E3A, 1},
  1320			{0x0E46, 0x0E46, 1},
  1321			{0x0E47, 0x0E4E, 1},
  1322			{0x0E50, 0x0E59, 1},
  1323			{0x0EB1, 0x0EB1, 1},
  1324			{0x0EB4, 0x0EB9, 1},
  1325			{0x0EBB, 0x0EBC, 1},
  1326			{0x0EC6, 0x0EC6, 1},
  1327			{0x0EC8, 0x0ECD, 1},
  1328			{0x0ED0, 0x0ED9, 1},
  1329			{0x0F18, 0x0F19, 1},
  1330			{0x0F20, 0x0F29, 1},
  1331			{0x0F35, 0x0F39, 2},
  1332			{0x0F3E, 0x0F3F, 1},
  1333			{0x0F71, 0x0F84, 1},
  1334			{0x0F86, 0x0F8B, 1},
  1335			{0x0F90, 0x0F95, 1},
  1336			{0x0F97, 0x0F97, 1},
  1337			{0x0F99, 0x0FAD, 1},
  1338			{0x0FB1, 0x0FB7, 1},
  1339			{0x0FB9, 0x0FB9, 1},
  1340			{0x20D0, 0x20DC, 1},
  1341			{0x20E1, 0x3005, 0x3005 - 0x20E1},
  1342			{0x302A, 0x302F, 1},
  1343			{0x3031, 0x3035, 1},
  1344			{0x3099, 0x309A, 1},
  1345			{0x309D, 0x309E, 1},
  1346			{0x30FC, 0x30FE, 1},
  1347		},
  1348	}
  1349	
  1350	// HTMLEntity is an entity map containing translations for the
  1351	// standard HTML entity characters.
  1352	var HTMLEntity = htmlEntity
  1353	
  1354	var htmlEntity = map[string]string{
  1355		/*
  1356			hget http://www.w3.org/TR/html4/sgml/entities.html |
  1357			ssam '
  1358				,y /\&gt;/ x/\&lt;(.|\n)+/ s/\n/ /g
  1359				,x v/^\&lt;!ENTITY/d
  1360				,s/\&lt;!ENTITY ([^ ]+) .*U\+([0-9A-F][0-9A-F][0-9A-F][0-9A-F]) .+/	"\1": "\\u\2",/g
  1361			'
  1362		*/
  1363		"nbsp":     "\u00A0",
  1364		"iexcl":    "\u00A1",
  1365		"cent":     "\u00A2",
  1366		"pound":    "\u00A3",
  1367		"curren":   "\u00A4",
  1368		"yen":      "\u00A5",
  1369		"brvbar":   "\u00A6",
  1370		"sect":     "\u00A7",
  1371		"uml":      "\u00A8",
  1372		"copy":     "\u00A9",
  1373		"ordf":     "\u00AA",
  1374		"laquo":    "\u00AB",
  1375		"not":      "\u00AC",
  1376		"shy":      "\u00AD",
  1377		"reg":      "\u00AE",
  1378		"macr":     "\u00AF",
  1379		"deg":      "\u00B0",
  1380		"plusmn":   "\u00B1",
  1381		"sup2":     "\u00B2",
  1382		"sup3":     "\u00B3",
  1383		"acute":    "\u00B4",
  1384		"micro":    "\u00B5",
  1385		"para":     "\u00B6",
  1386		"middot":   "\u00B7",
  1387		"cedil":    "\u00B8",
  1388		"sup1":     "\u00B9",
  1389		"ordm":     "\u00BA",
  1390		"raquo":    "\u00BB",
  1391		"frac14":   "\u00BC",
  1392		"frac12":   "\u00BD",
  1393		"frac34":   "\u00BE",
  1394		"iquest":   "\u00BF",
  1395		"Agrave":   "\u00C0",
  1396		"Aacute":   "\u00C1",
  1397		"Acirc":    "\u00C2",
  1398		"Atilde":   "\u00C3",
  1399		"Auml":     "\u00C4",
  1400		"Aring":    "\u00C5",
  1401		"AElig":    "\u00C6",
  1402		"Ccedil":   "\u00C7",
  1403		"Egrave":   "\u00C8",
  1404		"Eacute":   "\u00C9",
  1405		"Ecirc":    "\u00CA",
  1406		"Euml":     "\u00CB",
  1407		"Igrave":   "\u00CC",
  1408		"Iacute":   "\u00CD",
  1409		"Icirc":    "\u00CE",
  1410		"Iuml":     "\u00CF",
  1411		"ETH":      "\u00D0",
  1412		"Ntilde":   "\u00D1",
  1413		"Ograve":   "\u00D2",
  1414		"Oacute":   "\u00D3",
  1415		"Ocirc":    "\u00D4",
  1416		"Otilde":   "\u00D5",
  1417		"Ouml":     "\u00D6",
  1418		"times":    "\u00D7",
  1419		"Oslash":   "\u00D8",
  1420		"Ugrave":   "\u00D9",
  1421		"Uacute":   "\u00DA",
  1422		"Ucirc":    "\u00DB",
  1423		"Uuml":     "\u00DC",
  1424		"Yacute":   "\u00DD",
  1425		"THORN":    "\u00DE",
  1426		"szlig":    "\u00DF",
  1427		"agrave":   "\u00E0",
  1428		"aacute":   "\u00E1",
  1429		"acirc":    "\u00E2",
  1430		"atilde":   "\u00E3",
  1431		"auml":     "\u00E4",
  1432		"aring":    "\u00E5",
  1433		"aelig":    "\u00E6",
  1434		"ccedil":   "\u00E7",
  1435		"egrave":   "\u00E8",
  1436		"eacute":   "\u00E9",
  1437		"ecirc":    "\u00EA",
  1438		"euml":     "\u00EB",
  1439		"igrave":   "\u00EC",
  1440		"iacute":   "\u00ED",
  1441		"icirc":    "\u00EE",
  1442		"iuml":     "\u00EF",
  1443		"eth":      "\u00F0",
  1444		"ntilde":   "\u00F1",
  1445		"ograve":   "\u00F2",
  1446		"oacute":   "\u00F3",
  1447		"ocirc":    "\u00F4",
  1448		"otilde":   "\u00F5",
  1449		"ouml":     "\u00F6",
  1450		"divide":   "\u00F7",
  1451		"oslash":   "\u00F8",
  1452		"ugrave":   "\u00F9",
  1453		"uacute":   "\u00FA",
  1454		"ucirc":    "\u00FB",
  1455		"uuml":     "\u00FC",
  1456		"yacute":   "\u00FD",
  1457		"thorn":    "\u00FE",
  1458		"yuml":     "\u00FF",
  1459		"fnof":     "\u0192",
  1460		"Alpha":    "\u0391",
  1461		"Beta":     "\u0392",
  1462		"Gamma":    "\u0393",
  1463		"Delta":    "\u0394",
  1464		"Epsilon":  "\u0395",
  1465		"Zeta":     "\u0396",
  1466		"Eta":      "\u0397",
  1467		"Theta":    "\u0398",
  1468		"Iota":     "\u0399",
  1469		"Kappa":    "\u039A",
  1470		"Lambda":   "\u039B",
  1471		"Mu":       "\u039C",
  1472		"Nu":       "\u039D",
  1473		"Xi":       "\u039E",
  1474		"Omicron":  "\u039F",
  1475		"Pi":       "\u03A0",
  1476		"Rho":      "\u03A1",
  1477		"Sigma":    "\u03A3",
  1478		"Tau":      "\u03A4",
  1479		"Upsilon":  "\u03A5",
  1480		"Phi":      "\u03A6",
  1481		"Chi":      "\u03A7",
  1482		"Psi":      "\u03A8",
  1483		"Omega":    "\u03A9",
  1484		"alpha":    "\u03B1",
  1485		"beta":     "\u03B2",
  1486		"gamma":    "\u03B3",
  1487		"delta":    "\u03B4",
  1488		"epsilon":  "\u03B5",
  1489		"zeta":     "\u03B6",
  1490		"eta":      "\u03B7",
  1491		"theta":    "\u03B8",
  1492		"iota":     "\u03B9",
  1493		"kappa":    "\u03BA",
  1494		"lambda":   "\u03BB",
  1495		"mu":       "\u03BC",
  1496		"nu":       "\u03BD",
  1497		"xi":       "\u03BE",
  1498		"omicron":  "\u03BF",
  1499		"pi":       "\u03C0",
  1500		"rho":      "\u03C1",
  1501		"sigmaf":   "\u03C2",
  1502		"sigma":    "\u03C3",
  1503		"tau":      "\u03C4",
  1504		"upsilon":  "\u03C5",
  1505		"phi":      "\u03C6",
  1506		"chi":      "\u03C7",
  1507		"psi":      "\u03C8",
  1508		"omega":    "\u03C9",
  1509		"thetasym": "\u03D1",
  1510		"upsih":    "\u03D2",
  1511		"piv":      "\u03D6",
  1512		"bull":     "\u2022",
  1513		"hellip":   "\u2026",
  1514		"prime":    "\u2032",
  1515		"Prime":    "\u2033",
  1516		"oline":    "\u203E",
  1517		"frasl":    "\u2044",
  1518		"weierp":   "\u2118",
  1519		"image":    "\u2111",
  1520		"real":     "\u211C",
  1521		"trade":    "\u2122",
  1522		"alefsym":  "\u2135",
  1523		"larr":     "\u2190",
  1524		"uarr":     "\u2191",
  1525		"rarr":     "\u2192",
  1526		"darr":     "\u2193",
  1527		"harr":     "\u2194",
  1528		"crarr":    "\u21B5",
  1529		"lArr":     "\u21D0",
  1530		"uArr":     "\u21D1",
  1531		"rArr":     "\u21D2",
  1532		"dArr":     "\u21D3",
  1533		"hArr":     "\u21D4",
  1534		"forall":   "\u2200",
  1535		"part":     "\u2202",
  1536		"exist":    "\u2203",
  1537		"empty":    "\u2205",
  1538		"nabla":    "\u2207",
  1539		"isin":     "\u2208",
  1540		"notin":    "\u2209",
  1541		"ni":       "\u220B",
  1542		"prod":     "\u220F",
  1543		"sum":      "\u2211",
  1544		"minus":    "\u2212",
  1545		"lowast":   "\u2217",
  1546		"radic":    "\u221A",
  1547		"prop":     "\u221D",
  1548		"infin":    "\u221E",
  1549		"ang":      "\u2220",
  1550		"and":      "\u2227",
  1551		"or":       "\u2228",
  1552		"cap":      "\u2229",
  1553		"cup":      "\u222A",
  1554		"int":      "\u222B",
  1555		"there4":   "\u2234",
  1556		"sim":      "\u223C",
  1557		"cong":     "\u2245",
  1558		"asymp":    "\u2248",
  1559		"ne":       "\u2260",
  1560		"equiv":    "\u2261",
  1561		"le":       "\u2264",
  1562		"ge":       "\u2265",
  1563		"sub":      "\u2282",
  1564		"sup":      "\u2283",
  1565		"nsub":     "\u2284",
  1566		"sube":     "\u2286",
  1567		"supe":     "\u2287",
  1568		"oplus":    "\u2295",
  1569		"otimes":   "\u2297",
  1570		"perp":     "\u22A5",
  1571		"sdot":     "\u22C5",
  1572		"lceil":    "\u2308",
  1573		"rceil":    "\u2309",
  1574		"lfloor":   "\u230A",
  1575		"rfloor":   "\u230B",
  1576		"lang":     "\u2329",
  1577		"rang":     "\u232A",
  1578		"loz":      "\u25CA",
  1579		"spades":   "\u2660",
  1580		"clubs":    "\u2663",
  1581		"hearts":   "\u2665",
  1582		"diams":    "\u2666",
  1583		"quot":     "\u0022",
  1584		"amp":      "\u0026",
  1585		"lt":       "\u003C",
  1586		"gt":       "\u003E",
  1587		"OElig":    "\u0152",
  1588		"oelig":    "\u0153",
  1589		"Scaron":   "\u0160",
  1590		"scaron":   "\u0161",
  1591		"Yuml":     "\u0178",
  1592		"circ":     "\u02C6",
  1593		"tilde":    "\u02DC",
  1594		"ensp":     "\u2002",
  1595		"emsp":     "\u2003",
  1596		"thinsp":   "\u2009",
  1597		"zwnj":     "\u200C",
  1598		"zwj":      "\u200D",
  1599		"lrm":      "\u200E",
  1600		"rlm":      "\u200F",
  1601		"ndash":    "\u2013",
  1602		"mdash":    "\u2014",
  1603		"lsquo":    "\u2018",
  1604		"rsquo":    "\u2019",
  1605		"sbquo":    "\u201A",
  1606		"ldquo":    "\u201C",
  1607		"rdquo":    "\u201D",
  1608		"bdquo":    "\u201E",
  1609		"dagger":   "\u2020",
  1610		"Dagger":   "\u2021",
  1611		"permil":   "\u2030",
  1612		"lsaquo":   "\u2039",
  1613		"rsaquo":   "\u203A",
  1614		"euro":     "\u20AC",
  1615	}
  1616	
  1617	// HTMLAutoClose is the set of HTML elements that
  1618	// should be considered to close automatically.
  1619	var HTMLAutoClose = htmlAutoClose
  1620	
  1621	var htmlAutoClose = []string{
  1622		/*
  1623			hget http://www.w3.org/TR/html4/loose.dtd |
  1624			9 sed -n 's/<!ELEMENT (.*) - O EMPTY.+/	"\1",/p' | tr A-Z a-z
  1625		*/
  1626		"basefont",
  1627		"br",
  1628		"area",
  1629		"link",
  1630		"img",
  1631		"param",
  1632		"hr",
  1633		"input",
  1634		"col     ",
  1635		"frame",
  1636		"isindex",
  1637		"base",
  1638		"meta",
  1639	}
  1640	
  1641	var (
  1642		esc_quot = []byte("&#34;") // shorter than "&quot;"
  1643		esc_apos = []byte("&#39;") // shorter than "&apos;"
  1644		esc_amp  = []byte("&amp;")
  1645		esc_lt   = []byte("&lt;")
  1646		esc_gt   = []byte("&gt;")
  1647	)
  1648	
  1649	// Escape writes to w the properly escaped XML equivalent
  1650	// of the plain text data s.
  1651	func Escape(w io.Writer, s []byte) {
  1652		var esc []byte
  1653		last := 0
  1654		for i, c := range s {
  1655			switch c {
  1656			case '"':
  1657				esc = esc_quot
  1658			case '\'':
  1659				esc = esc_apos
  1660			case '&':
  1661				esc = esc_amp
  1662			case '<':
  1663				esc = esc_lt
  1664			case '>':
  1665				esc = esc_gt
  1666			default:
  1667				continue
  1668			}
  1669			w.Write(s[last:i])
  1670			w.Write(esc)
  1671			last = i + 1
  1672		}
  1673		w.Write(s[last:])
  1674	}
  1675	
  1676	// procInstEncoding parses the `encoding="..."` or `encoding='...'`
  1677	// value out of the provided string, returning "" if not found.
  1678	func procInstEncoding(s string) string {
  1679		// TODO: this parsing is somewhat lame and not exact.
  1680		// It works for all actual cases, though.
  1681		idx := strings.Index(s, "encoding=")
  1682		if idx == -1 {
  1683			return ""
  1684		}
  1685		v := s[idx+len("encoding="):]
  1686		if v == "" {
  1687			return ""
  1688		}
  1689		if v[0] != '\'' && v[0] != '"' {
  1690			return ""
  1691		}
  1692		idx = strings.IndexRune(v[1:], rune(v[0]))
  1693		if idx == -1 {
  1694			return ""
  1695		}
  1696		return v[1 : idx+1]
  1697	}