Source file src/pkg/encoding/xml/read.go
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 package xml
6
7 import (
8 "bytes"
9 "errors"
10 "reflect"
11 "strconv"
12 "strings"
13 "time"
14 )
15
16 // BUG(rsc): Mapping between XML elements and data structures is inherently flawed:
17 // an XML element is an order-dependent collection of anonymous
18 // values, while a data structure is an order-independent collection
19 // of named values.
20 // See package json for a textual representation more suitable
21 // to data structures.
22
23 // Unmarshal parses the XML-encoded data and stores the result in
24 // the value pointed to by v, which must be an arbitrary struct,
25 // slice, or string. Well-formed data that does not fit into v is
26 // discarded.
27 //
28 // Because Unmarshal uses the reflect package, it can only assign
29 // to exported (upper case) fields. Unmarshal uses a case-sensitive
30 // comparison to match XML element names to tag values and struct
31 // field names.
32 //
33 // Unmarshal maps an XML element to a struct using the following rules.
34 // In the rules, the tag of a field refers to the value associated with the
35 // key 'xml' in the struct field's tag (see the example above).
36 //
37 // * If the struct has a field of type []byte or string with tag
38 // ",innerxml", Unmarshal accumulates the raw XML nested inside the
39 // element in that field. The rest of the rules still apply.
40 //
41 // * If the struct has a field named XMLName of type xml.Name,
42 // Unmarshal records the element name in that field.
43 //
44 // * If the XMLName field has an associated tag of the form
45 // "name" or "namespace-URL name", the XML element must have
46 // the given name (and, optionally, name space) or else Unmarshal
47 // returns an error.
48 //
49 // * If the XML element has an attribute whose name matches a
50 // struct field name with an associated tag containing ",attr" or
51 // the explicit name in a struct field tag of the form "name,attr",
52 // Unmarshal records the attribute value in that field.
53 //
54 // * If the XML element contains character data, that data is
55 // accumulated in the first struct field that has tag "chardata".
56 // The struct field may have type []byte or string.
57 // If there is no such field, the character data is discarded.
58 //
59 // * If the XML element contains comments, they are accumulated in
60 // the first struct field that has tag ",comments". The struct
61 // field may have type []byte or string. If there is no such
62 // field, the comments are discarded.
63 //
64 // * If the XML element contains a sub-element whose name matches
65 // the prefix of a tag formatted as "a" or "a>b>c", unmarshal
66 // will descend into the XML structure looking for elements with the
67 // given names, and will map the innermost elements to that struct
68 // field. A tag starting with ">" is equivalent to one starting
69 // with the field name followed by ">".
70 //
71 // * If the XML element contains a sub-element whose name matches
72 // a struct field's XMLName tag and the struct field has no
73 // explicit name tag as per the previous rule, unmarshal maps
74 // the sub-element to that struct field.
75 //
76 // * If the XML element contains a sub-element whose name matches a
77 // field without any mode flags (",attr", ",chardata", etc), Unmarshal
78 // maps the sub-element to that struct field.
79 //
80 // * If the XML element contains a sub-element that hasn't matched any
81 // of the above rules and the struct has a field with tag ",any",
82 // unmarshal maps the sub-element to that struct field.
83 //
84 // * A non-pointer anonymous struct field is handled as if the
85 // fields of its value were part of the outer struct.
86 //
87 // * A struct field with tag "-" is never unmarshalled into.
88 //
89 // Unmarshal maps an XML element to a string or []byte by saving the
90 // concatenation of that element's character data in the string or
91 // []byte. The saved []byte is never nil.
92 //
93 // Unmarshal maps an attribute value to a string or []byte by saving
94 // the value in the string or slice.
95 //
96 // Unmarshal maps an XML element to a slice by extending the length of
97 // the slice and mapping the element to the newly created value.
98 //
99 // Unmarshal maps an XML element or attribute value to a bool by
100 // setting it to the boolean value represented by the string.
101 //
102 // Unmarshal maps an XML element or attribute value to an integer or
103 // floating-point field by setting the field to the result of
104 // interpreting the string value in decimal. There is no check for
105 // overflow.
106 //
107 // Unmarshal maps an XML element to an xml.Name by recording the
108 // element name.
109 //
110 // Unmarshal maps an XML element to a pointer by setting the pointer
111 // to a freshly allocated value and then mapping the element to that value.
112 //
113 func Unmarshal(data []byte, v interface{}) error {
114 return NewDecoder(bytes.NewBuffer(data)).Decode(v)
115 }
116
117 // Decode works like xml.Unmarshal, except it reads the decoder
118 // stream to find the start element.
119 func (d *Decoder) Decode(v interface{}) error {
120 return d.DecodeElement(v, nil)
121 }
122
123 // DecodeElement works like xml.Unmarshal except that it takes
124 // a pointer to the start XML element to decode into v.
125 // It is useful when a client reads some raw XML tokens itself
126 // but also wants to defer to Unmarshal for some elements.
127 func (d *Decoder) DecodeElement(v interface{}, start *StartElement) error {
128 val := reflect.ValueOf(v)
129 if val.Kind() != reflect.Ptr {
130 return errors.New("non-pointer passed to Unmarshal")
131 }
132 return d.unmarshal(val.Elem(), start)
133 }
134
135 // An UnmarshalError represents an error in the unmarshalling process.
136 type UnmarshalError string
137
138 func (e UnmarshalError) Error() string { return string(e) }
139
140 // Unmarshal a single XML element into val.
141 func (p *Decoder) unmarshal(val reflect.Value, start *StartElement) error {
142 // Find start element if we need it.
143 if start == nil {
144 for {
145 tok, err := p.Token()
146 if err != nil {
147 return err
148 }
149 if t, ok := tok.(StartElement); ok {
150 start = &t
151 break
152 }
153 }
154 }
155
156 if pv := val; pv.Kind() == reflect.Ptr {
157 if pv.IsNil() {
158 pv.Set(reflect.New(pv.Type().Elem()))
159 }
160 val = pv.Elem()
161 }
162
163 var (
164 data []byte
165 saveData reflect.Value
166 comment []byte
167 saveComment reflect.Value
168 saveXML reflect.Value
169 saveXMLIndex int
170 saveXMLData []byte
171 saveAny reflect.Value
172 sv reflect.Value
173 tinfo *typeInfo
174 err error
175 )
176
177 switch v := val; v.Kind() {
178 default:
179 return errors.New("unknown type " + v.Type().String())
180
181 case reflect.Interface:
182 // TODO: For now, simply ignore the field. In the near
183 // future we may choose to unmarshal the start
184 // element on it, if not nil.
185 return p.Skip()
186
187 case reflect.Slice:
188 typ := v.Type()
189 if typ.Elem().Kind() == reflect.Uint8 {
190 // []byte
191 saveData = v
192 break
193 }
194
195 // Slice of element values.
196 // Grow slice.
197 n := v.Len()
198 if n >= v.Cap() {
199 ncap := 2 * n
200 if ncap < 4 {
201 ncap = 4
202 }
203 new := reflect.MakeSlice(typ, n, ncap)
204 reflect.Copy(new, v)
205 v.Set(new)
206 }
207 v.SetLen(n + 1)
208
209 // Recur to read element into slice.
210 if err := p.unmarshal(v.Index(n), start); err != nil {
211 v.SetLen(n)
212 return err
213 }
214 return nil
215
216 case reflect.Bool, reflect.Float32, reflect.Float64, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.String:
217 saveData = v
218
219 case reflect.Struct:
220 typ := v.Type()
221 if typ == nameType {
222 v.Set(reflect.ValueOf(start.Name))
223 break
224 }
225 if typ == timeType {
226 saveData = v
227 break
228 }
229
230 sv = v
231 tinfo, err = getTypeInfo(typ)
232 if err != nil {
233 return err
234 }
235
236 // Validate and assign element name.
237 if tinfo.xmlname != nil {
238 finfo := tinfo.xmlname
239 if finfo.name != "" && finfo.name != start.Name.Local {
240 return UnmarshalError("expected element type <" + finfo.name + "> but have <" + start.Name.Local + ">")
241 }
242 if finfo.xmlns != "" && finfo.xmlns != start.Name.Space {
243 e := "expected element <" + finfo.name + "> in name space " + finfo.xmlns + " but have "
244 if start.Name.Space == "" {
245 e += "no name space"
246 } else {
247 e += start.Name.Space
248 }
249 return UnmarshalError(e)
250 }
251 fv := sv.FieldByIndex(finfo.idx)
252 if _, ok := fv.Interface().(Name); ok {
253 fv.Set(reflect.ValueOf(start.Name))
254 }
255 }
256
257 // Assign attributes.
258 // Also, determine whether we need to save character data or comments.
259 for i := range tinfo.fields {
260 finfo := &tinfo.fields[i]
261 switch finfo.flags & fMode {
262 case fAttr:
263 strv := sv.FieldByIndex(finfo.idx)
264 // Look for attribute.
265 for _, a := range start.Attr {
266 if a.Name.Local == finfo.name {
267 copyValue(strv, []byte(a.Value))
268 break
269 }
270 }
271
272 case fCharData:
273 if !saveData.IsValid() {
274 saveData = sv.FieldByIndex(finfo.idx)
275 }
276
277 case fComment:
278 if !saveComment.IsValid() {
279 saveComment = sv.FieldByIndex(finfo.idx)
280 }
281
282 case fAny:
283 if !saveAny.IsValid() {
284 saveAny = sv.FieldByIndex(finfo.idx)
285 }
286
287 case fInnerXml:
288 if !saveXML.IsValid() {
289 saveXML = sv.FieldByIndex(finfo.idx)
290 if p.saved == nil {
291 saveXMLIndex = 0
292 p.saved = new(bytes.Buffer)
293 } else {
294 saveXMLIndex = p.savedOffset()
295 }
296 }
297 }
298 }
299 }
300
301 // Find end element.
302 // Process sub-elements along the way.
303 Loop:
304 for {
305 var savedOffset int
306 if saveXML.IsValid() {
307 savedOffset = p.savedOffset()
308 }
309 tok, err := p.Token()
310 if err != nil {
311 return err
312 }
313 switch t := tok.(type) {
314 case StartElement:
315 consumed := false
316 if sv.IsValid() {
317 consumed, err = p.unmarshalPath(tinfo, sv, nil, &t)
318 if err != nil {
319 return err
320 }
321 if !consumed && saveAny.IsValid() {
322 consumed = true
323 if err := p.unmarshal(saveAny, &t); err != nil {
324 return err
325 }
326 }
327 }
328 if !consumed {
329 if err := p.Skip(); err != nil {
330 return err
331 }
332 }
333
334 case EndElement:
335 if saveXML.IsValid() {
336 saveXMLData = p.saved.Bytes()[saveXMLIndex:savedOffset]
337 if saveXMLIndex == 0 {
338 p.saved = nil
339 }
340 }
341 break Loop
342
343 case CharData:
344 if saveData.IsValid() {
345 data = append(data, t...)
346 }
347
348 case Comment:
349 if saveComment.IsValid() {
350 comment = append(comment, t...)
351 }
352 }
353 }
354
355 if err := copyValue(saveData, data); err != nil {
356 return err
357 }
358
359 switch t := saveComment; t.Kind() {
360 case reflect.String:
361 t.SetString(string(comment))
362 case reflect.Slice:
363 t.Set(reflect.ValueOf(comment))
364 }
365
366 switch t := saveXML; t.Kind() {
367 case reflect.String:
368 t.SetString(string(saveXMLData))
369 case reflect.Slice:
370 t.Set(reflect.ValueOf(saveXMLData))
371 }
372
373 return nil
374 }
375
376 func copyValue(dst reflect.Value, src []byte) (err error) {
377 // Helper functions for integer and unsigned integer conversions
378 var itmp int64
379 getInt64 := func() bool {
380 itmp, err = strconv.ParseInt(string(src), 10, 64)
381 // TODO: should check sizes
382 return err == nil
383 }
384 var utmp uint64
385 getUint64 := func() bool {
386 utmp, err = strconv.ParseUint(string(src), 10, 64)
387 // TODO: check for overflow?
388 return err == nil
389 }
390 var ftmp float64
391 getFloat64 := func() bool {
392 ftmp, err = strconv.ParseFloat(string(src), 64)
393 // TODO: check for overflow?
394 return err == nil
395 }
396
397 // Save accumulated data.
398 switch t := dst; t.Kind() {
399 case reflect.Invalid:
400 // Probably a comment.
401 default:
402 return errors.New("cannot happen: unknown type " + t.Type().String())
403 case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
404 if !getInt64() {
405 return err
406 }
407 t.SetInt(itmp)
408 case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr:
409 if !getUint64() {
410 return err
411 }
412 t.SetUint(utmp)
413 case reflect.Float32, reflect.Float64:
414 if !getFloat64() {
415 return err
416 }
417 t.SetFloat(ftmp)
418 case reflect.Bool:
419 value, err := strconv.ParseBool(strings.TrimSpace(string(src)))
420 if err != nil {
421 return err
422 }
423 t.SetBool(value)
424 case reflect.String:
425 t.SetString(string(src))
426 case reflect.Slice:
427 if len(src) == 0 {
428 // non-nil to flag presence
429 src = []byte{}
430 }
431 t.SetBytes(src)
432 case reflect.Struct:
433 if t.Type() == timeType {
434 tv, err := time.Parse(time.RFC3339, string(src))
435 if err != nil {
436 return err
437 }
438 t.Set(reflect.ValueOf(tv))
439 }
440 }
441 return nil
442 }
443
444 // unmarshalPath walks down an XML structure looking for wanted
445 // paths, and calls unmarshal on them.
446 // The consumed result tells whether XML elements have been consumed
447 // from the Decoder until start's matching end element, or if it's
448 // still untouched because start is uninteresting for sv's fields.
449 func (p *Decoder) unmarshalPath(tinfo *typeInfo, sv reflect.Value, parents []string, start *StartElement) (consumed bool, err error) {
450 recurse := false
451 Loop:
452 for i := range tinfo.fields {
453 finfo := &tinfo.fields[i]
454 if finfo.flags&fElement == 0 || len(finfo.parents) < len(parents) {
455 continue
456 }
457 for j := range parents {
458 if parents[j] != finfo.parents[j] {
459 continue Loop
460 }
461 }
462 if len(finfo.parents) == len(parents) && finfo.name == start.Name.Local {
463 // It's a perfect match, unmarshal the field.
464 return true, p.unmarshal(sv.FieldByIndex(finfo.idx), start)
465 }
466 if len(finfo.parents) > len(parents) && finfo.parents[len(parents)] == start.Name.Local {
467 // It's a prefix for the field. Break and recurse
468 // since it's not ok for one field path to be itself
469 // the prefix for another field path.
470 recurse = true
471
472 // We can reuse the same slice as long as we
473 // don't try to append to it.
474 parents = finfo.parents[:len(parents)+1]
475 break
476 }
477 }
478 if !recurse {
479 // We have no business with this element.
480 return false, nil
481 }
482 // The element is not a perfect match for any field, but one
483 // or more fields have the path to this element as a parent
484 // prefix. Recurse and attempt to match these.
485 for {
486 var tok Token
487 tok, err = p.Token()
488 if err != nil {
489 return true, err
490 }
491 switch t := tok.(type) {
492 case StartElement:
493 consumed2, err := p.unmarshalPath(tinfo, sv, parents, &t)
494 if err != nil {
495 return true, err
496 }
497 if !consumed2 {
498 if err := p.Skip(); err != nil {
499 return true, err
500 }
501 }
502 case EndElement:
503 return true, nil
504 }
505 }
506 panic("unreachable")
507 }
508
509 // Skip reads tokens until it has consumed the end element
510 // matching the most recent start element already consumed.
511 // It recurs if it encounters a start element, so it can be used to
512 // skip nested structures.
513 // It returns nil if it finds an end element matching the start
514 // element; otherwise it returns an error describing the problem.
515 func (d *Decoder) Skip() error {
516 for {
517 tok, err := d.Token()
518 if err != nil {
519 return err
520 }
521 switch tok.(type) {
522 case StartElement:
523 if err := d.Skip(); err != nil {
524 return err
525 }
526 case EndElement:
527 return nil
528 }
529 }
530 panic("unreachable")
531 }