Source file src/pkg/encoding/xml/read.go
1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package xml 6 7 import ( 8 "bytes" 9 "errors" 10 "reflect" 11 "strconv" 12 "strings" 13 "time" 14 ) 15 16 // BUG(rsc): Mapping between XML elements and data structures is inherently flawed: 17 // an XML element is an order-dependent collection of anonymous 18 // values, while a data structure is an order-independent collection 19 // of named values. 20 // See package json for a textual representation more suitable 21 // to data structures. 22 23 // Unmarshal parses the XML-encoded data and stores the result in 24 // the value pointed to by v, which must be an arbitrary struct, 25 // slice, or string. Well-formed data that does not fit into v is 26 // discarded. 27 // 28 // Because Unmarshal uses the reflect package, it can only assign 29 // to exported (upper case) fields. Unmarshal uses a case-sensitive 30 // comparison to match XML element names to tag values and struct 31 // field names. 32 // 33 // Unmarshal maps an XML element to a struct using the following rules. 34 // In the rules, the tag of a field refers to the value associated with the 35 // key 'xml' in the struct field's tag (see the example above). 36 // 37 // * If the struct has a field of type []byte or string with tag 38 // ",innerxml", Unmarshal accumulates the raw XML nested inside the 39 // element in that field. The rest of the rules still apply. 40 // 41 // * If the struct has a field named XMLName of type xml.Name, 42 // Unmarshal records the element name in that field. 43 // 44 // * If the XMLName field has an associated tag of the form 45 // "name" or "namespace-URL name", the XML element must have 46 // the given name (and, optionally, name space) or else Unmarshal 47 // returns an error. 48 // 49 // * If the XML element has an attribute whose name matches a 50 // struct field name with an associated tag containing ",attr" or 51 // the explicit name in a struct field tag of the form "name,attr", 52 // Unmarshal records the attribute value in that field. 53 // 54 // * If the XML element contains character data, that data is 55 // accumulated in the first struct field that has tag "chardata". 56 // The struct field may have type []byte or string. 57 // If there is no such field, the character data is discarded. 58 // 59 // * If the XML element contains comments, they are accumulated in 60 // the first struct field that has tag ",comments". The struct 61 // field may have type []byte or string. If there is no such 62 // field, the comments are discarded. 63 // 64 // * If the XML element contains a sub-element whose name matches 65 // the prefix of a tag formatted as "a" or "a>b>c", unmarshal 66 // will descend into the XML structure looking for elements with the 67 // given names, and will map the innermost elements to that struct 68 // field. A tag starting with ">" is equivalent to one starting 69 // with the field name followed by ">". 70 // 71 // * If the XML element contains a sub-element whose name matches 72 // a struct field's XMLName tag and the struct field has no 73 // explicit name tag as per the previous rule, unmarshal maps 74 // the sub-element to that struct field. 75 // 76 // * If the XML element contains a sub-element whose name matches a 77 // field without any mode flags (",attr", ",chardata", etc), Unmarshal 78 // maps the sub-element to that struct field. 79 // 80 // * If the XML element contains a sub-element that hasn't matched any 81 // of the above rules and the struct has a field with tag ",any", 82 // unmarshal maps the sub-element to that struct field. 83 // 84 // * A non-pointer anonymous struct field is handled as if the 85 // fields of its value were part of the outer struct. 86 // 87 // * A struct field with tag "-" is never unmarshalled into. 88 // 89 // Unmarshal maps an XML element to a string or []byte by saving the 90 // concatenation of that element's character data in the string or 91 // []byte. The saved []byte is never nil. 92 // 93 // Unmarshal maps an attribute value to a string or []byte by saving 94 // the value in the string or slice. 95 // 96 // Unmarshal maps an XML element to a slice by extending the length of 97 // the slice and mapping the element to the newly created value. 98 // 99 // Unmarshal maps an XML element or attribute value to a bool by 100 // setting it to the boolean value represented by the string. 101 // 102 // Unmarshal maps an XML element or attribute value to an integer or 103 // floating-point field by setting the field to the result of 104 // interpreting the string value in decimal. There is no check for 105 // overflow. 106 // 107 // Unmarshal maps an XML element to an xml.Name by recording the 108 // element name. 109 // 110 // Unmarshal maps an XML element to a pointer by setting the pointer 111 // to a freshly allocated value and then mapping the element to that value. 112 // 113 func Unmarshal(data []byte, v interface{}) error { 114 return NewDecoder(bytes.NewBuffer(data)).Decode(v) 115 } 116 117 // Decode works like xml.Unmarshal, except it reads the decoder 118 // stream to find the start element. 119 func (d *Decoder) Decode(v interface{}) error { 120 return d.DecodeElement(v, nil) 121 } 122 123 // DecodeElement works like xml.Unmarshal except that it takes 124 // a pointer to the start XML element to decode into v. 125 // It is useful when a client reads some raw XML tokens itself 126 // but also wants to defer to Unmarshal for some elements. 127 func (d *Decoder) DecodeElement(v interface{}, start *StartElement) error { 128 val := reflect.ValueOf(v) 129 if val.Kind() != reflect.Ptr { 130 return errors.New("non-pointer passed to Unmarshal") 131 } 132 return d.unmarshal(val.Elem(), start) 133 } 134 135 // An UnmarshalError represents an error in the unmarshalling process. 136 type UnmarshalError string 137 138 func (e UnmarshalError) Error() string { return string(e) } 139 140 // Unmarshal a single XML element into val. 141 func (p *Decoder) unmarshal(val reflect.Value, start *StartElement) error { 142 // Find start element if we need it. 143 if start == nil { 144 for { 145 tok, err := p.Token() 146 if err != nil { 147 return err 148 } 149 if t, ok := tok.(StartElement); ok { 150 start = &t 151 break 152 } 153 } 154 } 155 156 if pv := val; pv.Kind() == reflect.Ptr { 157 if pv.IsNil() { 158 pv.Set(reflect.New(pv.Type().Elem())) 159 } 160 val = pv.Elem() 161 } 162 163 var ( 164 data []byte 165 saveData reflect.Value 166 comment []byte 167 saveComment reflect.Value 168 saveXML reflect.Value 169 saveXMLIndex int 170 saveXMLData []byte 171 saveAny reflect.Value 172 sv reflect.Value 173 tinfo *typeInfo 174 err error 175 ) 176 177 switch v := val; v.Kind() { 178 default: 179 return errors.New("unknown type " + v.Type().String()) 180 181 case reflect.Interface: 182 // TODO: For now, simply ignore the field. In the near 183 // future we may choose to unmarshal the start 184 // element on it, if not nil. 185 return p.Skip() 186 187 case reflect.Slice: 188 typ := v.Type() 189 if typ.Elem().Kind() == reflect.Uint8 { 190 // []byte 191 saveData = v 192 break 193 } 194 195 // Slice of element values. 196 // Grow slice. 197 n := v.Len() 198 if n >= v.Cap() { 199 ncap := 2 * n 200 if ncap < 4 { 201 ncap = 4 202 } 203 new := reflect.MakeSlice(typ, n, ncap) 204 reflect.Copy(new, v) 205 v.Set(new) 206 } 207 v.SetLen(n + 1) 208 209 // Recur to read element into slice. 210 if err := p.unmarshal(v.Index(n), start); err != nil { 211 v.SetLen(n) 212 return err 213 } 214 return nil 215 216 case reflect.Bool, reflect.Float32, reflect.Float64, reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr, reflect.String: 217 saveData = v 218 219 case reflect.Struct: 220 typ := v.Type() 221 if typ == nameType { 222 v.Set(reflect.ValueOf(start.Name)) 223 break 224 } 225 if typ == timeType { 226 saveData = v 227 break 228 } 229 230 sv = v 231 tinfo, err = getTypeInfo(typ) 232 if err != nil { 233 return err 234 } 235 236 // Validate and assign element name. 237 if tinfo.xmlname != nil { 238 finfo := tinfo.xmlname 239 if finfo.name != "" && finfo.name != start.Name.Local { 240 return UnmarshalError("expected element type <" + finfo.name + "> but have <" + start.Name.Local + ">") 241 } 242 if finfo.xmlns != "" && finfo.xmlns != start.Name.Space { 243 e := "expected element <" + finfo.name + "> in name space " + finfo.xmlns + " but have " 244 if start.Name.Space == "" { 245 e += "no name space" 246 } else { 247 e += start.Name.Space 248 } 249 return UnmarshalError(e) 250 } 251 fv := sv.FieldByIndex(finfo.idx) 252 if _, ok := fv.Interface().(Name); ok { 253 fv.Set(reflect.ValueOf(start.Name)) 254 } 255 } 256 257 // Assign attributes. 258 // Also, determine whether we need to save character data or comments. 259 for i := range tinfo.fields { 260 finfo := &tinfo.fields[i] 261 switch finfo.flags & fMode { 262 case fAttr: 263 strv := sv.FieldByIndex(finfo.idx) 264 // Look for attribute. 265 for _, a := range start.Attr { 266 if a.Name.Local == finfo.name { 267 copyValue(strv, []byte(a.Value)) 268 break 269 } 270 } 271 272 case fCharData: 273 if !saveData.IsValid() { 274 saveData = sv.FieldByIndex(finfo.idx) 275 } 276 277 case fComment: 278 if !saveComment.IsValid() { 279 saveComment = sv.FieldByIndex(finfo.idx) 280 } 281 282 case fAny: 283 if !saveAny.IsValid() { 284 saveAny = sv.FieldByIndex(finfo.idx) 285 } 286 287 case fInnerXml: 288 if !saveXML.IsValid() { 289 saveXML = sv.FieldByIndex(finfo.idx) 290 if p.saved == nil { 291 saveXMLIndex = 0 292 p.saved = new(bytes.Buffer) 293 } else { 294 saveXMLIndex = p.savedOffset() 295 } 296 } 297 } 298 } 299 } 300 301 // Find end element. 302 // Process sub-elements along the way. 303 Loop: 304 for { 305 var savedOffset int 306 if saveXML.IsValid() { 307 savedOffset = p.savedOffset() 308 } 309 tok, err := p.Token() 310 if err != nil { 311 return err 312 } 313 switch t := tok.(type) { 314 case StartElement: 315 consumed := false 316 if sv.IsValid() { 317 consumed, err = p.unmarshalPath(tinfo, sv, nil, &t) 318 if err != nil { 319 return err 320 } 321 if !consumed && saveAny.IsValid() { 322 consumed = true 323 if err := p.unmarshal(saveAny, &t); err != nil { 324 return err 325 } 326 } 327 } 328 if !consumed { 329 if err := p.Skip(); err != nil { 330 return err 331 } 332 } 333 334 case EndElement: 335 if saveXML.IsValid() { 336 saveXMLData = p.saved.Bytes()[saveXMLIndex:savedOffset] 337 if saveXMLIndex == 0 { 338 p.saved = nil 339 } 340 } 341 break Loop 342 343 case CharData: 344 if saveData.IsValid() { 345 data = append(data, t...) 346 } 347 348 case Comment: 349 if saveComment.IsValid() { 350 comment = append(comment, t...) 351 } 352 } 353 } 354 355 if err := copyValue(saveData, data); err != nil { 356 return err 357 } 358 359 switch t := saveComment; t.Kind() { 360 case reflect.String: 361 t.SetString(string(comment)) 362 case reflect.Slice: 363 t.Set(reflect.ValueOf(comment)) 364 } 365 366 switch t := saveXML; t.Kind() { 367 case reflect.String: 368 t.SetString(string(saveXMLData)) 369 case reflect.Slice: 370 t.Set(reflect.ValueOf(saveXMLData)) 371 } 372 373 return nil 374 } 375 376 func copyValue(dst reflect.Value, src []byte) (err error) { 377 // Helper functions for integer and unsigned integer conversions 378 var itmp int64 379 getInt64 := func() bool { 380 itmp, err = strconv.ParseInt(string(src), 10, 64) 381 // TODO: should check sizes 382 return err == nil 383 } 384 var utmp uint64 385 getUint64 := func() bool { 386 utmp, err = strconv.ParseUint(string(src), 10, 64) 387 // TODO: check for overflow? 388 return err == nil 389 } 390 var ftmp float64 391 getFloat64 := func() bool { 392 ftmp, err = strconv.ParseFloat(string(src), 64) 393 // TODO: check for overflow? 394 return err == nil 395 } 396 397 // Save accumulated data. 398 switch t := dst; t.Kind() { 399 case reflect.Invalid: 400 // Probably a comment. 401 default: 402 return errors.New("cannot happen: unknown type " + t.Type().String()) 403 case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64: 404 if !getInt64() { 405 return err 406 } 407 t.SetInt(itmp) 408 case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr: 409 if !getUint64() { 410 return err 411 } 412 t.SetUint(utmp) 413 case reflect.Float32, reflect.Float64: 414 if !getFloat64() { 415 return err 416 } 417 t.SetFloat(ftmp) 418 case reflect.Bool: 419 value, err := strconv.ParseBool(strings.TrimSpace(string(src))) 420 if err != nil { 421 return err 422 } 423 t.SetBool(value) 424 case reflect.String: 425 t.SetString(string(src)) 426 case reflect.Slice: 427 if len(src) == 0 { 428 // non-nil to flag presence 429 src = []byte{} 430 } 431 t.SetBytes(src) 432 case reflect.Struct: 433 if t.Type() == timeType { 434 tv, err := time.Parse(time.RFC3339, string(src)) 435 if err != nil { 436 return err 437 } 438 t.Set(reflect.ValueOf(tv)) 439 } 440 } 441 return nil 442 } 443 444 // unmarshalPath walks down an XML structure looking for wanted 445 // paths, and calls unmarshal on them. 446 // The consumed result tells whether XML elements have been consumed 447 // from the Decoder until start's matching end element, or if it's 448 // still untouched because start is uninteresting for sv's fields. 449 func (p *Decoder) unmarshalPath(tinfo *typeInfo, sv reflect.Value, parents []string, start *StartElement) (consumed bool, err error) { 450 recurse := false 451 Loop: 452 for i := range tinfo.fields { 453 finfo := &tinfo.fields[i] 454 if finfo.flags&fElement == 0 || len(finfo.parents) < len(parents) { 455 continue 456 } 457 for j := range parents { 458 if parents[j] != finfo.parents[j] { 459 continue Loop 460 } 461 } 462 if len(finfo.parents) == len(parents) && finfo.name == start.Name.Local { 463 // It's a perfect match, unmarshal the field. 464 return true, p.unmarshal(sv.FieldByIndex(finfo.idx), start) 465 } 466 if len(finfo.parents) > len(parents) && finfo.parents[len(parents)] == start.Name.Local { 467 // It's a prefix for the field. Break and recurse 468 // since it's not ok for one field path to be itself 469 // the prefix for another field path. 470 recurse = true 471 472 // We can reuse the same slice as long as we 473 // don't try to append to it. 474 parents = finfo.parents[:len(parents)+1] 475 break 476 } 477 } 478 if !recurse { 479 // We have no business with this element. 480 return false, nil 481 } 482 // The element is not a perfect match for any field, but one 483 // or more fields have the path to this element as a parent 484 // prefix. Recurse and attempt to match these. 485 for { 486 var tok Token 487 tok, err = p.Token() 488 if err != nil { 489 return true, err 490 } 491 switch t := tok.(type) { 492 case StartElement: 493 consumed2, err := p.unmarshalPath(tinfo, sv, parents, &t) 494 if err != nil { 495 return true, err 496 } 497 if !consumed2 { 498 if err := p.Skip(); err != nil { 499 return true, err 500 } 501 } 502 case EndElement: 503 return true, nil 504 } 505 } 506 panic("unreachable") 507 } 508 509 // Skip reads tokens until it has consumed the end element 510 // matching the most recent start element already consumed. 511 // It recurs if it encounters a start element, so it can be used to 512 // skip nested structures. 513 // It returns nil if it finds an end element matching the start 514 // element; otherwise it returns an error describing the problem. 515 func (d *Decoder) Skip() error { 516 for { 517 tok, err := d.Token() 518 if err != nil { 519 return err 520 } 521 switch tok.(type) { 522 case StartElement: 523 if err := d.Skip(); err != nil { 524 return err 525 } 526 case EndElement: 527 return nil 528 } 529 } 530 panic("unreachable") 531 }