Source file src/pkg/net/url/url.go
1 // Copyright 2009 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package url parses URLs and implements query escaping. 6 // See RFC 3986. 7 package url 8 9 import ( 10 "errors" 11 "strconv" 12 "strings" 13 ) 14 15 // Error reports an error and the operation and URL that caused it. 16 type Error struct { 17 Op string 18 URL string 19 Err error 20 } 21 22 func (e *Error) Error() string { return e.Op + " " + e.URL + ": " + e.Err.Error() } 23 24 func ishex(c byte) bool { 25 switch { 26 case '0' <= c && c <= '9': 27 return true 28 case 'a' <= c && c <= 'f': 29 return true 30 case 'A' <= c && c <= 'F': 31 return true 32 } 33 return false 34 } 35 36 func unhex(c byte) byte { 37 switch { 38 case '0' <= c && c <= '9': 39 return c - '0' 40 case 'a' <= c && c <= 'f': 41 return c - 'a' + 10 42 case 'A' <= c && c <= 'F': 43 return c - 'A' + 10 44 } 45 return 0 46 } 47 48 type encoding int 49 50 const ( 51 encodePath encoding = 1 + iota 52 encodeUserPassword 53 encodeQueryComponent 54 encodeFragment 55 ) 56 57 type EscapeError string 58 59 func (e EscapeError) Error() string { 60 return "invalid URL escape " + strconv.Quote(string(e)) 61 } 62 63 // Return true if the specified character should be escaped when 64 // appearing in a URL string, according to RFC 3986. 65 // When 'all' is true the full range of reserved characters are matched. 66 func shouldEscape(c byte, mode encoding) bool { 67 // §2.3 Unreserved characters (alphanum) 68 if 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' { 69 return false 70 } 71 72 switch c { 73 case '-', '_', '.', '~': // §2.3 Unreserved characters (mark) 74 return false 75 76 case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved) 77 // Different sections of the URL allow a few of 78 // the reserved characters to appear unescaped. 79 switch mode { 80 case encodePath: // §3.3 81 // The RFC allows : @ & = + $ but saves / ; , for assigning 82 // meaning to individual path segments. This package 83 // only manipulates the path as a whole, so we allow those 84 // last two as well. That leaves only ? to escape. 85 return c == '?' 86 87 case encodeUserPassword: // §3.2.2 88 // The RFC allows ; : & = + $ , in userinfo, so we must escape only @ and /. 89 // The parsing of userinfo treats : as special so we must escape that too. 90 return c == '@' || c == '/' || c == ':' 91 92 case encodeQueryComponent: // §3.4 93 // The RFC reserves (so we must escape) everything. 94 return true 95 96 case encodeFragment: // §4.1 97 // The RFC text is silent but the grammar allows 98 // everything, so escape nothing. 99 return false 100 } 101 } 102 103 // Everything else must be escaped. 104 return true 105 } 106 107 // QueryUnescape does the inverse transformation of QueryEscape, converting 108 // %AB into the byte 0xAB and '+' into ' ' (space). It returns an error if 109 // any % is not followed by two hexadecimal digits. 110 func QueryUnescape(s string) (string, error) { 111 return unescape(s, encodeQueryComponent) 112 } 113 114 // unescape unescapes a string; the mode specifies 115 // which section of the URL string is being unescaped. 116 func unescape(s string, mode encoding) (string, error) { 117 // Count %, check that they're well-formed. 118 n := 0 119 hasPlus := false 120 for i := 0; i < len(s); { 121 switch s[i] { 122 case '%': 123 n++ 124 if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) { 125 s = s[i:] 126 if len(s) > 3 { 127 s = s[0:3] 128 } 129 return "", EscapeError(s) 130 } 131 i += 3 132 case '+': 133 hasPlus = mode == encodeQueryComponent 134 i++ 135 default: 136 i++ 137 } 138 } 139 140 if n == 0 && !hasPlus { 141 return s, nil 142 } 143 144 t := make([]byte, len(s)-2*n) 145 j := 0 146 for i := 0; i < len(s); { 147 switch s[i] { 148 case '%': 149 t[j] = unhex(s[i+1])<<4 | unhex(s[i+2]) 150 j++ 151 i += 3 152 case '+': 153 if mode == encodeQueryComponent { 154 t[j] = ' ' 155 } else { 156 t[j] = '+' 157 } 158 j++ 159 i++ 160 default: 161 t[j] = s[i] 162 j++ 163 i++ 164 } 165 } 166 return string(t), nil 167 } 168 169 // QueryEscape escapes the string so it can be safely placed 170 // inside a URL query. 171 func QueryEscape(s string) string { 172 return escape(s, encodeQueryComponent) 173 } 174 175 func escape(s string, mode encoding) string { 176 spaceCount, hexCount := 0, 0 177 for i := 0; i < len(s); i++ { 178 c := s[i] 179 if shouldEscape(c, mode) { 180 if c == ' ' && mode == encodeQueryComponent { 181 spaceCount++ 182 } else { 183 hexCount++ 184 } 185 } 186 } 187 188 if spaceCount == 0 && hexCount == 0 { 189 return s 190 } 191 192 t := make([]byte, len(s)+2*hexCount) 193 j := 0 194 for i := 0; i < len(s); i++ { 195 switch c := s[i]; { 196 case c == ' ' && mode == encodeQueryComponent: 197 t[j] = '+' 198 j++ 199 case shouldEscape(c, mode): 200 t[j] = '%' 201 t[j+1] = "0123456789ABCDEF"[c>>4] 202 t[j+2] = "0123456789ABCDEF"[c&15] 203 j += 3 204 default: 205 t[j] = s[i] 206 j++ 207 } 208 } 209 return string(t) 210 } 211 212 // A URL represents a parsed URL (technically, a URI reference). 213 // The general form represented is: 214 // 215 // scheme://[userinfo@]host/path[?query][#fragment] 216 // 217 // URLs that do not start with a slash after the scheme are interpreted as: 218 // 219 // scheme:opaque[?query][#fragment] 220 // 221 type URL struct { 222 Scheme string 223 Opaque string // encoded opaque data 224 User *Userinfo // username and password information 225 Host string 226 Path string 227 RawQuery string // encoded query values, without '?' 228 Fragment string // fragment for references, without '#' 229 } 230 231 // User returns a Userinfo containing the provided username 232 // and no password set. 233 func User(username string) *Userinfo { 234 return &Userinfo{username, "", false} 235 } 236 237 // UserPassword returns a Userinfo containing the provided username 238 // and password. 239 // This functionality should only be used with legacy web sites. 240 // RFC 2396 warns that interpreting Userinfo this way 241 // ``is NOT RECOMMENDED, because the passing of authentication 242 // information in clear text (such as URI) has proven to be a 243 // security risk in almost every case where it has been used.'' 244 func UserPassword(username, password string) *Userinfo { 245 return &Userinfo{username, password, true} 246 } 247 248 // The Userinfo type is an immutable encapsulation of username and 249 // password details for a URL. An existing Userinfo value is guaranteed 250 // to have a username set (potentially empty, as allowed by RFC 2396), 251 // and optionally a password. 252 type Userinfo struct { 253 username string 254 password string 255 passwordSet bool 256 } 257 258 // Username returns the username. 259 func (u *Userinfo) Username() string { 260 return u.username 261 } 262 263 // Password returns the password in case it is set, and whether it is set. 264 func (u *Userinfo) Password() (string, bool) { 265 if u.passwordSet { 266 return u.password, true 267 } 268 return "", false 269 } 270 271 // String returns the encoded userinfo information in the standard form 272 // of "username[:password]". 273 func (u *Userinfo) String() string { 274 s := escape(u.username, encodeUserPassword) 275 if u.passwordSet { 276 s += ":" + escape(u.password, encodeUserPassword) 277 } 278 return s 279 } 280 281 // Maybe rawurl is of the form scheme:path. 282 // (Scheme must be [a-zA-Z][a-zA-Z0-9+-.]*) 283 // If so, return scheme, path; else return "", rawurl. 284 func getscheme(rawurl string) (scheme, path string, err error) { 285 for i := 0; i < len(rawurl); i++ { 286 c := rawurl[i] 287 switch { 288 case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': 289 // do nothing 290 case '0' <= c && c <= '9' || c == '+' || c == '-' || c == '.': 291 if i == 0 { 292 return "", rawurl, nil 293 } 294 case c == ':': 295 if i == 0 { 296 return "", "", errors.New("missing protocol scheme") 297 } 298 return rawurl[0:i], rawurl[i+1:], nil 299 default: 300 // we have encountered an invalid character, 301 // so there is no valid scheme 302 return "", rawurl, nil 303 } 304 } 305 return "", rawurl, nil 306 } 307 308 // Maybe s is of the form t c u. 309 // If so, return t, c u (or t, u if cutc == true). 310 // If not, return s, "". 311 func split(s string, c byte, cutc bool) (string, string) { 312 for i := 0; i < len(s); i++ { 313 if s[i] == c { 314 if cutc { 315 return s[0:i], s[i+1:] 316 } 317 return s[0:i], s[i:] 318 } 319 } 320 return s, "" 321 } 322 323 // Parse parses rawurl into a URL structure. 324 // The rawurl may be relative or absolute. 325 func Parse(rawurl string) (url *URL, err error) { 326 // Cut off #frag 327 u, frag := split(rawurl, '#', true) 328 if url, err = parse(u, false); err != nil { 329 return nil, err 330 } 331 if frag == "" { 332 return url, nil 333 } 334 if url.Fragment, err = unescape(frag, encodeFragment); err != nil { 335 return nil, &Error{"parse", rawurl, err} 336 } 337 return url, nil 338 } 339 340 // ParseRequestURI parses rawurl into a URL structure. It assumes that 341 // rawurl was received in an HTTP request, so the rawurl is interpreted 342 // only as an absolute URI or an absolute path. 343 // The string rawurl is assumed not to have a #fragment suffix. 344 // (Web browsers strip #fragment before sending the URL to a web server.) 345 func ParseRequestURI(rawurl string) (url *URL, err error) { 346 return parse(rawurl, true) 347 } 348 349 // parse parses a URL from a string in one of two contexts. If 350 // viaRequest is true, the URL is assumed to have arrived via an HTTP request, 351 // in which case only absolute URLs or path-absolute relative URLs are allowed. 352 // If viaRequest is false, all forms of relative URLs are allowed. 353 func parse(rawurl string, viaRequest bool) (url *URL, err error) { 354 var rest string 355 356 if rawurl == "" { 357 err = errors.New("empty url") 358 goto Error 359 } 360 url = new(URL) 361 362 // Split off possible leading "http:", "mailto:", etc. 363 // Cannot contain escaped characters. 364 if url.Scheme, rest, err = getscheme(rawurl); err != nil { 365 goto Error 366 } 367 368 rest, url.RawQuery = split(rest, '?', true) 369 370 if !strings.HasPrefix(rest, "/") { 371 if url.Scheme != "" { 372 // We consider rootless paths per RFC 3986 as opaque. 373 url.Opaque = rest 374 return url, nil 375 } 376 if viaRequest { 377 err = errors.New("invalid URI for request") 378 goto Error 379 } 380 } 381 382 if (url.Scheme != "" || !viaRequest) && strings.HasPrefix(rest, "//") && !strings.HasPrefix(rest, "///") { 383 var authority string 384 authority, rest = split(rest[2:], '/', false) 385 url.User, url.Host, err = parseAuthority(authority) 386 if err != nil { 387 goto Error 388 } 389 if strings.Contains(url.Host, "%") { 390 err = errors.New("hexadecimal escape in host") 391 goto Error 392 } 393 } 394 if url.Path, err = unescape(rest, encodePath); err != nil { 395 goto Error 396 } 397 return url, nil 398 399 Error: 400 return nil, &Error{"parse", rawurl, err} 401 } 402 403 func parseAuthority(authority string) (user *Userinfo, host string, err error) { 404 if strings.Index(authority, "@") < 0 { 405 host = authority 406 return 407 } 408 userinfo, host := split(authority, '@', true) 409 if strings.Index(userinfo, ":") < 0 { 410 if userinfo, err = unescape(userinfo, encodeUserPassword); err != nil { 411 return 412 } 413 user = User(userinfo) 414 } else { 415 username, password := split(userinfo, ':', true) 416 if username, err = unescape(username, encodeUserPassword); err != nil { 417 return 418 } 419 if password, err = unescape(password, encodeUserPassword); err != nil { 420 return 421 } 422 user = UserPassword(username, password) 423 } 424 return 425 } 426 427 // String reassembles the URL into a valid URL string. 428 func (u *URL) String() string { 429 // TODO: Rewrite to use bytes.Buffer 430 result := "" 431 if u.Scheme != "" { 432 result += u.Scheme + ":" 433 } 434 if u.Opaque != "" { 435 result += u.Opaque 436 } else { 437 if u.Host != "" || u.User != nil { 438 result += "//" 439 if u := u.User; u != nil { 440 result += u.String() + "@" 441 } 442 result += u.Host 443 } 444 result += escape(u.Path, encodePath) 445 } 446 if u.RawQuery != "" { 447 result += "?" + u.RawQuery 448 } 449 if u.Fragment != "" { 450 result += "#" + escape(u.Fragment, encodeFragment) 451 } 452 return result 453 } 454 455 // Values maps a string key to a list of values. 456 // It is typically used for query parameters and form values. 457 // Unlike in the http.Header map, the keys in a Values map 458 // are case-sensitive. 459 type Values map[string][]string 460 461 // Get gets the first value associated with the given key. 462 // If there are no values associated with the key, Get returns 463 // the empty string. To access multiple values, use the map 464 // directly. 465 func (v Values) Get(key string) string { 466 if v == nil { 467 return "" 468 } 469 vs, ok := v[key] 470 if !ok || len(vs) == 0 { 471 return "" 472 } 473 return vs[0] 474 } 475 476 // Set sets the key to value. It replaces any existing 477 // values. 478 func (v Values) Set(key, value string) { 479 v[key] = []string{value} 480 } 481 482 // Add adds the key to value. It appends to any existing 483 // values associated with key. 484 func (v Values) Add(key, value string) { 485 v[key] = append(v[key], value) 486 } 487 488 // Del deletes the values associated with key. 489 func (v Values) Del(key string) { 490 delete(v, key) 491 } 492 493 // ParseQuery parses the URL-encoded query string and returns 494 // a map listing the values specified for each key. 495 // ParseQuery always returns a non-nil map containing all the 496 // valid query parameters found; err describes the first decoding error 497 // encountered, if any. 498 func ParseQuery(query string) (m Values, err error) { 499 m = make(Values) 500 err = parseQuery(m, query) 501 return 502 } 503 504 func parseQuery(m Values, query string) (err error) { 505 for query != "" { 506 key := query 507 if i := strings.IndexAny(key, "&;"); i >= 0 { 508 key, query = key[:i], key[i+1:] 509 } else { 510 query = "" 511 } 512 if key == "" { 513 continue 514 } 515 value := "" 516 if i := strings.Index(key, "="); i >= 0 { 517 key, value = key[:i], key[i+1:] 518 } 519 key, err1 := QueryUnescape(key) 520 if err1 != nil { 521 err = err1 522 continue 523 } 524 value, err1 = QueryUnescape(value) 525 if err1 != nil { 526 err = err1 527 continue 528 } 529 m[key] = append(m[key], value) 530 } 531 return err 532 } 533 534 // Encode encodes the values into ``URL encoded'' form. 535 // e.g. "foo=bar&bar=baz" 536 func (v Values) Encode() string { 537 if v == nil { 538 return "" 539 } 540 parts := make([]string, 0, len(v)) // will be large enough for most uses 541 for k, vs := range v { 542 prefix := QueryEscape(k) + "=" 543 for _, v := range vs { 544 parts = append(parts, prefix+QueryEscape(v)) 545 } 546 } 547 return strings.Join(parts, "&") 548 } 549 550 // resolvePath applies special path segments from refs and applies 551 // them to base, per RFC 2396. 552 func resolvePath(basepath string, refpath string) string { 553 base := strings.Split(basepath, "/") 554 refs := strings.Split(refpath, "/") 555 if len(base) == 0 { 556 base = []string{""} 557 } 558 for idx, ref := range refs { 559 switch { 560 case ref == ".": 561 base[len(base)-1] = "" 562 case ref == "..": 563 newLen := len(base) - 1 564 if newLen < 1 { 565 newLen = 1 566 } 567 base = base[0:newLen] 568 base[len(base)-1] = "" 569 default: 570 if idx == 0 || base[len(base)-1] == "" { 571 base[len(base)-1] = ref 572 } else { 573 base = append(base, ref) 574 } 575 } 576 } 577 return strings.Join(base, "/") 578 } 579 580 // IsAbs returns true if the URL is absolute. 581 func (u *URL) IsAbs() bool { 582 return u.Scheme != "" 583 } 584 585 // Parse parses a URL in the context of the receiver. The provided URL 586 // may be relative or absolute. Parse returns nil, err on parse 587 // failure, otherwise its return value is the same as ResolveReference. 588 func (u *URL) Parse(ref string) (*URL, error) { 589 refurl, err := Parse(ref) 590 if err != nil { 591 return nil, err 592 } 593 return u.ResolveReference(refurl), nil 594 } 595 596 // ResolveReference resolves a URI reference to an absolute URI from 597 // an absolute base URI, per RFC 2396 Section 5.2. The URI reference 598 // may be relative or absolute. ResolveReference always returns a new 599 // URL instance, even if the returned URL is identical to either the 600 // base or reference. If ref is an absolute URL, then ResolveReference 601 // ignores base and returns a copy of ref. 602 func (u *URL) ResolveReference(ref *URL) *URL { 603 if ref.IsAbs() { 604 url := *ref 605 return &url 606 } 607 // relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ] 608 url := *u 609 url.RawQuery = ref.RawQuery 610 url.Fragment = ref.Fragment 611 if ref.Opaque != "" { 612 url.Opaque = ref.Opaque 613 url.User = nil 614 url.Host = "" 615 url.Path = "" 616 return &url 617 } 618 if ref.Host != "" || ref.User != nil { 619 // The "net_path" case. 620 url.Host = ref.Host 621 url.User = ref.User 622 } 623 if strings.HasPrefix(ref.Path, "/") { 624 // The "abs_path" case. 625 url.Path = ref.Path 626 } else { 627 // The "rel_path" case. 628 path := resolvePath(u.Path, ref.Path) 629 if !strings.HasPrefix(path, "/") { 630 path = "/" + path 631 } 632 url.Path = path 633 } 634 return &url 635 } 636 637 // Query parses RawQuery and returns the corresponding values. 638 func (u *URL) Query() Values { 639 v, _ := ParseQuery(u.RawQuery) 640 return v 641 } 642 643 // RequestURI returns the encoded path?query or opaque?query 644 // string that would be used in an HTTP request for u. 645 func (u *URL) RequestURI() string { 646 result := u.Opaque 647 if result == "" { 648 result = escape(u.Path, encodePath) 649 if result == "" { 650 result = "/" 651 } 652 } 653 if u.RawQuery != "" { 654 result += "?" + u.RawQuery 655 } 656 return result 657 }