Source file src/pkg/net/url/url.go
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package url parses URLs and implements query escaping.
6 // See RFC 3986.
7 package url
8
9 import (
10 "errors"
11 "strconv"
12 "strings"
13 )
14
15 // Error reports an error and the operation and URL that caused it.
16 type Error struct {
17 Op string
18 URL string
19 Err error
20 }
21
22 func (e *Error) Error() string { return e.Op + " " + e.URL + ": " + e.Err.Error() }
23
24 func ishex(c byte) bool {
25 switch {
26 case '0' <= c && c <= '9':
27 return true
28 case 'a' <= c && c <= 'f':
29 return true
30 case 'A' <= c && c <= 'F':
31 return true
32 }
33 return false
34 }
35
36 func unhex(c byte) byte {
37 switch {
38 case '0' <= c && c <= '9':
39 return c - '0'
40 case 'a' <= c && c <= 'f':
41 return c - 'a' + 10
42 case 'A' <= c && c <= 'F':
43 return c - 'A' + 10
44 }
45 return 0
46 }
47
48 type encoding int
49
50 const (
51 encodePath encoding = 1 + iota
52 encodeUserPassword
53 encodeQueryComponent
54 encodeFragment
55 )
56
57 type EscapeError string
58
59 func (e EscapeError) Error() string {
60 return "invalid URL escape " + strconv.Quote(string(e))
61 }
62
63 // Return true if the specified character should be escaped when
64 // appearing in a URL string, according to RFC 3986.
65 // When 'all' is true the full range of reserved characters are matched.
66 func shouldEscape(c byte, mode encoding) bool {
67 // §2.3 Unreserved characters (alphanum)
68 if 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' {
69 return false
70 }
71
72 switch c {
73 case '-', '_', '.', '~': // §2.3 Unreserved characters (mark)
74 return false
75
76 case '$', '&', '+', ',', '/', ':', ';', '=', '?', '@': // §2.2 Reserved characters (reserved)
77 // Different sections of the URL allow a few of
78 // the reserved characters to appear unescaped.
79 switch mode {
80 case encodePath: // §3.3
81 // The RFC allows : @ & = + $ but saves / ; , for assigning
82 // meaning to individual path segments. This package
83 // only manipulates the path as a whole, so we allow those
84 // last two as well. That leaves only ? to escape.
85 return c == '?'
86
87 case encodeUserPassword: // §3.2.2
88 // The RFC allows ; : & = + $ , in userinfo, so we must escape only @ and /.
89 // The parsing of userinfo treats : as special so we must escape that too.
90 return c == '@' || c == '/' || c == ':'
91
92 case encodeQueryComponent: // §3.4
93 // The RFC reserves (so we must escape) everything.
94 return true
95
96 case encodeFragment: // §4.1
97 // The RFC text is silent but the grammar allows
98 // everything, so escape nothing.
99 return false
100 }
101 }
102
103 // Everything else must be escaped.
104 return true
105 }
106
107 // QueryUnescape does the inverse transformation of QueryEscape, converting
108 // %AB into the byte 0xAB and '+' into ' ' (space). It returns an error if
109 // any % is not followed by two hexadecimal digits.
110 func QueryUnescape(s string) (string, error) {
111 return unescape(s, encodeQueryComponent)
112 }
113
114 // unescape unescapes a string; the mode specifies
115 // which section of the URL string is being unescaped.
116 func unescape(s string, mode encoding) (string, error) {
117 // Count %, check that they're well-formed.
118 n := 0
119 hasPlus := false
120 for i := 0; i < len(s); {
121 switch s[i] {
122 case '%':
123 n++
124 if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
125 s = s[i:]
126 if len(s) > 3 {
127 s = s[0:3]
128 }
129 return "", EscapeError(s)
130 }
131 i += 3
132 case '+':
133 hasPlus = mode == encodeQueryComponent
134 i++
135 default:
136 i++
137 }
138 }
139
140 if n == 0 && !hasPlus {
141 return s, nil
142 }
143
144 t := make([]byte, len(s)-2*n)
145 j := 0
146 for i := 0; i < len(s); {
147 switch s[i] {
148 case '%':
149 t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
150 j++
151 i += 3
152 case '+':
153 if mode == encodeQueryComponent {
154 t[j] = ' '
155 } else {
156 t[j] = '+'
157 }
158 j++
159 i++
160 default:
161 t[j] = s[i]
162 j++
163 i++
164 }
165 }
166 return string(t), nil
167 }
168
169 // QueryEscape escapes the string so it can be safely placed
170 // inside a URL query.
171 func QueryEscape(s string) string {
172 return escape(s, encodeQueryComponent)
173 }
174
175 func escape(s string, mode encoding) string {
176 spaceCount, hexCount := 0, 0
177 for i := 0; i < len(s); i++ {
178 c := s[i]
179 if shouldEscape(c, mode) {
180 if c == ' ' && mode == encodeQueryComponent {
181 spaceCount++
182 } else {
183 hexCount++
184 }
185 }
186 }
187
188 if spaceCount == 0 && hexCount == 0 {
189 return s
190 }
191
192 t := make([]byte, len(s)+2*hexCount)
193 j := 0
194 for i := 0; i < len(s); i++ {
195 switch c := s[i]; {
196 case c == ' ' && mode == encodeQueryComponent:
197 t[j] = '+'
198 j++
199 case shouldEscape(c, mode):
200 t[j] = '%'
201 t[j+1] = "0123456789ABCDEF"[c>>4]
202 t[j+2] = "0123456789ABCDEF"[c&15]
203 j += 3
204 default:
205 t[j] = s[i]
206 j++
207 }
208 }
209 return string(t)
210 }
211
212 // A URL represents a parsed URL (technically, a URI reference).
213 // The general form represented is:
214 //
215 // scheme://[userinfo@]host/path[?query][#fragment]
216 //
217 // URLs that do not start with a slash after the scheme are interpreted as:
218 //
219 // scheme:opaque[?query][#fragment]
220 //
221 type URL struct {
222 Scheme string
223 Opaque string // encoded opaque data
224 User *Userinfo // username and password information
225 Host string
226 Path string
227 RawQuery string // encoded query values, without '?'
228 Fragment string // fragment for references, without '#'
229 }
230
231 // User returns a Userinfo containing the provided username
232 // and no password set.
233 func User(username string) *Userinfo {
234 return &Userinfo{username, "", false}
235 }
236
237 // UserPassword returns a Userinfo containing the provided username
238 // and password.
239 // This functionality should only be used with legacy web sites.
240 // RFC 2396 warns that interpreting Userinfo this way
241 // ``is NOT RECOMMENDED, because the passing of authentication
242 // information in clear text (such as URI) has proven to be a
243 // security risk in almost every case where it has been used.''
244 func UserPassword(username, password string) *Userinfo {
245 return &Userinfo{username, password, true}
246 }
247
248 // The Userinfo type is an immutable encapsulation of username and
249 // password details for a URL. An existing Userinfo value is guaranteed
250 // to have a username set (potentially empty, as allowed by RFC 2396),
251 // and optionally a password.
252 type Userinfo struct {
253 username string
254 password string
255 passwordSet bool
256 }
257
258 // Username returns the username.
259 func (u *Userinfo) Username() string {
260 return u.username
261 }
262
263 // Password returns the password in case it is set, and whether it is set.
264 func (u *Userinfo) Password() (string, bool) {
265 if u.passwordSet {
266 return u.password, true
267 }
268 return "", false
269 }
270
271 // String returns the encoded userinfo information in the standard form
272 // of "username[:password]".
273 func (u *Userinfo) String() string {
274 s := escape(u.username, encodeUserPassword)
275 if u.passwordSet {
276 s += ":" + escape(u.password, encodeUserPassword)
277 }
278 return s
279 }
280
281 // Maybe rawurl is of the form scheme:path.
282 // (Scheme must be [a-zA-Z][a-zA-Z0-9+-.]*)
283 // If so, return scheme, path; else return "", rawurl.
284 func getscheme(rawurl string) (scheme, path string, err error) {
285 for i := 0; i < len(rawurl); i++ {
286 c := rawurl[i]
287 switch {
288 case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
289 // do nothing
290 case '0' <= c && c <= '9' || c == '+' || c == '-' || c == '.':
291 if i == 0 {
292 return "", rawurl, nil
293 }
294 case c == ':':
295 if i == 0 {
296 return "", "", errors.New("missing protocol scheme")
297 }
298 return rawurl[0:i], rawurl[i+1:], nil
299 default:
300 // we have encountered an invalid character,
301 // so there is no valid scheme
302 return "", rawurl, nil
303 }
304 }
305 return "", rawurl, nil
306 }
307
308 // Maybe s is of the form t c u.
309 // If so, return t, c u (or t, u if cutc == true).
310 // If not, return s, "".
311 func split(s string, c byte, cutc bool) (string, string) {
312 for i := 0; i < len(s); i++ {
313 if s[i] == c {
314 if cutc {
315 return s[0:i], s[i+1:]
316 }
317 return s[0:i], s[i:]
318 }
319 }
320 return s, ""
321 }
322
323 // Parse parses rawurl into a URL structure.
324 // The rawurl may be relative or absolute.
325 func Parse(rawurl string) (url *URL, err error) {
326 // Cut off #frag
327 u, frag := split(rawurl, '#', true)
328 if url, err = parse(u, false); err != nil {
329 return nil, err
330 }
331 if frag == "" {
332 return url, nil
333 }
334 if url.Fragment, err = unescape(frag, encodeFragment); err != nil {
335 return nil, &Error{"parse", rawurl, err}
336 }
337 return url, nil
338 }
339
340 // ParseRequestURI parses rawurl into a URL structure. It assumes that
341 // rawurl was received in an HTTP request, so the rawurl is interpreted
342 // only as an absolute URI or an absolute path.
343 // The string rawurl is assumed not to have a #fragment suffix.
344 // (Web browsers strip #fragment before sending the URL to a web server.)
345 func ParseRequestURI(rawurl string) (url *URL, err error) {
346 return parse(rawurl, true)
347 }
348
349 // parse parses a URL from a string in one of two contexts. If
350 // viaRequest is true, the URL is assumed to have arrived via an HTTP request,
351 // in which case only absolute URLs or path-absolute relative URLs are allowed.
352 // If viaRequest is false, all forms of relative URLs are allowed.
353 func parse(rawurl string, viaRequest bool) (url *URL, err error) {
354 var rest string
355
356 if rawurl == "" {
357 err = errors.New("empty url")
358 goto Error
359 }
360 url = new(URL)
361
362 // Split off possible leading "http:", "mailto:", etc.
363 // Cannot contain escaped characters.
364 if url.Scheme, rest, err = getscheme(rawurl); err != nil {
365 goto Error
366 }
367
368 rest, url.RawQuery = split(rest, '?', true)
369
370 if !strings.HasPrefix(rest, "/") {
371 if url.Scheme != "" {
372 // We consider rootless paths per RFC 3986 as opaque.
373 url.Opaque = rest
374 return url, nil
375 }
376 if viaRequest {
377 err = errors.New("invalid URI for request")
378 goto Error
379 }
380 }
381
382 if (url.Scheme != "" || !viaRequest) && strings.HasPrefix(rest, "//") && !strings.HasPrefix(rest, "///") {
383 var authority string
384 authority, rest = split(rest[2:], '/', false)
385 url.User, url.Host, err = parseAuthority(authority)
386 if err != nil {
387 goto Error
388 }
389 if strings.Contains(url.Host, "%") {
390 err = errors.New("hexadecimal escape in host")
391 goto Error
392 }
393 }
394 if url.Path, err = unescape(rest, encodePath); err != nil {
395 goto Error
396 }
397 return url, nil
398
399 Error:
400 return nil, &Error{"parse", rawurl, err}
401 }
402
403 func parseAuthority(authority string) (user *Userinfo, host string, err error) {
404 if strings.Index(authority, "@") < 0 {
405 host = authority
406 return
407 }
408 userinfo, host := split(authority, '@', true)
409 if strings.Index(userinfo, ":") < 0 {
410 if userinfo, err = unescape(userinfo, encodeUserPassword); err != nil {
411 return
412 }
413 user = User(userinfo)
414 } else {
415 username, password := split(userinfo, ':', true)
416 if username, err = unescape(username, encodeUserPassword); err != nil {
417 return
418 }
419 if password, err = unescape(password, encodeUserPassword); err != nil {
420 return
421 }
422 user = UserPassword(username, password)
423 }
424 return
425 }
426
427 // String reassembles the URL into a valid URL string.
428 func (u *URL) String() string {
429 // TODO: Rewrite to use bytes.Buffer
430 result := ""
431 if u.Scheme != "" {
432 result += u.Scheme + ":"
433 }
434 if u.Opaque != "" {
435 result += u.Opaque
436 } else {
437 if u.Host != "" || u.User != nil {
438 result += "//"
439 if u := u.User; u != nil {
440 result += u.String() + "@"
441 }
442 result += u.Host
443 }
444 result += escape(u.Path, encodePath)
445 }
446 if u.RawQuery != "" {
447 result += "?" + u.RawQuery
448 }
449 if u.Fragment != "" {
450 result += "#" + escape(u.Fragment, encodeFragment)
451 }
452 return result
453 }
454
455 // Values maps a string key to a list of values.
456 // It is typically used for query parameters and form values.
457 // Unlike in the http.Header map, the keys in a Values map
458 // are case-sensitive.
459 type Values map[string][]string
460
461 // Get gets the first value associated with the given key.
462 // If there are no values associated with the key, Get returns
463 // the empty string. To access multiple values, use the map
464 // directly.
465 func (v Values) Get(key string) string {
466 if v == nil {
467 return ""
468 }
469 vs, ok := v[key]
470 if !ok || len(vs) == 0 {
471 return ""
472 }
473 return vs[0]
474 }
475
476 // Set sets the key to value. It replaces any existing
477 // values.
478 func (v Values) Set(key, value string) {
479 v[key] = []string{value}
480 }
481
482 // Add adds the key to value. It appends to any existing
483 // values associated with key.
484 func (v Values) Add(key, value string) {
485 v[key] = append(v[key], value)
486 }
487
488 // Del deletes the values associated with key.
489 func (v Values) Del(key string) {
490 delete(v, key)
491 }
492
493 // ParseQuery parses the URL-encoded query string and returns
494 // a map listing the values specified for each key.
495 // ParseQuery always returns a non-nil map containing all the
496 // valid query parameters found; err describes the first decoding error
497 // encountered, if any.
498 func ParseQuery(query string) (m Values, err error) {
499 m = make(Values)
500 err = parseQuery(m, query)
501 return
502 }
503
504 func parseQuery(m Values, query string) (err error) {
505 for query != "" {
506 key := query
507 if i := strings.IndexAny(key, "&;"); i >= 0 {
508 key, query = key[:i], key[i+1:]
509 } else {
510 query = ""
511 }
512 if key == "" {
513 continue
514 }
515 value := ""
516 if i := strings.Index(key, "="); i >= 0 {
517 key, value = key[:i], key[i+1:]
518 }
519 key, err1 := QueryUnescape(key)
520 if err1 != nil {
521 err = err1
522 continue
523 }
524 value, err1 = QueryUnescape(value)
525 if err1 != nil {
526 err = err1
527 continue
528 }
529 m[key] = append(m[key], value)
530 }
531 return err
532 }
533
534 // Encode encodes the values into ``URL encoded'' form.
535 // e.g. "foo=bar&bar=baz"
536 func (v Values) Encode() string {
537 if v == nil {
538 return ""
539 }
540 parts := make([]string, 0, len(v)) // will be large enough for most uses
541 for k, vs := range v {
542 prefix := QueryEscape(k) + "="
543 for _, v := range vs {
544 parts = append(parts, prefix+QueryEscape(v))
545 }
546 }
547 return strings.Join(parts, "&")
548 }
549
550 // resolvePath applies special path segments from refs and applies
551 // them to base, per RFC 2396.
552 func resolvePath(basepath string, refpath string) string {
553 base := strings.Split(basepath, "/")
554 refs := strings.Split(refpath, "/")
555 if len(base) == 0 {
556 base = []string{""}
557 }
558 for idx, ref := range refs {
559 switch {
560 case ref == ".":
561 base[len(base)-1] = ""
562 case ref == "..":
563 newLen := len(base) - 1
564 if newLen < 1 {
565 newLen = 1
566 }
567 base = base[0:newLen]
568 base[len(base)-1] = ""
569 default:
570 if idx == 0 || base[len(base)-1] == "" {
571 base[len(base)-1] = ref
572 } else {
573 base = append(base, ref)
574 }
575 }
576 }
577 return strings.Join(base, "/")
578 }
579
580 // IsAbs returns true if the URL is absolute.
581 func (u *URL) IsAbs() bool {
582 return u.Scheme != ""
583 }
584
585 // Parse parses a URL in the context of the receiver. The provided URL
586 // may be relative or absolute. Parse returns nil, err on parse
587 // failure, otherwise its return value is the same as ResolveReference.
588 func (u *URL) Parse(ref string) (*URL, error) {
589 refurl, err := Parse(ref)
590 if err != nil {
591 return nil, err
592 }
593 return u.ResolveReference(refurl), nil
594 }
595
596 // ResolveReference resolves a URI reference to an absolute URI from
597 // an absolute base URI, per RFC 2396 Section 5.2. The URI reference
598 // may be relative or absolute. ResolveReference always returns a new
599 // URL instance, even if the returned URL is identical to either the
600 // base or reference. If ref is an absolute URL, then ResolveReference
601 // ignores base and returns a copy of ref.
602 func (u *URL) ResolveReference(ref *URL) *URL {
603 if ref.IsAbs() {
604 url := *ref
605 return &url
606 }
607 // relativeURI = ( net_path | abs_path | rel_path ) [ "?" query ]
608 url := *u
609 url.RawQuery = ref.RawQuery
610 url.Fragment = ref.Fragment
611 if ref.Opaque != "" {
612 url.Opaque = ref.Opaque
613 url.User = nil
614 url.Host = ""
615 url.Path = ""
616 return &url
617 }
618 if ref.Host != "" || ref.User != nil {
619 // The "net_path" case.
620 url.Host = ref.Host
621 url.User = ref.User
622 }
623 if strings.HasPrefix(ref.Path, "/") {
624 // The "abs_path" case.
625 url.Path = ref.Path
626 } else {
627 // The "rel_path" case.
628 path := resolvePath(u.Path, ref.Path)
629 if !strings.HasPrefix(path, "/") {
630 path = "/" + path
631 }
632 url.Path = path
633 }
634 return &url
635 }
636
637 // Query parses RawQuery and returns the corresponding values.
638 func (u *URL) Query() Values {
639 v, _ := ParseQuery(u.RawQuery)
640 return v
641 }
642
643 // RequestURI returns the encoded path?query or opaque?query
644 // string that would be used in an HTTP request for u.
645 func (u *URL) RequestURI() string {
646 result := u.Opaque
647 if result == "" {
648 result = escape(u.Path, encodePath)
649 if result == "" {
650 result = "/"
651 }
652 }
653 if u.RawQuery != "" {
654 result += "?" + u.RawQuery
655 }
656 return result
657 }