Source file src/pkg/net/mail/message.go
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 /*
6 Package mail implements parsing of mail messages.
7
8 For the most part, this package follows the syntax as specified by RFC 5322.
9 Notable divergences:
10 * Obsolete address formats are not parsed, including addresses with
11 embedded route information.
12 * Group addresses are not parsed.
13 * The full range of spacing (the CFWS syntax element) is not supported,
14 such as breaking addresses across lines.
15 */
16 package mail
17
18 import (
19 "bufio"
20 "bytes"
21 "encoding/base64"
22 "errors"
23 "fmt"
24 "io"
25 "io/ioutil"
26 "log"
27 "net/textproto"
28 "strconv"
29 "strings"
30 "time"
31 )
32
33 var debug = debugT(false)
34
35 type debugT bool
36
37 func (d debugT) Printf(format string, args ...interface{}) {
38 if d {
39 log.Printf(format, args...)
40 }
41 }
42
43 // A Message represents a parsed mail message.
44 type Message struct {
45 Header Header
46 Body io.Reader
47 }
48
49 // ReadMessage reads a message from r.
50 // The headers are parsed, and the body of the message will be reading from r.
51 func ReadMessage(r io.Reader) (msg *Message, err error) {
52 tp := textproto.NewReader(bufio.NewReader(r))
53
54 hdr, err := tp.ReadMIMEHeader()
55 if err != nil {
56 return nil, err
57 }
58
59 return &Message{
60 Header: Header(hdr),
61 Body: tp.R,
62 }, nil
63 }
64
65 // Layouts suitable for passing to time.Parse.
66 // These are tried in order.
67 var dateLayouts []string
68
69 func init() {
70 // Generate layouts based on RFC 5322, section 3.3.
71
72 dows := [...]string{"", "Mon, "} // day-of-week
73 days := [...]string{"2", "02"} // day = 1*2DIGIT
74 years := [...]string{"2006", "06"} // year = 4*DIGIT / 2*DIGIT
75 seconds := [...]string{":05", ""} // second
76 zones := [...]string{"-0700", "MST"} // zone = (("+" / "-") 4DIGIT) / "GMT" / ...
77
78 for _, dow := range dows {
79 for _, day := range days {
80 for _, year := range years {
81 for _, second := range seconds {
82 for _, zone := range zones {
83 s := dow + day + " Jan " + year + " 15:04" + second + " " + zone
84 dateLayouts = append(dateLayouts, s)
85 }
86 }
87 }
88 }
89 }
90 }
91
92 func parseDate(date string) (time.Time, error) {
93 for _, layout := range dateLayouts {
94 t, err := time.Parse(layout, date)
95 if err == nil {
96 return t, nil
97 }
98 }
99 return time.Time{}, errors.New("mail: header could not be parsed")
100 }
101
102 // A Header represents the key-value pairs in a mail message header.
103 type Header map[string][]string
104
105 // Get gets the first value associated with the given key.
106 // If there are no values associated with the key, Get returns "".
107 func (h Header) Get(key string) string {
108 return textproto.MIMEHeader(h).Get(key)
109 }
110
111 var ErrHeaderNotPresent = errors.New("mail: header not in message")
112
113 // Date parses the Date header field.
114 func (h Header) Date() (time.Time, error) {
115 hdr := h.Get("Date")
116 if hdr == "" {
117 return time.Time{}, ErrHeaderNotPresent
118 }
119 return parseDate(hdr)
120 }
121
122 // AddressList parses the named header field as a list of addresses.
123 func (h Header) AddressList(key string) ([]*Address, error) {
124 hdr := h.Get(key)
125 if hdr == "" {
126 return nil, ErrHeaderNotPresent
127 }
128 return newAddrParser(hdr).parseAddressList()
129 }
130
131 // Address represents a single mail address.
132 // An address such as "Barry Gibbs <[email protected]>" is represented
133 // as Address{Name: "Barry Gibbs", Address: "[email protected]"}.
134 type Address struct {
135 Name string // Proper name; may be empty.
136 Address string // user@domain
137 }
138
139 // String formats the address as a valid RFC 5322 address.
140 // If the address's name contains non-ASCII characters
141 // the name will be rendered according to RFC 2047.
142 func (a *Address) String() string {
143 s := "<" + a.Address + ">"
144 if a.Name == "" {
145 return s
146 }
147 // If every character is printable ASCII, quoting is simple.
148 allPrintable := true
149 for i := 0; i < len(a.Name); i++ {
150 if !isVchar(a.Name[i]) {
151 allPrintable = false
152 break
153 }
154 }
155 if allPrintable {
156 b := bytes.NewBufferString(`"`)
157 for i := 0; i < len(a.Name); i++ {
158 if !isQtext(a.Name[i]) {
159 b.WriteByte('\\')
160 }
161 b.WriteByte(a.Name[i])
162 }
163 b.WriteString(`" `)
164 b.WriteString(s)
165 return b.String()
166 }
167
168 // UTF-8 "Q" encoding
169 b := bytes.NewBufferString("=?utf-8?q?")
170 for i := 0; i < len(a.Name); i++ {
171 switch c := a.Name[i]; {
172 case c == ' ':
173 b.WriteByte('_')
174 case isVchar(c) && c != '=' && c != '?' && c != '_':
175 b.WriteByte(c)
176 default:
177 fmt.Fprintf(b, "=%02X", c)
178 }
179 }
180 b.WriteString("?= ")
181 b.WriteString(s)
182 return b.String()
183 }
184
185 type addrParser []byte
186
187 func newAddrParser(s string) *addrParser {
188 p := addrParser(s)
189 return &p
190 }
191
192 func (p *addrParser) parseAddressList() ([]*Address, error) {
193 var list []*Address
194 for {
195 p.skipSpace()
196 addr, err := p.parseAddress()
197 if err != nil {
198 return nil, err
199 }
200 list = append(list, addr)
201
202 p.skipSpace()
203 if p.empty() {
204 break
205 }
206 if !p.consume(',') {
207 return nil, errors.New("mail: expected comma")
208 }
209 }
210 return list, nil
211 }
212
213 // parseAddress parses a single RFC 5322 address at the start of p.
214 func (p *addrParser) parseAddress() (addr *Address, err error) {
215 debug.Printf("parseAddress: %q", *p)
216 p.skipSpace()
217 if p.empty() {
218 return nil, errors.New("mail: no address")
219 }
220
221 // address = name-addr / addr-spec
222 // TODO(dsymonds): Support parsing group address.
223
224 // addr-spec has a more restricted grammar than name-addr,
225 // so try parsing it first, and fallback to name-addr.
226 // TODO(dsymonds): Is this really correct?
227 spec, err := p.consumeAddrSpec()
228 if err == nil {
229 return &Address{
230 Address: spec,
231 }, err
232 }
233 debug.Printf("parseAddress: not an addr-spec: %v", err)
234 debug.Printf("parseAddress: state is now %q", *p)
235
236 // display-name
237 var displayName string
238 if p.peek() != '<' {
239 displayName, err = p.consumePhrase()
240 if err != nil {
241 return nil, err
242 }
243 }
244 debug.Printf("parseAddress: displayName=%q", displayName)
245
246 // angle-addr = "<" addr-spec ">"
247 p.skipSpace()
248 if !p.consume('<') {
249 return nil, errors.New("mail: no angle-addr")
250 }
251 spec, err = p.consumeAddrSpec()
252 if err != nil {
253 return nil, err
254 }
255 if !p.consume('>') {
256 return nil, errors.New("mail: unclosed angle-addr")
257 }
258 debug.Printf("parseAddress: spec=%q", spec)
259
260 return &Address{
261 Name: displayName,
262 Address: spec,
263 }, nil
264 }
265
266 // consumeAddrSpec parses a single RFC 5322 addr-spec at the start of p.
267 func (p *addrParser) consumeAddrSpec() (spec string, err error) {
268 debug.Printf("consumeAddrSpec: %q", *p)
269
270 orig := *p
271 defer func() {
272 if err != nil {
273 *p = orig
274 }
275 }()
276
277 // local-part = dot-atom / quoted-string
278 var localPart string
279 p.skipSpace()
280 if p.empty() {
281 return "", errors.New("mail: no addr-spec")
282 }
283 if p.peek() == '"' {
284 // quoted-string
285 debug.Printf("consumeAddrSpec: parsing quoted-string")
286 localPart, err = p.consumeQuotedString()
287 } else {
288 // dot-atom
289 debug.Printf("consumeAddrSpec: parsing dot-atom")
290 localPart, err = p.consumeAtom(true)
291 }
292 if err != nil {
293 debug.Printf("consumeAddrSpec: failed: %v", err)
294 return "", err
295 }
296
297 if !p.consume('@') {
298 return "", errors.New("mail: missing @ in addr-spec")
299 }
300
301 // domain = dot-atom / domain-literal
302 var domain string
303 p.skipSpace()
304 if p.empty() {
305 return "", errors.New("mail: no domain in addr-spec")
306 }
307 // TODO(dsymonds): Handle domain-literal
308 domain, err = p.consumeAtom(true)
309 if err != nil {
310 return "", err
311 }
312
313 return localPart + "@" + domain, nil
314 }
315
316 // consumePhrase parses the RFC 5322 phrase at the start of p.
317 func (p *addrParser) consumePhrase() (phrase string, err error) {
318 debug.Printf("consumePhrase: [%s]", *p)
319 // phrase = 1*word
320 var words []string
321 for {
322 // word = atom / quoted-string
323 var word string
324 p.skipSpace()
325 if p.empty() {
326 return "", errors.New("mail: missing phrase")
327 }
328 if p.peek() == '"' {
329 // quoted-string
330 word, err = p.consumeQuotedString()
331 } else {
332 // atom
333 word, err = p.consumeAtom(false)
334 }
335
336 // RFC 2047 encoded-word starts with =?, ends with ?=, and has two other ?s.
337 if err == nil && strings.HasPrefix(word, "=?") && strings.HasSuffix(word, "?=") && strings.Count(word, "?") == 4 {
338 word, err = decodeRFC2047Word(word)
339 }
340
341 if err != nil {
342 break
343 }
344 debug.Printf("consumePhrase: consumed %q", word)
345 words = append(words, word)
346 }
347 // Ignore any error if we got at least one word.
348 if err != nil && len(words) == 0 {
349 debug.Printf("consumePhrase: hit err: %v", err)
350 return "", errors.New("mail: missing word in phrase")
351 }
352 phrase = strings.Join(words, " ")
353 return phrase, nil
354 }
355
356 // consumeQuotedString parses the quoted string at the start of p.
357 func (p *addrParser) consumeQuotedString() (qs string, err error) {
358 // Assume first byte is '"'.
359 i := 1
360 qsb := make([]byte, 0, 10)
361 Loop:
362 for {
363 if i >= p.len() {
364 return "", errors.New("mail: unclosed quoted-string")
365 }
366 switch c := (*p)[i]; {
367 case c == '"':
368 break Loop
369 case c == '\\':
370 if i+1 == p.len() {
371 return "", errors.New("mail: unclosed quoted-string")
372 }
373 qsb = append(qsb, (*p)[i+1])
374 i += 2
375 case isQtext(c), c == ' ' || c == '\t':
376 // qtext (printable US-ASCII excluding " and \), or
377 // FWS (almost; we're ignoring CRLF)
378 qsb = append(qsb, c)
379 i++
380 default:
381 return "", fmt.Errorf("mail: bad character in quoted-string: %q", c)
382 }
383 }
384 *p = (*p)[i+1:]
385 return string(qsb), nil
386 }
387
388 // consumeAtom parses an RFC 5322 atom at the start of p.
389 // If dot is true, consumeAtom parses an RFC 5322 dot-atom instead.
390 func (p *addrParser) consumeAtom(dot bool) (atom string, err error) {
391 if !isAtext(p.peek(), false) {
392 return "", errors.New("mail: invalid string")
393 }
394 i := 1
395 for ; i < p.len() && isAtext((*p)[i], dot); i++ {
396 }
397 atom, *p = string((*p)[:i]), (*p)[i:]
398 return atom, nil
399 }
400
401 func (p *addrParser) consume(c byte) bool {
402 if p.empty() || p.peek() != c {
403 return false
404 }
405 *p = (*p)[1:]
406 return true
407 }
408
409 // skipSpace skips the leading space and tab characters.
410 func (p *addrParser) skipSpace() {
411 *p = bytes.TrimLeft(*p, " \t")
412 }
413
414 func (p *addrParser) peek() byte {
415 return (*p)[0]
416 }
417
418 func (p *addrParser) empty() bool {
419 return p.len() == 0
420 }
421
422 func (p *addrParser) len() int {
423 return len(*p)
424 }
425
426 func decodeRFC2047Word(s string) (string, error) {
427 fields := strings.Split(s, "?")
428 if len(fields) != 5 || fields[0] != "=" || fields[4] != "=" {
429 return "", errors.New("mail: address not RFC 2047 encoded")
430 }
431 charset, enc := strings.ToLower(fields[1]), strings.ToLower(fields[2])
432 if charset != "iso-8859-1" && charset != "utf-8" {
433 return "", fmt.Errorf("mail: charset not supported: %q", charset)
434 }
435
436 in := bytes.NewBufferString(fields[3])
437 var r io.Reader
438 switch enc {
439 case "b":
440 r = base64.NewDecoder(base64.StdEncoding, in)
441 case "q":
442 r = qDecoder{r: in}
443 default:
444 return "", fmt.Errorf("mail: RFC 2047 encoding not supported: %q", enc)
445 }
446
447 dec, err := ioutil.ReadAll(r)
448 if err != nil {
449 return "", err
450 }
451
452 switch charset {
453 case "iso-8859-1":
454 b := new(bytes.Buffer)
455 for _, c := range dec {
456 b.WriteRune(rune(c))
457 }
458 return b.String(), nil
459 case "utf-8":
460 return string(dec), nil
461 }
462 panic("unreachable")
463 }
464
465 type qDecoder struct {
466 r io.Reader
467 scratch [2]byte
468 }
469
470 func (qd qDecoder) Read(p []byte) (n int, err error) {
471 // This method writes at most one byte into p.
472 if len(p) == 0 {
473 return 0, nil
474 }
475 if _, err := qd.r.Read(qd.scratch[:1]); err != nil {
476 return 0, err
477 }
478 switch c := qd.scratch[0]; {
479 case c == '=':
480 if _, err := io.ReadFull(qd.r, qd.scratch[:2]); err != nil {
481 return 0, err
482 }
483 x, err := strconv.ParseInt(string(qd.scratch[:2]), 16, 64)
484 if err != nil {
485 return 0, fmt.Errorf("mail: invalid RFC 2047 encoding: %q", qd.scratch[:2])
486 }
487 p[0] = byte(x)
488 case c == '_':
489 p[0] = ' '
490 default:
491 p[0] = c
492 }
493 return 1, nil
494 }
495
496 var atextChars = []byte("ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
497 "abcdefghijklmnopqrstuvwxyz" +
498 "0123456789" +
499 "!#$%&'*+-/=?^_`{|}~")
500
501 // isAtext returns true if c is an RFC 5322 atext character.
502 // If dot is true, period is included.
503 func isAtext(c byte, dot bool) bool {
504 if dot && c == '.' {
505 return true
506 }
507 return bytes.IndexByte(atextChars, c) >= 0
508 }
509
510 // isQtext returns true if c is an RFC 5322 qtest character.
511 func isQtext(c byte) bool {
512 // Printable US-ASCII, excluding backslash or quote.
513 if c == '\\' || c == '"' {
514 return false
515 }
516 return '!' <= c && c <= '~'
517 }
518
519 // isVchar returns true if c is an RFC 5322 VCHAR character.
520 func isVchar(c byte) bool {
521 // Visible (printing) characters.
522 return '!' <= c && c <= '~'
523 }