Source file src/pkg/encoding/csv/reader.go
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Package csv reads and writes comma-separated values (CSV) files.
6 //
7 // A csv file contains zero or more records of one or more fields per record.
8 // Each record is separated by the newline character. The final record may
9 // optionally be followed by a newline character.
10 //
11 // field1,field2,field3
12 //
13 // White space is considered part of a field.
14 //
15 // Carriage returns before newline characters are silently removed.
16 //
17 // Blank lines are ignored. A line with only whitespace characters (excluding
18 // the ending newline character) is not considered a blank line.
19 //
20 // Fields which start and stop with the quote character " are called
21 // quoted-fields. The beginning and ending quote are not part of the
22 // field.
23 //
24 // The source:
25 //
26 // normal string,"quoted-field"
27 //
28 // results in the fields
29 //
30 // {`normal string`, `quoted-field`}
31 //
32 // Within a quoted-field a quote character followed by a second quote
33 // character is considered a single quote.
34 //
35 // "the ""word"" is true","a ""quoted-field"""
36 //
37 // results in
38 //
39 // {`the "word" is true`, `a "quoted-field"`}
40 //
41 // Newlines and commas may be included in a quoted-field
42 //
43 // "Multi-line
44 // field","comma is ,"
45 //
46 // results in
47 //
48 // {`Multi-line
49 // field`, `comma is ,`}
50 package csv
51
52 import (
53 "bufio"
54 "bytes"
55 "errors"
56 "fmt"
57 "io"
58 "unicode"
59 )
60
61 // A ParseError is returned for parsing errors.
62 // The first line is 1. The first column is 0.
63 type ParseError struct {
64 Line int // Line where the error occurred
65 Column int // Column (rune index) where the error occurred
66 Err error // The actual error
67 }
68
69 func (e *ParseError) Error() string {
70 return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err)
71 }
72
73 // These are the errors that can be returned in ParseError.Error
74 var (
75 ErrTrailingComma = errors.New("extra delimiter at end of line")
76 ErrBareQuote = errors.New("bare \" in non-quoted-field")
77 ErrQuote = errors.New("extraneous \" in field")
78 ErrFieldCount = errors.New("wrong number of fields in line")
79 )
80
81 // A Reader reads records from a CSV-encoded file.
82 //
83 // As returned by NewReader, a Reader expects input conforming to RFC 4180.
84 // The exported fields can be changed to customize the details before the
85 // first call to Read or ReadAll.
86 //
87 // Comma is the field delimiter. It defaults to ','.
88 //
89 // Comment, if not 0, is the comment character. Lines beginning with the
90 // Comment character are ignored.
91 //
92 // If FieldsPerRecord is positive, Read requires each record to
93 // have the given number of fields. If FieldsPerRecord is 0, Read sets it to
94 // the number of fields in the first record, so that future records must
95 // have the same field count. If FieldsPerRecord is negative, no check is
96 // made and records may have a variable number of fields.
97 //
98 // If LazyQuotes is true, a quote may appear in an unquoted field and a
99 // non-doubled quote may appear in a quoted field.
100 //
101 // If TrailingComma is true, the last field may be an unquoted empty field.
102 //
103 // If TrimLeadingSpace is true, leading white space in a field is ignored.
104 type Reader struct {
105 Comma rune // Field delimiter (set to ',' by NewReader)
106 Comment rune // Comment character for start of line
107 FieldsPerRecord int // Number of expected fields per record
108 LazyQuotes bool // Allow lazy quotes
109 TrailingComma bool // Allow trailing comma
110 TrimLeadingSpace bool // Trim leading space
111 line int
112 column int
113 r *bufio.Reader
114 field bytes.Buffer
115 }
116
117 // NewReader returns a new Reader that reads from r.
118 func NewReader(r io.Reader) *Reader {
119 return &Reader{
120 Comma: ',',
121 r: bufio.NewReader(r),
122 }
123 }
124
125 // error creates a new ParseError based on err.
126 func (r *Reader) error(err error) error {
127 return &ParseError{
128 Line: r.line,
129 Column: r.column,
130 Err: err,
131 }
132 }
133
134 // Read reads one record from r. The record is a slice of strings with each
135 // string representing one field.
136 func (r *Reader) Read() (record []string, err error) {
137 for {
138 record, err = r.parseRecord()
139 if record != nil {
140 break
141 }
142 if err != nil {
143 return nil, err
144 }
145 }
146
147 if r.FieldsPerRecord > 0 {
148 if len(record) != r.FieldsPerRecord {
149 r.column = 0 // report at start of record
150 return record, r.error(ErrFieldCount)
151 }
152 } else if r.FieldsPerRecord == 0 {
153 r.FieldsPerRecord = len(record)
154 }
155 return record, nil
156 }
157
158 // ReadAll reads all the remaining records from r.
159 // Each record is a slice of fields.
160 // A successful call returns err == nil, not err == EOF. Because ReadAll is
161 // defined to read until EOF, it does not treat end of file as an error to be
162 // reported.
163 func (r *Reader) ReadAll() (records [][]string, err error) {
164 for {
165 record, err := r.Read()
166 if err == io.EOF {
167 return records, nil
168 }
169 if err != nil {
170 return nil, err
171 }
172 records = append(records, record)
173 }
174 panic("unreachable")
175 }
176
177 // readRune reads one rune from r, folding \r\n to \n and keeping track
178 // of how far into the line we have read. r.column will point to the start
179 // of this rune, not the end of this rune.
180 func (r *Reader) readRune() (rune, error) {
181 r1, _, err := r.r.ReadRune()
182
183 // Handle \r\n here. We make the simplifying assumption that
184 // anytime \r is followed by \n that it can be folded to \n.
185 // We will not detect files which contain both \r\n and bare \n.
186 if r1 == '\r' {
187 r1, _, err = r.r.ReadRune()
188 if err == nil {
189 if r1 != '\n' {
190 r.r.UnreadRune()
191 r1 = '\r'
192 }
193 }
194 }
195 r.column++
196 return r1, err
197 }
198
199 // unreadRune puts the last rune read from r back.
200 func (r *Reader) unreadRune() {
201 r.r.UnreadRune()
202 r.column--
203 }
204
205 // skip reads runes up to and including the rune delim or until error.
206 func (r *Reader) skip(delim rune) error {
207 for {
208 r1, err := r.readRune()
209 if err != nil {
210 return err
211 }
212 if r1 == delim {
213 return nil
214 }
215 }
216 panic("unreachable")
217 }
218
219 // parseRecord reads and parses a single csv record from r.
220 func (r *Reader) parseRecord() (fields []string, err error) {
221 // Each record starts on a new line. We increment our line
222 // number (lines start at 1, not 0) and set column to -1
223 // so as we increment in readRune it points to the character we read.
224 r.line++
225 r.column = -1
226
227 // Peek at the first rune. If it is an error we are done.
228 // If we are support comments and it is the comment character
229 // then skip to the end of line.
230
231 r1, _, err := r.r.ReadRune()
232 if err != nil {
233 return nil, err
234 }
235
236 if r.Comment != 0 && r1 == r.Comment {
237 return nil, r.skip('\n')
238 }
239 r.r.UnreadRune()
240
241 // At this point we have at least one field.
242 for {
243 haveField, delim, err := r.parseField()
244 if haveField {
245 fields = append(fields, r.field.String())
246 }
247 if delim == '\n' || err == io.EOF {
248 return fields, err
249 } else if err != nil {
250 return nil, err
251 }
252 }
253 panic("unreachable")
254 }
255
256 // parseField parses the next field in the record. The read field is
257 // located in r.field. Delim is the first character not part of the field
258 // (r.Comma or '\n').
259 func (r *Reader) parseField() (haveField bool, delim rune, err error) {
260 r.field.Reset()
261
262 r1, err := r.readRune()
263 if err != nil {
264 // If we have EOF and are not at the start of a line
265 // then we return the empty field. We have already
266 // checked for trailing commas if needed.
267 if err == io.EOF && r.column != 0 {
268 return true, 0, err
269 }
270 return false, 0, err
271 }
272
273 if r.TrimLeadingSpace {
274 for r1 != '\n' && unicode.IsSpace(r1) {
275 r1, err = r.readRune()
276 if err != nil {
277 return false, 0, err
278 }
279 }
280 }
281
282 switch r1 {
283 case r.Comma:
284 // will check below
285
286 case '\n':
287 // We are a trailing empty field or a blank line
288 if r.column == 0 {
289 return false, r1, nil
290 }
291 return true, r1, nil
292
293 case '"':
294 // quoted field
295 Quoted:
296 for {
297 r1, err = r.readRune()
298 if err != nil {
299 if err == io.EOF {
300 if r.LazyQuotes {
301 return true, 0, err
302 }
303 return false, 0, r.error(ErrQuote)
304 }
305 return false, 0, err
306 }
307 switch r1 {
308 case '"':
309 r1, err = r.readRune()
310 if err != nil || r1 == r.Comma {
311 break Quoted
312 }
313 if r1 == '\n' {
314 return true, r1, nil
315 }
316 if r1 != '"' {
317 if !r.LazyQuotes {
318 r.column--
319 return false, 0, r.error(ErrQuote)
320 }
321 // accept the bare quote
322 r.field.WriteRune('"')
323 }
324 case '\n':
325 r.line++
326 r.column = -1
327 }
328 r.field.WriteRune(r1)
329 }
330
331 default:
332 // unquoted field
333 for {
334 r.field.WriteRune(r1)
335 r1, err = r.readRune()
336 if err != nil || r1 == r.Comma {
337 break
338 }
339 if r1 == '\n' {
340 return true, r1, nil
341 }
342 if !r.LazyQuotes && r1 == '"' {
343 return false, 0, r.error(ErrBareQuote)
344 }
345 }
346 }
347
348 if err != nil {
349 if err == io.EOF {
350 return true, 0, err
351 }
352 return false, 0, err
353 }
354
355 if !r.TrailingComma {
356 // We don't allow trailing commas. See if we
357 // are at the end of the line (being mindful
358 // of trimming spaces).
359 c := r.column
360 r1, err = r.readRune()
361 if r.TrimLeadingSpace {
362 for r1 != '\n' && unicode.IsSpace(r1) {
363 r1, err = r.readRune()
364 if err != nil {
365 break
366 }
367 }
368 }
369 if err == io.EOF || r1 == '\n' {
370 r.column = c // report the comma
371 return false, 0, r.error(ErrTrailingComma)
372 }
373 r.unreadRune()
374 }
375 return true, r1, nil
376 }