src/pkg/encoding/csv/reader.go - The Go Programming Language

Golang

Source file src/pkg/encoding/csv/reader.go

     1	// Copyright 2011 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package csv reads and writes comma-separated values (CSV) files.
     6	//
     7	// A csv file contains zero or more records of one or more fields per record.
     8	// Each record is separated by the newline character. The final record may
     9	// optionally be followed by a newline character.
    10	//
    11	//	field1,field2,field3
    12	//
    13	// White space is considered part of a field.
    14	//
    15	// Carriage returns before newline characters are silently removed.
    16	//
    17	// Blank lines are ignored.  A line with only whitespace characters (excluding
    18	// the ending newline character) is not considered a blank line.
    19	//
    20	// Fields which start and stop with the quote character " are called
    21	// quoted-fields.  The beginning and ending quote are not part of the
    22	// field.
    23	//
    24	// The source:
    25	//
    26	//	normal string,"quoted-field"
    27	//
    28	// results in the fields
    29	//
    30	//	{`normal string`, `quoted-field`}
    31	//
    32	// Within a quoted-field a quote character followed by a second quote
    33	// character is considered a single quote.
    34	//
    35	//	"the ""word"" is true","a ""quoted-field"""
    36	//
    37	// results in
    38	//
    39	//	{`the "word" is true`, `a "quoted-field"`}
    40	//
    41	// Newlines and commas may be included in a quoted-field
    42	//
    43	//	"Multi-line
    44	//	field","comma is ,"
    45	//
    46	// results in
    47	//
    48	//	{`Multi-line
    49	//	field`, `comma is ,`}
    50	package csv
    51	
    52	import (
    53		"bufio"
    54		"bytes"
    55		"errors"
    56		"fmt"
    57		"io"
    58		"unicode"
    59	)
    60	
    61	// A ParseError is returned for parsing errors.
    62	// The first line is 1.  The first column is 0.
    63	type ParseError struct {
    64		Line   int   // Line where the error occurred
    65		Column int   // Column (rune index) where the error occurred
    66		Err    error // The actual error
    67	}
    68	
    69	func (e *ParseError) Error() string {
    70		return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err)
    71	}
    72	
    73	// These are the errors that can be returned in ParseError.Error
    74	var (
    75		ErrTrailingComma = errors.New("extra delimiter at end of line")
    76		ErrBareQuote     = errors.New("bare \" in non-quoted-field")
    77		ErrQuote         = errors.New("extraneous \" in field")
    78		ErrFieldCount    = errors.New("wrong number of fields in line")
    79	)
    80	
    81	// A Reader reads records from a CSV-encoded file.
    82	//
    83	// As returned by NewReader, a Reader expects input conforming to RFC 4180.
    84	// The exported fields can be changed to customize the details before the
    85	// first call to Read or ReadAll.
    86	//
    87	// Comma is the field delimiter.  It defaults to ','.
    88	//
    89	// Comment, if not 0, is the comment character. Lines beginning with the
    90	// Comment character are ignored.
    91	//
    92	// If FieldsPerRecord is positive, Read requires each record to
    93	// have the given number of fields.  If FieldsPerRecord is 0, Read sets it to
    94	// the number of fields in the first record, so that future records must
    95	// have the same field count.  If FieldsPerRecord is negative, no check is
    96	// made and records may have a variable number of fields.
    97	//
    98	// If LazyQuotes is true, a quote may appear in an unquoted field and a
    99	// non-doubled quote may appear in a quoted field.
   100	//
   101	// If TrailingComma is true, the last field may be an unquoted empty field.
   102	//
   103	// If TrimLeadingSpace is true, leading white space in a field is ignored.
   104	type Reader struct {
   105		Comma            rune // Field delimiter (set to ',' by NewReader)
   106		Comment          rune // Comment character for start of line
   107		FieldsPerRecord  int  // Number of expected fields per record
   108		LazyQuotes       bool // Allow lazy quotes
   109		TrailingComma    bool // Allow trailing comma
   110		TrimLeadingSpace bool // Trim leading space
   111		line             int
   112		column           int
   113		r                *bufio.Reader
   114		field            bytes.Buffer
   115	}
   116	
   117	// NewReader returns a new Reader that reads from r.
   118	func NewReader(r io.Reader) *Reader {
   119		return &Reader{
   120			Comma: ',',
   121			r:     bufio.NewReader(r),
   122		}
   123	}
   124	
   125	// error creates a new ParseError based on err.
   126	func (r *Reader) error(err error) error {
   127		return &ParseError{
   128			Line:   r.line,
   129			Column: r.column,
   130			Err:    err,
   131		}
   132	}
   133	
   134	// Read reads one record from r.  The record is a slice of strings with each
   135	// string representing one field.
   136	func (r *Reader) Read() (record []string, err error) {
   137		for {
   138			record, err = r.parseRecord()
   139			if record != nil {
   140				break
   141			}
   142			if err != nil {
   143				return nil, err
   144			}
   145		}
   146	
   147		if r.FieldsPerRecord > 0 {
   148			if len(record) != r.FieldsPerRecord {
   149				r.column = 0 // report at start of record
   150				return record, r.error(ErrFieldCount)
   151			}
   152		} else if r.FieldsPerRecord == 0 {
   153			r.FieldsPerRecord = len(record)
   154		}
   155		return record, nil
   156	}
   157	
   158	// ReadAll reads all the remaining records from r.
   159	// Each record is a slice of fields.
   160	// A successful call returns err == nil, not err == EOF. Because ReadAll is
   161	// defined to read until EOF, it does not treat end of file as an error to be
   162	// reported.
   163	func (r *Reader) ReadAll() (records [][]string, err error) {
   164		for {
   165			record, err := r.Read()
   166			if err == io.EOF {
   167				return records, nil
   168			}
   169			if err != nil {
   170				return nil, err
   171			}
   172			records = append(records, record)
   173		}
   174		panic("unreachable")
   175	}
   176	
   177	// readRune reads one rune from r, folding \r\n to \n and keeping track
   178	// of how far into the line we have read.  r.column will point to the start
   179	// of this rune, not the end of this rune.
   180	func (r *Reader) readRune() (rune, error) {
   181		r1, _, err := r.r.ReadRune()
   182	
   183		// Handle \r\n here.  We make the simplifying assumption that
   184		// anytime \r is followed by \n that it can be folded to \n.
   185		// We will not detect files which contain both \r\n and bare \n.
   186		if r1 == '\r' {
   187			r1, _, err = r.r.ReadRune()
   188			if err == nil {
   189				if r1 != '\n' {
   190					r.r.UnreadRune()
   191					r1 = '\r'
   192				}
   193			}
   194		}
   195		r.column++
   196		return r1, err
   197	}
   198	
   199	// unreadRune puts the last rune read from r back.
   200	func (r *Reader) unreadRune() {
   201		r.r.UnreadRune()
   202		r.column--
   203	}
   204	
   205	// skip reads runes up to and including the rune delim or until error.
   206	func (r *Reader) skip(delim rune) error {
   207		for {
   208			r1, err := r.readRune()
   209			if err != nil {
   210				return err
   211			}
   212			if r1 == delim {
   213				return nil
   214			}
   215		}
   216		panic("unreachable")
   217	}
   218	
   219	// parseRecord reads and parses a single csv record from r.
   220	func (r *Reader) parseRecord() (fields []string, err error) {
   221		// Each record starts on a new line.  We increment our line
   222		// number (lines start at 1, not 0) and set column to -1
   223		// so as we increment in readRune it points to the character we read.
   224		r.line++
   225		r.column = -1
   226	
   227		// Peek at the first rune.  If it is an error we are done.
   228		// If we are support comments and it is the comment character
   229		// then skip to the end of line.
   230	
   231		r1, _, err := r.r.ReadRune()
   232		if err != nil {
   233			return nil, err
   234		}
   235	
   236		if r.Comment != 0 && r1 == r.Comment {
   237			return nil, r.skip('\n')
   238		}
   239		r.r.UnreadRune()
   240	
   241		// At this point we have at least one field.
   242		for {
   243			haveField, delim, err := r.parseField()
   244			if haveField {
   245				fields = append(fields, r.field.String())
   246			}
   247			if delim == '\n' || err == io.EOF {
   248				return fields, err
   249			} else if err != nil {
   250				return nil, err
   251			}
   252		}
   253		panic("unreachable")
   254	}
   255	
   256	// parseField parses the next field in the record.  The read field is
   257	// located in r.field.  Delim is the first character not part of the field
   258	// (r.Comma or '\n').
   259	func (r *Reader) parseField() (haveField bool, delim rune, err error) {
   260		r.field.Reset()
   261	
   262		r1, err := r.readRune()
   263		if err != nil {
   264			// If we have EOF and are not at the start of a line
   265			// then we return the empty field.  We have already
   266			// checked for trailing commas if needed.
   267			if err == io.EOF && r.column != 0 {
   268				return true, 0, err
   269			}
   270			return false, 0, err
   271		}
   272	
   273		if r.TrimLeadingSpace {
   274			for r1 != '\n' && unicode.IsSpace(r1) {
   275				r1, err = r.readRune()
   276				if err != nil {
   277					return false, 0, err
   278				}
   279			}
   280		}
   281	
   282		switch r1 {
   283		case r.Comma:
   284			// will check below
   285	
   286		case '\n':
   287			// We are a trailing empty field or a blank line
   288			if r.column == 0 {
   289				return false, r1, nil
   290			}
   291			return true, r1, nil
   292	
   293		case '"':
   294			// quoted field
   295		Quoted:
   296			for {
   297				r1, err = r.readRune()
   298				if err != nil {
   299					if err == io.EOF {
   300						if r.LazyQuotes {
   301							return true, 0, err
   302						}
   303						return false, 0, r.error(ErrQuote)
   304					}
   305					return false, 0, err
   306				}
   307				switch r1 {
   308				case '"':
   309					r1, err = r.readRune()
   310					if err != nil || r1 == r.Comma {
   311						break Quoted
   312					}
   313					if r1 == '\n' {
   314						return true, r1, nil
   315					}
   316					if r1 != '"' {
   317						if !r.LazyQuotes {
   318							r.column--
   319							return false, 0, r.error(ErrQuote)
   320						}
   321						// accept the bare quote
   322						r.field.WriteRune('"')
   323					}
   324				case '\n':
   325					r.line++
   326					r.column = -1
   327				}
   328				r.field.WriteRune(r1)
   329			}
   330	
   331		default:
   332			// unquoted field
   333			for {
   334				r.field.WriteRune(r1)
   335				r1, err = r.readRune()
   336				if err != nil || r1 == r.Comma {
   337					break
   338				}
   339				if r1 == '\n' {
   340					return true, r1, nil
   341				}
   342				if !r.LazyQuotes && r1 == '"' {
   343					return false, 0, r.error(ErrBareQuote)
   344				}
   345			}
   346		}
   347	
   348		if err != nil {
   349			if err == io.EOF {
   350				return true, 0, err
   351			}
   352			return false, 0, err
   353		}
   354	
   355		if !r.TrailingComma {
   356			// We don't allow trailing commas.  See if we
   357			// are at the end of the line (being mindful
   358			// of trimming spaces).
   359			c := r.column
   360			r1, err = r.readRune()
   361			if r.TrimLeadingSpace {
   362				for r1 != '\n' && unicode.IsSpace(r1) {
   363					r1, err = r.readRune()
   364					if err != nil {
   365						break
   366					}
   367				}
   368			}
   369			if err == io.EOF || r1 == '\n' {
   370				r.column = c // report the comma
   371				return false, 0, r.error(ErrTrailingComma)
   372			}
   373			r.unreadRune()
   374		}
   375		return true, r1, nil
   376	}