/lib/csvs.py - Diff - BIEN 3 - NCEAS Projects

« Previous | Next »

Revision 1623

Added by Aaron Marcuse-Kubitza almost 13 years ago

csvs.py: Added TsvReader to support TSV quirks. Added reader_class(). reader_and_header(): Use reader_class() to automatically use TsvReader instead of csv.reader for TSVs. Added is_tsv() and use it where `dialect.delimiter == '\t'` was used.

     import csv
     import StringIO
     import strings
     import util
     delimiters = ',\t`'
     delims = ',\t`'
     tsv_delim = '\t'
     escape = '\\'
     ending_placeholder = r'\n'
     def is_tsv(dialect): return dialect.delimiter == tsv_delim
     def sniff(line):
         '''Automatically detects the dialect'''
         dialect = csv.Sniffer().sniff(line, delimiters)
         dialect = csv.Sniffer().sniff(line, delims)
         # TSVs usually don't quote fields (nor doublequote embedded quotes)
         if dialect.delimiter == '\t': dialect.quoting = csv.QUOTE_NONE
         if is_tsv(dialect): dialect.quoting = csv.QUOTE_NONE
         else: dialect.doublequote = True # Sniffer doesn't turn this on by default
         return dialect
-...
         info.dialect = sniff(info.header_line)
         return info
     class TsvReader:
         '''Unlike csv.reader, for TSVs, interprets \ as escaping a line ending but
         ignores it before everything else (e.g. \N for NULL). Also interprets the
         '\n' escape sequence as a newline.'''
         def __init__(self, stream, dialect):
             assert is_tsv(dialect)
             self.stream = stream
             self.dialect = dialect
         def __iter__(self): return self
         def next(self):
             record = ''
             ending = None
             while True:
                 line = self.stream.readline()
                 if line == '': raise StopIteration
                 raw_contents, ending = strings.extract_line_ending(line)
                 contents = strings.remove_suffix(escape, raw_contents)
                 record += contents
                 if len(contents) == len(raw_contents): break # no line continuation
                 record += ending_placeholder
             row = csv.reader(StringIO.StringIO(record), self.dialect).next()
             return [v.replace(ending_placeholder, '\n') for v in row]
     def reader_class(dialect):
         if is_tsv(dialect): return TsvReader
         else: return csv.reader
     def make_reader(stream, dialect): return reader_class(dialect)(stream, dialect)
     def reader_and_header(stream):
         '''Automatically detects the dialect based on the header line
         @return tuple (reader, header)'''
         info = stream_info(stream)
         header = csv.reader(StringIO.StringIO(info.header_line),
         reader_class_ = reader_class(info.dialect)
         header = reader_class_(StringIO.StringIO(info.header_line),
             info.dialect).next()
         return (csv.reader(stream, info.dialect), header)
         return (reader_class_(stream, info.dialect), header)
     ##### csv modifications

Also available in: Unified diff

Project

General

Profile

Revision 1623

Added by Aaron Marcuse-Kubitza almost 13 years ago