/lib/csvs.py - Annotate - BIEN 3 - NCEAS Projects

root/lib/csvs.py @ 1834

-            aaronmk
+# CSV I/O
 import csv
 import StringIO
-            aaronmk
+import strings
-            aaronmk
+import util
-            aaronmk
+delims = ',\t`'
 tsv_delim = '\t'
 escape = '\\'
             aaronmk
-            aaronmk
+ending_placeholder = r'\n'
 def is_tsv(dialect): return dialect.delimiter == tsv_delim
-            aaronmk
+def sniff(line):
     '''Automatically detects the dialect'''
-            aaronmk
+    dialect = csv.Sniffer().sniff(line, delims)
-            aaronmk
+    # TSVs usually don't quote fields (nor doublequote embedded quotes)
-            aaronmk
+    if is_tsv(dialect): dialect.quoting = csv.QUOTE_NONE
-            aaronmk
+    else: dialect.doublequote = True # Sniffer doesn't turn this on by default
     return dialect
-            aaronmk
+def stream_info(stream):
     '''Automatically detects the dialect based on the header line
     @return NamedTuple {header_line, dialect}'''
     info = util.NamedTuple()
     info.header_line = stream.readline()
-            aaronmk
+    if info.header_line != '': info.dialect = sniff(info.header_line)
     else: info.dialect = None # line of '' indicates EOF = empty stream
-            aaronmk
+    return info
-            aaronmk
+class TsvReader:
     '''Unlike csv.reader, for TSVs, interprets \ as escaping a line ending but
     ignores it before everything else (e.g. \N for NULL). Also interprets the
     '\n' escape sequence as a newline.'''
     def __init__(self, stream, dialect):
         assert is_tsv(dialect)
         self.stream = stream
         self.dialect = dialect
     def __iter__(self): return self
     def next(self):
         record = ''
         ending = None
         while True:
             line = self.stream.readline()
             if line == '': raise StopIteration
             raw_contents, ending = strings.extract_line_ending(line)
             contents = strings.remove_suffix(escape, raw_contents)
             record += contents
             if len(contents) == len(raw_contents): break # no line continuation
             record += ending_placeholder
         row = csv.reader(StringIO.StringIO(record), self.dialect).next()
         return [v.replace(ending_placeholder, '\n') for v in row]
 def reader_class(dialect):
     if is_tsv(dialect): return TsvReader
     else: return csv.reader
 def make_reader(stream, dialect): return reader_class(dialect)(stream, dialect)
-            aaronmk
+def reader_and_header(stream):
     '''Automatically detects the dialect based on the header line
     @return tuple (reader, header)'''
-            aaronmk
+    info = stream_info(stream)
-            aaronmk
+    reader_class_ = reader_class(info.dialect)
     header = reader_class_(StringIO.StringIO(info.header_line),
-            aaronmk
+        info.dialect).next()
-            aaronmk
+    return (reader_class_(stream, info.dialect), header)
             aaronmk
 ##### csv modifications
 # Note that these methods only work on *instances* of Dialect classes
 csv.Dialect.__eq__ = lambda self, other: self.__dict__ == other.__dict__
 csv.Dialect.__ne__ = lambda self, other: not (self == other)

Project

General

Profile