Project

General

Profile

1 1388 aaronmk
# CSV I/O
2
3
import csv
4
import StringIO
5
6 1623 aaronmk
import strings
7 1444 aaronmk
import util
8
9 4211 aaronmk
delims = ',;\t`'
10 1623 aaronmk
tsv_delim = '\t'
11
escape = '\\'
12 1442 aaronmk
13 1623 aaronmk
ending_placeholder = r'\n'
14
15
def is_tsv(dialect): return dialect.delimiter == tsv_delim
16
17 1621 aaronmk
def sniff(line):
18
    '''Automatically detects the dialect'''
19 1623 aaronmk
    dialect = csv.Sniffer().sniff(line, delims)
20 1621 aaronmk
    # TSVs usually don't quote fields (nor doublequote embedded quotes)
21 1623 aaronmk
    if is_tsv(dialect): dialect.quoting = csv.QUOTE_NONE
22 1621 aaronmk
    else: dialect.doublequote = True # Sniffer doesn't turn this on by default
23
    return dialect
24
25 1923 aaronmk
def stream_info(stream, parse_header=False):
26 1444 aaronmk
    '''Automatically detects the dialect based on the header line
27 1923 aaronmk
    @return NamedTuple {header_line, header, dialect}'''
28 1444 aaronmk
    info = util.NamedTuple()
29
    info.header_line = stream.readline()
30 1923 aaronmk
    info.header = None
31
    if info.header_line != '':
32
        info.dialect = sniff(info.header_line)
33
        if parse_header:
34
            info.header = reader_class(info.dialect)(
35
                StringIO.StringIO(info.header_line), info.dialect).next()
36 1660 aaronmk
    else: info.dialect = None # line of '' indicates EOF = empty stream
37 1444 aaronmk
    return info
38
39 5170 aaronmk
tsv_encode_map = strings.json_encode_map[:]
40
tsv_encode_map.append(('\t', r'\t'))
41
tsv_decode_map = strings.flip_map(tsv_encode_map)
42 5146 aaronmk
43 1623 aaronmk
class TsvReader:
44
    '''Unlike csv.reader, for TSVs, interprets \ as escaping a line ending but
45 5145 aaronmk
    ignores it before everything else (e.g. \N for NULL).
46 5170 aaronmk
    Also expands tsv_encode_map escapes.
47 5145 aaronmk
    '''
48 1623 aaronmk
    def __init__(self, stream, dialect):
49
        assert is_tsv(dialect)
50
        self.stream = stream
51
        self.dialect = dialect
52
53
    def __iter__(self): return self
54
55
    def next(self):
56
        record = ''
57
        ending = None
58
        while True:
59
            line = self.stream.readline()
60
            if line == '': raise StopIteration
61
62
            raw_contents, ending = strings.extract_line_ending(line)
63
            contents = strings.remove_suffix(escape, raw_contents)
64
            record += contents
65
            if len(contents) == len(raw_contents): break # no line continuation
66
            record += ending_placeholder
67 3055 aaronmk
68
        # Prevent "new-line character seen in unquoted field" errors
69
        record = record.replace('\r', ending_placeholder)
70
71 1623 aaronmk
        row = csv.reader(StringIO.StringIO(record), self.dialect).next()
72 5170 aaronmk
        return [strings.replace_all(tsv_decode_map, v) for v in row]
73 1623 aaronmk
74
def reader_class(dialect):
75
    if is_tsv(dialect): return TsvReader
76
    else: return csv.reader
77
78
def make_reader(stream, dialect): return reader_class(dialect)(stream, dialect)
79
80 1388 aaronmk
def reader_and_header(stream):
81
    '''Automatically detects the dialect based on the header line
82
    @return tuple (reader, header)'''
83 1923 aaronmk
    info = stream_info(stream, parse_header=True)
84 1958 aaronmk
    return (make_reader(stream, info.dialect), info.header)
85 1446 aaronmk
86
##### csv modifications
87
88
# Note that these methods only work on *instances* of Dialect classes
89
csv.Dialect.__eq__ = lambda self, other: self.__dict__ == other.__dict__
90
csv.Dialect.__ne__ = lambda self, other: not (self == other)
91 2114 aaronmk
92
##### Row filters
93
94
class Filter:
95
    '''Wraps a reader, filtering each row'''
96
    def __init__(self, filter_, reader):
97
        self.reader = reader
98
        self.filter = filter_
99
100
    def __iter__(self): return self
101
102
    def next(self): return self.filter(self.reader.next())
103
104
std_nulls = [r'\N']
105
empty_nulls = [''] + std_nulls
106
107
class NullFilter(Filter):
108
    '''Translates special string values to None'''
109
    def __init__(self, reader, nulls=std_nulls):
110
        map_ = dict.fromkeys(nulls, None)
111
        def filter_(row): return [map_.get(v, v) for v in row]
112
        Filter.__init__(self, filter_, reader)
113
114
class StripFilter(Filter):
115
    '''Strips whitespace'''
116
    def __init__(self, reader):
117
        def filter_(row): return [v.strip() for v in row]
118
        Filter.__init__(self, filter_, reader)