Project

General

Profile

1 1388 aaronmk
# CSV I/O
2
3
import csv
4 5431 aaronmk
import _csv
5 1388 aaronmk
import StringIO
6
7 5593 aaronmk
import streams
8 1623 aaronmk
import strings
9 1444 aaronmk
import util
10
11 5426 aaronmk
delims = ',;\t|`'
12 5439 aaronmk
tab_padded_delims = ['\t|\t']
13 1623 aaronmk
tsv_delim = '\t'
14
escape = '\\'
15 1442 aaronmk
16 1623 aaronmk
ending_placeholder = r'\n'
17
18 5437 aaronmk
def is_tsv(dialect): return dialect.delimiter.startswith(tsv_delim)
19 1623 aaronmk
20 1621 aaronmk
def sniff(line):
21
    '''Automatically detects the dialect'''
22 5435 aaronmk
    line, ending = strings.extract_line_ending(line)
23 1623 aaronmk
    dialect = csv.Sniffer().sniff(line, delims)
24 5435 aaronmk
25 5439 aaronmk
    if is_tsv(dialect):
26
        # TSVs usually don't quote fields (nor doublequote embedded quotes)
27
        dialect.quoting = csv.QUOTE_NONE
28
29
        # Check multi-char delims using \t
30
        delim = strings.find_any(line, tab_padded_delims)
31
        if delim:
32
            dialect.delimiter = delim
33
            line_suffix = delim.rstrip('\t')
34
            if line.endswith(line_suffix): ending = line_suffix+ending
35 1621 aaronmk
    else: dialect.doublequote = True # Sniffer doesn't turn this on by default
36 5435 aaronmk
    dialect.lineterminator = ending
37
38 1621 aaronmk
    return dialect
39
40 1923 aaronmk
def stream_info(stream, parse_header=False):
41 1444 aaronmk
    '''Automatically detects the dialect based on the header line
42 1923 aaronmk
    @return NamedTuple {header_line, header, dialect}'''
43 1444 aaronmk
    info = util.NamedTuple()
44
    info.header_line = stream.readline()
45 1923 aaronmk
    info.header = None
46
    if info.header_line != '':
47
        info.dialect = sniff(info.header_line)
48
        if parse_header:
49
            info.header = reader_class(info.dialect)(
50
                StringIO.StringIO(info.header_line), info.dialect).next()
51 1660 aaronmk
    else: info.dialect = None # line of '' indicates EOF = empty stream
52 1444 aaronmk
    return info
53
54 5170 aaronmk
tsv_encode_map = strings.json_encode_map[:]
55
tsv_encode_map.append(('\t', r'\t'))
56
tsv_decode_map = strings.flip_map(tsv_encode_map)
57 5146 aaronmk
58 1623 aaronmk
class TsvReader:
59
    '''Unlike csv.reader, for TSVs, interprets \ as escaping a line ending but
60 5145 aaronmk
    ignores it before everything else (e.g. \N for NULL).
61 5170 aaronmk
    Also expands tsv_encode_map escapes.
62 5145 aaronmk
    '''
63 1623 aaronmk
    def __init__(self, stream, dialect):
64
        assert is_tsv(dialect)
65
        self.stream = stream
66
        self.dialect = dialect
67
68
    def __iter__(self): return self
69
70
    def next(self):
71
        record = ''
72
        ending = None
73
        while True:
74
            line = self.stream.readline()
75
            if line == '': raise StopIteration
76
77 5438 aaronmk
            line = strings.remove_suffix(self.dialect.lineterminator, line)
78 5433 aaronmk
            contents = strings.remove_suffix(escape, line)
79 1623 aaronmk
            record += contents
80 5433 aaronmk
            if len(contents) == len(line): break # no line continuation
81 1623 aaronmk
            record += ending_placeholder
82 3055 aaronmk
83
        # Prevent "new-line character seen in unquoted field" errors
84
        record = record.replace('\r', ending_placeholder)
85
86 5429 aaronmk
        row = record.split(self.dialect.delimiter)
87 5170 aaronmk
        return [strings.replace_all(tsv_decode_map, v) for v in row]
88 1623 aaronmk
89
def reader_class(dialect):
90
    if is_tsv(dialect): return TsvReader
91
    else: return csv.reader
92
93
def make_reader(stream, dialect): return reader_class(dialect)(stream, dialect)
94
95 1388 aaronmk
def reader_and_header(stream):
96
    '''Automatically detects the dialect based on the header line
97
    @return tuple (reader, header)'''
98 1923 aaronmk
    info = stream_info(stream, parse_header=True)
99 1958 aaronmk
    return (make_reader(stream, info.dialect), info.header)
100 1446 aaronmk
101
##### csv modifications
102
103
# Note that these methods only work on *instances* of Dialect classes
104
csv.Dialect.__eq__ = lambda self, other: self.__dict__ == other.__dict__
105
csv.Dialect.__ne__ = lambda self, other: not (self == other)
106 2114 aaronmk
107 5430 aaronmk
__Dialect__validate_orig = csv.Dialect._validate
108
def __Dialect__validate(self):
109
        try: __Dialect__validate_orig(self)
110
        except _csv.Error, e:
111
            if str(e) == '"delimiter" must be an 1-character string': pass # OK
112
            else: raise
113
csv.Dialect._validate = __Dialect__validate
114
115 2114 aaronmk
##### Row filters
116
117
class Filter:
118
    '''Wraps a reader, filtering each row'''
119
    def __init__(self, filter_, reader):
120
        self.reader = reader
121
        self.filter = filter_
122
123
    def __iter__(self): return self
124
125
    def next(self): return self.filter(self.reader.next())
126 5574 aaronmk
127
    def close(self): pass # support using as a stream
128 2114 aaronmk
129
std_nulls = [r'\N']
130
empty_nulls = [''] + std_nulls
131
132
class NullFilter(Filter):
133
    '''Translates special string values to None'''
134
    def __init__(self, reader, nulls=std_nulls):
135
        map_ = dict.fromkeys(nulls, None)
136
        def filter_(row): return [map_.get(v, v) for v in row]
137
        Filter.__init__(self, filter_, reader)
138
139
class StripFilter(Filter):
140
    '''Strips whitespace'''
141
    def __init__(self, reader):
142
        def filter_(row): return [v.strip() for v in row]
143
        Filter.__init__(self, filter_, reader)
144 5570 aaronmk
145
class ColCtFilter(Filter):
146
    '''Gives all rows the same # columns'''
147
    def __init__(self, reader, cols_ct):
148
        def filter_(row): return util.list_as_length(row, cols_ct)
149
        Filter.__init__(self, filter_, reader)
150 5571 aaronmk
151
##### Translators
152
153 5586 aaronmk
class StreamFilter(Filter):
154
    '''Wraps a reader in a way that's usable to a filter stream that does not
155
    require lines to be strings. Reports EOF as '' instead of StopIteration.'''
156
    def __init__(self, reader):
157
        Filter.__init__(self, None, reader)
158
159
    def readline(self):
160
        try: return self.reader.next()
161
        except StopIteration: return '' # EOF
162
163 5735 aaronmk
class ColInsertFilter(Filter):
164
    '''Adds a column to each row
165
    @param mk_value(row, row_num)
166
    '''
167
    def __init__(self, reader, mk_value, index=0):
168
        def filter_(row):
169
            row = list(row) # make sure it's mutable; don't modify input!
170
            row.insert(index, mk_value(row, self.reader.line_num))
171
            return row
172
        Filter.__init__(self, filter_,
173
            streams.LineCountInputStream(StreamFilter(reader)))
174
175 5736 aaronmk
class RowNumFilter(ColInsertFilter):
176 5593 aaronmk
    '''Adds a row # column at the beginning of each row'''
177
    def __init__(self, reader):
178 5736 aaronmk
        def mk_value(row, row_num): return row_num
179
        ColInsertFilter.__init__(self, reader, mk_value, 0)
180 5593 aaronmk
181 5587 aaronmk
class InputRewriter(StreamFilter):
182 5571 aaronmk
    '''Wraps a reader, writing each row back to CSV'''
183
    def __init__(self, reader, dialect=csv.excel):
184 5587 aaronmk
        StreamFilter.__init__(self, reader)
185
186 5571 aaronmk
        self.dialect = dialect
187
188
    def readline(self):
189 5587 aaronmk
        row = self.reader.readline()
190 5585 aaronmk
        if row == '': return row # EOF
191
192 5571 aaronmk
        line_stream = StringIO.StringIO()
193
        csv.writer(line_stream, self.dialect).writerow(row)
194
        return line_stream.getvalue()
195
196
    def read(self, n): return self.readline() # forward all reads to readline()