Project

General

Profile

1 1388 aaronmk
# CSV I/O
2
3
import csv
4 5431 aaronmk
import _csv
5 1388 aaronmk
import StringIO
6
7 8069 aaronmk
import exc
8 5593 aaronmk
import streams
9 1623 aaronmk
import strings
10 1444 aaronmk
import util
11
12 5426 aaronmk
delims = ',;\t|`'
13 5439 aaronmk
tab_padded_delims = ['\t|\t']
14 1623 aaronmk
tsv_delim = '\t'
15
escape = '\\'
16 1442 aaronmk
17 1623 aaronmk
ending_placeholder = r'\n'
18
19 5437 aaronmk
def is_tsv(dialect): return dialect.delimiter.startswith(tsv_delim)
20 1623 aaronmk
21 1621 aaronmk
def sniff(line):
22
    '''Automatically detects the dialect'''
23 5435 aaronmk
    line, ending = strings.extract_line_ending(line)
24 1623 aaronmk
    dialect = csv.Sniffer().sniff(line, delims)
25 5435 aaronmk
26 5439 aaronmk
    if is_tsv(dialect):
27 8070 aaronmk
        dialect.quoting = csv.QUOTE_NONE
28 5439 aaronmk
        # Check multi-char delims using \t
29
        delim = strings.find_any(line, tab_padded_delims)
30
        if delim:
31
            dialect.delimiter = delim
32
            line_suffix = delim.rstrip('\t')
33
            if line.endswith(line_suffix): ending = line_suffix+ending
34 1621 aaronmk
    else: dialect.doublequote = True # Sniffer doesn't turn this on by default
35 5435 aaronmk
    dialect.lineterminator = ending
36
37 1621 aaronmk
    return dialect
38
39 8202 aaronmk
def has_unbalanced_quotes(str_): return str_.count('"') % 2 == 1 # odd # of "
40
41
def has_multiline_column(str_): return has_unbalanced_quotes(str_)
42
43 1923 aaronmk
def stream_info(stream, parse_header=False):
44 6589 aaronmk
    '''Automatically detects the dialect based on the header line.
45
    Uses the Excel dialect if the CSV file is empty.
46 1923 aaronmk
    @return NamedTuple {header_line, header, dialect}'''
47 1444 aaronmk
    info = util.NamedTuple()
48
    info.header_line = stream.readline()
49 8202 aaronmk
    if has_multiline_column(info.header_line): # 1st line not full header
50
        # assume it's a header-only csv with multiline columns
51
        info.header_line += ''.join(stream.readlines()) # use entire file
52 1923 aaronmk
    info.header = None
53
    if info.header_line != '':
54
        info.dialect = sniff(info.header_line)
55 6589 aaronmk
    else: info.dialect = csv.excel # line of '' indicates EOF = empty stream
56
57
    if parse_header:
58
        try: info.header = reader_class(info.dialect)(
59
            StringIO.StringIO(info.header_line), info.dialect).next()
60
        except StopIteration: info.header = []
61
62 1444 aaronmk
    return info
63
64 5170 aaronmk
tsv_encode_map = strings.json_encode_map[:]
65
tsv_encode_map.append(('\t', r'\t'))
66
tsv_decode_map = strings.flip_map(tsv_encode_map)
67 5146 aaronmk
68 1623 aaronmk
class TsvReader:
69
    '''Unlike csv.reader, for TSVs, interprets \ as escaping a line ending but
70 5145 aaronmk
    ignores it before everything else (e.g. \N for NULL).
71 5170 aaronmk
    Also expands tsv_encode_map escapes.
72 5145 aaronmk
    '''
73 1623 aaronmk
    def __init__(self, stream, dialect):
74
        assert is_tsv(dialect)
75
        self.stream = stream
76
        self.dialect = dialect
77
78
    def __iter__(self): return self
79
80
    def next(self):
81
        record = ''
82
        ending = None
83
        while True:
84
            line = self.stream.readline()
85
            if line == '': raise StopIteration
86
87 5438 aaronmk
            line = strings.remove_suffix(self.dialect.lineterminator, line)
88 5433 aaronmk
            contents = strings.remove_suffix(escape, line)
89 1623 aaronmk
            record += contents
90 5433 aaronmk
            if len(contents) == len(line): break # no line continuation
91 1623 aaronmk
            record += ending_placeholder
92 3055 aaronmk
93
        # Prevent "new-line character seen in unquoted field" errors
94
        record = record.replace('\r', ending_placeholder)
95
96 7211 aaronmk
        # Split line
97 8071 aaronmk
        if record == '': row = [] # csv.reader would interpret as EOF
98
        elif len(self.dialect.delimiter) > 1: # multi-char delims
99 7211 aaronmk
            row = record.split(self.dialect.delimiter)
100
        else: row = csv.reader(StringIO.StringIO(record), self.dialect).next()
101
102 5170 aaronmk
        return [strings.replace_all(tsv_decode_map, v) for v in row]
103 1623 aaronmk
104
def reader_class(dialect):
105
    if is_tsv(dialect): return TsvReader
106
    else: return csv.reader
107
108
def make_reader(stream, dialect): return reader_class(dialect)(stream, dialect)
109
110 1388 aaronmk
def reader_and_header(stream):
111
    '''Automatically detects the dialect based on the header line
112
    @return tuple (reader, header)'''
113 1923 aaronmk
    info = stream_info(stream, parse_header=True)
114 1958 aaronmk
    return (make_reader(stream, info.dialect), info.header)
115 1446 aaronmk
116
##### csv modifications
117
118
# Note that these methods only work on *instances* of Dialect classes
119
csv.Dialect.__eq__ = lambda self, other: self.__dict__ == other.__dict__
120
csv.Dialect.__ne__ = lambda self, other: not (self == other)
121 2114 aaronmk
122 5430 aaronmk
__Dialect__validate_orig = csv.Dialect._validate
123
def __Dialect__validate(self):
124
        try: __Dialect__validate_orig(self)
125
        except _csv.Error, e:
126
            if str(e) == '"delimiter" must be an 1-character string': pass # OK
127
            else: raise
128
csv.Dialect._validate = __Dialect__validate
129
130 2114 aaronmk
##### Row filters
131
132
class Filter:
133
    '''Wraps a reader, filtering each row'''
134
    def __init__(self, filter_, reader):
135
        self.reader = reader
136
        self.filter = filter_
137
138
    def __iter__(self): return self
139
140
    def next(self): return self.filter(self.reader.next())
141 5574 aaronmk
142
    def close(self): pass # support using as a stream
143 2114 aaronmk
144
std_nulls = [r'\N']
145
empty_nulls = [''] + std_nulls
146
147
class NullFilter(Filter):
148
    '''Translates special string values to None'''
149
    def __init__(self, reader, nulls=std_nulls):
150
        map_ = dict.fromkeys(nulls, None)
151
        def filter_(row): return [map_.get(v, v) for v in row]
152
        Filter.__init__(self, filter_, reader)
153
154
class StripFilter(Filter):
155
    '''Strips whitespace'''
156
    def __init__(self, reader):
157
        def filter_(row): return [v.strip() for v in row]
158
        Filter.__init__(self, filter_, reader)
159 5570 aaronmk
160
class ColCtFilter(Filter):
161
    '''Gives all rows the same # columns'''
162
    def __init__(self, reader, cols_ct):
163
        def filter_(row): return util.list_as_length(row, cols_ct)
164
        Filter.__init__(self, filter_, reader)
165 5571 aaronmk
166
##### Translators
167
168 5586 aaronmk
class StreamFilter(Filter):
169
    '''Wraps a reader in a way that's usable to a filter stream that does not
170
    require lines to be strings. Reports EOF as '' instead of StopIteration.'''
171
    def __init__(self, reader):
172
        Filter.__init__(self, None, reader)
173
174
    def readline(self):
175
        try: return self.reader.next()
176
        except StopIteration: return '' # EOF
177
178 5735 aaronmk
class ColInsertFilter(Filter):
179 7290 aaronmk
    '''Adds column(s) to each row
180 5735 aaronmk
    @param mk_value(row, row_num)
181
    '''
182 7290 aaronmk
    def __init__(self, reader, mk_value, index=0, n=1):
183 5735 aaronmk
        def filter_(row):
184
            row = list(row) # make sure it's mutable; don't modify input!
185 7290 aaronmk
            for i in xrange(n):
186
                row.insert(index+i, mk_value(row, self.reader.line_num))
187 5735 aaronmk
            return row
188
        Filter.__init__(self, filter_,
189
            streams.LineCountInputStream(StreamFilter(reader)))
190
191 5736 aaronmk
class RowNumFilter(ColInsertFilter):
192 5593 aaronmk
    '''Adds a row # column at the beginning of each row'''
193
    def __init__(self, reader):
194 5736 aaronmk
        def mk_value(row, row_num): return row_num
195
        ColInsertFilter.__init__(self, reader, mk_value, 0)
196 5593 aaronmk
197 5587 aaronmk
class InputRewriter(StreamFilter):
198 5571 aaronmk
    '''Wraps a reader, writing each row back to CSV'''
199
    def __init__(self, reader, dialect=csv.excel):
200 5587 aaronmk
        StreamFilter.__init__(self, reader)
201
202 5571 aaronmk
        self.dialect = dialect
203
204
    def readline(self):
205 8069 aaronmk
        try:
206
            row = self.reader.readline()
207
            if row == '': return row # EOF
208
209
            line_stream = StringIO.StringIO()
210
            csv.writer(line_stream, self.dialect).writerow(row)
211
            return line_stream.getvalue()
212
        except Exception, e:
213
            exc.print_ex(e)
214
            raise
215 5571 aaronmk
216
    def read(self, n): return self.readline() # forward all reads to readline()