Project

General

Profile

1 1388 aaronmk
# CSV I/O
2
3
import csv
4 5431 aaronmk
import _csv
5 1388 aaronmk
import StringIO
6
7 1623 aaronmk
import strings
8 1444 aaronmk
import util
9
10 5426 aaronmk
delims = ',;\t|`'
11 1623 aaronmk
tsv_delim = '\t'
12
escape = '\\'
13 1442 aaronmk
14 1623 aaronmk
ending_placeholder = r'\n'
15
16
def is_tsv(dialect): return dialect.delimiter == tsv_delim
17
18 1621 aaronmk
def sniff(line):
19
    '''Automatically detects the dialect'''
20 1623 aaronmk
    dialect = csv.Sniffer().sniff(line, delims)
21 1621 aaronmk
    # TSVs usually don't quote fields (nor doublequote embedded quotes)
22 1623 aaronmk
    if is_tsv(dialect): dialect.quoting = csv.QUOTE_NONE
23 1621 aaronmk
    else: dialect.doublequote = True # Sniffer doesn't turn this on by default
24
    return dialect
25
26 1923 aaronmk
def stream_info(stream, parse_header=False):
27 1444 aaronmk
    '''Automatically detects the dialect based on the header line
28 1923 aaronmk
    @return NamedTuple {header_line, header, dialect}'''
29 1444 aaronmk
    info = util.NamedTuple()
30
    info.header_line = stream.readline()
31 1923 aaronmk
    info.header = None
32
    if info.header_line != '':
33
        info.dialect = sniff(info.header_line)
34
        if parse_header:
35
            info.header = reader_class(info.dialect)(
36
                StringIO.StringIO(info.header_line), info.dialect).next()
37 1660 aaronmk
    else: info.dialect = None # line of '' indicates EOF = empty stream
38 1444 aaronmk
    return info
39
40 5170 aaronmk
tsv_encode_map = strings.json_encode_map[:]
41
tsv_encode_map.append(('\t', r'\t'))
42
tsv_decode_map = strings.flip_map(tsv_encode_map)
43 5146 aaronmk
44 1623 aaronmk
class TsvReader:
45
    '''Unlike csv.reader, for TSVs, interprets \ as escaping a line ending but
46 5145 aaronmk
    ignores it before everything else (e.g. \N for NULL).
47 5170 aaronmk
    Also expands tsv_encode_map escapes.
48 5145 aaronmk
    '''
49 1623 aaronmk
    def __init__(self, stream, dialect):
50
        assert is_tsv(dialect)
51
        self.stream = stream
52
        self.dialect = dialect
53
54
    def __iter__(self): return self
55
56
    def next(self):
57
        record = ''
58
        ending = None
59
        while True:
60
            line = self.stream.readline()
61
            if line == '': raise StopIteration
62
63
            raw_contents, ending = strings.extract_line_ending(line)
64
            contents = strings.remove_suffix(escape, raw_contents)
65
            record += contents
66
            if len(contents) == len(raw_contents): break # no line continuation
67
            record += ending_placeholder
68 3055 aaronmk
69
        # Prevent "new-line character seen in unquoted field" errors
70
        record = record.replace('\r', ending_placeholder)
71
72 5429 aaronmk
        row = record.split(self.dialect.delimiter)
73 5170 aaronmk
        return [strings.replace_all(tsv_decode_map, v) for v in row]
74 1623 aaronmk
75
def reader_class(dialect):
76
    if is_tsv(dialect): return TsvReader
77
    else: return csv.reader
78
79
def make_reader(stream, dialect): return reader_class(dialect)(stream, dialect)
80
81 1388 aaronmk
def reader_and_header(stream):
82
    '''Automatically detects the dialect based on the header line
83
    @return tuple (reader, header)'''
84 1923 aaronmk
    info = stream_info(stream, parse_header=True)
85 1958 aaronmk
    return (make_reader(stream, info.dialect), info.header)
86 1446 aaronmk
87
##### csv modifications
88
89
# Note that these methods only work on *instances* of Dialect classes
90
csv.Dialect.__eq__ = lambda self, other: self.__dict__ == other.__dict__
91
csv.Dialect.__ne__ = lambda self, other: not (self == other)
92 2114 aaronmk
93 5430 aaronmk
__Dialect__validate_orig = csv.Dialect._validate
94
def __Dialect__validate(self):
95
        try: __Dialect__validate_orig(self)
96
        except _csv.Error, e:
97
            if str(e) == '"delimiter" must be an 1-character string': pass # OK
98
            else: raise
99
csv.Dialect._validate = __Dialect__validate
100
101 2114 aaronmk
##### Row filters
102
103
class Filter:
104
    '''Wraps a reader, filtering each row'''
105
    def __init__(self, filter_, reader):
106
        self.reader = reader
107
        self.filter = filter_
108
109
    def __iter__(self): return self
110
111
    def next(self): return self.filter(self.reader.next())
112
113
std_nulls = [r'\N']
114
empty_nulls = [''] + std_nulls
115
116
class NullFilter(Filter):
117
    '''Translates special string values to None'''
118
    def __init__(self, reader, nulls=std_nulls):
119
        map_ = dict.fromkeys(nulls, None)
120
        def filter_(row): return [map_.get(v, v) for v in row]
121
        Filter.__init__(self, filter_, reader)
122
123
class StripFilter(Filter):
124
    '''Strips whitespace'''
125
    def __init__(self, reader):
126
        def filter_(row): return [v.strip() for v in row]
127
        Filter.__init__(self, filter_, reader)