Project

General

Profile

1
# CSV I/O
2

    
3
import csv
4
import _csv
5
import StringIO
6

    
7
import strings
8
import util
9

    
10
delims = ',;\t|`'
11
tab_padded_delims = ['\t|\t']
12
tsv_delim = '\t'
13
escape = '\\'
14

    
15
ending_placeholder = r'\n'
16

    
17
def is_tsv(dialect): return dialect.delimiter.startswith(tsv_delim)
18

    
19
def sniff(line):
20
    '''Automatically detects the dialect'''
21
    line, ending = strings.extract_line_ending(line)
22
    dialect = csv.Sniffer().sniff(line, delims)
23
    
24
    if is_tsv(dialect):
25
        # TSVs usually don't quote fields (nor doublequote embedded quotes)
26
        dialect.quoting = csv.QUOTE_NONE
27
        
28
        # Check multi-char delims using \t
29
        delim = strings.find_any(line, tab_padded_delims)
30
        if delim:
31
            dialect.delimiter = delim
32
            line_suffix = delim.rstrip('\t')
33
            if line.endswith(line_suffix): ending = line_suffix+ending
34
    else: dialect.doublequote = True # Sniffer doesn't turn this on by default
35
    dialect.lineterminator = ending
36
    
37
    return dialect
38

    
39
def stream_info(stream, parse_header=False):
40
    '''Automatically detects the dialect based on the header line
41
    @return NamedTuple {header_line, header, dialect}'''
42
    info = util.NamedTuple()
43
    info.header_line = stream.readline()
44
    info.header = None
45
    if info.header_line != '':
46
        info.dialect = sniff(info.header_line)
47
        if parse_header:
48
            info.header = reader_class(info.dialect)(
49
                StringIO.StringIO(info.header_line), info.dialect).next()
50
    else: info.dialect = None # line of '' indicates EOF = empty stream
51
    return info
52

    
53
tsv_encode_map = strings.json_encode_map[:]
54
tsv_encode_map.append(('\t', r'\t'))
55
tsv_decode_map = strings.flip_map(tsv_encode_map)
56

    
57
class TsvReader:
58
    '''Unlike csv.reader, for TSVs, interprets \ as escaping a line ending but
59
    ignores it before everything else (e.g. \N for NULL).
60
    Also expands tsv_encode_map escapes.
61
    '''
62
    def __init__(self, stream, dialect):
63
        assert is_tsv(dialect)
64
        self.stream = stream
65
        self.dialect = dialect
66
    
67
    def __iter__(self): return self
68
    
69
    def next(self):
70
        record = ''
71
        ending = None
72
        while True:
73
            line = self.stream.readline()
74
            if line == '': raise StopIteration
75
            
76
            line = strings.remove_suffix(self.dialect.lineterminator, line)
77
            contents = strings.remove_suffix(escape, line)
78
            record += contents
79
            if len(contents) == len(line): break # no line continuation
80
            record += ending_placeholder
81
        
82
        # Prevent "new-line character seen in unquoted field" errors
83
        record = record.replace('\r', ending_placeholder)
84
        
85
        row = record.split(self.dialect.delimiter)
86
        return [strings.replace_all(tsv_decode_map, v) for v in row]
87

    
88
def reader_class(dialect):
89
    if is_tsv(dialect): return TsvReader
90
    else: return csv.reader
91

    
92
def make_reader(stream, dialect): return reader_class(dialect)(stream, dialect)
93

    
94
def reader_and_header(stream):
95
    '''Automatically detects the dialect based on the header line
96
    @return tuple (reader, header)'''
97
    info = stream_info(stream, parse_header=True)
98
    return (make_reader(stream, info.dialect), info.header)
99

    
100
##### csv modifications
101

    
102
# Note that these methods only work on *instances* of Dialect classes
103
csv.Dialect.__eq__ = lambda self, other: self.__dict__ == other.__dict__
104
csv.Dialect.__ne__ = lambda self, other: not (self == other)
105

    
106
__Dialect__validate_orig = csv.Dialect._validate
107
def __Dialect__validate(self):
108
        try: __Dialect__validate_orig(self)
109
        except _csv.Error, e:
110
            if str(e) == '"delimiter" must be an 1-character string': pass # OK
111
            else: raise
112
csv.Dialect._validate = __Dialect__validate
113

    
114
##### Row filters
115

    
116
class Filter:
117
    '''Wraps a reader, filtering each row'''
118
    def __init__(self, filter_, reader):
119
        self.reader = reader
120
        self.filter = filter_
121
    
122
    def __iter__(self): return self
123
    
124
    def next(self): return self.filter(self.reader.next())
125
    
126
    def close(self): pass # support using as a stream
127

    
128
std_nulls = [r'\N']
129
empty_nulls = [''] + std_nulls
130

    
131
class NullFilter(Filter):
132
    '''Translates special string values to None'''
133
    def __init__(self, reader, nulls=std_nulls):
134
        map_ = dict.fromkeys(nulls, None)
135
        def filter_(row): return [map_.get(v, v) for v in row]
136
        Filter.__init__(self, filter_, reader)
137

    
138
class StripFilter(Filter):
139
    '''Strips whitespace'''
140
    def __init__(self, reader):
141
        def filter_(row): return [v.strip() for v in row]
142
        Filter.__init__(self, filter_, reader)
143

    
144
class ColCtFilter(Filter):
145
    '''Gives all rows the same # columns'''
146
    def __init__(self, reader, cols_ct):
147
        def filter_(row): return util.list_as_length(row, cols_ct)
148
        Filter.__init__(self, filter_, reader)
149

    
150
##### Translators
151

    
152
class InputRewriter:
153
    '''Wraps a reader, writing each row back to CSV'''
154
    def __init__(self, reader, dialect=csv.excel):
155
        self.reader = reader
156
        self.dialect = dialect
157
    
158
    def readline(self):
159
        try: row = self.reader.next()
160
        except StopIteration: return '' # EOF
161
        line_stream = StringIO.StringIO()
162
        csv.writer(line_stream, self.dialect).writerow(row)
163
        return line_stream.getvalue()
164
    
165
    def read(self, n): return self.readline() # forward all reads to readline()
(8-8/42)