Project

General

Profile

1
# CSV I/O
2

    
3
import csv
4
import _csv
5
import StringIO
6

    
7
import strings
8
import util
9

    
10
delims = ',;\t|`'
11
tsv_delim = '\t'
12
escape = '\\'
13

    
14
ending_placeholder = r'\n'
15

    
16
def is_tsv(dialect): return dialect.delimiter.startswith(tsv_delim)
17

    
18
def sniff(line):
19
    '''Automatically detects the dialect'''
20
    line, ending = strings.extract_line_ending(line)
21
    dialect = csv.Sniffer().sniff(line, delims)
22
    
23
    # TSVs usually don't quote fields (nor doublequote embedded quotes)
24
    if is_tsv(dialect): dialect.quoting = csv.QUOTE_NONE
25
    else: dialect.doublequote = True # Sniffer doesn't turn this on by default
26
    dialect.lineterminator = ending
27
    
28
    return dialect
29

    
30
def stream_info(stream, parse_header=False):
31
    '''Automatically detects the dialect based on the header line
32
    @return NamedTuple {header_line, header, dialect}'''
33
    info = util.NamedTuple()
34
    info.header_line = stream.readline()
35
    info.header = None
36
    if info.header_line != '':
37
        info.dialect = sniff(info.header_line)
38
        if parse_header:
39
            info.header = reader_class(info.dialect)(
40
                StringIO.StringIO(info.header_line), info.dialect).next()
41
    else: info.dialect = None # line of '' indicates EOF = empty stream
42
    return info
43

    
44
tsv_encode_map = strings.json_encode_map[:]
45
tsv_encode_map.append(('\t', r'\t'))
46
tsv_decode_map = strings.flip_map(tsv_encode_map)
47

    
48
class TsvReader:
49
    '''Unlike csv.reader, for TSVs, interprets \ as escaping a line ending but
50
    ignores it before everything else (e.g. \N for NULL).
51
    Also expands tsv_encode_map escapes.
52
    '''
53
    def __init__(self, stream, dialect):
54
        assert is_tsv(dialect)
55
        self.stream = stream
56
        self.dialect = dialect
57
    
58
    def __iter__(self): return self
59
    
60
    def next(self):
61
        record = ''
62
        ending = None
63
        while True:
64
            line = self.stream.readline()
65
            if line == '': raise StopIteration
66
            
67
            line = strings.remove_suffix(self.dialect.lineterminator, line)
68
            contents = strings.remove_suffix(escape, line)
69
            record += contents
70
            if len(contents) == len(line): break # no line continuation
71
            record += ending_placeholder
72
        
73
        # Prevent "new-line character seen in unquoted field" errors
74
        record = record.replace('\r', ending_placeholder)
75
        
76
        row = record.split(self.dialect.delimiter)
77
        return [strings.replace_all(tsv_decode_map, v) for v in row]
78

    
79
def reader_class(dialect):
80
    if is_tsv(dialect): return TsvReader
81
    else: return csv.reader
82

    
83
def make_reader(stream, dialect): return reader_class(dialect)(stream, dialect)
84

    
85
def reader_and_header(stream):
86
    '''Automatically detects the dialect based on the header line
87
    @return tuple (reader, header)'''
88
    info = stream_info(stream, parse_header=True)
89
    return (make_reader(stream, info.dialect), info.header)
90

    
91
##### csv modifications
92

    
93
# Note that these methods only work on *instances* of Dialect classes
94
csv.Dialect.__eq__ = lambda self, other: self.__dict__ == other.__dict__
95
csv.Dialect.__ne__ = lambda self, other: not (self == other)
96

    
97
__Dialect__validate_orig = csv.Dialect._validate
98
def __Dialect__validate(self):
99
        try: __Dialect__validate_orig(self)
100
        except _csv.Error, e:
101
            if str(e) == '"delimiter" must be an 1-character string': pass # OK
102
            else: raise
103
csv.Dialect._validate = __Dialect__validate
104

    
105
##### Row filters
106

    
107
class Filter:
108
    '''Wraps a reader, filtering each row'''
109
    def __init__(self, filter_, reader):
110
        self.reader = reader
111
        self.filter = filter_
112
    
113
    def __iter__(self): return self
114
    
115
    def next(self): return self.filter(self.reader.next())
116

    
117
std_nulls = [r'\N']
118
empty_nulls = [''] + std_nulls
119

    
120
class NullFilter(Filter):
121
    '''Translates special string values to None'''
122
    def __init__(self, reader, nulls=std_nulls):
123
        map_ = dict.fromkeys(nulls, None)
124
        def filter_(row): return [map_.get(v, v) for v in row]
125
        Filter.__init__(self, filter_, reader)
126

    
127
class StripFilter(Filter):
128
    '''Strips whitespace'''
129
    def __init__(self, reader):
130
        def filter_(row): return [v.strip() for v in row]
131
        Filter.__init__(self, filter_, reader)
(8-8/42)