Project

General

Profile

1
# CSV I/O
2

    
3
import csv
4
import StringIO
5

    
6
import strings
7
import util
8

    
9
delims = ',;\t`'
10
tsv_delim = '\t'
11
escape = '\\'
12

    
13
ending_placeholder = r'\n'
14

    
15
def is_tsv(dialect): return dialect.delimiter == tsv_delim
16

    
17
def sniff(line):
18
    '''Automatically detects the dialect'''
19
    dialect = csv.Sniffer().sniff(line, delims)
20
    # TSVs usually don't quote fields (nor doublequote embedded quotes)
21
    if is_tsv(dialect): dialect.quoting = csv.QUOTE_NONE
22
    else: dialect.doublequote = True # Sniffer doesn't turn this on by default
23
    return dialect
24

    
25
def stream_info(stream, parse_header=False):
26
    '''Automatically detects the dialect based on the header line
27
    @return NamedTuple {header_line, header, dialect}'''
28
    info = util.NamedTuple()
29
    info.header_line = stream.readline()
30
    info.header = None
31
    if info.header_line != '':
32
        info.dialect = sniff(info.header_line)
33
        if parse_header:
34
            info.header = reader_class(info.dialect)(
35
                StringIO.StringIO(info.header_line), info.dialect).next()
36
    else: info.dialect = None # line of '' indicates EOF = empty stream
37
    return info
38

    
39
tsv_encode_map = [
40
    ('\t', r'\t'),
41
    ('\n', '\\\n'),
42
]
43

    
44
class TsvReader:
45
    '''Unlike csv.reader, for TSVs, interprets \ as escaping a line ending but
46
    ignores it before everything else (e.g. \N for NULL).
47
    Also interprets '\n' as a newline  and '\t' as a tab.
48
    '''
49
    def __init__(self, stream, dialect):
50
        assert is_tsv(dialect)
51
        self.stream = stream
52
        self.dialect = dialect
53
    
54
    def __iter__(self): return self
55
    
56
    def next(self):
57
        record = ''
58
        ending = None
59
        while True:
60
            line = self.stream.readline()
61
            if line == '': raise StopIteration
62
            
63
            raw_contents, ending = strings.extract_line_ending(line)
64
            contents = strings.remove_suffix(escape, raw_contents)
65
            record += contents
66
            if len(contents) == len(raw_contents): break # no line continuation
67
            record += ending_placeholder
68
        
69
        # Prevent "new-line character seen in unquoted field" errors
70
        record = record.replace('\r', ending_placeholder)
71
        
72
        row = csv.reader(StringIO.StringIO(record), self.dialect).next()
73
        return [v.replace(r'\n', '\n').replace(r'\t', '\t') for v in row]
74

    
75
def reader_class(dialect):
76
    if is_tsv(dialect): return TsvReader
77
    else: return csv.reader
78

    
79
def make_reader(stream, dialect): return reader_class(dialect)(stream, dialect)
80

    
81
def reader_and_header(stream):
82
    '''Automatically detects the dialect based on the header line
83
    @return tuple (reader, header)'''
84
    info = stream_info(stream, parse_header=True)
85
    return (make_reader(stream, info.dialect), info.header)
86

    
87
##### csv modifications
88

    
89
# Note that these methods only work on *instances* of Dialect classes
90
csv.Dialect.__eq__ = lambda self, other: self.__dict__ == other.__dict__
91
csv.Dialect.__ne__ = lambda self, other: not (self == other)
92

    
93
##### Row filters
94

    
95
class Filter:
96
    '''Wraps a reader, filtering each row'''
97
    def __init__(self, filter_, reader):
98
        self.reader = reader
99
        self.filter = filter_
100
    
101
    def __iter__(self): return self
102
    
103
    def next(self): return self.filter(self.reader.next())
104

    
105
std_nulls = [r'\N']
106
empty_nulls = [''] + std_nulls
107

    
108
class NullFilter(Filter):
109
    '''Translates special string values to None'''
110
    def __init__(self, reader, nulls=std_nulls):
111
        map_ = dict.fromkeys(nulls, None)
112
        def filter_(row): return [map_.get(v, v) for v in row]
113
        Filter.__init__(self, filter_, reader)
114

    
115
class StripFilter(Filter):
116
    '''Strips whitespace'''
117
    def __init__(self, reader):
118
        def filter_(row): return [v.strip() for v in row]
119
        Filter.__init__(self, filter_, reader)
(8-8/41)