Project

General

Profile

1
# CSV I/O
2

    
3
import csv
4
import StringIO
5

    
6
import strings
7
import util
8

    
9
delims = ',\t`'
10
tsv_delim = '\t'
11
escape = '\\'
12

    
13
ending_placeholder = r'\n'
14

    
15
def is_tsv(dialect): return dialect.delimiter == tsv_delim
16

    
17
def sniff(line):
18
    '''Automatically detects the dialect'''
19
    dialect = csv.Sniffer().sniff(line, delims)
20
    # TSVs usually don't quote fields (nor doublequote embedded quotes)
21
    if is_tsv(dialect): dialect.quoting = csv.QUOTE_NONE
22
    else: dialect.doublequote = True # Sniffer doesn't turn this on by default
23
    return dialect
24

    
25
def stream_info(stream):
26
    '''Automatically detects the dialect based on the header line
27
    @return NamedTuple {header_line, dialect}'''
28
    info = util.NamedTuple()
29
    info.header_line = stream.readline()
30
    info.dialect = sniff(info.header_line)
31
    return info
32

    
33
class TsvReader:
34
    '''Unlike csv.reader, for TSVs, interprets \ as escaping a line ending but
35
    ignores it before everything else (e.g. \N for NULL). Also interprets the
36
    '\n' escape sequence as a newline.'''
37
    def __init__(self, stream, dialect):
38
        assert is_tsv(dialect)
39
        self.stream = stream
40
        self.dialect = dialect
41
    
42
    def __iter__(self): return self
43
    
44
    def next(self):
45
        record = ''
46
        ending = None
47
        while True:
48
            line = self.stream.readline()
49
            if line == '': raise StopIteration
50
            
51
            raw_contents, ending = strings.extract_line_ending(line)
52
            contents = strings.remove_suffix(escape, raw_contents)
53
            record += contents
54
            if len(contents) == len(raw_contents): break # no line continuation
55
            record += ending_placeholder
56
        row = csv.reader(StringIO.StringIO(record), self.dialect).next()
57
        return [v.replace(ending_placeholder, '\n') for v in row]
58

    
59
def reader_class(dialect):
60
    if is_tsv(dialect): return TsvReader
61
    else: return csv.reader
62

    
63
def make_reader(stream, dialect): return reader_class(dialect)(stream, dialect)
64

    
65
def reader_and_header(stream):
66
    '''Automatically detects the dialect based on the header line
67
    @return tuple (reader, header)'''
68
    info = stream_info(stream)
69
    reader_class_ = reader_class(info.dialect)
70
    header = reader_class_(StringIO.StringIO(info.header_line),
71
        info.dialect).next()
72
    return (reader_class_(stream, info.dialect), header)
73

    
74
##### csv modifications
75

    
76
# Note that these methods only work on *instances* of Dialect classes
77
csv.Dialect.__eq__ = lambda self, other: self.__dict__ == other.__dict__
78
csv.Dialect.__ne__ = lambda self, other: not (self == other)
(4-4/20)