Revision 1623
Added by Aaron Marcuse-Kubitza almost 13 years ago
csvs.py | ||
---|---|---|
3 | 3 |
import csv |
4 | 4 |
import StringIO |
5 | 5 |
|
6 |
import strings |
|
6 | 7 |
import util |
7 | 8 |
|
8 |
delimiters = ',\t`' |
|
9 |
delims = ',\t`' |
|
10 |
tsv_delim = '\t' |
|
11 |
escape = '\\' |
|
9 | 12 |
|
13 |
ending_placeholder = r'\n' |
|
14 |
|
|
15 |
def is_tsv(dialect): return dialect.delimiter == tsv_delim |
|
16 |
|
|
10 | 17 |
def sniff(line): |
11 | 18 |
'''Automatically detects the dialect''' |
12 |
dialect = csv.Sniffer().sniff(line, delimiters)
|
|
19 |
dialect = csv.Sniffer().sniff(line, delims) |
|
13 | 20 |
# TSVs usually don't quote fields (nor doublequote embedded quotes) |
14 |
if dialect.delimiter == '\t': dialect.quoting = csv.QUOTE_NONE
|
|
21 |
if is_tsv(dialect): dialect.quoting = csv.QUOTE_NONE
|
|
15 | 22 |
else: dialect.doublequote = True # Sniffer doesn't turn this on by default |
16 | 23 |
return dialect |
17 | 24 |
|
... | ... | |
23 | 30 |
info.dialect = sniff(info.header_line) |
24 | 31 |
return info |
25 | 32 |
|
33 |
class TsvReader: |
|
34 |
'''Unlike csv.reader, for TSVs, interprets \ as escaping a line ending but |
|
35 |
ignores it before everything else (e.g. \N for NULL). Also interprets the |
|
36 |
'\n' escape sequence as a newline.''' |
|
37 |
def __init__(self, stream, dialect): |
|
38 |
assert is_tsv(dialect) |
|
39 |
self.stream = stream |
|
40 |
self.dialect = dialect |
|
41 |
|
|
42 |
def __iter__(self): return self |
|
43 |
|
|
44 |
def next(self): |
|
45 |
record = '' |
|
46 |
ending = None |
|
47 |
while True: |
|
48 |
line = self.stream.readline() |
|
49 |
if line == '': raise StopIteration |
|
50 |
|
|
51 |
raw_contents, ending = strings.extract_line_ending(line) |
|
52 |
contents = strings.remove_suffix(escape, raw_contents) |
|
53 |
record += contents |
|
54 |
if len(contents) == len(raw_contents): break # no line continuation |
|
55 |
record += ending_placeholder |
|
56 |
row = csv.reader(StringIO.StringIO(record), self.dialect).next() |
|
57 |
return [v.replace(ending_placeholder, '\n') for v in row] |
|
58 |
|
|
59 |
def reader_class(dialect): |
|
60 |
if is_tsv(dialect): return TsvReader |
|
61 |
else: return csv.reader |
|
62 |
|
|
63 |
def make_reader(stream, dialect): return reader_class(dialect)(stream, dialect) |
|
64 |
|
|
26 | 65 |
def reader_and_header(stream): |
27 | 66 |
'''Automatically detects the dialect based on the header line |
28 | 67 |
@return tuple (reader, header)''' |
29 | 68 |
info = stream_info(stream) |
30 |
header = csv.reader(StringIO.StringIO(info.header_line), |
|
69 |
reader_class_ = reader_class(info.dialect) |
|
70 |
header = reader_class_(StringIO.StringIO(info.header_line), |
|
31 | 71 |
info.dialect).next() |
32 |
return (csv.reader(stream, info.dialect), header)
|
|
72 |
return (reader_class_(stream, info.dialect), header)
|
|
33 | 73 |
|
34 | 74 |
##### csv modifications |
35 | 75 |
|
Also available in: Unified diff
csvs.py: Added TsvReader to support TSV quirks. Added reader_class(). reader_and_header(): Use reader_class() to automatically use TsvReader instead of csv.reader for TSVs. Added is_tsv() and use it where `dialect.delimiter == '\t'` was used.