Project

General

Profile

« Previous | Next » 

Revision 1623

csvs.py: Added TsvReader to support TSV quirks. Added reader_class(). reader_and_header(): Use reader_class() to automatically use TsvReader instead of csv.reader for TSVs. Added is_tsv() and use it where `dialect.delimiter == '\t'` was used.

View differences:

csvs.py
3 3
import csv
4 4
import StringIO
5 5

  
6
import strings
6 7
import util
7 8

  
8
delimiters = ',\t`'
9
delims = ',\t`'
10
tsv_delim = '\t'
11
escape = '\\'
9 12

  
13
ending_placeholder = r'\n'
14

  
15
def is_tsv(dialect): return dialect.delimiter == tsv_delim
16

  
10 17
def sniff(line):
11 18
    '''Automatically detects the dialect'''
12
    dialect = csv.Sniffer().sniff(line, delimiters)
19
    dialect = csv.Sniffer().sniff(line, delims)
13 20
    # TSVs usually don't quote fields (nor doublequote embedded quotes)
14
    if dialect.delimiter == '\t': dialect.quoting = csv.QUOTE_NONE
21
    if is_tsv(dialect): dialect.quoting = csv.QUOTE_NONE
15 22
    else: dialect.doublequote = True # Sniffer doesn't turn this on by default
16 23
    return dialect
17 24

  
......
23 30
    info.dialect = sniff(info.header_line)
24 31
    return info
25 32

  
33
class TsvReader:
34
    '''Unlike csv.reader, for TSVs, interprets \ as escaping a line ending but
35
    ignores it before everything else (e.g. \N for NULL). Also interprets the
36
    '\n' escape sequence as a newline.'''
37
    def __init__(self, stream, dialect):
38
        assert is_tsv(dialect)
39
        self.stream = stream
40
        self.dialect = dialect
41
    
42
    def __iter__(self): return self
43
    
44
    def next(self):
45
        record = ''
46
        ending = None
47
        while True:
48
            line = self.stream.readline()
49
            if line == '': raise StopIteration
50
            
51
            raw_contents, ending = strings.extract_line_ending(line)
52
            contents = strings.remove_suffix(escape, raw_contents)
53
            record += contents
54
            if len(contents) == len(raw_contents): break # no line continuation
55
            record += ending_placeholder
56
        row = csv.reader(StringIO.StringIO(record), self.dialect).next()
57
        return [v.replace(ending_placeholder, '\n') for v in row]
58

  
59
def reader_class(dialect):
60
    if is_tsv(dialect): return TsvReader
61
    else: return csv.reader
62

  
63
def make_reader(stream, dialect): return reader_class(dialect)(stream, dialect)
64

  
26 65
def reader_and_header(stream):
27 66
    '''Automatically detects the dialect based on the header line
28 67
    @return tuple (reader, header)'''
29 68
    info = stream_info(stream)
30
    header = csv.reader(StringIO.StringIO(info.header_line),
69
    reader_class_ = reader_class(info.dialect)
70
    header = reader_class_(StringIO.StringIO(info.header_line),
31 71
        info.dialect).next()
32
    return (csv.reader(stream, info.dialect), header)
72
    return (reader_class_(stream, info.dialect), header)
33 73

  
34 74
##### csv modifications
35 75

  

Also available in: Unified diff