Project

General

Profile

1
#!/usr/bin/env python
2
# Canonicalizes a spreadsheet column to a vocabulary.
3
# The column header is also canonicalized. CSVs without a header are supported.
4
# Unrecognized names are left untouched, permitting successive runs on different
5
# vocabularies.
6
# Case- and punctuation-insensitive.
7

    
8
import csv
9
import re
10
import sys
11

    
12
def simplify(str_): return re.sub(r'[\W_]+', r'', str_.lower())
13

    
14
def main():
15
    try: _prog_name, col_num, vocab_path = sys.argv
16
    except ValueError: raise SystemExit('Usage: '+sys.argv[0]
17
        +' <in col# vocab [| '+sys.argv[0]+' col# vocab_2]... >out')
18
    col_num = int(col_num)
19
    
20
    # Get vocab
21
    dict_ = {}
22
    stream = open(vocab_path, 'rb')
23
    reader = csv.reader(stream)
24
    for row in reader: dict_[simplify(row[0])] = row[0]
25
    stream.close()
26
    
27
    # Canonicalize input
28
    reader = csv.reader(sys.stdin)
29
    writer = csv.writer(sys.stdout)
30
    for row in reader:
31
        term = simplify(row[col_num])
32
        try: row[col_num] = dict_[term]
33
        except KeyError: pass
34
        writer.writerow(row)
35

    
36
main()
(2-2/68)