Project

General

Profile

1 4586 aaronmk
#!/usr/bin/env python
2
# Canonicalizes a spreadsheet column to a vocabulary.
3 4649 aaronmk
# The column header is also canonicalized. CSVs without a header are supported.
4 4586 aaronmk
# Unrecognized names are left untouched, permitting successive runs on different
5
# vocabularies.
6 7449 aaronmk
# Raises an error if two input terms map to the same simplified string.
7 4586 aaronmk
# Case- and punctuation-insensitive.
8
9
import csv
10
import re
11
import sys
12
13 7449 aaronmk
class OnceOnlyDict(dict):
14
    '''A dict that only allows each key to be assigned once (no overwriting).'''
15
16
    def __setitem__(self, key, value):
17
        if key in self: raise KeyError(key)
18
        dict.__setitem__(self, key, value)
19
20 10283 aaronmk
def simplify(str_): return re.sub(r'[^a-zA-Z0-9]+', r'', str_.lower())
21 4586 aaronmk
22
def main():
23
    try: _prog_name, col_num, vocab_path = sys.argv
24
    except ValueError: raise SystemExit('Usage: '+sys.argv[0]
25
        +' <in col# vocab [| '+sys.argv[0]+' col# vocab_2]... >out')
26
    col_num = int(col_num)
27
28
    # Get vocab
29 7449 aaronmk
    dict_ = OnceOnlyDict()
30 4586 aaronmk
    stream = open(vocab_path, 'rb')
31
    reader = csv.reader(stream)
32 4626 aaronmk
    for row in reader: dict_[simplify(row[0])] = row[0]
33 4586 aaronmk
    stream.close()
34
35
    # Canonicalize input
36
    reader = csv.reader(sys.stdin)
37
    writer = csv.writer(sys.stdout)
38
    for row in reader:
39
        term = simplify(row[col_num])
40
        try: row[col_num] = dict_[term]
41
        except KeyError: pass
42
        writer.writerow(row)
43
44
main()