Project

General

Profile

1
#!/usr/bin/env python
2
# Canonicalizes a spreadsheet column to a vocabulary.
3
# The column header is also canonicalized. CSVs without a header are supported.
4
# Unrecognized names are left untouched, permitting successive runs on different
5
# vocabularies.
6
# Raises an error if two input terms map to the same simplified string.
7
# Case- and punctuation-insensitive.
8

    
9
import csv
10
import re
11
import sys
12

    
13
class OnceOnlyDict(dict):
14
    '''A dict that only allows each key to be assigned once (no overwriting).'''
15
    
16
    def __setitem__(self, key, value):
17
        if key in self: raise KeyError(key)
18
        dict.__setitem__(self, key, value)
19

    
20
def simplify(str_): return re.sub(r'[^a-zA-Z0-9]+', r'', str_.lower())
21

    
22
def main():
23
    try: _prog_name, col_num, vocab_path = sys.argv
24
    except ValueError: raise SystemExit('Usage: '+sys.argv[0]
25
        +' <in col# vocab [| '+sys.argv[0]+' col# vocab_2]... >out')
26
    col_num = int(col_num)
27
    
28
    # Get vocab
29
    dict_ = OnceOnlyDict()
30
    stream = open(vocab_path, 'rb')
31
    reader = csv.reader(stream)
32
    for row in reader: dict_[simplify(row[0])] = row[0]
33
    stream.close()
34
    
35
    # Canonicalize input
36
    reader = csv.reader(sys.stdin)
37
    writer = csv.writer(sys.stdout)
38
    for row in reader:
39
        term = simplify(row[col_num])
40
        try: row[col_num] = dict_[term]
41
        except KeyError: pass
42
        writer.writerow(row)
43

    
44
main()
(5-5/87)