Project

General

Profile

1
#!/usr/bin/env python
2
# Canonicalizes a spreadsheet column to a vocabulary.
3
# Unrecognized names are left untouched, permitting successive runs on different
4
# vocabularies.
5
# Case- and punctuation-insensitive.
6

    
7
import csv
8
import re
9
import sys
10

    
11
def simplify(str_): return re.sub(r'[\W_]+', r'', str_.lower())
12

    
13
def main():
14
    try: _prog_name, col_num, vocab_path = sys.argv
15
    except ValueError: raise SystemExit('Usage: '+sys.argv[0]
16
        +' <in col# vocab [| '+sys.argv[0]+' col# vocab_2]... >out')
17
    col_num = int(col_num)
18
    
19
    # Get vocab
20
    dict_ = {}
21
    stream = open(vocab_path, 'rb')
22
    reader = csv.reader(stream)
23
    reader.next() # skip header
24
    for row in reader: dict_[simplify(row[0])] = row[0]
25
    stream.close()
26
    
27
    # Canonicalize input
28
    reader = csv.reader(sys.stdin)
29
    writer = csv.writer(sys.stdout)
30
    writer.writerow(reader.next()) # pass through header
31
    for row in reader:
32
        term = simplify(row[col_num])
33
        try: row[col_num] = dict_[term]
34
        except KeyError: pass
35
        writer.writerow(row)
36

    
37
main()
(2-2/61)