Project

General

Profile

« Previous | Next » 

Revision 4586

Added canon

View differences:

bin/canon
1
#!/usr/bin/env python
2
# Canonicalizes a spreadsheet column to a vocabulary.
3
# Unrecognized names are left untouched, permitting successive runs on different
4
# vocabularies.
5
# Case- and punctuation-insensitive.
6

  
7
import csv
8
import re
9
import sys
10

  
11
def simplify(str_): return re.sub(r'[\W_]+', r'', str_.lower())
12

  
13
def main():
14
    try: _prog_name, col_num, vocab_path = sys.argv
15
    except ValueError: raise SystemExit('Usage: '+sys.argv[0]
16
        +' <in col# vocab [| '+sys.argv[0]+' col# vocab_2]... >out')
17
    col_num = int(col_num)
18
    
19
    # Get vocab
20
    dict_ = {}
21
    stream = open(vocab_path, 'rb')
22
    reader = csv.reader(stream)
23
    reader.next() # skip header
24
    for term, in reader: dict_[simplify(term)] = term
25
    stream.close()
26
    
27
    # Canonicalize input
28
    reader = csv.reader(sys.stdin)
29
    writer = csv.writer(sys.stdout)
30
    writer.writerow(reader.next()) # pass through header
31
    for row in reader:
32
        term = simplify(row[col_num])
33
        try: row[col_num] = dict_[term]
34
        except KeyError: pass
35
        writer.writerow(row)
36

  
37
main()
0 38

  

Also available in: Unified diff