Revision 7449
Added by Aaron Marcuse-Kubitza almost 12 years ago
bin/canon | ||
---|---|---|
3 | 3 |
# The column header is also canonicalized. CSVs without a header are supported. |
4 | 4 |
# Unrecognized names are left untouched, permitting successive runs on different |
5 | 5 |
# vocabularies. |
6 |
# Raises an error if two input terms map to the same simplified string. |
|
6 | 7 |
# Case- and punctuation-insensitive. |
7 | 8 |
|
8 | 9 |
import csv |
9 | 10 |
import re |
10 | 11 |
import sys |
11 | 12 |
|
13 |
class OnceOnlyDict(dict): |
|
14 |
'''A dict that only allows each key to be assigned once (no overwriting).''' |
|
15 |
|
|
16 |
def __setitem__(self, key, value): |
|
17 |
if key in self: raise KeyError(key) |
|
18 |
dict.__setitem__(self, key, value) |
|
19 |
|
|
12 | 20 |
def simplify(str_): return re.sub(r'[\W_]+', r'', str_.lower()) |
13 | 21 |
|
14 | 22 |
def main(): |
... | ... | |
18 | 26 |
col_num = int(col_num) |
19 | 27 |
|
20 | 28 |
# Get vocab |
21 |
dict_ = {}
|
|
29 |
dict_ = OnceOnlyDict()
|
|
22 | 30 |
stream = open(vocab_path, 'rb') |
23 | 31 |
reader = csv.reader(stream) |
24 | 32 |
for row in reader: dict_[simplify(row[0])] = row[0] |
Also available in: Unified diff
canon: Raise an error if two input terms map to the same simplified string