Project

General

Profile

« Previous | Next » 

Revision 7449

canon: Raise an error if two input terms map to the same simplified string

View differences:

bin/canon
3 3
# The column header is also canonicalized. CSVs without a header are supported.
4 4
# Unrecognized names are left untouched, permitting successive runs on different
5 5
# vocabularies.
6
# Raises an error if two input terms map to the same simplified string.
6 7
# Case- and punctuation-insensitive.
7 8

  
8 9
import csv
9 10
import re
10 11
import sys
11 12

  
13
class OnceOnlyDict(dict):
14
    '''A dict that only allows each key to be assigned once (no overwriting).'''
15
    
16
    def __setitem__(self, key, value):
17
        if key in self: raise KeyError(key)
18
        dict.__setitem__(self, key, value)
19

  
12 20
def simplify(str_): return re.sub(r'[\W_]+', r'', str_.lower())
13 21

  
14 22
def main():
......
18 26
    col_num = int(col_num)
19 27
    
20 28
    # Get vocab
21
    dict_ = {}
29
    dict_ = OnceOnlyDict()
22 30
    stream = open(vocab_path, 'rb')
23 31
    reader = csv.reader(stream)
24 32
    for row in reader: dict_[simplify(row[0])] = row[0]

Also available in: Unified diff