/bin/canon - BIEN 3 - NCEAS Projects

root/bin/canon @ 4637

       #!/usr/bin/env python
       # Canonicalizes a spreadsheet column to a vocabulary.
       # Unrecognized names are left untouched, permitting successive runs on different
       # vocabularies.
       # Case- and punctuation-insensitive.
       import csv
       import re
       import sys
       def simplify(str_): return re.sub(r'[\W_]+', r'', str_.lower())
       def main():
           try: _prog_name, col_num, vocab_path = sys.argv
           except ValueError: raise SystemExit('Usage: '+sys.argv[0]
               +' <in col# vocab [| '+sys.argv[0]+' col# vocab_2]... >out')
           col_num = int(col_num)
           # Get vocab
           dict_ = {}
           stream = open(vocab_path, 'rb')
           reader = csv.reader(stream)
           reader.next() # skip header
           for row in reader: dict_[simplify(row[0])] = row[0]
           stream.close()
           # Canonicalize input
           reader = csv.reader(sys.stdin)
           writer = csv.writer(sys.stdout)
           writer.writerow(reader.next()) # pass through header
           for row in reader:
               term = simplify(row[col_num])
               try: row[col_num] = dict_[term]
               except KeyError: pass
               writer.writerow(row)
       main()

(2-2/61)