/ - Diff - BIEN 3 - NCEAS Projects

« Previous | Next »

Revision 4586

Added canon

     #!/usr/bin/env python
     # Canonicalizes a spreadsheet column to a vocabulary.
     # Unrecognized names are left untouched, permitting successive runs on different
     # vocabularies.
     # Case- and punctuation-insensitive.
     import csv
     import re
     import sys
     def simplify(str_): return re.sub(r'[\W_]+', r'', str_.lower())
     def main():
         try: _prog_name, col_num, vocab_path = sys.argv
         except ValueError: raise SystemExit('Usage: '+sys.argv[0]
             +' <in col# vocab [| '+sys.argv[0]+' col# vocab_2]... >out')
         col_num = int(col_num)
         # Get vocab
         dict_ = {}
         stream = open(vocab_path, 'rb')
         reader = csv.reader(stream)
         reader.next() # skip header
         for term, in reader: dict_[simplify(term)] = term
         stream.close()
         # Canonicalize input
         reader = csv.reader(sys.stdin)
         writer = csv.writer(sys.stdout)
         writer.writerow(reader.next()) # pass through header
         for row in reader:
             term = simplify(row[col_num])
             try: row[col_num] = dict_[term]
             except KeyError: pass
             writer.writerow(row)
     main()

Also available in: Unified diff