Project

General

Profile

1
#!/usr/bin/env python
2
# Finds spreadsheet rows where a column is not in a vocabulary.
3
# The vocabulary should not have a header. CSVs without a header are supported.
4
# Case- and punctuation-insensitive.
5

    
6
import csv
7
import re
8
import sys
9

    
10
def simplify(str_): return re.sub(r'[\W_]+', r'', str_.lower())
11

    
12
def main():
13
    try: _prog_name, col_num, vocab_path = sys.argv
14
    except ValueError: raise SystemExit('Usage: '+sys.argv[0]
15
        +' <in col# vocab [| '+sys.argv[0]+' col# vocab_2]... >out')
16
    col_num = int(col_num)
17
    
18
    # Get vocab
19
    vocab = set()
20
    stream = open(vocab_path, 'rb')
21
    reader = csv.reader(stream)
22
    for row in reader: vocab.add(simplify(row[0]))
23
    stream.close()
24
    
25
    # Filter input
26
    reader = csv.reader(sys.stdin)
27
    writer = csv.writer(sys.stdout)
28
    for row in reader:
29
        term = simplify(row[col_num])
30
        if term not in vocab: writer.writerow(row)
31

    
32
main()
(25-25/79)