Project

General

Profile

« Previous | Next » 

Revision 4593

Added filter_out_ci

View differences:

bin/filter_out_ci
1
#!/usr/bin/env python
2
# Finds spreadsheet rows where a column is not in a vocabulary.
3
# Case- and punctuation-insensitive.
4

  
5
import csv
6
import re
7
import sys
8

  
9
def simplify(str_): return re.sub(r'[\W_]+', r'', str_.lower())
10

  
11
def main():
12
    try: _prog_name, col_num, vocab_path = sys.argv
13
    except ValueError: raise SystemExit('Usage: '+sys.argv[0]
14
        +' <in col# vocab [| '+sys.argv[0]+' col# vocab_2]... >out')
15
    col_num = int(col_num)
16
    
17
    # Get vocab
18
    vocab = set()
19
    stream = open(vocab_path, 'rb')
20
    reader = csv.reader(stream)
21
    reader.next() # skip header
22
    for term, in reader: vocab.add(simplify(term))
23
    stream.close()
24
    
25
    # Filter input
26
    reader = csv.reader(sys.stdin)
27
    writer = csv.writer(sys.stdout)
28
    writer.writerow(reader.next()) # pass through header
29
    for row in reader:
30
        term = simplify(row[col_num])
31
        if term not in vocab: writer.writerow(row)
32

  
33
main()
0 34

  

Also available in: Unified diff