Project

General

Profile

« Previous | Next » 

Revision 7033

Added filter_out_cs

View differences:

bin/filter_out_cs
1
#!/usr/bin/env python
2
# Finds spreadsheet rows where a column is not in a vocabulary.
3
# The vocabulary should not have a header. CSVs without a header are supported.
4
# Case- and punctuation-sensitive.
5

  
6
import csv
7
import re
8
import sys
9

  
10
def main():
11
    try: _prog_name, col_num, vocab_path = sys.argv
12
    except ValueError: raise SystemExit('Usage: '+sys.argv[0]
13
        +' <in col# vocab [| '+sys.argv[0]+' col# vocab_2]... >out')
14
    col_num = int(col_num)
15
    
16
    # Get vocab
17
    vocab = set()
18
    stream = open(vocab_path, 'rb')
19
    reader = csv.reader(stream)
20
    for row in reader: vocab.add(row[0])
21
    stream.close()
22
    
23
    # Filter input
24
    reader = csv.reader(sys.stdin)
25
    writer = csv.writer(sys.stdout)
26
    for row in reader:
27
        term = row[col_num]
28
        if term not in vocab: writer.writerow(row)
29

  
30
main()
0 31

  

Also available in: Unified diff