Project

General

Profile

1 205 aaronmk
#!/usr/bin/env python
2
# Modifies a map spreadsheet A->B or any file using a replacements spreadsheet
3
# A->C or B->C
4
5
import csv
6 9785 aaronmk
import HTMLParser
7 1705 aaronmk
import os.path
8 205 aaronmk
import re
9
import sys
10
11 1705 aaronmk
sys.path.append(os.path.dirname(__file__)+"/../lib")
12
13
import maps
14 4357 aaronmk
import opts
15 9784 aaronmk
import strings
16 1705 aaronmk
17 10348 aaronmk
quote_re = '[\'"`]'
18 12746 aaronmk
excluded_prefix_re = (
19 12749 aaronmk
     '(?<! )' # not if it's a word in a sentence
20
    +'(?<!\*)' # don't double leading *
21
    )
22
excluded_suffix_re = (
23
     '(?! )' # not if it's a word in a sentence
24
    )
25 10348 aaronmk
26 9785 aaronmk
def unescape_html(str_): return HTMLParser.HTMLParser().unescape(str_)
27
28
def repl_unescape_html(match): return unescape_html(match.group(0))
29
30 205 aaronmk
def main():
31 4357 aaronmk
    env_names = []
32
    def usage_err():
33
        raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' '
34
            +sys.argv[0]+' <map repl [col_num] [| '+sys.argv[0]
35
            +' repl_1 [col_num_1]]... >new_map')
36
37
    text = opts.env_flag('text', False, env_names) # all patterns are plain text
38 205 aaronmk
    try: _prog_name, repl_path = sys.argv[:2]
39 4357 aaronmk
    except ValueError: usage_err()
40 205 aaronmk
    col_num = None
41
    try: col_num = sys.argv[2]
42 211 aaronmk
    except IndexError: pass
43 205 aaronmk
    if col_num != None: col_num = int(col_num) # 0-based
44
45
    # Get replacements
46
    repls = []
47
    stream = open(repl_path, 'rb')
48
    reader = csv.reader(stream)
49 8740 aaronmk
    reader.next() # skip header
50 205 aaronmk
    for row in reader:
51 210 aaronmk
        in_, out = row[:2]
52 216 aaronmk
        if in_ != '':
53 10348 aaronmk
            is_word = re.match(r'^\w+$', in_)
54
            if text or is_word: # match as whole-word text (like SQL identifier)
55
                in_str_re = re.escape(in_)
56
                q = quote_re
57
                in_ = '(?<='+q+')'+in_str_re+'(?='+q+')' # require quotes
58 12742 aaronmk
                if is_word: # also match with quotes optional
59 12748 aaronmk
                    # don't try to match word w/ suffix, because there are cases
60
                    # where a mapping adds a suffix which would cause the same
61
                    # replacement to be performed repeatedly
62 12749 aaronmk
                    in_word_re = (excluded_prefix_re+r'\b'+in_str_re+r'\b'
63
                        +excluded_suffix_re)
64 10348 aaronmk
                    in_ = '(?:'+in_+'|'+in_word_re+')'
65 6713 aaronmk
            repls.append((r'(?m)'+in_, out))
66 205 aaronmk
    stream.close()
67
    def repl_all(str_):
68 9784 aaronmk
        str_ = strings.ustr(str_)
69 9785 aaronmk
        for repl, with_ in repls:
70
            if with_ == 'unescape_html()': with_ = repl_unescape_html
71
            str_ = re.sub(repl, with_, str_)
72 205 aaronmk
        return str_
73
74
    # Modify map or file
75
    if col_num != None:
76
        reader = csv.reader(sys.stdin)
77
        writer = csv.writer(sys.stdout)
78
        cols = reader.next()
79
        writer.writerow(cols)
80
        for row in reader:
81
            row[col_num] = repl_all(row[col_num])
82
            writer.writerow(row)
83 9784 aaronmk
    else: sys.stdout.write(strings.to_raw_str(repl_all(sys.stdin.read())))
84 205 aaronmk
85
main()