Project

General

Profile

1
#!/usr/bin/env python
2
# Modifies a map spreadsheet A->B or any file using a replacements spreadsheet
3
# A->C or B->C
4

    
5
import csv
6
import HTMLParser
7
import os.path
8
import re
9
import sys
10

    
11
sys.path.append(os.path.dirname(__file__)+"/../lib")
12

    
13
import maps
14
import opts
15
import strings
16

    
17
quote_re = '[\'"`]'
18
excluded_prefix_re = (
19
     '(?<![a-z] )' #not if it's a word in a sentence (uppercase SQL keywords OK)
20
    +'(?<!-)' # not if it's part of a '-'-separated identifier
21
    +'(?<!\*)' # don't double leading *
22
    )
23
excluded_suffix_re = (
24
     '(?! [a-z])' # not if it's a word in a sentence (uppercase SQL keywords OK)
25
    +'(?!-)' # not if it's part of a '-'-separated identifier
26
    )
27

    
28
def unescape_html(str_): return HTMLParser.HTMLParser().unescape(str_)
29

    
30
def repl_unescape_html(match): return unescape_html(match.group(0))
31

    
32
def main():
33
    env_names = []
34
    def usage_err():
35
        raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' '
36
            +sys.argv[0]+' <map repl [col_num] [| '+sys.argv[0]
37
            +' repl_1 [col_num_1]]... >new_map')
38
    
39
    col_num = None
40
    try: col_num = sys.argv[2]
41
    except IndexError: pass
42
    if col_num != None: col_num = int(col_num) # 0-based
43
    # whether all patterns are plain text
44
    # defaults to on if matching entire cells in a spreadsheet (w/ col_num)
45
    text = opts.env_flag('text', col_num != None, env_names)
46
    
47
    try: _prog_name, repl_path = sys.argv[:2]
48
    except ValueError: usage_err()
49
    
50
    # Get replacements
51
    repls = []
52
    stream = open(repl_path, 'rb')
53
    reader = csv.reader(stream)
54
    reader.next() # skip header
55
    for row in reader:
56
        in_, out = row[:2]
57
        if in_ != '':
58
            is_word = re.match(r'^\w+$', in_)
59
            if text or is_word: # match as whole-word text (like SQL identifier)
60
                # this can also be done in Postgres with expression substitution
61
                # (wiki.vegpath.org/Postgres_queries#expression-substitution)
62
                # this is a generalization of lib/sql_gen.py map_expr() to work
63
                # on entire source files
64
                in_str_re = re.escape(in_)
65
                q = quote_re
66
                in_ = '(?<='+q+')'+in_str_re+'(?='+q+')' # require quotes
67
                if is_word: # also match with quotes optional
68
                    # don't try to match word w/ suffix, because there are cases
69
                    # where a mapping adds a suffix which would cause the same
70
                    # replacement to be performed repeatedly
71
                    in_word_re = r'\b'+in_str_re+r'\b'
72
                    
73
                    # only use excluded_prefix_re/excluded_suffix_re in text
74
                    # mode (used in renaming columns in SQL scripts), to prevent
75
                    # the special coding for column renames from also affecting
76
                    # regular regexp/word replacements
77
                    if text: in_word_re = (excluded_prefix_re+in_word_re
78
                        +excluded_suffix_re)
79
                    
80
                    in_ = '(?:'+in_+'|'+in_word_re+')'
81
            repls.append((r'(?m)'+in_, out))
82
    stream.close()
83
    def repl_all(str_):
84
        str_ = strings.ustr(str_)
85
        for repl, with_ in repls:
86
            if with_ == 'unescape_html()': with_ = repl_unescape_html
87
            str_ = re.sub(repl, with_, str_)
88
        return str_
89
    
90
    # Modify map or file
91
    if col_num != None:
92
        reader = csv.reader(sys.stdin)
93
        writer = csv.writer(sys.stdout)
94
        cols = reader.next()
95
        writer.writerow(cols)
96
        for row in reader:
97
            row[col_num] = repl_all(row[col_num])
98
            writer.writerow(row)
99
    else: sys.stdout.write(strings.to_raw_str(repl_all(sys.stdin.read())))
100

    
101
main()
(66-66/87)