1 |
205
|
aaronmk
|
#!/usr/bin/env python
|
2 |
|
|
# Modifies a map spreadsheet A->B or any file using a replacements spreadsheet
|
3 |
|
|
# A->C or B->C
|
4 |
|
|
|
5 |
|
|
import csv
|
6 |
9785
|
aaronmk
|
import HTMLParser
|
7 |
1705
|
aaronmk
|
import os.path
|
8 |
205
|
aaronmk
|
import re
|
9 |
|
|
import sys
|
10 |
|
|
|
11 |
1705
|
aaronmk
|
sys.path.append(os.path.dirname(__file__)+"/../lib")
|
12 |
|
|
|
13 |
|
|
import maps
|
14 |
4357
|
aaronmk
|
import opts
|
15 |
9784
|
aaronmk
|
import strings
|
16 |
1705
|
aaronmk
|
|
17 |
10348
|
aaronmk
|
quote_re = '[\'"`]'
|
18 |
12746
|
aaronmk
|
excluded_prefix_re = (
|
19 |
12897
|
aaronmk
|
'(?<![a-z] )' #not if it's a word in a sentence (uppercase SQL keywords OK)
|
20 |
12750
|
aaronmk
|
+'(?<!-)' # not if it's part of a '-'-separated identifier
|
21 |
12749
|
aaronmk
|
+'(?<!\*)' # don't double leading *
|
22 |
|
|
)
|
23 |
|
|
excluded_suffix_re = (
|
24 |
12897
|
aaronmk
|
'(?! [a-z])' # not if it's a word in a sentence (uppercase SQL keywords OK)
|
25 |
12750
|
aaronmk
|
+'(?!-)' # not if it's part of a '-'-separated identifier
|
26 |
12749
|
aaronmk
|
)
|
27 |
10348
|
aaronmk
|
|
28 |
9785
|
aaronmk
|
def unescape_html(str_): return HTMLParser.HTMLParser().unescape(str_)
|
29 |
|
|
|
30 |
|
|
def repl_unescape_html(match): return unescape_html(match.group(0))
|
31 |
|
|
|
32 |
205
|
aaronmk
|
def main():
|
33 |
4357
|
aaronmk
|
env_names = []
|
34 |
|
|
def usage_err():
|
35 |
|
|
raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' '
|
36 |
|
|
+sys.argv[0]+' <map repl [col_num] [| '+sys.argv[0]
|
37 |
|
|
+' repl_1 [col_num_1]]... >new_map')
|
38 |
|
|
|
39 |
|
|
text = opts.env_flag('text', False, env_names) # all patterns are plain text
|
40 |
205
|
aaronmk
|
try: _prog_name, repl_path = sys.argv[:2]
|
41 |
4357
|
aaronmk
|
except ValueError: usage_err()
|
42 |
205
|
aaronmk
|
col_num = None
|
43 |
|
|
try: col_num = sys.argv[2]
|
44 |
211
|
aaronmk
|
except IndexError: pass
|
45 |
205
|
aaronmk
|
if col_num != None: col_num = int(col_num) # 0-based
|
46 |
|
|
|
47 |
|
|
# Get replacements
|
48 |
|
|
repls = []
|
49 |
|
|
stream = open(repl_path, 'rb')
|
50 |
|
|
reader = csv.reader(stream)
|
51 |
8740
|
aaronmk
|
reader.next() # skip header
|
52 |
205
|
aaronmk
|
for row in reader:
|
53 |
210
|
aaronmk
|
in_, out = row[:2]
|
54 |
216
|
aaronmk
|
if in_ != '':
|
55 |
10348
|
aaronmk
|
is_word = re.match(r'^\w+$', in_)
|
56 |
|
|
if text or is_word: # match as whole-word text (like SQL identifier)
|
57 |
|
|
in_str_re = re.escape(in_)
|
58 |
|
|
q = quote_re
|
59 |
|
|
in_ = '(?<='+q+')'+in_str_re+'(?='+q+')' # require quotes
|
60 |
12742
|
aaronmk
|
if is_word: # also match with quotes optional
|
61 |
12748
|
aaronmk
|
# don't try to match word w/ suffix, because there are cases
|
62 |
|
|
# where a mapping adds a suffix which would cause the same
|
63 |
|
|
# replacement to be performed repeatedly
|
64 |
12754
|
aaronmk
|
in_word_re = r'\b'+in_str_re+r'\b'
|
65 |
|
|
|
66 |
|
|
# only use excluded_prefix_re/excluded_suffix_re in text
|
67 |
|
|
# mode (used in renaming columns in SQL scripts), to prevent
|
68 |
|
|
# the special coding for column renames from also affecting
|
69 |
|
|
# regular regexp/word replacements
|
70 |
|
|
if text: in_word_re = (excluded_prefix_re+in_word_re
|
71 |
12749
|
aaronmk
|
+excluded_suffix_re)
|
72 |
12754
|
aaronmk
|
|
73 |
10348
|
aaronmk
|
in_ = '(?:'+in_+'|'+in_word_re+')'
|
74 |
6713
|
aaronmk
|
repls.append((r'(?m)'+in_, out))
|
75 |
205
|
aaronmk
|
stream.close()
|
76 |
|
|
def repl_all(str_):
|
77 |
9784
|
aaronmk
|
str_ = strings.ustr(str_)
|
78 |
9785
|
aaronmk
|
for repl, with_ in repls:
|
79 |
|
|
if with_ == 'unescape_html()': with_ = repl_unescape_html
|
80 |
|
|
str_ = re.sub(repl, with_, str_)
|
81 |
205
|
aaronmk
|
return str_
|
82 |
|
|
|
83 |
|
|
# Modify map or file
|
84 |
|
|
if col_num != None:
|
85 |
|
|
reader = csv.reader(sys.stdin)
|
86 |
|
|
writer = csv.writer(sys.stdout)
|
87 |
|
|
cols = reader.next()
|
88 |
|
|
writer.writerow(cols)
|
89 |
|
|
for row in reader:
|
90 |
|
|
row[col_num] = repl_all(row[col_num])
|
91 |
|
|
writer.writerow(row)
|
92 |
9784
|
aaronmk
|
else: sys.stdout.write(strings.to_raw_str(repl_all(sys.stdin.read())))
|
93 |
205
|
aaronmk
|
|
94 |
|
|
main()
|