/trunk/bin/repl - Annotate - BIEN 3 - NCEAS Projects

root/trunk/bin/repl @ 12936

-            aaronmk
+#!/usr/bin/env python
 # Modifies a map spreadsheet A->B or any file using a replacements spreadsheet
 # A->C or B->C
 import csv
-            aaronmk
+import HTMLParser
-            aaronmk
+import os.path
-            aaronmk
+import re
 import sys
-            aaronmk
+sys.path.append(os.path.dirname(__file__)+"/../lib")
 import maps
-            aaronmk
+import opts
-            aaronmk
+import strings
             aaronmk
-            aaronmk
+quote_re = '[\'"`]'
-            aaronmk
+excluded_prefix_re = (
-            aaronmk
+     '(?<![a-z] )' #not if it's a word in a sentence (uppercase SQL keywords OK)
-            aaronmk
+    +'(?<!-)' # not if it's part of a '-'-separated identifier
-            aaronmk
+    +'(?<!\*)' # don't double leading *
+    )
 excluded_suffix_re = (
-            aaronmk
+     '(?! [a-z])' # not if it's a word in a sentence (uppercase SQL keywords OK)
-            aaronmk
+    +'(?!-)' # not if it's part of a '-'-separated identifier
             aaronmk
             aaronmk
-            aaronmk
+def unescape_html(str_): return HTMLParser.HTMLParser().unescape(str_)
 def repl_unescape_html(match): return unescape_html(match.group(0))
-            aaronmk
+def main():
-            aaronmk
+    env_names = []
     def usage_err():
         raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' '
             +sys.argv[0]+' <map repl [col_num] [| '+sys.argv[0]
             +' repl_1 [col_num_1]]... >new_map')
-            aaronmk
+    col_num = None
     try: col_num = sys.argv[2]
-            aaronmk
+    except IndexError: pass
-            aaronmk
+    if col_num != None: col_num = int(col_num) # 0-based
-            aaronmk
+    # whether all patterns are plain text
     # defaults to on if matching entire cells in a spreadsheet (w/ col_num)
     text = opts.env_flag('text', col_num != None, env_names)
             aaronmk
-            aaronmk
+    try: _prog_name, repl_path = sys.argv[:2]
     except ValueError: usage_err()
-            aaronmk
+    # Get replacements
     repls = []
     stream = open(repl_path, 'rb')
     reader = csv.reader(stream)
-            aaronmk
+    reader.next() # skip header
-            aaronmk
+    for row in reader:
-            aaronmk
+        in_, out = row[:2]
-            aaronmk
+        if in_ != '':
-            aaronmk
+            is_word = re.match(r'^\w+$', in_)
             if text or is_word: # match as whole-word text (like SQL identifier)
                 in_str_re = re.escape(in_)
                 q = quote_re
                 in_ = '(?<='+q+')'+in_str_re+'(?='+q+')' # require quotes
-            aaronmk
+                if is_word: # also match with quotes optional
-            aaronmk
+                    # don't try to match word w/ suffix, because there are cases
                     # where a mapping adds a suffix which would cause the same
                     # replacement to be performed repeatedly
-            aaronmk
+                    in_word_re = r'\b'+in_str_re+r'\b'
                     # only use excluded_prefix_re/excluded_suffix_re in text
                     # mode (used in renaming columns in SQL scripts), to prevent
                     # the special coding for column renames from also affecting
                     # regular regexp/word replacements
                     if text: in_word_re = (excluded_prefix_re+in_word_re
-            aaronmk
+                        +excluded_suffix_re)
             aaronmk
-            aaronmk
+                    in_ = '(?:'+in_+'|'+in_word_re+')'
-            aaronmk
+            repls.append((r'(?m)'+in_, out))
-            aaronmk
+    stream.close()
     def repl_all(str_):
-            aaronmk
+        str_ = strings.ustr(str_)
-            aaronmk
+        for repl, with_ in repls:
             if with_ == 'unescape_html()': with_ = repl_unescape_html
             str_ = re.sub(repl, with_, str_)
-            aaronmk
+        return str_
     # Modify map or file
     if col_num != None:
         reader = csv.reader(sys.stdin)
         writer = csv.writer(sys.stdout)
         cols = reader.next()
         writer.writerow(cols)
         for row in reader:
             row[col_num] = repl_all(row[col_num])
             writer.writerow(row)
-            aaronmk
+    else: sys.stdout.write(strings.to_raw_str(repl_all(sys.stdin.read())))
             aaronmk
 main()

Project

General

Profile