/trunk/bin/repl - BIEN 3 - NCEAS Projects

root/trunk/bin/repl @ 12966

       #!/usr/bin/env python
       # Modifies a map spreadsheet A->B or any file using a replacements spreadsheet
       # A->C or B->C
       import csv
       import HTMLParser
       import os.path
       import re
       import sys
       sys.path.append(os.path.dirname(__file__)+"/../lib")
       import maps
       import opts
       import strings
       quote_re = '[\'"`]'
       excluded_prefix_re = (
            '(?<![a-z] )' #not if it's a word in a sentence (uppercase SQL keywords OK)
           +'(?<!-)' # not if it's part of a '-'-separated identifier
           +'(?<!\*)' # don't double leading *
+          )
       excluded_suffix_re = (
            '(?! [a-z])' # not if it's a word in a sentence (uppercase SQL keywords OK)
           +'(?!-)' # not if it's part of a '-'-separated identifier
+          )
       def unescape_html(str_): return HTMLParser.HTMLParser().unescape(str_)
       def repl_unescape_html(match): return unescape_html(match.group(0))
       def main():
           env_names = []
           def usage_err():
               raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' '
                   +sys.argv[0]+' <map repl [col_num] [| '+sys.argv[0]
                   +' repl_1 [col_num_1]]... >new_map')
           col_num = None
           try: col_num = sys.argv[2]
           except IndexError: pass
           if col_num != None: col_num = int(col_num) # 0-based
           # whether all patterns are plain text
           # defaults to on if matching entire cells in a spreadsheet (w/ col_num)
           text = opts.env_flag('text', col_num != None, env_names)
           try: _prog_name, repl_path = sys.argv[:2]
           except ValueError: usage_err()
           # Get replacements
           repls = []
           stream = open(repl_path, 'rb')
           reader = csv.reader(stream)
           reader.next() # skip header
           for row in reader:
               in_, out = row[:2]
               if in_ != '':
                   is_word = re.match(r'^\w+$', in_)
                   if text or is_word: # match as whole-word text (like SQL identifier)
                       in_str_re = re.escape(in_)
                       q = quote_re
                       in_ = '(?<='+q+')'+in_str_re+'(?='+q+')' # require quotes
                       if is_word: # also match with quotes optional
                           # don't try to match word w/ suffix, because there are cases
                           # where a mapping adds a suffix which would cause the same
                           # replacement to be performed repeatedly
                           in_word_re = r'\b'+in_str_re+r'\b'
                           # only use excluded_prefix_re/excluded_suffix_re in text
                           # mode (used in renaming columns in SQL scripts), to prevent
                           # the special coding for column renames from also affecting
                           # regular regexp/word replacements
                           if text: in_word_re = (excluded_prefix_re+in_word_re
                               +excluded_suffix_re)
                           in_ = '(?:'+in_+'|'+in_word_re+')'
                   repls.append((r'(?m)'+in_, out))
           stream.close()
           def repl_all(str_):
               str_ = strings.ustr(str_)
               for repl, with_ in repls:
                   if with_ == 'unescape_html()': with_ = repl_unescape_html
                   str_ = re.sub(repl, with_, str_)
               return str_
           # Modify map or file
           if col_num != None:
               reader = csv.reader(sys.stdin)
               writer = csv.writer(sys.stdout)
               cols = reader.next()
               writer.writerow(cols)
               for row in reader:
                   row[col_num] = repl_all(row[col_num])
                   writer.writerow(row)
           else: sys.stdout.write(strings.to_raw_str(repl_all(sys.stdin.read())))
       main()

(65-65/86)

Project

General

Profile