Project

General

Profile

1 53 aaronmk
#!/usr/bin/env python
2
# Maps one datasource to another, using a map spreadsheet if needed
3
# For outputting an XML file to a PostgreSQL database, use the general format of
4
# http://vegbank.org/vegdocs/xml/vegbank_example_ver1.0.2.xml
5
6
import os
7
import os.path
8
import sys
9
import xml.dom.minidom
10
11
sys.path.append(os.path.dirname(__file__)+"/lib")
12
13
def env_flag(name): return name in os.environ and os.environ[name] != ''
14
15
def main():
16
    # Get db config from env vars
17
    db_config_names = ['host', 'user', 'password', 'database']
18
    env_names = []
19
    def get_db_config(prefix):
20
        has_all = True
21
        db_config = {}
22
        for name in db_config_names:
23
            env_name = prefix+'_'+name
24
            env_names.append(env_name)
25
            if env_name in os.environ: db_config[name] = os.environ[env_name]
26
            else: has_all = False
27
        if has_all: return db_config
28
        else: return None
29
    from_db_config = get_db_config('from')
30
    to_db_config = get_db_config('to')
31
    uses_map = not (from_db_config == None and to_db_config != None)
32
33
    # Parse args
34
    prog_name = sys.argv[0]
35
    try: prog_name, map_path = sys.argv
36
    except ValueError:
37 54 aaronmk
        if uses_map: raise SystemExit('Usage: env'+''.join(map(lambda name:
38 53 aaronmk
            ' ['+name+'=...]', env_names))+' [commit=1] '+prog_name
39
            +' [map_path] [<input] [>output]')
40
    commit = env_flag('commit')
41
42
    csv_config = dict(delimiter=',', quotechar='"')
43
44 57 aaronmk
    # Load map header
45
    in_is_xml = True
46 56 aaronmk
    if uses_map:
47
        import copy
48
        import csv
49
50 53 aaronmk
        import xpath
51
52 57 aaronmk
        map_stream = open(map_path, 'rb')
53
        map_reader = csv.reader(map_stream, **csv_config)
54
        src, dest = map_reader.next()[:2]
55 56 aaronmk
        src, sep, src_base = src.partition('/')
56 57 aaronmk
        in_is_xml = sep != ''
57 56 aaronmk
58 57 aaronmk
    # Input datasource to XML tree, mapping if needed
59
    if in_is_xml: doc = xml.dom.minidom.parse(sys.stdin)
60 56 aaronmk
    if uses_map:
61
        import xml_xpath
62 53 aaronmk
63 57 aaronmk
        map_ = {}
64
        has_types = False # whether outer elements are type containiners
65
        for row in map_reader:
66
            in_, out = row[:2]
67
            if out != '':
68
                if out.startswith('/*s/'): has_types = True # *s for type elem
69
                out = xpath.parse(out)
70
                if in_is_xml: pass # TODO: process the mapping
71
                else: map_[in_] = out
72
        map_stream.close()
73
74 56 aaronmk
        out_doc = xml.dom.minidom.getDOMImplementation().createDocument(None,
75
            dest, None)
76 57 aaronmk
        if in_is_xml: raise Exception('XML-XML mapping not supported yet')
77 56 aaronmk
        else: # input is CSV
78
            reader = csv.reader(sys.stdin, **csv_config)
79
            fieldnames = reader.next()
80
            row_idx = 0
81
            for row in reader:
82
                row_id = str(row_idx)
83
                for idx, name in enumerate(fieldnames):
84
                    value = row[idx]
85
                    if value != '' and name in map_:
86
                        path = copy.deepcopy(map_[name]) # don't modify value!
87
                        xpath.set_id(path, row_id, has_types)
88
                        xpath.set_value(path, value)
89
                        xml_xpath.get(out_doc, path, True)
90
                row_idx += 1
91
        doc = out_doc
92 53 aaronmk
93
    # Output XML tree
94
    if to_db_config != None: # output is database
95
        import psycopg2
96
        from psycopg2.extensions import ISOLATION_LEVEL_SERIALIZABLE
97
98
        import db_xml
99
100
        db = psycopg2.connect(**to_db_config)
101
        db.set_isolation_level(ISOLATION_LEVEL_SERIALIZABLE)
102
        try:
103
            row_ct_ref = [0]
104
            db_xml.xml2db(db, doc.documentElement, row_ct_ref)
105
            print 'Inserted '+str(row_ct_ref[0])+' rows'
106
            if commit: db.commit()
107
        finally:
108
            db.rollback()
109
            db.close()
110
    else: doc.writexml(sys.stdout, addindent='    ', newl='\n') # output is XML
111
112
main()