Project

General

Profile

1 53 aaronmk
#!/usr/bin/env python
2
# Maps one datasource to another, using a map spreadsheet if needed
3
# For outputting an XML file to a PostgreSQL database, use the general format of
4
# http://vegbank.org/vegdocs/xml/vegbank_example_ver1.0.2.xml
5
6
import os
7
import os.path
8
import sys
9
import xml.dom.minidom
10
11
sys.path.append(os.path.dirname(__file__)+"/lib")
12
13 64 aaronmk
import opts
14 53 aaronmk
15
def main():
16
    # Get db config from env vars
17
    db_config_names = ['host', 'user', 'password', 'database']
18
    env_names = []
19
    def get_db_config(prefix):
20 64 aaronmk
        return opts.get_env_vars(db_config_names, prefix, env_names)
21 53 aaronmk
    from_db_config = get_db_config('from')
22
    to_db_config = get_db_config('to')
23 58 aaronmk
    in_is_db = from_db_config != None
24
    out_is_db = to_db_config != None
25
    uses_map = in_is_db or not out_is_db
26 53 aaronmk
27
    # Parse args
28
    prog_name = sys.argv[0]
29
    try: prog_name, map_path = sys.argv
30
    except ValueError:
31 54 aaronmk
        if uses_map: raise SystemExit('Usage: env'+''.join(map(lambda name:
32 53 aaronmk
            ' ['+name+'=...]', env_names))+' [commit=1] '+prog_name
33
            +' [map_path] [<input] [>output]')
34 64 aaronmk
    commit = opts.env_flag('commit')
35 53 aaronmk
36 57 aaronmk
    # Load map header
37
    in_is_xml = True
38 56 aaronmk
    if uses_map:
39
        import copy
40
        import csv
41
42 53 aaronmk
        import xpath
43
44 57 aaronmk
        map_stream = open(map_path, 'rb')
45 59 aaronmk
        map_reader = csv.reader(map_stream)
46 57 aaronmk
        src, dest = map_reader.next()[:2]
47 61 aaronmk
        def split_col_name(name):
48
            name, sep, prefix = name.partition('/')
49
            return name, sep != '', sep+prefix
50
        src, in_is_xml, src_prefix = split_col_name(src)
51
        dest, out_is_xml, dest_prefix = split_col_name(dest)
52
        assert out_is_xml
53
        has_types = dest_prefix.startswith('/*s/') # outer elements are types
54 56 aaronmk
55 57 aaronmk
    # Input datasource to XML tree, mapping if needed
56
    if in_is_xml: doc = xml.dom.minidom.parse(sys.stdin)
57 56 aaronmk
    if uses_map:
58 61 aaronmk
        from Parser import SyntaxException
59 56 aaronmk
        import xml_xpath
60 53 aaronmk
61 57 aaronmk
        map_ = {}
62
        for row in map_reader:
63
            in_, out = row[:2]
64
            if out != '':
65 61 aaronmk
                try: out = xpath.parse(dest_prefix+out)
66
                except SyntaxException, ex: raise SystemExit(str(ex))
67 57 aaronmk
                if in_is_xml: pass # TODO: process the mapping
68 58 aaronmk
                elif in_is_db: pass # TODO: process the mapping
69 57 aaronmk
                else: map_[in_] = out
70
        map_stream.close()
71
72 56 aaronmk
        out_doc = xml.dom.minidom.getDOMImplementation().createDocument(None,
73
            dest, None)
74 57 aaronmk
        if in_is_xml: raise Exception('XML-XML mapping not supported yet')
75 56 aaronmk
        else: # input is CSV
76 59 aaronmk
            reader = csv.reader(sys.stdin)
77 56 aaronmk
            fieldnames = reader.next()
78
            row_idx = 0
79
            for row in reader:
80
                row_id = str(row_idx)
81
                for idx, name in enumerate(fieldnames):
82
                    value = row[idx]
83
                    if value != '' and name in map_:
84
                        path = copy.deepcopy(map_[name]) # don't modify value!
85
                        xpath.set_id(path, row_id, has_types)
86
                        xpath.set_value(path, value)
87
                        xml_xpath.get(out_doc, path, True)
88
                row_idx += 1
89
        doc = out_doc
90 53 aaronmk
91
    # Output XML tree
92
    if to_db_config != None: # output is database
93
        import psycopg2
94
        from psycopg2.extensions import ISOLATION_LEVEL_SERIALIZABLE
95
96
        import db_xml
97
98
        db = psycopg2.connect(**to_db_config)
99
        db.set_isolation_level(ISOLATION_LEVEL_SERIALIZABLE)
100
        try:
101
            row_ct_ref = [0]
102
            db_xml.xml2db(db, doc.documentElement, row_ct_ref)
103
            print 'Inserted '+str(row_ct_ref[0])+' rows'
104
            if commit: db.commit()
105
        finally:
106
            db.rollback()
107
            db.close()
108
    else: doc.writexml(sys.stdout, addindent='    ', newl='\n') # output is XML
109
110
main()