Project

General

Profile

1 53 aaronmk
#!/usr/bin/env python
2
# Maps one datasource to another, using a map spreadsheet if needed
3 986 aaronmk
# Exit status is the # of errors in the import, up to the maximum exit status
4 53 aaronmk
# For outputting an XML file to a PostgreSQL database, use the general format of
5
# http://vegbank.org/vegdocs/xml/vegbank_example_ver1.0.2.xml
6
7 2550 aaronmk
import copy
8 1014 aaronmk
import csv
9 1714 aaronmk
import itertools
10 53 aaronmk
import os.path
11 3577 aaronmk
import warnings
12 53 aaronmk
import sys
13 299 aaronmk
import xml.dom.minidom as minidom
14 53 aaronmk
15 266 aaronmk
sys.path.append(os.path.dirname(__file__)+"/../lib")
16 53 aaronmk
17 1389 aaronmk
import csvs
18 2040 aaronmk
import db_xml
19 344 aaronmk
import exc
20 1714 aaronmk
import iters
21 1705 aaronmk
import maps
22 64 aaronmk
import opts
23 2035 aaronmk
import parallelproc
24 281 aaronmk
import Parser
25 982 aaronmk
import profiling
26 131 aaronmk
import sql
27 2288 aaronmk
import sql_gen
28 3103 aaronmk
import sql_io
29 1758 aaronmk
import streams
30 715 aaronmk
import strings
31 828 aaronmk
import term
32 310 aaronmk
import util
33 1014 aaronmk
import xpath
34 133 aaronmk
import xml_dom
35 86 aaronmk
import xml_func
36 1713 aaronmk
import xml_parse
37 53 aaronmk
38 1404 aaronmk
def get_with_prefix(map_, prefixes, key):
39 2040 aaronmk
    '''Gets all entries for the given key with any of the given prefixes
40
    @return tuple(found_key, found_value)
41
    '''
42 1484 aaronmk
    values = []
43 1681 aaronmk
    for key_ in strings.with_prefixes(['']+prefixes, key): # also with no prefix
44
        try: value = map_[key_]
45 1484 aaronmk
        except KeyError, e: continue # keep going
46 2040 aaronmk
        values.append((key_, value))
47 1484 aaronmk
48
    if values != []: return values
49
    else: raise e # re-raise last KeyError
50 1404 aaronmk
51 1018 aaronmk
def metadata_value(name): return None # this feature has been removed
52 84 aaronmk
53 1360 aaronmk
def cleanup(val):
54
    if val == None: return val
55 1374 aaronmk
    return util.none_if(strings.cleanup(strings.ustr(val)), u'', u'\\N')
56 1360 aaronmk
57 847 aaronmk
def main_():
58 131 aaronmk
    env_names = []
59
    def usage_err():
60 944 aaronmk
        raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' '
61 1945 aaronmk
            +sys.argv[0]+' [map_path...] [<input] [>output]\n'
62 1946 aaronmk
            'Note: Row #s start with 1')
63 838 aaronmk
64 1570 aaronmk
    ## Get config from env vars
65 838 aaronmk
66 1570 aaronmk
    # Modes
67 946 aaronmk
    test = opts.env_flag('test', False, env_names)
68 947 aaronmk
    commit = opts.env_flag('commit', False, env_names) and not test
69 944 aaronmk
        # never commit in test mode
70 947 aaronmk
    redo = opts.env_flag('redo', test, env_names) and not commit
71 2977 aaronmk
        # never redo in commit mode (run `make schemas/reinstall` instead)
72 1570 aaronmk
73
    # Ranges
74 1946 aaronmk
    start = util.cast(int, opts.get_env_var('start', 1, env_names)) # 1-based
75
    # Make start interally 0-based.
76
    # It's 1-based to the user to match up with the staging table row #s.
77
    start -= 1
78 1945 aaronmk
    if test: n_default = 1
79
    else: n_default = None
80
    n = util.cast(int, util.none_if(opts.get_env_var('n', n_default, env_names),
81
        u''))
82
    end = n
83 1570 aaronmk
    if end != None: end += start
84
85
    # Debugging
86 3132 aaronmk
    verbosity = util.cast(float, opts.get_env_var('verbosity', None, env_names))
87 944 aaronmk
    opts.get_env_var('profile_to', None, env_names) # add to env_names
88 131 aaronmk
89 1570 aaronmk
    # DB
90
    def get_db_config(prefix):
91 1926 aaronmk
        return opts.get_env_vars(sql.db_config_names, prefix, env_names)
92 1570 aaronmk
    in_db_config = get_db_config('in')
93
    out_db_config = get_db_config('out')
94
    in_is_db = 'engine' in in_db_config
95
    out_is_db = 'engine' in out_db_config
96 1982 aaronmk
    in_schema = opts.get_env_var('in_schema', None, env_names)
97
    in_table = opts.get_env_var('in_table', None, env_names)
98 3339 aaronmk
    if in_schema != None:
99
        for config in [in_db_config, out_db_config]:
100
            config['schemas'] += ','+in_schema
101 1570 aaronmk
102 1989 aaronmk
    # Optimization
103 2050 aaronmk
    cache_sql = opts.env_flag('cache_sql', True, env_names)
104 1989 aaronmk
    by_col = in_db_config == out_db_config and opts.env_flag('by_col', False,
105
        env_names) # by-column optimization only applies if mapping to same DB
106
    if test: cpus_default = 0
107 2044 aaronmk
    else: cpus_default = 0 # or None to use parallel processing by default
108 1989 aaronmk
    cpus = util.cast(int, util.none_if(opts.get_env_var('cpus', cpus_default,
109
        env_names), u''))
110
111 3132 aaronmk
    # Set default verbosity. Must happen after by_col is set.
112
    if verbosity == None:
113
        if test: verbosity = 0.5 # automated tests should not be verbose
114
        elif by_col: verbosity = 3 # show all queries to assist profiling
115
        else: verbosity = 1.1 # just show row progress
116
117 1570 aaronmk
    ##
118
119 838 aaronmk
    # Logging
120 2457 aaronmk
    verbose_errors = test and verbosity > 0
121 3243 aaronmk
    debug = verbosity >= 1.5
122 2439 aaronmk
    def log(msg, level=1):
123
        '''Higher level -> more verbose'''
124 2452 aaronmk
        if level <= verbosity:
125 2490 aaronmk
            if verbosity <= 2:
126
                if level == 1.5: msg = '# '+msg # msg is Redmine list item
127
                elif msg.startswith('DB query:'): # remove extra debug info
128
                    first_line, nl, msg = msg.partition('\n')
129
            elif level > 1: msg = '['+str(level)+'] '+msg # include level in msg
130
131 2883 aaronmk
            sys.stderr.write(strings.to_raw_str(msg.rstrip('\n')+'\n'))
132 2439 aaronmk
    if debug: log_debug = lambda msg, level=2: log(msg, level)
133 1901 aaronmk
    else: log_debug = sql.log_debug_none
134 662 aaronmk
135 53 aaronmk
    # Parse args
136 510 aaronmk
    map_paths = sys.argv[1:]
137 512 aaronmk
    if map_paths == []:
138
        if in_is_db or not out_is_db: usage_err()
139
        else: map_paths = [None]
140 53 aaronmk
141 646 aaronmk
    def connect_db(db_config):
142 1901 aaronmk
        log('Connecting to '+sql.db_config_str(db_config))
143 2920 aaronmk
        return sql.connect(db_config, caching=cache_sql, autocommit=commit,
144 2916 aaronmk
            debug_temp=verbosity > 3 and commit, log_debug=log_debug)
145 646 aaronmk
146 1945 aaronmk
    if end != None: end_str = str(end-1) # end is one past the last #
147 1573 aaronmk
    else: end_str = 'end'
148 1901 aaronmk
    log('Processing input rows '+str(start)+'-'+end_str)
149 1573 aaronmk
150 1014 aaronmk
    ex_tracker = exc.ExPercentTracker(iter_text='row')
151
    profiler = profiling.ItersProfiler(start_now=True, iter_text='row')
152
153 1856 aaronmk
    # Parallel processing
154 2035 aaronmk
    pool = parallelproc.MultiProducerPool(cpus)
155 1901 aaronmk
    log('Using '+str(pool.process_ct)+' parallel CPUs')
156 1846 aaronmk
157 1014 aaronmk
    doc = xml_dom.create_doc()
158
    root = doc.documentElement
159 751 aaronmk
    out_is_xml_ref = [False]
160 1014 aaronmk
    in_label_ref = [None]
161
    def update_in_label():
162
        if in_label_ref[0] != None:
163
            xpath.get(root, '/_ignore/inLabel="'+in_label_ref[0]+'"', True)
164
    def prep_root():
165
        root.clear()
166
        update_in_label()
167
    prep_root()
168 751 aaronmk
169 1991 aaronmk
    # Define before the out_is_db section because it's used by by_col
170
    row_ins_ct_ref = [0]
171
172 2601 aaronmk
    if out_is_db:
173
        out_db = connect_db(out_db_config)
174 2602 aaronmk
        rel_funcs = set(sql.tables(out_db, schema_like='%',
175 2601 aaronmk
            table_like=r'\__%'))
176
177 838 aaronmk
    def process_input(root, row_ready, map_path):
178 512 aaronmk
        '''Inputs datasource to XML tree, mapping if needed'''
179
        # Load map header
180
        in_is_xpaths = True
181 751 aaronmk
        out_is_xpaths = True
182 512 aaronmk
        out_label = None
183
        if map_path != None:
184
            metadata = []
185
            mappings = []
186
            stream = open(map_path, 'rb')
187
            reader = csv.reader(stream)
188
            in_label, out_label = reader.next()[:2]
189 1402 aaronmk
190 512 aaronmk
            def split_col_name(name):
191 1402 aaronmk
                label, sep, root = name.partition(':')
192
                label, sep2, prefixes_str = label.partition('[')
193
                prefixes_str = strings.remove_suffix(']', prefixes_str)
194
                prefixes = strings.split(',', prefixes_str)
195
                return label, sep != '', root, prefixes
196 1198 aaronmk
                    # extract datasrc from "datasrc[data_format]"
197 1402 aaronmk
198 1705 aaronmk
            in_label, in_root, prefixes = maps.col_info(in_label)
199
            in_is_xpaths = in_root != None
200 1014 aaronmk
            in_label_ref[0] = in_label
201
            update_in_label()
202 1705 aaronmk
            out_label, out_root = maps.col_info(out_label)[:2]
203
            out_is_xpaths = out_root != None
204 1841 aaronmk
            if out_is_xpaths: has_types = out_root.find('/*s/') >= 0
205 1705 aaronmk
                # outer elements are types
206 1402 aaronmk
207 512 aaronmk
            for row in reader:
208
                in_, out = row[:2]
209 2029 aaronmk
                if out != '': mappings.append([in_, out_root+out])
210 1402 aaronmk
211 512 aaronmk
            stream.close()
212
213
            root.ownerDocument.documentElement.tagName = out_label
214
        in_is_xml = in_is_xpaths and not in_is_db
215 751 aaronmk
        out_is_xml_ref[0] = out_is_xpaths and not out_is_db
216 56 aaronmk
217 1897 aaronmk
        def process_rows(process_row, rows, rows_start=0):
218
            '''Processes input rows
219 838 aaronmk
            @param process_row(in_row, i)
220 1945 aaronmk
            @rows_start The (0-based) row # of the first row in rows. Set this
221
                only if the pre-start rows have already been skipped.
222 297 aaronmk
            '''
223 1897 aaronmk
            rows = iter(rows)
224
225
            if end != None: row_nums = xrange(rows_start, end)
226
            else: row_nums = itertools.count(rows_start)
227 2038 aaronmk
            i = -1
228 1897 aaronmk
            for i in row_nums:
229 1718 aaronmk
                try: row = rows.next()
230 2038 aaronmk
                except StopIteration:
231
                    i -= 1 # last row # didn't count
232
                    break # no more rows
233 1718 aaronmk
                if i < start: continue # not at start row yet
234
235 2881 aaronmk
                # Row # is interally 0-based, but 1-based to the user
236
                log('Processing input row #'+str(i+1), level=1.1)
237 838 aaronmk
                process_row(row, i)
238
                row_ready(i, row)
239 2033 aaronmk
            row_ct = i-start+1
240 982 aaronmk
            return row_ct
241 838 aaronmk
242 1898 aaronmk
        def map_rows(get_value, rows, **kw_args):
243 838 aaronmk
            '''Maps input rows
244
            @param get_value(in_, row):str
245
            '''
246 2030 aaronmk
            # Prevent collisions if multiple inputs mapping to same output
247
            outputs_idxs = dict()
248
            for i, mapping in enumerate(mappings):
249
                in_, out = mapping
250
                default = util.NamedTuple(count=1, first=i)
251
                idxs = outputs_idxs.setdefault(out, default)
252
                if idxs is not default: # key existed, so there was a collision
253
                    if idxs.count == 1: # first key does not yet have /_alt/#
254
                        mappings[idxs.first][1] += '/_alt/0'
255
                    mappings[i][1] += '/_alt/'+str(idxs.count)
256
                    idxs.count += 1
257
258 2026 aaronmk
            id_node = None
259
            if out_is_db:
260
                for i, mapping in enumerate(mappings):
261
                    in_, out = mapping
262
                    # All put_obj()s should return the same id_node
263
                    nodes, id_node = xpath.put_obj(root, out, '-1', has_types,
264
                        '$'+str(in_)) # value is placeholder that documents name
265 2032 aaronmk
                    mappings[i] = [in_, nodes]
266 3580 aaronmk
                if id_node == None:
267
                    warnings.warn(UserWarning('Map warning: No mappings'))
268 2119 aaronmk
                if debug: # only calc if debug
269 2026 aaronmk
                    log_debug('Put template:\n'+str(root))
270
271 838 aaronmk
            def process_row(row, i):
272 316 aaronmk
                row_id = str(i)
273 2032 aaronmk
                if id_node != None: xml_dom.set_value(id_node, row_id)
274 316 aaronmk
                for in_, out in mappings:
275 2027 aaronmk
                    log_debug('Getting '+str(in_))
276 316 aaronmk
                    value = metadata_value(in_)
277 2027 aaronmk
                    if value == None: value = cleanup(get_value(in_, row))
278
                    log_debug('Putting '+repr(value)+' to '+str(out))
279 2032 aaronmk
                    if out_is_db: # out is list of XML nodes
280
                        for node in out: xml_dom.set_value(node, value)
281
                    elif value != None: # out is XPath
282 1360 aaronmk
                        xpath.put_obj(root, out, row_id, has_types, value)
283 1898 aaronmk
            return process_rows(process_row, rows, **kw_args)
284 297 aaronmk
285 1898 aaronmk
        def map_table(col_names, rows, **kw_args):
286 1416 aaronmk
            col_names_ct = len(col_names)
287 1403 aaronmk
            col_idxs = util.list_flip(col_names)
288
289 2029 aaronmk
            # Resolve prefixes
290 2025 aaronmk
            mappings_orig = mappings[:] # save a copy
291
            mappings[:] = [] # empty existing elements
292
            for in_, out in mappings_orig:
293 1403 aaronmk
                if metadata_value(in_) == None:
294 2040 aaronmk
                    try: cols = get_with_prefix(col_idxs, prefixes, in_)
295 2025 aaronmk
                    except KeyError: pass
296 2040 aaronmk
                    else: mappings[len(mappings):] = [[db_xml.ColRef(*col), out]
297
                        for col in cols] # can't use += because that uses =
298 1403 aaronmk
299 2040 aaronmk
            def get_value(in_, row): return row.list[in_.idx]
300 1416 aaronmk
            def wrap_row(row):
301
                return util.ListDict(util.list_as_length(row, col_names_ct),
302
                    col_names, col_idxs) # handle CSV rows of different lengths
303 1403 aaronmk
304 1898 aaronmk
            return map_rows(get_value, util.WrapIter(wrap_row, rows), **kw_args)
305 1403 aaronmk
306 1700 aaronmk
        if in_is_db:
307 2809 aaronmk
            def on_error(e): ex_tracker.track(e)
308
309 3186 aaronmk
            if by_col: in_db = out_db
310
            else: in_db = connect_db(in_db_config)
311 126 aaronmk
312 1982 aaronmk
            # Get table and schema name
313
            schema = in_schema # modified, so can't have same name as outer var
314
            table = in_table # modified, so can't have same name as outer var
315
            if table == None:
316
                assert in_is_xpaths
317
                schema, sep, table = in_root.partition('.')
318
                if sep == '': # only the table name was specified
319
                    table = schema
320
                    schema = None
321 2313 aaronmk
            table = sql_gen.Table(table, schema)
322 1982 aaronmk
323 1991 aaronmk
            # Fetch rows
324
            if by_col: limit = 0 # only fetch column names
325
            else: limit = n
326 2313 aaronmk
            cur = sql.select(in_db, table, limit=limit, start=start,
327
                cacheable=False)
328 1991 aaronmk
            col_names = list(sql.col_names(cur))
329
330 1989 aaronmk
            if by_col:
331 2042 aaronmk
                map_table(col_names, []) # just create the template
332 2550 aaronmk
333 3617 aaronmk
                if start == 0 and n == None: # doing full re-import
334 2734 aaronmk
                    log('Clearing errors table')
335 3103 aaronmk
                    sql.truncate(in_db, sql_io.errors_table(in_db, table))
336 2734 aaronmk
337 2550 aaronmk
                # Strip XML functions not in the DB
338 3427 aaronmk
                def is_rel_func(name):
339
                    return (name in db_xml.put_table_special_funcs
340
                        or sql.function_exists(in_db, sql_gen.Function(name)))
341
                xml_func.process(root, is_rel_func=is_rel_func)
342 2103 aaronmk
                if debug: log_debug('Putting stripped:\n'+str(root))
343 2119 aaronmk
                    # only calc if debug
344 2550 aaronmk
345
                # Import rows
346 2422 aaronmk
                in_row_ct_ref = [0]
347 2928 aaronmk
                db_xml.put_table(in_db, root.firstChild, table, in_row_ct_ref,
348
                    row_ins_ct_ref, n, start, on_error)
349 2422 aaronmk
                row_ct = in_row_ct_ref[0]
350 1984 aaronmk
            else:
351
                # Use normal by-row method
352 1991 aaronmk
                row_ct = map_table(col_names, sql.rows(cur), rows_start=start)
353
                    # rows_start: pre-start rows have been skipped
354 3186 aaronmk
355
                in_db.db.close()
356 161 aaronmk
        elif in_is_xml:
357 2809 aaronmk
            stdin = streams.LineCountStream(sys.stdin)
358
            def on_error(e):
359
                exc.add_msg(e, term.emph('input line #:')+' '
360
                    +str(stdin.line_num))
361
                ex_tracker.track(e)
362
363 1715 aaronmk
            def get_rows(doc2rows):
364 1758 aaronmk
                return iters.flatten(itertools.imap(doc2rows,
365
                    xml_parse.docs_iter(stdin, on_error)))
366 1715 aaronmk
367 1714 aaronmk
            if map_path == None:
368 1715 aaronmk
                def doc2rows(in_xml_root):
369 1701 aaronmk
                    iter_ = xml_dom.NodeElemIter(in_xml_root)
370
                    util.skip(iter_, xml_dom.is_text) # skip metadata
371 1715 aaronmk
                    return iter_
372
373
                row_ct = process_rows(lambda row, i: root.appendChild(row),
374
                    get_rows(doc2rows))
375 1714 aaronmk
            else:
376 1715 aaronmk
                def doc2rows(in_xml_root):
377 1701 aaronmk
                    rows = xpath.get(in_xml_root, in_root, limit=end)
378 3581 aaronmk
                    if rows == []: warnings.warn(UserWarning('Map warning: '
379
                        'Root "'+in_root+'" not found in input'))
380 1714 aaronmk
                    return rows
381
382
                def get_value(in_, row):
383
                    in_ = './{'+(','.join(strings.with_prefixes(
384
                        ['']+prefixes, in_)))+'}' # also with no prefix
385
                    nodes = xpath.get(row, in_, allow_rooted=False)
386
                    if nodes != []: return xml_dom.value(nodes[0])
387
                    else: return None
388
389 1715 aaronmk
                row_ct = map_rows(get_value, get_rows(doc2rows))
390 56 aaronmk
        else: # input is CSV
391 133 aaronmk
            map_ = dict(mappings)
392 1389 aaronmk
            reader, col_names = csvs.reader_and_header(sys.stdin)
393 1403 aaronmk
            row_ct = map_table(col_names, reader)
394 982 aaronmk
395
        return row_ct
396 53 aaronmk
397 838 aaronmk
    def process_inputs(root, row_ready):
398 982 aaronmk
        row_ct = 0
399
        for map_path in map_paths:
400
            row_ct += process_input(root, row_ready, map_path)
401
        return row_ct
402 512 aaronmk
403 1886 aaronmk
    pool.share_vars(locals())
404 130 aaronmk
    if out_is_db:
405 53 aaronmk
        try:
406 947 aaronmk
            if redo: sql.empty_db(out_db)
407 1886 aaronmk
            pool.share_vars(locals())
408 449 aaronmk
409 838 aaronmk
            def row_ready(row_num, input_row):
410 2119 aaronmk
                row_str_ = [None]
411
                def row_str():
412
                    if row_str_[0] == None:
413
                        # Row # is interally 0-based, but 1-based to the user
414
                        row_str_[0] = (term.emph('row #:')+' '+str(row_num+1)
415
                            +'\n'+term.emph('input row:')+'\n'+str(input_row))
416
                        if verbose_errors: row_str_[0] += ('\n'
417
                            +term.emph('output row:')+'\n'+str(root))
418
                    return row_str_[0]
419
420
                if debug: log_debug(row_str()) # only calc if debug
421
422 452 aaronmk
                def on_error(e):
423 2119 aaronmk
                    exc.add_msg(e, row_str())
424 2046 aaronmk
                    ex_tracker.track(e, row_num, detail=verbose_errors)
425 1886 aaronmk
                pool.share_vars(locals())
426 452 aaronmk
427 2032 aaronmk
                row_root = root.cloneNode(True) # deep copy so don't modify root
428 3424 aaronmk
                xml_func.process(row_root, on_error, lambda f: f in rel_funcs,
429
                    out_db)
430 2032 aaronmk
                if not xml_dom.is_empty(row_root):
431
                    assert xml_dom.has_one_child(row_root)
432 442 aaronmk
                    try:
433 982 aaronmk
                        sql.with_savepoint(out_db,
434 2032 aaronmk
                            lambda: db_xml.put(out_db, row_root.firstChild,
435 1850 aaronmk
                                row_ins_ct_ref, on_error))
436 449 aaronmk
                    except sql.DatabaseErrors, e: on_error(e)
437
438 982 aaronmk
            row_ct = process_inputs(root, row_ready)
439
            sys.stdout.write('Inserted '+str(row_ins_ct_ref[0])+
440 460 aaronmk
                ' new rows into database\n')
441 1868 aaronmk
442
            # Consume asynchronous tasks
443
            pool.main_loop()
444 3117 aaronmk
        finally: out_db.close()
445 751 aaronmk
    else:
446 759 aaronmk
        def on_error(e): ex_tracker.track(e)
447 838 aaronmk
        def row_ready(row_num, input_row): pass
448 982 aaronmk
        row_ct = process_inputs(root, row_ready)
449 759 aaronmk
        xml_func.process(root, on_error)
450 751 aaronmk
        if out_is_xml_ref[0]:
451
            doc.writexml(sys.stdout, **xml_dom.prettyxml_config)
452
        else: # output is CSV
453
            raise NotImplementedError('CSV output not supported yet')
454 985 aaronmk
455 1868 aaronmk
    # Consume any asynchronous tasks not already consumed above
456 1862 aaronmk
    pool.main_loop()
457
458 982 aaronmk
    profiler.stop(row_ct)
459 3324 aaronmk
    if not by_col: ex_tracker.add_iters(row_ct) # only if errors are done by row
460 2439 aaronmk
    log('Processed '+str(row_ct)+' input rows')
461
    log(profiler.msg())
462
    log(ex_tracker.msg())
463 985 aaronmk
    ex_tracker.exit()
464 53 aaronmk
465 847 aaronmk
def main():
466
    try: main_()
467 1719 aaronmk
    except Parser.SyntaxError, e: raise SystemExit(str(e))
468 847 aaronmk
469 846 aaronmk
if __name__ == '__main__':
470 847 aaronmk
    profile_to = opts.get_env_var('profile_to', None)
471
    if profile_to != None:
472
        import cProfile
473 1584 aaronmk
        sys.stderr.write('Profiling to '+profile_to+'\n')
474 847 aaronmk
        cProfile.run(main.func_code, profile_to)
475
    else: main()