Project

General

Profile

1
# Database import/export
2

    
3
import operator
4

    
5
import exc
6
import dicts
7
import sql
8
import sql_gen
9
import strings
10
import util
11

    
12
##### Data cleanup
13

    
14
def cleanup_table(db, table, cols):
15
    table = sql_gen.as_Table(table)
16
    cols = map(sql_gen.as_Col, cols)
17
    
18
    expr = ('nullif(nullif(trim(both from %s), '+db.esc_value('')+'), '
19
        +db.esc_value(r'\N')+')')
20
    changes = [(v, sql_gen.CustomCode(expr % v.to_str(db)))
21
        for v in cols]
22
    
23
    sql.update(db, table, changes, in_place=True)
24

    
25
##### Error tracking
26

    
27
def track_data_error(db, errors_table, cols, value, error_code, error):
28
    '''
29
    @param errors_table If None, does nothing.
30
    '''
31
    if errors_table == None or cols == (): return
32
    
33
    for col in cols:
34
        try:
35
            sql.insert(db, errors_table, dict(column=col.name, value=value,
36
                error_code=error_code, error=error), recover=True,
37
                cacheable=True, log_level=4)
38
        except sql.DuplicateKeyException: pass
39

    
40
def data_exception_handler(db, srcs=[], errors_table=None):
41
    '''Handles a data_exception by saving the error or converting it to a
42
    warning, and returning NULL.
43
    @param srcs The column names for the errors table
44
    @param errors_table None|sql_gen.Table
45
    '''
46
    save_errors = errors_table != None and srcs
47
    handler = ''
48
    if save_errors:
49
        errors_table_cols = map(sql_gen.Col,
50
            ['column', 'value', 'error_code', 'error'])
51
        col_names_query = sql.mk_select(db, sql_gen.NamedValues('c', None,
52
            [[c.name] for c in srcs]), order_by=None)
53
        insert_query = sql.mk_insert_select(db, errors_table, errors_table_cols,
54
            sql_gen.Values(errors_table_cols).to_str(db))+';\n'
55
        handler += '''\
56
-- Save error in errors table.
57
DECLARE
58
    error_code text := SQLSTATE;
59
    error text := SQLERRM;
60
BEGIN
61
    -- Insert the value and error for *each* source column.
62
'''+strings.indent(sql_gen.RowExcIgnore('text', col_names_query, insert_query,
63
    row_var=errors_table_cols[0]).to_str(db))+'''
64
END;
65
'''
66
    else:
67
        handler += '''\
68
RAISE WARNING '%', SQLERRM;
69
'''
70
    handler += '''\
71
RETURN NULL;
72
'''
73
    return sql_gen.ExcHandler('data_exception', handler)
74

    
75
def cast(db, type_, col, errors_table=None):
76
    '''Casts an (unrenamed) column or value.
77
    If errors_table set and col has srcs, saves errors in errors_table (using
78
    col's srcs attr as source columns). Otherwise, converts errors to warnings.
79
    @param col str|sql_gen.Col|sql_gen.Literal
80
    @param errors_table None|sql_gen.Table|str
81
    '''
82
    col = sql_gen.as_Col(col)
83
    
84
    # Don't convert exceptions to warnings for user-supplied constants
85
    if isinstance(col, sql_gen.Literal): return sql_gen.Cast(type_, col)
86
    
87
    assert not isinstance(col, sql_gen.NamedCol)
88
    
89
    function_name = strings.first_word(type_)
90
    srcs = col.srcs
91
    save_errors = (errors_table != None and isinstance(col, sql_gen.Col)
92
        and col.srcs != ())
93
    if save_errors:
94
        srcs = map(sql_gen.to_name_only_col, col.srcs)
95
        function_name = str(sql_gen.FunctionCall(function_name, *srcs))
96
    function = db.TempFunction(function_name)
97
    
98
    # Create function definition
99
    modifiers = 'STRICT'
100
    if not save_errors: modifiers = 'IMMUTABLE '+modifiers
101
    handler = data_exception_handler(db, srcs, errors_table)
102
    body = sql_gen.CustomCode(handler.to_str(db, '''\
103
/* The explicit cast to the return type is needed to make the cast happen
104
inside the try block. (Implicit casts to the return type happen at the end
105
of the function, outside any block.) */
106
RETURN value::'''+type_+''';
107
'''))
108
    body.lang='plpgsql'
109
    def_ = sql_gen.FunctionDef(function, type_, body, ['value text'], modifiers)
110
    
111
    # Create function
112
    while True:
113
        try:
114
            sql.run_query(db, def_.to_str(db), recover=True, cacheable=True,
115
                log_ignore_excs=(sql.DuplicateException,))
116
            break # successful
117
        except sql.DuplicateException:
118
            function.name = sql.next_version(function.name)
119
            # try again with next version of name
120
    
121
    return sql_gen.FunctionCall(function, col)
122

    
123
def cast_temp_col(db, type_, col, errors_table=None):
124
    '''Like cast(), but creates a new column with the cast values if the input
125
    is a column.
126
    @return The new column or cast value
127
    '''
128
    def cast_(col): return cast(db, type_, col, errors_table)
129
    
130
    try: col = sql_gen.underlying_col(col)
131
    except sql_gen.NoUnderlyingTableException: return sql_gen.wrap(cast_, col)
132
    
133
    table = col.table
134
    new_col = sql_gen.suffixed_col(col, '::'+strings.first_word(type_))
135
    expr = cast_(col)
136
    
137
    # Add column
138
    new_typed_col = sql_gen.TypedCol(new_col.name, type_)
139
    sql.add_col(db, table, new_typed_col, comment=repr(col)+'::'+type_)
140
    new_col.name = new_typed_col.name # propagate any renaming
141
    
142
    sql.update(db, table, [(new_col, expr)], in_place=True, recover=True)
143
    
144
    return new_col
145

    
146
def errors_table(db, table, if_exists=True):
147
    '''
148
    @param if_exists If set, returns None if the errors table doesn't exist
149
    @return None|sql_gen.Table
150
    '''
151
    table = sql_gen.as_Table(table)
152
    if table.srcs != (): table = table.srcs[0]
153
    
154
    errors_table = sql_gen.suffixed_table(table, '.errors')
155
    if if_exists and not sql.table_exists(db, errors_table): return None
156
    return errors_table
157

    
158
##### Import
159

    
160
def put(db, table, row, pkey_=None, row_ct_ref=None):
161
    '''Recovers from errors.
162
    Only works under PostgreSQL (uses INSERT RETURNING).
163
    '''
164
    row = sql_gen.ColDict(db, table, row)
165
    if pkey_ == None: pkey_ = sql.pkey(db, table, recover=True)
166
    
167
    try:
168
        cur = sql.insert(db, table, row, pkey_, recover=True, log_level=3.5)
169
        if row_ct_ref != None and cur.rowcount >= 0:
170
            row_ct_ref[0] += cur.rowcount
171
        return sql.value(cur)
172
    except sql.DuplicateKeyException, e:
173
        row = sql_gen.ColDict(db, table,
174
            util.dict_subset_right_join(row, e.cols))
175
        return sql.value(sql.select(db, table, [pkey_], row, recover=True,
176
            log_level=3.5))
177
    except sql.NullValueException: return None
178

    
179
def get(db, table, row, pkey, row_ct_ref=None, create=False):
180
    '''Recovers from errors'''
181
    try:
182
        return sql.value(sql.select(db, table, [pkey], row, limit=1,
183
            recover=True))
184
    except StopIteration:
185
        if not create: raise
186
        return put(db, table, row, pkey, row_ct_ref) # insert new row
187

    
188
def is_func_result(col):
189
    return col.table.name.find('(') >= 0 and col.name == 'result'
190

    
191
def into_table_name(out_table, in_tables0, mapping, is_func):
192
    def in_col_str(in_col):
193
        in_col = sql_gen.remove_col_rename(in_col)
194
        if isinstance(in_col, sql_gen.Col):
195
            table = in_col.table
196
            if table == in_tables0:
197
                in_col = sql_gen.to_name_only_col(in_col)
198
            elif is_func_result(in_col): in_col = table # omit col name
199
        return str(in_col)
200
    
201
    str_ = str(out_table)
202
    if is_func:
203
        str_ += '('
204
        
205
        try: value_in_col = mapping['value']
206
        except KeyError:
207
            str_ += ', '.join((str(k)+'='+in_col_str(v)
208
                for k, v in mapping.iteritems()))
209
        else: str_ += in_col_str(value_in_col)
210
        
211
        str_ += ')'
212
    else:
213
        out_col = 'rank'
214
        try: in_col = mapping[out_col]
215
        except KeyError: str_ += '_pkeys'
216
        else: # has a rank column, so hierarchical
217
            str_ += '['+str(out_col)+'='+in_col_str(in_col)+']'
218
    return str_
219

    
220
def put_table(db, out_table, in_tables, mapping, row_ct_ref=None, into=None,
221
    default=None, is_func=False, on_error=exc.raise_):
222
    '''Recovers from errors.
223
    Only works under PostgreSQL (uses INSERT RETURNING).
224
    IMPORTANT: Must be run at the *beginning* of a transaction.
225
    @param in_tables The main input table to select from, followed by a list of
226
        tables to join with it using the main input table's pkey
227
    @param mapping dict(out_table_col=in_table_col, ...)
228
        * out_table_col: str (*not* sql_gen.Col)
229
        * in_table_col: sql_gen.Col|literal-value
230
    @param into The table to contain the output and input pkeys.
231
        Defaults to `out_table.name+'_pkeys'`.
232
    @param default The *output* column to use as the pkey for missing rows.
233
        If this output column does not exist in the mapping, uses None.
234
    @param is_func Whether out_table is the name of a SQL function, not a table
235
    @return sql_gen.Col Where the output pkeys are made available
236
    '''
237
    out_table = sql_gen.as_Table(out_table)
238
    
239
    def log_debug(msg): db.log_debug(msg, level=1.5)
240
    def col_ustr(str_):
241
        return strings.repr_no_u(sql_gen.remove_col_rename(str_))
242
    
243
    log_debug('********** New iteration **********')
244
    log_debug('Inserting these input columns into '+strings.as_tt(
245
        out_table.to_str(db))+':\n'+strings.as_table(mapping, ustr=col_ustr))
246
    
247
    is_function = sql.function_exists(db, out_table)
248
    
249
    if is_function: out_pkey = 'result'
250
    else: out_pkey = sql.pkey(db, out_table, recover=True)
251
    out_pkey_col = sql_gen.as_Col(out_pkey, out_table)
252
    
253
    in_tables_ = in_tables[:] # don't modify input!
254
    try: in_tables0 = in_tables_.pop(0) # first table is separate
255
    except IndexError: in_tables0 = None
256
    else:
257
        in_pkey = sql.pkey(db, in_tables0, recover=True)
258
        in_pkey_col = sql_gen.as_Col(in_pkey, in_tables0)
259
    
260
    # Determine if can use optimization for only literal values
261
    is_literals = not reduce(operator.or_, map(sql_gen.is_table_col,
262
        mapping.values()), False)
263
    is_literals_or_function = is_literals or is_function
264
    
265
    if in_tables0 == None: errors_table_ = None
266
    else: errors_table_ = errors_table(db, in_tables0)
267
    
268
    # Create input joins from list of input tables
269
    input_joins = [in_tables0]+[sql_gen.Join(v,
270
        {in_pkey: sql_gen.join_same_not_null}) for v in in_tables_]
271
    
272
    if mapping == {} and not is_function: # need >= one column for INSERT SELECT
273
        mapping = {out_pkey: None} # ColDict will replace with default value
274
    
275
    if not is_literals:
276
        if into == None:
277
            into = into_table_name(out_table, in_tables0, mapping, is_func)
278
        into = sql_gen.as_Table(into)
279
        
280
        # Set column sources
281
        in_cols = filter(sql_gen.is_table_col, mapping.values())
282
        for col in in_cols:
283
            if col.table == in_tables0: col.set_srcs(sql_gen.src_self)
284
        
285
        log_debug('Joining together input tables into temp table')
286
        # Place in new table so don't modify input and for speed
287
        in_table = sql_gen.Table('in')
288
        mapping = dicts.join(mapping, sql.flatten(db, in_table, input_joins,
289
            in_cols, preserve=[in_pkey_col]))
290
        input_joins = [in_table]
291
        db.log_debug('Temp table: '+strings.as_tt(in_table.to_str(db)), level=2)
292
    
293
    mapping = sql_gen.ColDict(db, out_table, mapping)
294
        # after applying dicts.join() because that returns a plain dict
295
    
296
    # Resolve default value column
297
    if default != None:
298
        try: default = mapping[default]
299
        except KeyError:
300
            db.log_debug('Default value column '
301
                +strings.as_tt(strings.repr_no_u(default))
302
                +' does not exist in mapping, falling back to None', level=2.1)
303
            default = None
304
    
305
    # Save default values for all rows since in_table may have rows deleted
306
    if is_literals: pass
307
    elif is_function: full_in_table = in_table
308
    else:
309
        full_in_table = sql_gen.suffixed_table(in_table, '_full')
310
        full_in_table_cols = [in_pkey_col]
311
        if default != None:
312
            full_in_table_cols.append(default)
313
            default = sql_gen.with_table(default, full_in_table)
314
        sql.run_query_into(db, sql.mk_select(db, in_table, full_in_table_cols,
315
            order_by=None), into=full_in_table, add_pkey_=True)
316
    
317
    if not is_literals:
318
        pkeys_names = [in_pkey, out_pkey]
319
        pkeys_cols = [in_pkey_col, out_pkey_col]
320
    
321
    pkeys_table_exists_ref = [False]
322
    def insert_into_pkeys(joins, cols, **kw_args):
323
        query = sql.mk_select(db, joins, cols, order_by=None)
324
        if pkeys_table_exists_ref[0]:
325
            sql.insert_select(db, into, pkeys_names, query, **kw_args)
326
        else:
327
            sql.run_query_into(db, query, into=into, add_pkey_=True, **kw_args)
328
            pkeys_table_exists_ref[0] = True
329
    
330
    limit_ref = [None]
331
    def mk_main_select(joins, cols):
332
        return sql.mk_select(db, joins, cols, limit=limit_ref[0], order_by=None)
333
    
334
    if is_literals: insert_in_table = None
335
    else:
336
        insert_in_table = in_table
337
        insert_in_tables = [insert_in_table]
338
    join_cols = sql_gen.ColDict(db, out_table)
339
    
340
    exc_strs = set()
341
    def log_exc(e):
342
        e_str = exc.str_(e, first_line_only=True)
343
        log_debug('Caught exception: '+e_str)
344
        assert e_str not in exc_strs # avoid infinite loops
345
        exc_strs.add(e_str)
346
    
347
    def remove_all_rows():
348
        log_debug('Ignoring all rows')
349
        limit_ref[0] = 0 # just create an empty pkeys table
350
    
351
    def ignore_cond(cond, e):
352
        out_table_cols = sql_gen.ColDict(db, out_table)
353
        out_table_cols.update(util.dict_subset_right_join({},
354
            sql.table_cols(db, out_table)))
355
        
356
        in_cols = []
357
        cond = sql.map_expr(db, cond, mapping, in_cols)
358
        cond = sql.map_expr(db, cond, out_table_cols)
359
        
360
        track_data_error(db, errors_table_, sql_gen.cols_srcs(in_cols), None,
361
            e.cause.pgcode,
362
            strings.ensure_newl(e.cause.pgerror)+'condition: '+cond)
363
        
364
        not_cond = sql_gen.NotCond(sql_gen.CustomCode(cond))
365
        log_debug('Ignoring rows where '+strings.as_tt(not_cond.to_str(db)))
366
        sql.delete(db, insert_in_table, not_cond)
367
    
368
    not_null_cols = set()
369
    def ignore(in_col, value, e):
370
        in_col = sql_gen.with_table(in_col, insert_in_table)
371
        
372
        track_data_error(db, errors_table_, in_col.srcs, value,
373
            e.cause.pgcode, e.cause.pgerror)
374
        log_debug('Ignoring rows with '+strings.as_tt(repr(in_col))+' = '
375
            +strings.as_tt(repr(value)))
376
        
377
        sql.add_index(db, in_col, insert_in_table) # enable fast filtering
378
        if value != None and in_col not in not_null_cols:
379
            # Try just mapping the value to NULL
380
            sql.update(db, insert_in_table, [(in_col, None)],
381
                sql_gen.ColValueCond(in_col, value))
382
        else:
383
            sql.delete(db, insert_in_table, sql_gen.ColValueCond(in_col, value))
384
            if value == None: not_null_cols.add(in_col)
385
    
386
    if not is_literals:
387
        def insert_pkeys_table(which):
388
            return sql_gen.Table(sql_gen.concat(in_table.name,
389
                '_insert_'+which+'_pkeys'))
390
        insert_out_pkeys = insert_pkeys_table('out')
391
        insert_in_pkeys = insert_pkeys_table('in')
392
    
393
    # Do inserts and selects
394
    while True:
395
        if limit_ref[0] == 0: # special case
396
            if is_literals: return None
397
            log_debug('Creating an empty pkeys table')
398
            cur = sql.run_query_into(db, sql.mk_select(db, out_table,
399
                [out_pkey], order_by=None, limit=0), into=insert_out_pkeys)
400
            break # don't do main case
401
        
402
        has_joins = join_cols != {}
403
        
404
        log_debug('Trying to insert new rows')
405
        
406
        # Prepare to insert new rows
407
        if is_function:
408
            log_debug('Calling function on input rows')
409
            args = dict(((k.name, v) for k, v in mapping.iteritems()))
410
            func_call = sql_gen.NamedCol(out_pkey,
411
                sql_gen.FunctionCall(out_table, **args))
412
        else:
413
            insert_args = dict(recover=True, cacheable=False)
414
            if has_joins:
415
                insert_args.update(dict(ignore=True))
416
            else:
417
                insert_args.update(dict(returning=out_pkey))
418
                if not is_literals:
419
                    insert_args.update(dict(into=insert_out_pkeys))
420
            main_select = mk_main_select([insert_in_table], [sql_gen.with_table(
421
                c, insert_in_table) for c in mapping.values()])
422
        
423
        try:
424
            cur = None
425
            if is_function:
426
                if is_literals: cur = sql.select(db, fields=[func_call])
427
                else:
428
                    insert_into_pkeys(input_joins, [in_pkey_col, func_call],
429
                        recover=True)
430
            else:
431
                cur = sql.insert_select(db, out_table, mapping.keys(),
432
                    main_select, **insert_args)
433
            break # insert successful
434
        except sql.MissingCastException, e:
435
            log_exc(e)
436
            
437
            out_col = e.col
438
            type_ = e.type
439
            
440
            log_debug('Casting '+strings.as_tt(out_col)+' input to '
441
                +strings.as_tt(type_))
442
            in_col = mapping[out_col]
443
            while True:
444
                try:
445
                    mapping[out_col] = cast_temp_col(db, type_, in_col,
446
                        errors_table_)
447
                    break # cast successful
448
                except sql.InvalidValueException, e:
449
                    log_exc(e)
450
                    
451
                    ignore(in_col, e.value, e)
452
        except sql.DuplicateKeyException, e:
453
            log_exc(e)
454
            
455
            # Different rows violating different unique constraints not
456
            # supported
457
            assert not join_cols
458
            
459
            join_cols.update(util.dict_subset_right_join(mapping, e.cols))
460
            log_debug('Ignoring existing rows, comparing on these columns:\n'
461
                +strings.as_inline_table(join_cols, ustr=col_ustr))
462
            
463
            if is_literals:
464
                return sql.value(sql.select(db, out_table, [out_pkey_col],
465
                    mapping, order_by=None))
466
            
467
            # Uniquify input table to avoid internal duplicate keys
468
            insert_in_table = sql.distinct_table(db, insert_in_table,
469
                join_cols.values())
470
            insert_in_tables.append(insert_in_table)
471
        except sql.NullValueException, e:
472
            log_exc(e)
473
            
474
            out_col, = e.cols
475
            try: in_col = mapping[out_col]
476
            except KeyError:
477
                msg = 'Missing mapping for NOT NULL column '+out_col
478
                log_debug(msg)
479
                if default == None: on_error(SyntaxError(msg)) # required col
480
                remove_all_rows()
481
            else: ignore(in_col, None, e)
482
        except sql.CheckException, e:
483
            log_exc(e)
484
            
485
            ignore_cond(e.cond, e)
486
        except sql.InvalidValueException, e:
487
            log_exc(e)
488
            
489
            for in_col in mapping.values(): ignore(in_col, e.value, e)
490
        except sql.DatabaseErrors, e:
491
            log_exc(e)
492
            
493
            log_debug('No handler for exception')
494
            on_error(e)
495
            remove_all_rows()
496
        # after exception handled, rerun loop with additional constraints
497
    
498
    if cur != None and row_ct_ref != None and cur.rowcount >= 0:
499
        row_ct_ref[0] += cur.rowcount
500
    
501
    if is_literals_or_function: pass # pkeys table already created
502
    elif has_joins:
503
        select_joins = input_joins+[sql_gen.Join(out_table, join_cols)]
504
        log_debug('Getting output table pkeys of existing/inserted rows')
505
        insert_into_pkeys(select_joins, pkeys_cols)
506
    else:
507
        sql.add_row_num(db, insert_out_pkeys) # for joining with input pkeys
508
        
509
        log_debug('Getting input table pkeys of inserted rows')
510
        # Note that mk_main_select() does not use ORDER BY. Instead, assume that
511
        # since the SELECT query is identical to the one used in INSERT SELECT,
512
        # its rows will be retrieved in the same order.
513
        sql.run_query_into(db, mk_main_select(input_joins, [in_pkey]),
514
            into=insert_in_pkeys)
515
        sql.add_row_num(db, insert_in_pkeys) # for joining with output pkeys
516
        
517
        assert sql.table_row_count(db, insert_out_pkeys) == sql.table_row_count(
518
            db, insert_in_pkeys)
519
        
520
        log_debug('Combining output and input pkeys in inserted order')
521
        pkey_joins = [insert_in_pkeys, sql_gen.Join(insert_out_pkeys,
522
            {sql.row_num_col: sql_gen.join_same_not_null})]
523
        insert_into_pkeys(pkey_joins, pkeys_names)
524
        
525
        sql.empty_temp(db, [insert_out_pkeys, insert_in_pkeys])
526
    
527
    if not is_literals_or_function:
528
        log_debug('Setting pkeys of missing rows to '
529
            +strings.as_tt(repr(default)))
530
        missing_rows_joins = [full_in_table, sql_gen.Join(into,
531
            {in_pkey: sql_gen.join_same_not_null}, sql_gen.filter_out)]
532
            # must use join_same_not_null or query will take forever
533
        insert_into_pkeys(missing_rows_joins,
534
            [sql_gen.Col(in_pkey, full_in_table),
535
            sql_gen.NamedCol(out_pkey, default)])
536
    # otherwise, there is already an entry for every row
537
    
538
    if is_literals: return sql.value(cur)
539
    else:
540
        assert (sql.table_row_count(db, into)
541
            == sql.table_row_count(db, full_in_table))
542
        
543
        sql.empty_temp(db, insert_in_tables+[full_in_table])
544
        
545
        srcs = []
546
        if is_func: srcs = sql_gen.cols_srcs(in_cols)
547
        return sql_gen.Col(out_pkey, into, srcs)
(26-26/37)