Project

General

Profile

1
# Database import/export
2

    
3
import operator
4

    
5
import exc
6
import dicts
7
import sql
8
import sql_gen
9
import strings
10
import util
11

    
12
##### Data cleanup
13

    
14
def cleanup_table(db, table, cols):
15
    table = sql_gen.as_Table(table)
16
    cols = map(sql_gen.as_Col, cols)
17
    
18
    expr = ('nullif(nullif(trim(both from %s), '+db.esc_value('')+'), '
19
        +db.esc_value(r'\N')+')')
20
    changes = [(v, sql_gen.CustomCode(expr % v.to_str(db)))
21
        for v in cols]
22
    
23
    sql.update(db, table, changes, in_place=True)
24

    
25
##### Error tracking
26

    
27
def track_data_error(db, errors_table, cols, value, error_code, error):
28
    '''
29
    @param errors_table If None, does nothing.
30
    '''
31
    if errors_table == None or cols == (): return
32
    
33
    for col in cols:
34
        try:
35
            sql.insert(db, errors_table, dict(column=col.name, value=value,
36
                error_code=error_code, error=error), recover=True,
37
                cacheable=True, log_level=4)
38
        except sql.DuplicateKeyException: pass
39

    
40
def cast(db, type_, col, errors_table=None):
41
    '''Casts an (unrenamed) column or value.
42
    If errors_table set and col has srcs, saves errors in errors_table (using
43
    col's srcs attr as source columns). Otherwise, converts errors to warnings.
44
    @param col str|sql_gen.Col|sql_gen.Literal
45
    @param errors_table None|sql_gen.Table|str
46
    '''
47
    col = sql_gen.as_Col(col)
48
    
49
    # Don't convert exceptions to warnings for user-supplied constants
50
    if isinstance(col, sql_gen.Literal): return sql_gen.Cast(type_, col)
51
    
52
    assert not isinstance(col, sql_gen.NamedCol)
53
    
54
    save_errors = (errors_table != None and isinstance(col, sql_gen.Col)
55
        and col.srcs != ())
56
    function_name = type_
57
    if save_errors:
58
        errors_table = sql_gen.as_Table(errors_table)
59
        
60
        srcs = map(sql_gen.to_name_only_col, col.srcs)
61
        function_name = str(sql_gen.FunctionCall(function_name, *srcs))
62
    function = db.TempFunction(function_name)
63
    
64
    while True:
65
        # Create function definition
66
        query = '''\
67
CREATE FUNCTION '''+function.to_str(db)+'''(value text)
68
RETURNS '''+type_+'''
69
LANGUAGE plpgsql
70
'''
71
        if not save_errors: query += 'IMMUTABLE '
72
        query += '''\
73
STRICT
74
AS $$
75
BEGIN
76
    /* The explicit cast to the return type is needed to make the cast happen
77
    inside the try block. (Implicit casts to the return type happen at the end
78
    of the function, outside any block.) */
79
    RETURN value::'''+type_+''';
80
EXCEPTION
81
    WHEN data_exception THEN
82
'''
83
        if save_errors:
84
            errors_table_cols = map(sql_gen.Col,
85
                ['column', 'value', 'error_code', 'error'])
86
            query += '''\
87
        -- Save error in errors table.
88
        DECLARE
89
            error_code text := SQLSTATE;
90
            error text := SQLERRM;
91
            "column" text;
92
        BEGIN
93
            -- Insert the value and error for *each* source column.
94
            FOR "column" IN
95
'''+sql.mk_select(db, sql_gen.NamedValues('c', None, [[c.name] for c in srcs]),
96
    order_by=None)+'''
97
            LOOP
98
                BEGIN
99
'''+sql.mk_insert_select(db, errors_table, errors_table_cols,
100
    sql_gen.Values(errors_table_cols).to_str(db))+''';
101
                EXCEPTION
102
                    WHEN unique_violation THEN NULL; -- continue to next row
103
                END;
104
            END LOOP;
105
        END;
106
'''
107
        else:
108
            query += '''\
109
        RAISE WARNING '%', SQLERRM;
110
'''
111
        query += '''\
112
        RETURN NULL;
113
END;
114
$$;
115
'''
116
        
117
        # Create function
118
        try:
119
            sql.run_query(db, query, recover=True, cacheable=True,
120
                log_ignore_excs=(sql.DuplicateException,))
121
            break # successful
122
        except sql.DuplicateException:
123
            function.name = sql.next_version(function.name)
124
            # try again with next version of name
125
    
126
    return sql_gen.FunctionCall(function, col)
127

    
128
def cast_temp_col(db, type_, col, errors_table=None):
129
    '''Like cast(), but creates a new column with the cast values if the input
130
    is a column.
131
    @return The new column or cast value
132
    '''
133
    def cast_(col): return cast(db, type_, col, errors_table)
134
    
135
    try: col = sql_gen.underlying_col(col)
136
    except sql_gen.NoUnderlyingTableException: return sql_gen.wrap(cast_, col)
137
    
138
    table = col.table
139
    new_col = sql_gen.suffixed_col(col, '::'+strings.first_word(type_))
140
    expr = cast_(col)
141
    
142
    # Add column
143
    new_typed_col = sql_gen.TypedCol(new_col.name, type_)
144
    sql.add_col(db, table, new_typed_col, comment=repr(col)+'::'+type_)
145
    new_col.name = new_typed_col.name # propagate any renaming
146
    
147
    sql.update(db, table, [(new_col, expr)], in_place=True, recover=True)
148
    
149
    return new_col
150

    
151
def errors_table(db, table, if_exists=True):
152
    '''
153
    @param if_exists If set, returns None if the errors table doesn't exist
154
    @return None|sql_gen.Table
155
    '''
156
    table = sql_gen.as_Table(table)
157
    if table.srcs != (): table = table.srcs[0]
158
    
159
    errors_table = sql_gen.suffixed_table(table, '.errors')
160
    if if_exists and not sql.table_exists(db, errors_table): return None
161
    return errors_table
162

    
163
##### Import
164

    
165
def put(db, table, row, pkey_=None, row_ct_ref=None):
166
    '''Recovers from errors.
167
    Only works under PostgreSQL (uses INSERT RETURNING).
168
    '''
169
    row = sql_gen.ColDict(db, table, row)
170
    if pkey_ == None: pkey_ = sql.pkey(db, table, recover=True)
171
    
172
    try:
173
        cur = sql.insert(db, table, row, pkey_, recover=True, log_level=3.5)
174
        if row_ct_ref != None and cur.rowcount >= 0:
175
            row_ct_ref[0] += cur.rowcount
176
        return sql.value(cur)
177
    except sql.DuplicateKeyException, e:
178
        row = sql_gen.ColDict(db, table,
179
            util.dict_subset_right_join(row, e.cols))
180
        return sql.value(sql.select(db, table, [pkey_], row, recover=True,
181
            log_level=3.5))
182
    except sql.NullValueException: return None
183

    
184
def get(db, table, row, pkey, row_ct_ref=None, create=False):
185
    '''Recovers from errors'''
186
    try:
187
        return sql.value(sql.select(db, table, [pkey], row, limit=1,
188
            recover=True))
189
    except StopIteration:
190
        if not create: raise
191
        return put(db, table, row, pkey, row_ct_ref) # insert new row
192

    
193
def is_func_result(col):
194
    return col.table.name.find('(') >= 0 and col.name == 'result'
195

    
196
def into_table_name(out_table, in_tables0, mapping, is_func):
197
    def in_col_str(in_col):
198
        in_col = sql_gen.remove_col_rename(in_col)
199
        if isinstance(in_col, sql_gen.Col):
200
            table = in_col.table
201
            if table == in_tables0:
202
                in_col = sql_gen.to_name_only_col(in_col)
203
            elif is_func_result(in_col): in_col = table # omit col name
204
        return str(in_col)
205
    
206
    str_ = str(out_table)
207
    if is_func:
208
        str_ += '('
209
        
210
        try: value_in_col = mapping['value']
211
        except KeyError:
212
            str_ += ', '.join((str(k)+'='+in_col_str(v)
213
                for k, v in mapping.iteritems()))
214
        else: str_ += in_col_str(value_in_col)
215
        
216
        str_ += ')'
217
    else:
218
        out_col = 'rank'
219
        try: in_col = mapping[out_col]
220
        except KeyError: str_ += '_pkeys'
221
        else: # has a rank column, so hierarchical
222
            str_ += '['+str(out_col)+'='+in_col_str(in_col)+']'
223
    return str_
224

    
225
def put_table(db, out_table, in_tables, mapping, row_ct_ref=None, into=None,
226
    default=None, is_func=False, on_error=exc.raise_):
227
    '''Recovers from errors.
228
    Only works under PostgreSQL (uses INSERT RETURNING).
229
    IMPORTANT: Must be run at the *beginning* of a transaction.
230
    @param in_tables The main input table to select from, followed by a list of
231
        tables to join with it using the main input table's pkey
232
    @param mapping dict(out_table_col=in_table_col, ...)
233
        * out_table_col: str (*not* sql_gen.Col)
234
        * in_table_col: sql_gen.Col|literal-value
235
    @param into The table to contain the output and input pkeys.
236
        Defaults to `out_table.name+'_pkeys'`.
237
    @param default The *output* column to use as the pkey for missing rows.
238
        If this output column does not exist in the mapping, uses None.
239
    @param is_func Whether out_table is the name of a SQL function, not a table
240
    @return sql_gen.Col Where the output pkeys are made available
241
    '''
242
    out_table = sql_gen.as_Table(out_table)
243
    
244
    def log_debug(msg): db.log_debug(msg, level=1.5)
245
    def col_ustr(str_):
246
        return strings.repr_no_u(sql_gen.remove_col_rename(str_))
247
    
248
    log_debug('********** New iteration **********')
249
    log_debug('Inserting these input columns into '+strings.as_tt(
250
        out_table.to_str(db))+':\n'+strings.as_table(mapping, ustr=col_ustr))
251
    
252
    is_function = sql.function_exists(db, out_table)
253
    
254
    if is_function: out_pkey = 'result'
255
    else: out_pkey = sql.pkey(db, out_table, recover=True)
256
    out_pkey_col = sql_gen.as_Col(out_pkey, out_table)
257
    
258
    in_tables_ = in_tables[:] # don't modify input!
259
    try: in_tables0 = in_tables_.pop(0) # first table is separate
260
    except IndexError: in_tables0 = None
261
    else:
262
        in_pkey = sql.pkey(db, in_tables0, recover=True)
263
        in_pkey_col = sql_gen.as_Col(in_pkey, in_tables0)
264
    
265
    # Determine if can use optimization for only literal values
266
    is_literals = not reduce(operator.or_, map(sql_gen.is_table_col,
267
        mapping.values()), False)
268
    is_literals_or_function = is_literals or is_function
269
    
270
    if in_tables0 == None: errors_table_ = None
271
    else: errors_table_ = errors_table(db, in_tables0)
272
    
273
    # Create input joins from list of input tables
274
    input_joins = [in_tables0]+[sql_gen.Join(v,
275
        {in_pkey: sql_gen.join_same_not_null}) for v in in_tables_]
276
    
277
    if mapping == {} and not is_function: # need >= one column for INSERT SELECT
278
        mapping = {out_pkey: None} # ColDict will replace with default value
279
    
280
    if not is_literals:
281
        if into == None:
282
            into = into_table_name(out_table, in_tables0, mapping, is_func)
283
        into = sql_gen.as_Table(into)
284
        
285
        # Set column sources
286
        in_cols = filter(sql_gen.is_table_col, mapping.values())
287
        for col in in_cols:
288
            if col.table == in_tables0: col.set_srcs(sql_gen.src_self)
289
        
290
        log_debug('Joining together input tables into temp table')
291
        # Place in new table so don't modify input and for speed
292
        in_table = sql_gen.Table('in')
293
        mapping = dicts.join(mapping, sql.flatten(db, in_table, input_joins,
294
            in_cols, preserve=[in_pkey_col]))
295
        input_joins = [in_table]
296
        db.log_debug('Temp table: '+strings.as_tt(in_table.to_str(db)), level=2)
297
    
298
    mapping = sql_gen.ColDict(db, out_table, mapping)
299
        # after applying dicts.join() because that returns a plain dict
300
    
301
    # Resolve default value column
302
    if default != None:
303
        try: default = mapping[default]
304
        except KeyError:
305
            db.log_debug('Default value column '
306
                +strings.as_tt(strings.repr_no_u(default))
307
                +' does not exist in mapping, falling back to None', level=2.1)
308
            default = None
309
    
310
    # Save default values for all rows since in_table may have rows deleted
311
    if is_literals: pass
312
    elif is_function: full_in_table = in_table
313
    else:
314
        full_in_table = sql_gen.suffixed_table(in_table, '_full')
315
        full_in_table_cols = [in_pkey_col]
316
        if default != None:
317
            full_in_table_cols.append(default)
318
            default = sql_gen.with_table(default, full_in_table)
319
        sql.run_query_into(db, sql.mk_select(db, in_table, full_in_table_cols,
320
            order_by=None), into=full_in_table, add_pkey_=True)
321
    
322
    if not is_literals:
323
        pkeys_names = [in_pkey, out_pkey]
324
        pkeys_cols = [in_pkey_col, out_pkey_col]
325
    
326
    pkeys_table_exists_ref = [False]
327
    def insert_into_pkeys(joins, cols, **kw_args):
328
        query = sql.mk_select(db, joins, cols, order_by=None)
329
        if pkeys_table_exists_ref[0]:
330
            sql.insert_select(db, into, pkeys_names, query, **kw_args)
331
        else:
332
            sql.run_query_into(db, query, into=into, add_pkey_=True, **kw_args)
333
            pkeys_table_exists_ref[0] = True
334
    
335
    limit_ref = [None]
336
    def mk_main_select(joins, cols):
337
        return sql.mk_select(db, joins, cols, limit=limit_ref[0], order_by=None)
338
    
339
    if is_literals: insert_in_table = None
340
    else:
341
        insert_in_table = in_table
342
        insert_in_tables = [insert_in_table]
343
    join_cols = sql_gen.ColDict(db, out_table)
344
    
345
    exc_strs = set()
346
    def log_exc(e):
347
        e_str = exc.str_(e, first_line_only=True)
348
        log_debug('Caught exception: '+e_str)
349
        assert e_str not in exc_strs # avoid infinite loops
350
        exc_strs.add(e_str)
351
    
352
    def remove_all_rows():
353
        log_debug('Ignoring all rows')
354
        limit_ref[0] = 0 # just create an empty pkeys table
355
    
356
    def ignore_cond(cond, e):
357
        out_table_cols = sql_gen.ColDict(db, out_table)
358
        out_table_cols.update(util.dict_subset_right_join({},
359
            sql.table_cols(db, out_table)))
360
        
361
        in_cols = []
362
        cond = sql.map_expr(db, cond, mapping, in_cols)
363
        cond = sql.map_expr(db, cond, out_table_cols)
364
        
365
        track_data_error(db, errors_table_, sql_gen.cols_srcs(in_cols), None,
366
            e.cause.pgcode,
367
            strings.ensure_newl(e.cause.pgerror)+'condition: '+cond)
368
        
369
        not_cond = sql_gen.NotCond(sql_gen.CustomCode(cond))
370
        log_debug('Ignoring rows where '+strings.as_tt(not_cond.to_str(db)))
371
        sql.delete(db, insert_in_table, not_cond)
372
    
373
    not_null_cols = set()
374
    def ignore(in_col, value, e):
375
        in_col = sql_gen.with_table(in_col, insert_in_table)
376
        
377
        track_data_error(db, errors_table_, in_col.srcs, value,
378
            e.cause.pgcode, e.cause.pgerror)
379
        log_debug('Ignoring rows with '+strings.as_tt(repr(in_col))+' = '
380
            +strings.as_tt(repr(value)))
381
        
382
        sql.add_index(db, in_col, insert_in_table) # enable fast filtering
383
        if value != None and in_col not in not_null_cols:
384
            # Try just mapping the value to NULL
385
            sql.update(db, insert_in_table, [(in_col, None)],
386
                sql_gen.ColValueCond(in_col, value))
387
        else:
388
            sql.delete(db, insert_in_table, sql_gen.ColValueCond(in_col, value))
389
            if value == None: not_null_cols.add(in_col)
390
    
391
    if not is_literals:
392
        def insert_pkeys_table(which):
393
            return sql_gen.Table(sql_gen.concat(in_table.name,
394
                '_insert_'+which+'_pkeys'))
395
        insert_out_pkeys = insert_pkeys_table('out')
396
        insert_in_pkeys = insert_pkeys_table('in')
397
    
398
    # Do inserts and selects
399
    while True:
400
        if limit_ref[0] == 0: # special case
401
            if is_literals: return None
402
            log_debug('Creating an empty pkeys table')
403
            cur = sql.run_query_into(db, sql.mk_select(db, out_table,
404
                [out_pkey], order_by=None, limit=0), into=insert_out_pkeys)
405
            break # don't do main case
406
        
407
        has_joins = join_cols != {}
408
        
409
        log_debug('Trying to insert new rows')
410
        
411
        # Prepare to insert new rows
412
        if is_function:
413
            log_debug('Calling function on input rows')
414
            args = dict(((k.name, v) for k, v in mapping.iteritems()))
415
            func_call = sql_gen.NamedCol(out_pkey,
416
                sql_gen.FunctionCall(out_table, **args))
417
        else:
418
            insert_args = dict(recover=True, cacheable=False)
419
            if has_joins:
420
                insert_args.update(dict(ignore=True))
421
            else:
422
                insert_args.update(dict(returning=out_pkey))
423
                if not is_literals:
424
                    insert_args.update(dict(into=insert_out_pkeys))
425
            main_select = mk_main_select([insert_in_table], [sql_gen.with_table(
426
                c, insert_in_table) for c in mapping.values()])
427
        
428
        try:
429
            cur = None
430
            if is_function:
431
                if is_literals: cur = sql.select(db, fields=[func_call])
432
                else:
433
                    insert_into_pkeys(input_joins, [in_pkey_col, func_call],
434
                        recover=True)
435
            else:
436
                cur = sql.insert_select(db, out_table, mapping.keys(),
437
                    main_select, **insert_args)
438
            break # insert successful
439
        except sql.MissingCastException, e:
440
            log_exc(e)
441
            
442
            out_col = e.col
443
            type_ = e.type
444
            
445
            log_debug('Casting '+strings.as_tt(out_col)+' input to '
446
                +strings.as_tt(type_))
447
            in_col = mapping[out_col]
448
            while True:
449
                try:
450
                    mapping[out_col] = cast_temp_col(db, type_, in_col,
451
                        errors_table_)
452
                    break # cast successful
453
                except sql.InvalidValueException, e:
454
                    log_exc(e)
455
                    
456
                    ignore(in_col, e.value, e)
457
        except sql.DuplicateKeyException, e:
458
            log_exc(e)
459
            
460
            # Different rows violating different unique constraints not
461
            # supported
462
            assert not join_cols
463
            
464
            join_cols.update(util.dict_subset_right_join(mapping, e.cols))
465
            log_debug('Ignoring existing rows, comparing on these columns:\n'
466
                +strings.as_inline_table(join_cols, ustr=col_ustr))
467
            
468
            if is_literals:
469
                return sql.value(sql.select(db, out_table, [out_pkey_col],
470
                    mapping, order_by=None))
471
            
472
            # Uniquify input table to avoid internal duplicate keys
473
            insert_in_table = sql.distinct_table(db, insert_in_table,
474
                join_cols.values())
475
            insert_in_tables.append(insert_in_table)
476
        except sql.NullValueException, e:
477
            log_exc(e)
478
            
479
            out_col, = e.cols
480
            try: in_col = mapping[out_col]
481
            except KeyError:
482
                msg = 'Missing mapping for NOT NULL column '+out_col
483
                log_debug(msg)
484
                if default == None: on_error(SyntaxError(msg)) # required col
485
                remove_all_rows()
486
            else: ignore(in_col, None, e)
487
        except sql.CheckException, e:
488
            log_exc(e)
489
            
490
            ignore_cond(e.cond, e)
491
        except sql.InvalidValueException, e:
492
            log_exc(e)
493
            
494
            for in_col in mapping.values(): ignore(in_col, e.value, e)
495
        except sql.DatabaseErrors, e:
496
            log_exc(e)
497
            
498
            log_debug('No handler for exception')
499
            on_error(e)
500
            remove_all_rows()
501
        # after exception handled, rerun loop with additional constraints
502
    
503
    if cur != None and row_ct_ref != None and cur.rowcount >= 0:
504
        row_ct_ref[0] += cur.rowcount
505
    
506
    if is_literals_or_function: pass # pkeys table already created
507
    elif has_joins:
508
        select_joins = input_joins+[sql_gen.Join(out_table, join_cols)]
509
        log_debug('Getting output table pkeys of existing/inserted rows')
510
        insert_into_pkeys(select_joins, pkeys_cols)
511
    else:
512
        sql.add_row_num(db, insert_out_pkeys) # for joining with input pkeys
513
        
514
        log_debug('Getting input table pkeys of inserted rows')
515
        # Note that mk_main_select() does not use ORDER BY. Instead, assume that
516
        # since the SELECT query is identical to the one used in INSERT SELECT,
517
        # its rows will be retrieved in the same order.
518
        sql.run_query_into(db, mk_main_select(input_joins, [in_pkey]),
519
            into=insert_in_pkeys)
520
        sql.add_row_num(db, insert_in_pkeys) # for joining with output pkeys
521
        
522
        assert sql.table_row_count(db, insert_out_pkeys) == sql.table_row_count(
523
            db, insert_in_pkeys)
524
        
525
        log_debug('Combining output and input pkeys in inserted order')
526
        pkey_joins = [insert_in_pkeys, sql_gen.Join(insert_out_pkeys,
527
            {sql.row_num_col: sql_gen.join_same_not_null})]
528
        insert_into_pkeys(pkey_joins, pkeys_names)
529
        
530
        sql.empty_temp(db, [insert_out_pkeys, insert_in_pkeys])
531
    
532
    if not is_literals_or_function:
533
        log_debug('Setting pkeys of missing rows to '
534
            +strings.as_tt(repr(default)))
535
        missing_rows_joins = [full_in_table, sql_gen.Join(into,
536
            {in_pkey: sql_gen.join_same_not_null}, sql_gen.filter_out)]
537
            # must use join_same_not_null or query will take forever
538
        insert_into_pkeys(missing_rows_joins,
539
            [sql_gen.Col(in_pkey, full_in_table),
540
            sql_gen.NamedCol(out_pkey, default)])
541
    # otherwise, there is already an entry for every row
542
    
543
    if is_literals: return sql.value(cur)
544
    else:
545
        assert (sql.table_row_count(db, into)
546
            == sql.table_row_count(db, full_in_table))
547
        
548
        sql.empty_temp(db, insert_in_tables+[full_in_table])
549
        
550
        srcs = []
551
        if is_func: srcs = sql_gen.cols_srcs(in_cols)
552
        return sql_gen.Col(out_pkey, into, srcs)
(26-26/37)