1
|
# Database import/export
|
2
|
|
3
|
import copy
|
4
|
import csv
|
5
|
import operator
|
6
|
import warnings
|
7
|
import sys
|
8
|
|
9
|
import csvs
|
10
|
import exc
|
11
|
import dicts
|
12
|
import sql
|
13
|
import sql_gen
|
14
|
import streams
|
15
|
import strings
|
16
|
import util
|
17
|
|
18
|
##### Exceptions
|
19
|
|
20
|
# Can't use built-in SyntaxError because it stringifies to only the first line
|
21
|
class SyntaxError(Exception): pass
|
22
|
|
23
|
##### Data cleanup
|
24
|
|
25
|
def table_nulls_mapped__set(db, table):
|
26
|
assert isinstance(table, sql_gen.Table)
|
27
|
sql.run_query(db, 'SELECT util.table_nulls_mapped__set('
|
28
|
+sql_gen.table2regclass_text(db, table)+')')
|
29
|
|
30
|
def table_nulls_mapped__get(db, table):
|
31
|
assert isinstance(table, sql_gen.Table)
|
32
|
return sql.value(sql.run_query(db, 'SELECT util.table_nulls_mapped__get('
|
33
|
+sql_gen.table2regclass_text(db, table)+')'))
|
34
|
|
35
|
null_strs = ['', '-', r'\N', 'NULL', 'UNKNOWN', 'nulo']
|
36
|
|
37
|
def cleanup_table(db, table):
|
38
|
'''idempotent'''
|
39
|
table = sql_gen.as_Table(table)
|
40
|
assert sql.table_exists(db, table)
|
41
|
|
42
|
if table_nulls_mapped__get(db, table): return # already cleaned up
|
43
|
|
44
|
cols = filter(lambda c: sql_gen.is_text_col(db, c),
|
45
|
sql.table_cols(db, table))
|
46
|
try: pkey_col = sql.table_pkey_col(db, table)
|
47
|
except sql.DoesNotExistException: pass
|
48
|
else:
|
49
|
try: cols.remove(pkey_col)
|
50
|
except ValueError: pass
|
51
|
if not cols: return
|
52
|
|
53
|
db.log_debug('Cleaning up table', level=1.5)
|
54
|
|
55
|
expr = 'trim(both from %s)'
|
56
|
for null in null_strs: expr = 'nullif('+expr+', '+db.esc_value(null)+')'
|
57
|
changes = [(v, sql_gen.CustomCode(expr % v.to_str(db))) for v in cols]
|
58
|
|
59
|
while True:
|
60
|
try:
|
61
|
sql.update(db, table, changes, in_place=True, recover=True)
|
62
|
break # successful
|
63
|
except sql.NullValueException, e:
|
64
|
db.log_debug('Caught exception: '+exc.str_(e))
|
65
|
col, = e.cols
|
66
|
sql.drop_not_null(db, col)
|
67
|
|
68
|
db.log_debug('Vacuuming and reanalyzing table', level=1.5)
|
69
|
sql.vacuum(db, table)
|
70
|
|
71
|
table_nulls_mapped__set(db, table)
|
72
|
|
73
|
##### Error tracking
|
74
|
|
75
|
def track_data_error(db, errors_table, cols, value, error_code, error):
|
76
|
'''
|
77
|
@param errors_table If None, does nothing.
|
78
|
'''
|
79
|
if errors_table == None: return
|
80
|
|
81
|
col_names = [c.name for c in cols]
|
82
|
if not col_names: col_names = [None] # need at least one entry
|
83
|
for col_name in col_names:
|
84
|
try:
|
85
|
sql.insert(db, errors_table, dict(column=col_name, value=value,
|
86
|
error_code=error_code, error=error), recover=True,
|
87
|
cacheable=True, log_level=4)
|
88
|
except sql.DuplicateKeyException: pass
|
89
|
|
90
|
class ExcToErrorsTable(sql_gen.ExcToWarning):
|
91
|
'''Handles an exception by saving it or converting it to a warning.'''
|
92
|
def __init__(self, return_, srcs, errors_table, value=None):
|
93
|
'''
|
94
|
@param return_ See sql_gen.ExcToWarning
|
95
|
@param srcs The column names for the errors table
|
96
|
@param errors_table None|sql_gen.Table
|
97
|
@param value The value (or an expression for it) that caused the error
|
98
|
@pre The invalid value must be in a local variable "value" of type text.
|
99
|
'''
|
100
|
sql_gen.ExcToWarning.__init__(self, return_)
|
101
|
|
102
|
value = sql_gen.as_Code(value)
|
103
|
|
104
|
self.srcs = srcs
|
105
|
self.errors_table = errors_table
|
106
|
self.value = value
|
107
|
|
108
|
def to_str(self, db):
|
109
|
if not self.srcs or self.errors_table == None:
|
110
|
return sql_gen.ExcToWarning.to_str(self, db)
|
111
|
|
112
|
errors_table_cols = map(sql_gen.Col,
|
113
|
['column', 'value', 'error_code', 'error'])
|
114
|
col_names_query = sql.mk_select(db, sql_gen.NamedValues('c', None,
|
115
|
[[c.name] for c in self.srcs]), order_by=None)
|
116
|
insert_query = sql.mk_insert_select(db, self.errors_table,
|
117
|
errors_table_cols,
|
118
|
sql_gen.Values(errors_table_cols).to_str(db))+';\n'
|
119
|
return '''\
|
120
|
-- Save error in errors table.
|
121
|
DECLARE
|
122
|
error_code text := SQLSTATE;
|
123
|
error text := SQLERRM;
|
124
|
value text := '''+self.value.to_str(db)+''';
|
125
|
"column" text;
|
126
|
BEGIN
|
127
|
-- Insert the value and error for *each* source column.
|
128
|
'''+strings.indent(sql_gen.RowExcIgnore(None, col_names_query, insert_query,
|
129
|
row_var=errors_table_cols[0]).to_str(db))+'''
|
130
|
END;
|
131
|
|
132
|
'''+self.return_.to_str(db)
|
133
|
|
134
|
def data_exception_handler(*args, **kw_args):
|
135
|
'''Handles a data_exception by saving it or converting it to a warning.
|
136
|
For params, see ExcToErrorsTable().
|
137
|
'''
|
138
|
return sql_gen.data_exception_handler(ExcToErrorsTable(*args, **kw_args))
|
139
|
|
140
|
def cast(db, type_, col, errors_table=None):
|
141
|
'''Casts an (unrenamed) column or value.
|
142
|
If errors_table set and col has srcs, saves errors in errors_table (using
|
143
|
col's srcs attr as source columns). Otherwise, converts errors to warnings.
|
144
|
@param col str|sql_gen.Col|sql_gen.Literal
|
145
|
@param errors_table None|sql_gen.Table|str
|
146
|
'''
|
147
|
col = sql_gen.as_Col(col)
|
148
|
|
149
|
# Don't convert exceptions to warnings for user-supplied constants
|
150
|
if isinstance(col, sql_gen.Literal): return sql_gen.Cast(type_, col)
|
151
|
|
152
|
assert not isinstance(col, sql_gen.NamedCol)
|
153
|
|
154
|
function_name = strings.first_word(type_)
|
155
|
srcs = col.srcs
|
156
|
save_errors = errors_table != None and srcs
|
157
|
if save_errors: # function will be unique for the given srcs
|
158
|
function_name = strings.ustr(sql_gen.FunctionCall(function_name,
|
159
|
*map(sql_gen.to_name_only_col, srcs)))
|
160
|
function = db.TempFunction(function_name)
|
161
|
|
162
|
# Create function definition
|
163
|
modifiers = 'STRICT'
|
164
|
if not save_errors: modifiers = 'IMMUTABLE '+modifiers
|
165
|
value_param = sql_gen.FunctionParam('value', 'anyelement')
|
166
|
handler = data_exception_handler('RETURN NULL;\n', srcs, errors_table,
|
167
|
value_param.name)
|
168
|
body = sql_gen.CustomCode(handler.to_str(db, '''\
|
169
|
/* The explicit cast to the return type is needed to make the cast happen
|
170
|
inside the try block. (Implicit casts to the return type happen at the end
|
171
|
of the function, outside any block.) */
|
172
|
RETURN '''+sql_gen.Cast(type_, sql_gen.CustomCode('value')).to_str(db)+''';
|
173
|
'''))
|
174
|
body.lang='plpgsql'
|
175
|
sql.define_func(db, sql_gen.FunctionDef(function, type_, body,
|
176
|
[value_param], modifiers))
|
177
|
|
178
|
return sql_gen.FunctionCall(function, col)
|
179
|
|
180
|
def func_wrapper_exception_handler(db, return_, args, errors_table):
|
181
|
'''Handles a function call's data_exceptions.
|
182
|
Supports PL/Python functions.
|
183
|
@param return_ See data_exception_handler()
|
184
|
@param args [arg...] Function call's args
|
185
|
@param errors_table See data_exception_handler()
|
186
|
'''
|
187
|
args = filter(sql_gen.has_srcs, args)
|
188
|
|
189
|
srcs = sql_gen.cross_join_srcs(args)
|
190
|
value = sql_gen.merge_not_null(db, ',', args)
|
191
|
return sql_gen.NestedExcHandler(
|
192
|
data_exception_handler(return_, srcs, errors_table, value)
|
193
|
, sql_gen.plpythonu_error_handler
|
194
|
)
|
195
|
|
196
|
def cast_temp_col(db, type_, col, errors_table=None):
|
197
|
'''Like cast(), but creates a new column with the cast values if the input
|
198
|
is a column.
|
199
|
@return The new column or cast value
|
200
|
'''
|
201
|
def cast_(col): return cast(db, type_, col, errors_table)
|
202
|
|
203
|
try: col = sql_gen.underlying_col(col)
|
204
|
except sql_gen.NoUnderlyingTableException: return sql_gen.wrap(cast_, col)
|
205
|
|
206
|
table = col.table
|
207
|
new_col = sql_gen.suffixed_col(col, '::'+strings.first_word(type_))
|
208
|
expr = cast_(col)
|
209
|
|
210
|
# Add column
|
211
|
new_typed_col = sql_gen.TypedCol(new_col.name, type_)
|
212
|
sql.add_col(db, table, new_typed_col, comment=strings.urepr(col)+'::'+type_)
|
213
|
new_col.name = new_typed_col.name # propagate any renaming
|
214
|
|
215
|
sql.update(db, table, [(new_col, expr)], in_place=True, recover=True)
|
216
|
|
217
|
return new_col
|
218
|
|
219
|
def errors_table(db, table, if_exists=True):
|
220
|
'''
|
221
|
@param if_exists If set, returns None if the errors table doesn't exist
|
222
|
@return None|sql_gen.Table
|
223
|
'''
|
224
|
table = sql_gen.as_Table(table)
|
225
|
if table.srcs != (): table = table.srcs[0]
|
226
|
|
227
|
errors_table = sql_gen.suffixed_table(table, '.errors')
|
228
|
if if_exists and not sql.table_exists(db, errors_table): return None
|
229
|
return errors_table
|
230
|
|
231
|
def mk_errors_table(db, table):
|
232
|
errors_table_ = errors_table(db, table, if_exists=False)
|
233
|
if sql.table_exists(db, errors_table_, cacheable=False): return
|
234
|
|
235
|
typed_cols = [
|
236
|
sql_gen.TypedCol('column', 'text'),
|
237
|
sql_gen.TypedCol('value', 'text'),
|
238
|
sql_gen.TypedCol('error_code', 'character varying(5)', nullable=False),
|
239
|
sql_gen.TypedCol('error', 'text', nullable=False),
|
240
|
]
|
241
|
sql.create_table(db, errors_table_, typed_cols, has_pkey=False)
|
242
|
index_cols = ['column', sql_gen.CustomCode('md5(value)'), 'error_code',
|
243
|
sql_gen.CustomCode('md5(error)')]
|
244
|
sql.add_index(db, index_cols, errors_table_, unique=True)
|
245
|
|
246
|
##### Import
|
247
|
|
248
|
row_num_col_def = copy.copy(sql.row_num_col_def)
|
249
|
row_num_col_def.name = 'row_num'
|
250
|
row_num_col_def.type = 'integer'
|
251
|
|
252
|
def append_csv(db, table, reader, header):
|
253
|
def esc_name_(name): return sql.esc_name(db, name)
|
254
|
|
255
|
def log(msg, level=1): db.log_debug(msg, level)
|
256
|
|
257
|
# Wrap in standardizing stream
|
258
|
cols_ct = len(header)
|
259
|
stream = csvs.InputRewriter(streams.ProgressInputStream(csvs.StreamFilter(
|
260
|
csvs.ColCtFilter(reader, cols_ct)), sys.stderr, msg='Read %d row(s)',
|
261
|
n=1000))
|
262
|
dialect = stream.dialect # use default dialect
|
263
|
|
264
|
# Create COPY FROM statement
|
265
|
if header == sql.table_col_names(db, table): cols_str = ''
|
266
|
else: cols_str =' ('+(', '.join(map(esc_name_, header)))+')'
|
267
|
copy_from = ('COPY '+table.to_str(db)+cols_str+' FROM STDIN DELIMITER '
|
268
|
+db.esc_value(dialect.delimiter)+' NULL '+db.esc_value(''))
|
269
|
assert not csvs.is_tsv(dialect)
|
270
|
copy_from += ' CSV'
|
271
|
if dialect.quoting != csv.QUOTE_NONE:
|
272
|
quote_str = db.esc_value(dialect.quotechar)
|
273
|
copy_from += ' QUOTE '+quote_str
|
274
|
if dialect.doublequote: copy_from += ' ESCAPE '+quote_str
|
275
|
copy_from += ';\n'
|
276
|
|
277
|
log(copy_from, level=2)
|
278
|
try: db.db.cursor().copy_expert(copy_from, stream)
|
279
|
except Exception, e: sql.parse_exception(db, e, recover=True)
|
280
|
|
281
|
def import_csv(db, table, reader, header):
|
282
|
def log(msg, level=1): db.log_debug(msg, level)
|
283
|
|
284
|
# Get format info
|
285
|
col_names = map(strings.to_unicode, header)
|
286
|
for i, col in enumerate(col_names): # replace empty column names
|
287
|
if col == '': col_names[i] = 'column_'+str(i)
|
288
|
|
289
|
# Select schema and escape names
|
290
|
def esc_name(name): return db.esc_name(name)
|
291
|
|
292
|
typed_cols = [sql_gen.TypedCol(v, 'text') for v in col_names]
|
293
|
typed_cols.insert(0, row_num_col_def)
|
294
|
header.insert(0, row_num_col_def.name)
|
295
|
reader = csvs.RowNumFilter(reader)
|
296
|
|
297
|
log('Creating table')
|
298
|
# Note that this is not rolled back if the import fails. Instead, it is
|
299
|
# cached, and will not be re-run if the import is retried.
|
300
|
sql.create_table(db, table, typed_cols, has_pkey=False, col_indexes=False)
|
301
|
|
302
|
# Free memory used by deleted (rolled back) rows from any failed import.
|
303
|
# This MUST be run so that the rows will be stored in inserted order, and
|
304
|
# the row_num added after import will match up with the CSV's row order.
|
305
|
sql.truncate(db, table)
|
306
|
|
307
|
# Load the data
|
308
|
def load(): append_csv(db, table, reader, header)
|
309
|
sql.with_savepoint(db, load)
|
310
|
|
311
|
cleanup_table(db, table)
|
312
|
|
313
|
def put(db, table, row, pkey_=None, row_ct_ref=None, on_error=exc.reraise):
|
314
|
'''Recovers from errors.
|
315
|
Only works under PostgreSQL (uses INSERT RETURNING).
|
316
|
'''
|
317
|
return put_table(db, table, [], row, row_ct_ref, on_error=on_error)
|
318
|
|
319
|
def get(db, table, row, pkey, row_ct_ref=None, create=False):
|
320
|
'''Recovers from errors'''
|
321
|
try:
|
322
|
return sql.value(sql.select(db, table, [pkey], row, limit=1,
|
323
|
recover=True))
|
324
|
except StopIteration:
|
325
|
if not create: raise
|
326
|
return put(db, table, row, pkey, row_ct_ref) # insert new row
|
327
|
|
328
|
def is_func_result(col):
|
329
|
return col.table.name.find('(') >= 0 and col.name == 'result'
|
330
|
|
331
|
def into_table_name(out_table, in_tables0, mapping, is_func):
|
332
|
def in_col_str(in_col):
|
333
|
in_col = sql_gen.remove_col_rename(in_col)
|
334
|
if isinstance(in_col, sql_gen.Col):
|
335
|
table = in_col.table
|
336
|
if table == in_tables0:
|
337
|
in_col = sql_gen.to_name_only_col(in_col)
|
338
|
elif is_func_result(in_col): in_col = table # omit col name
|
339
|
return strings.ustr(in_col)
|
340
|
|
341
|
str_ = strings.ustr(out_table)
|
342
|
if is_func:
|
343
|
str_ += '('
|
344
|
|
345
|
try: value_in_col = mapping['value']
|
346
|
except KeyError:
|
347
|
str_ += ', '.join((strings.ustr(k)+'='+in_col_str(v)
|
348
|
for k, v in mapping.iteritems()))
|
349
|
else: str_ += in_col_str(value_in_col)
|
350
|
|
351
|
str_ += ')'
|
352
|
else:
|
353
|
out_col = 'rank'
|
354
|
try: in_col = mapping[out_col]
|
355
|
except KeyError: str_ += '_pkeys'
|
356
|
else: # has a rank column, so hierarchical
|
357
|
str_ += '['+strings.ustr(out_col)+'='+in_col_str(in_col)+']'
|
358
|
return str_
|
359
|
|
360
|
def put_table(db, out_table, in_tables, mapping, row_ct_ref=None, default=None,
|
361
|
col_defaults={}, on_error=exc.reraise):
|
362
|
'''Recovers from errors.
|
363
|
Only works under PostgreSQL (uses INSERT RETURNING).
|
364
|
|
365
|
Warning: This function's normalizing algorithm does not support database
|
366
|
triggers that populate fields covered by the unique constraint used to do
|
367
|
the DISTINCT ON. Such fields must be populated by the mappings instead.
|
368
|
(Other unique constraints and other non-unique fields are not affected by
|
369
|
this restriction on triggers. Note that the primary key will normally not be
|
370
|
the DISTINCT ON constraint, so trigger-populated natural keys are supported
|
371
|
*unless* the input table contains duplicate rows for some generated keys.)
|
372
|
|
373
|
Note that much of the complexity of the normalizing algorithm is due to
|
374
|
PostgreSQL (and other DB systems) not having a native command for
|
375
|
INSERT ON DUPLICATE SELECT (wiki.vegpath.org/INSERT_ON_DUPLICATE_SELECT).
|
376
|
For PostgreSQL 9.1+, this can now be emulated using INSTEAD OF triggers.
|
377
|
For earlier versions, you instead have to use this function.
|
378
|
|
379
|
@param in_tables The main input table to select from, followed by a list of
|
380
|
tables to join with it using the main input table's pkey
|
381
|
@param mapping dict(out_table_col=in_table_col, ...)
|
382
|
* out_table_col: str (*not* sql_gen.Col)
|
383
|
* in_table_col: sql_gen.Col|literal-value
|
384
|
@param default The *output* column to use as the pkey for missing rows.
|
385
|
If this output column does not exist in the mapping, uses None.
|
386
|
Note that this will be used for *all* missing rows, regardless of which
|
387
|
error caused them not to be inserted.
|
388
|
@param col_defaults Default values for required columns.
|
389
|
@return sql_gen.Col Where the output pkeys are made available
|
390
|
'''
|
391
|
import psycopg2.extensions
|
392
|
|
393
|
# Special handling for functions with hstore params
|
394
|
if out_table == '_map':
|
395
|
import psycopg2.extras
|
396
|
psycopg2.extras.register_hstore(db.db)
|
397
|
|
398
|
# Parse args
|
399
|
try: value = mapping.pop('value')
|
400
|
except KeyError: return None # value required
|
401
|
|
402
|
mapping = dict([(k, sql_gen.get_value(v))
|
403
|
for k, v in mapping.iteritems()]) # unwrap literal value
|
404
|
mapping = dict(map=mapping, value=value) # non-value params -> hstore
|
405
|
|
406
|
out_table = sql_gen.as_Table(out_table)
|
407
|
|
408
|
def log_debug(msg): db.log_debug(msg, level=1.5)
|
409
|
def col_ustr(str_):
|
410
|
return strings.repr_no_u(sql_gen.remove_col_rename(str_))
|
411
|
|
412
|
log_debug('********** New iteration **********')
|
413
|
log_debug('Inserting these input columns into '+strings.as_tt(
|
414
|
out_table.to_str(db))+':\n'+strings.as_table(mapping, ustr=col_ustr))
|
415
|
|
416
|
is_function = sql.function_exists(db, out_table)
|
417
|
|
418
|
if is_function: row_ct_ref = None # only track inserted rows
|
419
|
|
420
|
# Warn if inserting empty table rows
|
421
|
if not mapping and not is_function: # functions with no args OK
|
422
|
warnings.warn(UserWarning('Inserting empty table row(s)'))
|
423
|
|
424
|
if is_function: out_pkey = 'result'
|
425
|
else: out_pkey = sql.pkey_name(db, out_table, recover=True)
|
426
|
out_pkey_col = sql_gen.as_Col(out_pkey, out_table)
|
427
|
|
428
|
in_tables_ = copy.copy(in_tables) # don't modify input!
|
429
|
try: in_tables0 = in_tables_.pop(0) # first table is separate
|
430
|
except IndexError: in_tables0 = None
|
431
|
else:
|
432
|
in_pkey = sql.pkey_name(db, in_tables0, recover=True)
|
433
|
in_pkey_col = sql_gen.as_Col(in_pkey, in_tables0)
|
434
|
|
435
|
# Determine if can use optimization for only literal values
|
436
|
is_literals = not reduce(operator.or_, map(sql_gen.is_table_col,
|
437
|
mapping.values()), False)
|
438
|
is_literals_or_function = is_literals or is_function
|
439
|
|
440
|
if in_tables0 == None: errors_table_ = None
|
441
|
else: errors_table_ = errors_table(db, in_tables0)
|
442
|
|
443
|
# Create input joins from list of input tables
|
444
|
input_joins = [in_tables0]+[sql_gen.Join(v,
|
445
|
{in_pkey: sql_gen.join_same_not_null}) for v in in_tables_]
|
446
|
|
447
|
orig_mapping = mapping.copy()
|
448
|
if mapping == {} and not is_function: # need >= one column for INSERT SELECT
|
449
|
mapping = {out_pkey: None} # ColDict will replace with default value
|
450
|
|
451
|
if not is_literals:
|
452
|
into = sql_gen.as_Table(into_table_name(out_table, in_tables0, mapping,
|
453
|
is_function))
|
454
|
# Ensure into's out_pkey is different from in_pkey by prepending "out."
|
455
|
if is_function: into_out_pkey = out_pkey
|
456
|
else: into_out_pkey = 'out.'+out_pkey
|
457
|
|
458
|
# Set column sources
|
459
|
in_cols = filter(sql_gen.is_table_col, mapping.values())
|
460
|
for col in in_cols:
|
461
|
if col.table == in_tables0: col.set_srcs(sql_gen.src_self)
|
462
|
|
463
|
log_debug('Joining together input tables into temp table')
|
464
|
# Place in new table so don't modify input and for speed
|
465
|
in_table = sql_gen.Table('in')
|
466
|
mapping = dicts.join(mapping, sql.flatten(db, in_table, input_joins,
|
467
|
in_cols, preserve=[in_pkey_col]))
|
468
|
input_joins = [in_table]
|
469
|
db.log_debug('Temp table: '+strings.as_tt(in_table.to_str(db)), level=2)
|
470
|
|
471
|
# Wrap mapping in a sql_gen.ColDict.
|
472
|
# sql_gen.ColDict sanitizes both keys and values passed into it.
|
473
|
# Do after applying dicts.join() because that returns a plain dict.
|
474
|
mapping = sql_gen.ColDict(db, out_table, mapping)
|
475
|
|
476
|
# Save all rows since in_table may have rows deleted
|
477
|
if is_literals: pass
|
478
|
elif is_function: full_in_table = in_table
|
479
|
else:
|
480
|
full_in_table = sql_gen.suffixed_table(in_table, '_full')
|
481
|
sql.copy_table(db, in_table, full_in_table)
|
482
|
|
483
|
pkeys_table_exists_ref = [False]
|
484
|
def insert_into_pkeys(query, **kw_args):
|
485
|
if pkeys_table_exists_ref[0]:
|
486
|
sql.insert_select(db, into, [in_pkey, into_out_pkey], query,
|
487
|
**kw_args)
|
488
|
else:
|
489
|
kw_args.setdefault('add_pkey_', True)
|
490
|
|
491
|
sql.run_query_into(db, query, into=into, **kw_args)
|
492
|
pkeys_table_exists_ref[0] = True
|
493
|
|
494
|
def mk_main_select(joins, cols): return sql.mk_select(db, joins, cols)
|
495
|
|
496
|
if is_literals: insert_in_table = None
|
497
|
else:
|
498
|
insert_in_table = in_table
|
499
|
insert_in_tables = [insert_in_table]
|
500
|
join_cols = sql_gen.ColDict(db, out_table)
|
501
|
join_custom_cond = None
|
502
|
|
503
|
exc_strs = set()
|
504
|
def log_exc(e):
|
505
|
e_str = exc.str_(e, first_line_only=True)
|
506
|
log_debug('Caught exception: '+e_str)
|
507
|
if e_str in exc_strs: # avoid infinite loops
|
508
|
log_debug('Exception already seen, handler broken')
|
509
|
on_error(e)
|
510
|
remove_all_rows()
|
511
|
return False
|
512
|
else: exc_strs.add(e_str)
|
513
|
return True
|
514
|
|
515
|
ignore_all_ref = [False]
|
516
|
def remove_all_rows():
|
517
|
log_debug('Ignoring all rows')
|
518
|
ignore_all_ref[0] = True # just return the default value column
|
519
|
|
520
|
def handle_unknown_exc(e):
|
521
|
log_debug('No handler for exception')
|
522
|
on_error(e)
|
523
|
remove_all_rows()
|
524
|
|
525
|
def ensure_cond(cond, e, passed=False, failed=False):
|
526
|
'''
|
527
|
@param passed at least one row passed the constraint
|
528
|
@param failed at least one row failed the constraint
|
529
|
'''
|
530
|
if is_literals: # we know the constraint was applied exactly once
|
531
|
if passed: pass
|
532
|
elif failed: remove_all_rows()
|
533
|
else: raise NotImplementedError()
|
534
|
else:
|
535
|
if not is_function:
|
536
|
out_table_cols = sql_gen.ColDict(db, out_table)
|
537
|
out_table_cols.update(util.dict_subset_right_join({},
|
538
|
sql.table_col_names(db, out_table)))
|
539
|
|
540
|
in_cols = []
|
541
|
cond = strings.ustr(cond)
|
542
|
orig_cond = cond
|
543
|
cond = sql_gen.map_expr(db, cond, mapping, in_cols)
|
544
|
if not is_function:
|
545
|
cond = sql_gen.map_expr(db, cond, out_table_cols)
|
546
|
|
547
|
log_debug('Ignoring rows that do not satisfy '+strings.as_tt(cond))
|
548
|
cur = None
|
549
|
if cond == sql_gen.false_expr:
|
550
|
assert failed
|
551
|
remove_all_rows()
|
552
|
elif cond == sql_gen.true_expr: assert passed
|
553
|
else:
|
554
|
while True:
|
555
|
not_cond = sql_gen.NotCond(sql_gen.CustomCode(cond))
|
556
|
try:
|
557
|
cur = sql.delete(db, insert_in_table, not_cond)
|
558
|
break
|
559
|
except sql.DoesNotExistException, e:
|
560
|
if e.type != 'column': raise
|
561
|
|
562
|
last_cond = cond
|
563
|
cond = sql_gen.map_expr(db, cond, {e.name: None})
|
564
|
if cond == last_cond: raise # not fixable
|
565
|
|
566
|
# If any rows failed cond
|
567
|
if failed or cur != None and cur.rowcount > 0:
|
568
|
track_data_error(db, errors_table_,
|
569
|
sql_gen.cross_join_srcs(in_cols), None, e.cause.pgcode,
|
570
|
strings.ensure_newl(strings.ustr(e.cause.pgerror))
|
571
|
+'condition: '+orig_cond+'\ntranslated condition: '+cond)
|
572
|
|
573
|
not_null_cols = set()
|
574
|
def ignore(in_col, value, e):
|
575
|
if sql_gen.is_table_col(in_col):
|
576
|
in_col = sql_gen.with_table(in_col, insert_in_table)
|
577
|
|
578
|
track_data_error(db, errors_table_, in_col.srcs, value,
|
579
|
e.cause.pgcode, e.cause.pgerror)
|
580
|
|
581
|
sql.add_index(db, in_col, insert_in_table) # enable fast filtering
|
582
|
if value != None and in_col not in not_null_cols:
|
583
|
log_debug('Replacing invalid value '
|
584
|
+strings.as_tt(strings.urepr(value))+' with NULL in column '
|
585
|
+strings.as_tt(in_col.to_str(db)))
|
586
|
sql.update(db, insert_in_table, [(in_col, None)],
|
587
|
sql_gen.ColValueCond(in_col, value))
|
588
|
else:
|
589
|
log_debug('Ignoring rows with '+strings.as_tt(in_col.to_str(db))
|
590
|
+' = '+strings.as_tt(strings.urepr(value)))
|
591
|
sql.delete(db, insert_in_table,
|
592
|
sql_gen.ColValueCond(in_col, value))
|
593
|
if value == None: not_null_cols.add(in_col)
|
594
|
else:
|
595
|
assert isinstance(in_col, sql_gen.NamedCol)
|
596
|
in_value = sql_gen.remove_col_rename(in_col)
|
597
|
assert sql_gen.is_literal(in_value)
|
598
|
if value == in_value.value:
|
599
|
if value != None:
|
600
|
log_debug('Replacing invalid literal '
|
601
|
+strings.as_tt(in_col.to_str(db))+' with NULL')
|
602
|
mapping[in_col.name] = None
|
603
|
else:
|
604
|
remove_all_rows()
|
605
|
# otherwise, all columns were being ignore()d because the specific
|
606
|
# column couldn't be identified, and this was not the invalid column
|
607
|
|
608
|
if not is_literals:
|
609
|
def insert_pkeys_table(which):
|
610
|
return sql_gen.Table(sql_gen.concat(in_table.name,
|
611
|
'_insert_'+which+'_pkeys'))
|
612
|
insert_out_pkeys = insert_pkeys_table('out')
|
613
|
insert_in_pkeys = insert_pkeys_table('in')
|
614
|
|
615
|
def mk_func_call():
|
616
|
args = dict(((k.name, v) for k, v in mapping.iteritems()))
|
617
|
return sql_gen.FunctionCall(out_table, **args), args
|
618
|
|
619
|
missing_msg = None
|
620
|
|
621
|
# Do inserts and selects
|
622
|
while True:
|
623
|
has_joins = join_cols != {}
|
624
|
|
625
|
if ignore_all_ref[0]: break # unrecoverable error, so don't do main case
|
626
|
|
627
|
# Prepare to insert new rows
|
628
|
if is_function:
|
629
|
if is_literals:
|
630
|
log_debug('Calling function')
|
631
|
func_call, args = mk_func_call()
|
632
|
else:
|
633
|
log_debug('Trying to insert new rows')
|
634
|
insert_args = dict(recover=True, cacheable=False)
|
635
|
if has_joins:
|
636
|
insert_args.update(dict(ignore=True))
|
637
|
else:
|
638
|
insert_args.update(dict(returning=out_pkey))
|
639
|
if not is_literals:
|
640
|
insert_args.update(dict(into=insert_out_pkeys))
|
641
|
main_select = mk_main_select([insert_in_table], [sql_gen.with_table(
|
642
|
c, insert_in_table) for c in mapping.values()])
|
643
|
|
644
|
try:
|
645
|
cur = None
|
646
|
if is_function:
|
647
|
if is_literals:
|
648
|
cur = sql.select(db, fields=[func_call], recover=True,
|
649
|
cacheable=True)
|
650
|
else:
|
651
|
log_debug('Defining wrapper function')
|
652
|
|
653
|
func_call, args = mk_func_call()
|
654
|
func_call = sql_gen.NamedCol(into_out_pkey, func_call)
|
655
|
|
656
|
# Create empty pkeys table so its row type can be used
|
657
|
insert_into_pkeys(sql.mk_select(db, input_joins,
|
658
|
[in_pkey_col, func_call], limit=0), add_pkey_=False,
|
659
|
recover=True)
|
660
|
result_type = db.col_info(sql_gen.Col(into_out_pkey,
|
661
|
into)).type
|
662
|
|
663
|
## Create error handling wrapper function
|
664
|
|
665
|
wrapper = db.TempFunction(sql_gen.concat(into.name,
|
666
|
'_wrap'))
|
667
|
|
668
|
select_cols = [in_pkey_col]+args.values()
|
669
|
row_var = copy.copy(sql_gen.row_var)
|
670
|
row_var.set_srcs([in_table])
|
671
|
in_pkey_var = sql_gen.Col(in_pkey, row_var)
|
672
|
|
673
|
args = dict(((k, sql_gen.with_table(v, row_var))
|
674
|
for k, v in args.iteritems()))
|
675
|
func_call = sql_gen.FunctionCall(out_table, **args)
|
676
|
|
677
|
def mk_return(result):
|
678
|
return sql_gen.ReturnQuery(sql.mk_select(db,
|
679
|
fields=[in_pkey_var, result], explain=False))
|
680
|
exc_handler = func_wrapper_exception_handler(db,
|
681
|
mk_return(sql_gen.Cast(result_type, None)),
|
682
|
args.values(), errors_table_)
|
683
|
|
684
|
sql.define_func(db, sql_gen.FunctionDef(wrapper,
|
685
|
sql_gen.SetOf(into),
|
686
|
sql_gen.RowExcIgnore(sql_gen.RowType(in_table),
|
687
|
sql.mk_select(db, input_joins),
|
688
|
mk_return(func_call), exc_handler=exc_handler)
|
689
|
))
|
690
|
wrapper_table = sql_gen.FunctionCall(wrapper)
|
691
|
|
692
|
log_debug('Calling function')
|
693
|
insert_into_pkeys(sql.mk_select(db, wrapper_table,
|
694
|
order_by=None), recover=True, cacheable=False)
|
695
|
sql.add_pkey_or_index(db, into)
|
696
|
else:
|
697
|
cur = sql.insert_select(db, out_table, mapping.keys(),
|
698
|
main_select, **insert_args)
|
699
|
break # insert successful
|
700
|
except sql.MissingCastException, e:
|
701
|
if not log_exc(e): break
|
702
|
|
703
|
type_ = e.type
|
704
|
if e.col == None: out_cols = mapping.keys()
|
705
|
else: out_cols = [e.col]
|
706
|
|
707
|
for out_col in out_cols:
|
708
|
log_debug('Casting '+strings.as_tt(strings.repr_no_u(out_col))
|
709
|
+' input to '+strings.as_tt(type_))
|
710
|
in_col = mapping[out_col]
|
711
|
while True:
|
712
|
try:
|
713
|
mapping[out_col] = cast_temp_col(db, type_, in_col,
|
714
|
errors_table_)
|
715
|
break # cast successful
|
716
|
except sql.InvalidValueException, e:
|
717
|
if not log_exc(e): break
|
718
|
|
719
|
ignore(in_col, e.value, e)
|
720
|
except sql.DuplicateKeyException, e:
|
721
|
if not log_exc(e): break
|
722
|
|
723
|
# Different rows violating different unique constraints not
|
724
|
# supported
|
725
|
assert not join_cols
|
726
|
|
727
|
join_custom_cond = e.cond
|
728
|
if e.cond != None: ensure_cond(e.cond, e, passed=True)
|
729
|
|
730
|
join_cols.update(util.dict_subset_right_join(mapping, e.cols))
|
731
|
log_debug('Ignoring existing rows, comparing on these columns:\n'
|
732
|
+strings.as_inline_table(join_cols, ustr=col_ustr))
|
733
|
|
734
|
if is_literals:
|
735
|
return sql.value(sql.select(db, out_table, [out_pkey_col],
|
736
|
join_cols, order_by=None))
|
737
|
|
738
|
# Uniquify and filter input table to avoid (most) duplicate keys
|
739
|
# (Additional duplicates may be added concurrently and will be
|
740
|
# filtered out separately upon insert.)
|
741
|
insert_in_table = sql.distinct_table(db, insert_in_table,
|
742
|
join_cols.values(), [insert_in_table,
|
743
|
sql_gen.Join(out_table, join_cols, sql_gen.filter_out, e.cond)])
|
744
|
insert_in_tables.append(insert_in_table)
|
745
|
except sql.NullValueException, e:
|
746
|
if not log_exc(e): break
|
747
|
|
748
|
out_col, = e.cols
|
749
|
try: in_col = mapping[out_col]
|
750
|
except KeyError, e:
|
751
|
try: in_col = mapping[out_col] = col_defaults[out_col]
|
752
|
except KeyError:
|
753
|
missing_msg = 'Missing mapping for NOT NULL column '+out_col
|
754
|
log_debug(missing_msg)
|
755
|
remove_all_rows()
|
756
|
else: ignore(in_col, None, e)
|
757
|
except sql.CheckException, e:
|
758
|
if not log_exc(e): break
|
759
|
|
760
|
ensure_cond(e.cond, e, failed=True)
|
761
|
except sql.InvalidValueException, e:
|
762
|
if not log_exc(e): break
|
763
|
|
764
|
for in_col in mapping.values(): ignore(in_col, e.value, e)
|
765
|
except psycopg2.extensions.TransactionRollbackError, e:
|
766
|
if not log_exc(e): break
|
767
|
# retry
|
768
|
except sql.DatabaseErrors, e:
|
769
|
if not log_exc(e): break
|
770
|
|
771
|
handle_unknown_exc(e)
|
772
|
# after exception handled, rerun loop with additional constraints
|
773
|
|
774
|
# Resolve default value column
|
775
|
if default != None:
|
776
|
if ignore_all_ref[0]: mapping.update(orig_mapping) # use input cols
|
777
|
try: default = mapping[default]
|
778
|
except KeyError:
|
779
|
db.log_debug('Default value column '
|
780
|
+strings.as_tt(strings.repr_no_u(default))
|
781
|
+' does not exist in mapping, falling back to None', level=2.1)
|
782
|
default = None
|
783
|
else: default = sql_gen.remove_col_rename(default)
|
784
|
|
785
|
if missing_msg != None and default == None:
|
786
|
warnings.warn(UserWarning(missing_msg))
|
787
|
# not an error because sometimes the mappings include
|
788
|
# extra tables which aren't used by the dataset
|
789
|
|
790
|
# Handle unrecoverable errors
|
791
|
if ignore_all_ref[0]:
|
792
|
log_debug('Returning default: '+strings.as_tt(strings.urepr(default)))
|
793
|
return default
|
794
|
|
795
|
if cur != None and row_ct_ref != None and cur.rowcount >= 0:
|
796
|
row_ct_ref[0] += cur.rowcount
|
797
|
|
798
|
if is_literals: return sql.value(cur)
|
799
|
|
800
|
if is_function: pass # pkeys table already created
|
801
|
elif has_joins:
|
802
|
select_joins = input_joins+[sql_gen.Join(out_table, join_cols,
|
803
|
custom_cond=join_custom_cond)]
|
804
|
log_debug('Getting output table pkeys of existing/inserted rows')
|
805
|
insert_into_pkeys(sql.mk_select(db, select_joins, [in_pkey_col,
|
806
|
sql_gen.NamedCol(into_out_pkey, out_pkey_col)], order_by=None))
|
807
|
else:
|
808
|
sql.add_row_num(db, insert_out_pkeys) # for joining with input pkeys
|
809
|
|
810
|
log_debug('Getting input table pkeys of inserted rows')
|
811
|
# Note that mk_main_select() does not use ORDER BY. Instead, assume that
|
812
|
# since the SELECT query is identical to the one used in INSERT SELECT,
|
813
|
# its rows will be retrieved in the same order.
|
814
|
sql.run_query_into(db, mk_main_select(input_joins, [in_pkey]),
|
815
|
into=insert_in_pkeys)
|
816
|
sql.add_row_num(db, insert_in_pkeys) # for joining with output pkeys
|
817
|
|
818
|
assert sql.table_row_count(db, insert_out_pkeys) == sql.table_row_count(
|
819
|
db, insert_in_pkeys)
|
820
|
|
821
|
log_debug('Combining output and input pkeys in inserted order')
|
822
|
pkey_joins = [insert_in_pkeys, sql_gen.Join(insert_out_pkeys,
|
823
|
{sql.row_num_col: sql_gen.join_same_not_null})]
|
824
|
in_col = sql_gen.Col(in_pkey, insert_in_pkeys)
|
825
|
out_col = sql_gen.NamedCol(into_out_pkey,
|
826
|
sql_gen.Col(out_pkey, insert_out_pkeys))
|
827
|
insert_into_pkeys(sql.mk_select(db, pkey_joins, [in_col, out_col],
|
828
|
order_by=None))
|
829
|
|
830
|
sql.empty_temp(db, [insert_out_pkeys, insert_in_pkeys])
|
831
|
|
832
|
if not is_function: # is_function doesn't leave holes
|
833
|
log_debug('Setting pkeys of missing rows to '
|
834
|
+strings.as_tt(strings.urepr(default)))
|
835
|
|
836
|
full_in_pkey_col = sql_gen.Col(in_pkey, full_in_table)
|
837
|
if sql_gen.is_table_col(default):
|
838
|
default = sql_gen.with_table(default, full_in_table)
|
839
|
missing_rows_joins = [full_in_table, sql_gen.Join(into,
|
840
|
{in_pkey: sql_gen.join_same_not_null}, sql_gen.filter_out)]
|
841
|
# must use join_same_not_null or query will take forever
|
842
|
|
843
|
insert_args = dict(order_by=None)
|
844
|
if not sql.table_has_pkey(db, full_in_table): # in_table has duplicates
|
845
|
insert_args.update(dict(distinct_on=[full_in_pkey_col]))
|
846
|
|
847
|
insert_into_pkeys(sql.mk_select(db, missing_rows_joins,
|
848
|
[full_in_pkey_col, sql_gen.NamedCol(into_out_pkey, default)],
|
849
|
**insert_args))
|
850
|
# otherwise, there is already an entry for every row
|
851
|
|
852
|
sql.empty_temp(db, insert_in_tables+[full_in_table])
|
853
|
|
854
|
srcs = []
|
855
|
if is_function: srcs = sql_gen.cols_srcs(in_cols)
|
856
|
return sql_gen.Col(into_out_pkey, into, srcs)
|