1 |
13
|
aaronmk
|
# XML-database conversion
|
2 |
|
|
|
3 |
2418
|
aaronmk
|
import copy
|
4 |
16
|
aaronmk
|
import re
|
5 |
13
|
aaronmk
|
from xml.dom import Node
|
6 |
|
|
|
7 |
3718
|
aaronmk
|
import dicts
|
8 |
446
|
aaronmk
|
import exc
|
9 |
2436
|
aaronmk
|
import Parser
|
10 |
46
|
aaronmk
|
import sql
|
11 |
3077
|
aaronmk
|
import sql_io
|
12 |
2268
|
aaronmk
|
import sql_gen
|
13 |
84
|
aaronmk
|
import strings
|
14 |
133
|
aaronmk
|
import util
|
15 |
46
|
aaronmk
|
import xml_dom
|
16 |
2113
|
aaronmk
|
import xml_func
|
17 |
2436
|
aaronmk
|
import xpath
|
18 |
13
|
aaronmk
|
|
19 |
139
|
aaronmk
|
def name_of(node): return re.sub(r'^.*\.', r'', node.tagName)
|
20 |
16
|
aaronmk
|
|
21 |
13
|
aaronmk
|
ptr_suffix = '_id'
|
22 |
|
|
|
23 |
138
|
aaronmk
|
def is_ptr(node_name): return node_name.lower().endswith(ptr_suffix)
|
24 |
13
|
aaronmk
|
|
25 |
172
|
aaronmk
|
def ptr_type_guess(node_name):
|
26 |
13
|
aaronmk
|
assert is_ptr(node_name)
|
27 |
|
|
return node_name[:-len(ptr_suffix)]
|
28 |
|
|
|
29 |
|
|
def ptr_target(node):
|
30 |
|
|
assert is_ptr(name_of(node))
|
31 |
2113
|
aaronmk
|
return xml_dom.value_node(node)
|
32 |
13
|
aaronmk
|
|
33 |
|
|
def find_by_name(node, name):
|
34 |
46
|
aaronmk
|
for parent in xml_dom.NodeParentIter(node):
|
35 |
13
|
aaronmk
|
if name_of(parent) == name: return parent
|
36 |
|
|
else:
|
37 |
46
|
aaronmk
|
for child in xml_dom.NodeElemIter(parent):
|
38 |
16
|
aaronmk
|
child_name = name_of(child)
|
39 |
13
|
aaronmk
|
if is_ptr(child_name):
|
40 |
172
|
aaronmk
|
target = ptr_target(child)
|
41 |
|
|
if target.tagName == name: return target
|
42 |
13
|
aaronmk
|
elif child_name == name: return child
|
43 |
|
|
return None
|
44 |
|
|
|
45 |
2039
|
aaronmk
|
class ColRef:
|
46 |
|
|
'''A reference to a table column'''
|
47 |
|
|
def __init__(self, name, idx):
|
48 |
|
|
self.name = name
|
49 |
|
|
self.idx = idx
|
50 |
|
|
|
51 |
|
|
def __str__(self): return self.name
|
52 |
|
|
|
53 |
4242
|
aaronmk
|
input_col_prefix = xml_func.var_name_prefix
|
54 |
4241
|
aaronmk
|
|
55 |
6402
|
aaronmk
|
put_special_funcs = set(['_setDefault', '_simplifyPath'])
|
56 |
2549
|
aaronmk
|
|
57 |
5725
|
aaronmk
|
no_parent_ids_loc = object() # tells put() there is no parent_ids_loc
|
58 |
|
|
|
59 |
6410
|
aaronmk
|
def put(db, node, row_ins_ct_ref=None, on_error=exc.reraise, col_defaults=None,
|
60 |
5725
|
aaronmk
|
in_table=None, parent_ids_loc=no_parent_ids_loc, next=None):
|
61 |
3676
|
aaronmk
|
'''
|
62 |
|
|
@param node To use an entire XML document, pass root.firstChild.
|
63 |
|
|
'''
|
64 |
|
|
if node == None: return None # when no rows, root.firstChild == None
|
65 |
6408
|
aaronmk
|
elif xml_dom.is_text_node(node): return xml_dom.value(node)
|
66 |
3676
|
aaronmk
|
|
67 |
6410
|
aaronmk
|
if col_defaults == None: col_defaults = {}
|
68 |
|
|
|
69 |
3685
|
aaronmk
|
def put_(node):
|
70 |
6407
|
aaronmk
|
if util.is_str(node): return node
|
71 |
3685
|
aaronmk
|
return put(db, node, row_ins_ct_ref, on_error, col_defaults, in_table,
|
72 |
|
|
parent_ids_loc, next)
|
73 |
3621
|
aaronmk
|
|
74 |
4491
|
aaronmk
|
def augment_error(e): exc.add_msg(e, 'node:\n'+strings.ustr(node))
|
75 |
3646
|
aaronmk
|
def on_error_(e):
|
76 |
3677
|
aaronmk
|
augment_error(e)
|
77 |
3646
|
aaronmk
|
on_error(e)
|
78 |
|
|
|
79 |
3719
|
aaronmk
|
def wrap_e(e):
|
80 |
|
|
augment_error(e)
|
81 |
|
|
raise xml_func.SyntaxError(e)
|
82 |
3718
|
aaronmk
|
|
83 |
2551
|
aaronmk
|
is_func = xml_func.is_func(node)
|
84 |
2005
|
aaronmk
|
out_table = name_of(node)
|
85 |
2432
|
aaronmk
|
|
86 |
|
|
# Divide children into fields and children with fkeys to parent
|
87 |
3718
|
aaronmk
|
row = dicts.OnceOnlyDict()
|
88 |
2005
|
aaronmk
|
children = []
|
89 |
3718
|
aaronmk
|
try:
|
90 |
|
|
for child in xml_dom.NodeElemIter(node):
|
91 |
|
|
child_name = name_of(child)
|
92 |
|
|
if xml_dom.is_empty(child): row[child_name] = None
|
93 |
|
|
elif xml_dom.is_text(child):
|
94 |
|
|
row[child_name] = strings.to_unicode(xml_dom.value(child))
|
95 |
|
|
else:
|
96 |
|
|
child_value = xml_dom.value_node(child)
|
97 |
5508
|
aaronmk
|
if ((is_func or is_ptr(child_name)
|
98 |
|
|
or xml_func.is_func(child_value))
|
99 |
|
|
and not xml_func.is_func(child)):
|
100 |
3718
|
aaronmk
|
row[child_name] = child_value
|
101 |
|
|
else: children.append(child)
|
102 |
|
|
except dicts.KeyExistsError, e: wrap_e(e)
|
103 |
2434
|
aaronmk
|
|
104 |
|
|
# Special handling for structural XML functions
|
105 |
6402
|
aaronmk
|
if out_table == '_setDefault':
|
106 |
2436
|
aaronmk
|
# Parse args
|
107 |
6406
|
aaronmk
|
try: path = row.pop('path')
|
108 |
6402
|
aaronmk
|
except KeyError, e: wrap_e(e)
|
109 |
|
|
|
110 |
6413
|
aaronmk
|
col_defaults = dicts.MergeDict(dicts.WrapDict(put_, row), col_defaults)
|
111 |
6402
|
aaronmk
|
return put_(path)
|
112 |
|
|
elif out_table == '_simplifyPath':
|
113 |
|
|
# Parse args
|
114 |
|
|
try:
|
115 |
3650
|
aaronmk
|
next = row['next'] # modifies outer next var used by put_()
|
116 |
2434
|
aaronmk
|
path = row['path']
|
117 |
2436
|
aaronmk
|
except KeyError, e: wrap_e(e)
|
118 |
|
|
try: next = xpath.parse(next)
|
119 |
|
|
except Parser.SyntaxError, e: wrap_e(e)
|
120 |
|
|
try: next = next[0].name
|
121 |
|
|
except IndexError, e: wrap_e(e)
|
122 |
2434
|
aaronmk
|
|
123 |
3650
|
aaronmk
|
return put_(path)
|
124 |
2434
|
aaronmk
|
|
125 |
3687
|
aaronmk
|
is_literals = in_table == None
|
126 |
3624
|
aaronmk
|
in_tables = []
|
127 |
|
|
no_empty = set()
|
128 |
3687
|
aaronmk
|
if not is_literals:
|
129 |
3624
|
aaronmk
|
in_tables.append(in_table)
|
130 |
|
|
no_empty.add(in_table)
|
131 |
2966
|
aaronmk
|
|
132 |
5388
|
aaronmk
|
def pkey_name(table): return sql.pkey_name(db, table, True)
|
133 |
3107
|
aaronmk
|
|
134 |
2177
|
aaronmk
|
# Add fkey to parent
|
135 |
5725
|
aaronmk
|
if parent_ids_loc is not no_parent_ids_loc:
|
136 |
3601
|
aaronmk
|
if sql_gen.is_table_col(parent_ids_loc):
|
137 |
|
|
no_empty.add(parent_ids_loc.table)
|
138 |
2177
|
aaronmk
|
parent_ptr = node.getAttribute('fkey')
|
139 |
5388
|
aaronmk
|
if parent_ptr == '': parent_ptr = pkey_name(name_of(node.parentNode))
|
140 |
2177
|
aaronmk
|
row[parent_ptr] = parent_ids_loc
|
141 |
|
|
|
142 |
5071
|
aaronmk
|
# Parse input columns
|
143 |
3718
|
aaronmk
|
row = row.inner # now allow keys to be overwritten
|
144 |
2060
|
aaronmk
|
for out_col, value in row.iteritems():
|
145 |
5071
|
aaronmk
|
if (not is_literals and util.is_str(value)
|
146 |
|
|
and value.startswith(input_col_prefix)): # value is input column
|
147 |
|
|
row[out_col] = sql_gen.Col(strings.remove_prefix(input_col_prefix,
|
148 |
|
|
value), in_table)
|
149 |
|
|
|
150 |
5075
|
aaronmk
|
# Optimizations for structural XML functions
|
151 |
|
|
if out_table == '_alt': # return first arg if non-NULL
|
152 |
|
|
args = row.items()
|
153 |
|
|
args.sort()
|
154 |
|
|
out_col, value = min(args) # first arg
|
155 |
|
|
if xml_dom.is_node(value): row[out_col] = value = put_(value)
|
156 |
|
|
if not sql_gen.is_nullable(db, value): return value
|
157 |
|
|
|
158 |
5071
|
aaronmk
|
# Process values
|
159 |
5725
|
aaronmk
|
parent_ids_loc = no_parent_ids_loc # applies to this section
|
160 |
5071
|
aaronmk
|
for out_col, value in row.iteritems():
|
161 |
2432
|
aaronmk
|
# Handle forward pointers
|
162 |
3650
|
aaronmk
|
if xml_dom.is_node(value): row[out_col] = value = put_(value)
|
163 |
2432
|
aaronmk
|
|
164 |
|
|
# Translate values
|
165 |
5071
|
aaronmk
|
if isinstance(value, sql_gen.Col): # value is table column
|
166 |
3687
|
aaronmk
|
assert sql_gen.is_table_col(value)
|
167 |
5071
|
aaronmk
|
if value.table is not in_table: in_tables.append(value.table)
|
168 |
3428
|
aaronmk
|
else: # value is literal value
|
169 |
2323
|
aaronmk
|
row[out_col] = sql_gen.NamedCol(out_col, value)
|
170 |
2060
|
aaronmk
|
|
171 |
|
|
# Insert node
|
172 |
3657
|
aaronmk
|
try: pkeys_loc = sql_io.put_table(db, out_table, in_tables, row,
|
173 |
|
|
row_ins_ct_ref, next, col_defaults, on_error_)
|
174 |
|
|
except Exception, e:
|
175 |
3677
|
aaronmk
|
augment_error(e)
|
176 |
|
|
raise
|
177 |
5382
|
aaronmk
|
if sql_gen.is_table_col(pkeys_loc): no_empty.add(pkeys_loc.table)
|
178 |
2060
|
aaronmk
|
|
179 |
2975
|
aaronmk
|
sql.empty_temp(db, set(in_tables) - no_empty)
|
180 |
2966
|
aaronmk
|
|
181 |
2177
|
aaronmk
|
# Insert children with fkeys to parent
|
182 |
2506
|
aaronmk
|
parent_ids_loc = pkeys_loc # applies to this section
|
183 |
5725
|
aaronmk
|
for child in children: put_(child)
|
184 |
2177
|
aaronmk
|
|
185 |
2133
|
aaronmk
|
return pkeys_loc
|
186 |
3651
|
aaronmk
|
|
187 |
|
|
def get(db, node, limit=None, start=None):
|
188 |
5388
|
aaronmk
|
def pkey_name(table): return sql.pkey_name(db, table)
|
189 |
3651
|
aaronmk
|
|
190 |
|
|
node = node.firstChild
|
191 |
|
|
table = name_of(node)
|
192 |
5388
|
aaronmk
|
pkey_ = pkey_name(table)
|
193 |
3651
|
aaronmk
|
|
194 |
|
|
fields = []
|
195 |
|
|
conds = {}
|
196 |
|
|
for child in xml_dom.NodeElemIter(node):
|
197 |
|
|
child_name = name_of(child)
|
198 |
|
|
if xml_dom.is_empty(child): fields.append(child_name)
|
199 |
|
|
elif xml_dom.is_text(child): conds[child_name] = xml_dom.value(child)
|
200 |
|
|
else: raise Exception('Joins not supported yet')
|
201 |
|
|
id_ = xml_dom.get_id(node)
|
202 |
5388
|
aaronmk
|
if id_ != None: conds[pkey_name(table)] = id_ # replace any existing value
|
203 |
3651
|
aaronmk
|
if fields == []: fields.append(pkey_)
|
204 |
|
|
|
205 |
|
|
return sql.select(db, table, fields, conds, limit, start)
|
206 |
|
|
|
207 |
|
|
# Controls when and how put_table() will partition the input table
|
208 |
5706
|
aaronmk
|
partition_size = 1000000 # rows; must be >= NCBI.nodes size
|
209 |
3651
|
aaronmk
|
|
210 |
|
|
def put_table(db, node, in_table, in_row_ct_ref=None, row_ins_ct_ref=None,
|
211 |
5007
|
aaronmk
|
limit=None, start=0, on_error=exc.reraise, col_defaults={},
|
212 |
|
|
partition_size=partition_size):
|
213 |
3651
|
aaronmk
|
'''
|
214 |
|
|
@param node The XML tree that transforms the input to the output. Similar to
|
215 |
|
|
put()'s node param, but with the input column name prefixed by
|
216 |
|
|
input_col_prefix in place of the column value.
|
217 |
|
|
@return sql_gen.Col Where the pkeys (from INSERT RETURNING) are made
|
218 |
|
|
available
|
219 |
|
|
'''
|
220 |
6444
|
aaronmk
|
if in_table == None:
|
221 |
|
|
return put(db, node, row_ins_ct_ref, on_error, col_defaults)
|
222 |
|
|
|
223 |
3651
|
aaronmk
|
in_table = sql_gen.as_Table(in_table)
|
224 |
4437
|
aaronmk
|
sql_io.mk_errors_table(db, in_table)
|
225 |
3651
|
aaronmk
|
in_table.set_srcs([in_table], overwrite=False)
|
226 |
4491
|
aaronmk
|
db.src = strings.ustr(in_table)
|
227 |
3651
|
aaronmk
|
|
228 |
|
|
db.autoexplain = True # but don't do this in row-based import
|
229 |
|
|
|
230 |
|
|
# Subset and partition in_table
|
231 |
|
|
# OK to do even if table already the right size because it takes <1 sec.
|
232 |
|
|
full_in_table = in_table
|
233 |
5087
|
aaronmk
|
pkeys_loc = None # used if loop is never executed
|
234 |
3651
|
aaronmk
|
total = 0
|
235 |
|
|
while limit == None or total < limit:
|
236 |
|
|
# Adjust partition size if last partition
|
237 |
|
|
this_limit = partition_size
|
238 |
|
|
if limit != None: this_limit = min(this_limit, limit - total)
|
239 |
|
|
|
240 |
|
|
# Row # is interally 0-based, but 1-based to the user
|
241 |
|
|
db.log_debug('********** Partition: rows '+str(start+1)+'-'
|
242 |
|
|
+str(start+this_limit)+' **********', level=1.2)
|
243 |
|
|
|
244 |
|
|
# Subset in_table
|
245 |
5022
|
aaronmk
|
in_table = sql_gen.Table(strings.ustr(full_in_table),
|
246 |
|
|
srcs=full_in_table.srcs, is_temp=True) # prepend schema to name
|
247 |
5023
|
aaronmk
|
sql.copy_table_struct(db, full_in_table, in_table)
|
248 |
5385
|
aaronmk
|
try: sql.add_row_num(db, in_table, 'row_num')
|
249 |
5024
|
aaronmk
|
except sql.DatabaseErrors: pass # already has pkey
|
250 |
5023
|
aaronmk
|
cur = sql.insert_select(db, in_table, None, sql.mk_select(db,
|
251 |
5523
|
aaronmk
|
full_in_table, limit=this_limit, start=start))
|
252 |
3651
|
aaronmk
|
|
253 |
|
|
this_ct = cur.rowcount
|
254 |
|
|
total += this_ct
|
255 |
8265
|
aaronmk
|
start += this_ct # advance start to fetch next set
|
256 |
3651
|
aaronmk
|
if this_ct == 0: break # in_table size is multiple of partition_size
|
257 |
|
|
|
258 |
5018
|
aaronmk
|
# Import data
|
259 |
|
|
pkeys_loc = put(db, node, row_ins_ct_ref, on_error, col_defaults,
|
260 |
|
|
in_table)
|
261 |
3651
|
aaronmk
|
if in_row_ct_ref != None: in_row_ct_ref[0] += this_ct
|
262 |
|
|
|
263 |
|
|
sql.empty_temp(db, in_table)
|
264 |
|
|
|
265 |
|
|
if this_ct < partition_size: break # partial partition = last
|
266 |
|
|
|
267 |
|
|
# Work around PostgreSQL's temp table disk space leak
|
268 |
|
|
db.reconnect()
|
269 |
|
|
|
270 |
|
|
return pkeys_loc
|