1 |
13
|
aaronmk
|
# XML-database conversion
|
2 |
|
|
|
3 |
2418
|
aaronmk
|
import copy
|
4 |
16
|
aaronmk
|
import re
|
5 |
13
|
aaronmk
|
from xml.dom import Node
|
6 |
|
|
|
7 |
446
|
aaronmk
|
import exc
|
8 |
2436
|
aaronmk
|
import Parser
|
9 |
46
|
aaronmk
|
import sql
|
10 |
3077
|
aaronmk
|
import sql_io
|
11 |
2268
|
aaronmk
|
import sql_gen
|
12 |
84
|
aaronmk
|
import strings
|
13 |
133
|
aaronmk
|
import util
|
14 |
46
|
aaronmk
|
import xml_dom
|
15 |
2113
|
aaronmk
|
import xml_func
|
16 |
2436
|
aaronmk
|
import xpath
|
17 |
13
|
aaronmk
|
|
18 |
139
|
aaronmk
|
def name_of(node): return re.sub(r'^.*\.', r'', node.tagName)
|
19 |
16
|
aaronmk
|
|
20 |
13
|
aaronmk
|
ptr_suffix = '_id'
|
21 |
|
|
|
22 |
138
|
aaronmk
|
def is_ptr(node_name): return node_name.lower().endswith(ptr_suffix)
|
23 |
13
|
aaronmk
|
|
24 |
172
|
aaronmk
|
def ptr_type_guess(node_name):
|
25 |
13
|
aaronmk
|
assert is_ptr(node_name)
|
26 |
|
|
return node_name[:-len(ptr_suffix)]
|
27 |
|
|
|
28 |
|
|
def ptr_target(node):
|
29 |
|
|
assert is_ptr(name_of(node))
|
30 |
2113
|
aaronmk
|
return xml_dom.value_node(node)
|
31 |
13
|
aaronmk
|
|
32 |
|
|
def find_by_name(node, name):
|
33 |
46
|
aaronmk
|
for parent in xml_dom.NodeParentIter(node):
|
34 |
13
|
aaronmk
|
if name_of(parent) == name: return parent
|
35 |
|
|
else:
|
36 |
46
|
aaronmk
|
for child in xml_dom.NodeElemIter(parent):
|
37 |
16
|
aaronmk
|
child_name = name_of(child)
|
38 |
13
|
aaronmk
|
if is_ptr(child_name):
|
39 |
172
|
aaronmk
|
target = ptr_target(child)
|
40 |
|
|
if target.tagName == name: return target
|
41 |
13
|
aaronmk
|
elif child_name == name: return child
|
42 |
|
|
return None
|
43 |
|
|
|
44 |
1850
|
aaronmk
|
def get(db, node, limit=None, start=None):
|
45 |
|
|
def pkey(table): return sql.pkey(db, table)
|
46 |
126
|
aaronmk
|
|
47 |
141
|
aaronmk
|
node = node.firstChild
|
48 |
135
|
aaronmk
|
table = name_of(node)
|
49 |
|
|
pkey_ = pkey(table)
|
50 |
|
|
|
51 |
|
|
fields = []
|
52 |
|
|
conds = {}
|
53 |
133
|
aaronmk
|
for child in xml_dom.NodeElemIter(node):
|
54 |
135
|
aaronmk
|
child_name = name_of(child)
|
55 |
|
|
if xml_dom.is_empty(child): fields.append(child_name)
|
56 |
|
|
elif xml_dom.is_text(child): conds[child_name] = xml_dom.value(child)
|
57 |
|
|
else: raise Exception('Joins not supported yet')
|
58 |
|
|
id_ = xml_dom.get_id(node)
|
59 |
1836
|
aaronmk
|
if id_ != None: conds[pkey(table)] = id_ # replace any existing pkey value
|
60 |
135
|
aaronmk
|
if fields == []: fields.append(pkey_)
|
61 |
133
|
aaronmk
|
|
62 |
864
|
aaronmk
|
return sql.select(db, table, fields, conds, limit, start)
|
63 |
126
|
aaronmk
|
|
64 |
1864
|
aaronmk
|
def put(db, node, row_ct_ref=None, on_error=exc.raise_, pool=None,
|
65 |
|
|
store_ids=False, parent_id=None):
|
66 |
138
|
aaronmk
|
'''store_ids enables searching the tree for missing fields'''
|
67 |
1850
|
aaronmk
|
def pkey(table): return sql.pkey(db, table, True)
|
68 |
15
|
aaronmk
|
|
69 |
461
|
aaronmk
|
def put_(node, parent_id=None):
|
70 |
1874
|
aaronmk
|
args = (db, node, row_ct_ref, on_error, pool, store_ids, parent_id)
|
71 |
|
|
if parent_id != None and pool != None: pool.apply_async(put, args)
|
72 |
|
|
else: return put(*args)
|
73 |
461
|
aaronmk
|
|
74 |
|
|
def on_error_(e):
|
75 |
446
|
aaronmk
|
exc.add_msg(e, 'node:\n'+str(node))
|
76 |
461
|
aaronmk
|
on_error(e)
|
77 |
446
|
aaronmk
|
|
78 |
48
|
aaronmk
|
table = name_of(node)
|
79 |
446
|
aaronmk
|
try: pkey_ = pkey(table)
|
80 |
461
|
aaronmk
|
except sql.DatabaseErrors, e: on_error_(e); return None
|
81 |
48
|
aaronmk
|
row = {}
|
82 |
|
|
children = []
|
83 |
13
|
aaronmk
|
|
84 |
48
|
aaronmk
|
# Divide children into fields and children with fkeys to parent
|
85 |
|
|
for child in xml_dom.NodeElemIter(node):
|
86 |
|
|
child_name = name_of(child)
|
87 |
463
|
aaronmk
|
if xml_dom.is_empty(child): row[child_name] = None
|
88 |
454
|
aaronmk
|
elif xml_dom.is_text(child):
|
89 |
84
|
aaronmk
|
row[child_name] = strings.to_unicode(xml_dom.value(child))
|
90 |
461
|
aaronmk
|
elif is_ptr(child_name): row[child_name] = put_(ptr_target(child))
|
91 |
48
|
aaronmk
|
else: children.append(child)
|
92 |
|
|
try: del row[pkey_]
|
93 |
|
|
except KeyError: pass
|
94 |
|
|
|
95 |
|
|
# Add fkey to parent
|
96 |
59
|
aaronmk
|
if parent_id != None:
|
97 |
|
|
parent_ptr = node.getAttribute('fkey')
|
98 |
|
|
if parent_ptr == '': parent_ptr = pkey(name_of(node.parentNode))
|
99 |
|
|
row[parent_ptr] = parent_id
|
100 |
48
|
aaronmk
|
|
101 |
|
|
# Insert node
|
102 |
446
|
aaronmk
|
try:
|
103 |
|
|
for try_num in xrange(2):
|
104 |
|
|
try:
|
105 |
3077
|
aaronmk
|
id_ = sql_io.put(db, table, row, pkey_, row_ct_ref)
|
106 |
446
|
aaronmk
|
if store_ids: xml_dom.set_id(node, id_)
|
107 |
|
|
break
|
108 |
|
|
except sql.NullValueException, e:
|
109 |
468
|
aaronmk
|
col = e.cols[0]
|
110 |
446
|
aaronmk
|
if try_num > 0: raise # exception still raised after retry
|
111 |
468
|
aaronmk
|
if store_ids and is_ptr(col):
|
112 |
446
|
aaronmk
|
# Search for required column in ancestors and their children
|
113 |
468
|
aaronmk
|
target = find_by_name(node, ptr_type_guess(col))
|
114 |
446
|
aaronmk
|
if target == None: raise
|
115 |
468
|
aaronmk
|
row[col] = xml_dom.get_id(target)
|
116 |
446
|
aaronmk
|
else: raise
|
117 |
461
|
aaronmk
|
except sql.DatabaseErrors, e: on_error_(e); return None
|
118 |
48
|
aaronmk
|
|
119 |
|
|
# Insert children with fkeys to parent
|
120 |
1874
|
aaronmk
|
for child in children: put_(child, id_)
|
121 |
48
|
aaronmk
|
|
122 |
|
|
return id_
|
123 |
1996
|
aaronmk
|
|
124 |
2039
|
aaronmk
|
class ColRef:
|
125 |
|
|
'''A reference to a table column'''
|
126 |
|
|
def __init__(self, name, idx):
|
127 |
|
|
self.name = name
|
128 |
|
|
self.idx = idx
|
129 |
|
|
|
130 |
|
|
def __str__(self): return self.name
|
131 |
|
|
|
132 |
3114
|
aaronmk
|
# Controls when and how put_table() will partition the input table
|
133 |
3145
|
aaronmk
|
partition_size = 500000 # rows
|
134 |
3114
|
aaronmk
|
|
135 |
2177
|
aaronmk
|
input_col_prefix = '$'
|
136 |
|
|
|
137 |
2549
|
aaronmk
|
put_table_special_funcs = set(['_simplifyPath'])
|
138 |
|
|
|
139 |
2928
|
aaronmk
|
def put_table(db, node, in_table, in_row_ct_ref=None,
|
140 |
2806
|
aaronmk
|
row_ins_ct_ref=None, limit=None, start=0, on_error=exc.raise_,
|
141 |
3194
|
aaronmk
|
parent_ids_loc=None, next=None, top_call=True):
|
142 |
1996
|
aaronmk
|
'''
|
143 |
1998
|
aaronmk
|
@param node The XML tree that transforms the input to the output. Similar to
|
144 |
2177
|
aaronmk
|
put()'s node param, but with the input column name prefixed by
|
145 |
|
|
input_col_prefix in place of the column value.
|
146 |
3194
|
aaronmk
|
@return sql_gen.Col Where the pkeys (from INSERT RETURNING) are made
|
147 |
2067
|
aaronmk
|
available
|
148 |
1998
|
aaronmk
|
'''
|
149 |
2418
|
aaronmk
|
in_table = sql_gen.as_Table(in_table)
|
150 |
3113
|
aaronmk
|
in_table.set_srcs([in_table], overwrite=False)
|
151 |
3184
|
aaronmk
|
db.src = str(in_table)
|
152 |
2418
|
aaronmk
|
|
153 |
3123
|
aaronmk
|
def put_table_(node, in_row_ct_ref=None):
|
154 |
3107
|
aaronmk
|
return put_table(db, node, in_table, in_row_ct_ref, row_ins_ct_ref,
|
155 |
3194
|
aaronmk
|
None, 0, on_error, parent_ids_loc, next, False)
|
156 |
3107
|
aaronmk
|
|
157 |
3122
|
aaronmk
|
# Subset and/or partition in_table if needed
|
158 |
3194
|
aaronmk
|
if top_call:
|
159 |
3268
|
aaronmk
|
db.autoanalyze = True # but don't do this in row-based import
|
160 |
3269
|
aaronmk
|
db.autoexplain = True # but don't do this in row-based import
|
161 |
3268
|
aaronmk
|
|
162 |
3122
|
aaronmk
|
full_in_table = in_table
|
163 |
|
|
|
164 |
3194
|
aaronmk
|
total = 0
|
165 |
|
|
while limit == None or total < limit:
|
166 |
|
|
# Adjust partition size if last partition
|
167 |
|
|
this_limit = partition_size
|
168 |
|
|
if limit != None: this_limit = min(this_limit, limit - total)
|
169 |
3122
|
aaronmk
|
|
170 |
|
|
# Row # is interally 0-based, but 1-based to the user
|
171 |
3194
|
aaronmk
|
db.log_debug('********** Partition: rows '+str(start+1)+'-'
|
172 |
|
|
+str(start+this_limit)+' **********', level=1.2)
|
173 |
3122
|
aaronmk
|
|
174 |
|
|
# Subset in_table
|
175 |
|
|
in_table = copy.copy(full_in_table) # don't modify input!
|
176 |
3175
|
aaronmk
|
in_table.name = str(in_table) # prepend schema
|
177 |
3194
|
aaronmk
|
cur = sql.run_query_into(db, sql.mk_select(db, full_in_table,
|
178 |
3304
|
aaronmk
|
limit=this_limit, start=start), into=in_table, add_pkey_=True)
|
179 |
3122
|
aaronmk
|
# full_in_table will be shadowed (hidden) by created temp table
|
180 |
3194
|
aaronmk
|
|
181 |
|
|
this_ct = cur.rowcount
|
182 |
|
|
total += this_ct
|
183 |
|
|
start += this_ct # advance start to fetch next set
|
184 |
3195
|
aaronmk
|
if this_ct == 0: break # in_table size is multiple of partition_size
|
185 |
3194
|
aaronmk
|
|
186 |
3142
|
aaronmk
|
# Recurse
|
187 |
3122
|
aaronmk
|
pkeys_loc = put_table_(node, in_row_ct_ref)
|
188 |
3195
|
aaronmk
|
if in_row_ct_ref != None: in_row_ct_ref[0] += this_ct
|
189 |
3130
|
aaronmk
|
|
190 |
3142
|
aaronmk
|
sql.empty_temp(db, in_table)
|
191 |
|
|
|
192 |
3194
|
aaronmk
|
if this_ct < partition_size: break # partial partition = last
|
193 |
|
|
|
194 |
3130
|
aaronmk
|
# Work around PostgreSQL's temp table disk space leak
|
195 |
|
|
db.reconnect()
|
196 |
3121
|
aaronmk
|
|
197 |
3114
|
aaronmk
|
return pkeys_loc
|
198 |
|
|
|
199 |
2551
|
aaronmk
|
is_func = xml_func.is_func(node)
|
200 |
2005
|
aaronmk
|
out_table = name_of(node)
|
201 |
2432
|
aaronmk
|
|
202 |
|
|
# Divide children into fields and children with fkeys to parent
|
203 |
2005
|
aaronmk
|
row = {}
|
204 |
|
|
children = []
|
205 |
|
|
for child in xml_dom.NodeElemIter(node):
|
206 |
|
|
child_name = name_of(child)
|
207 |
|
|
if xml_dom.is_empty(child): row[child_name] = None
|
208 |
|
|
elif xml_dom.is_text(child):
|
209 |
|
|
row[child_name] = strings.to_unicode(xml_dom.value(child))
|
210 |
2113
|
aaronmk
|
else:
|
211 |
|
|
child_value = xml_dom.value_node(child)
|
212 |
2434
|
aaronmk
|
if is_func or is_ptr(child_name) or xml_func.is_func(child_value):
|
213 |
2432
|
aaronmk
|
row[child_name] = child_value
|
214 |
2113
|
aaronmk
|
else: children.append(child)
|
215 |
2434
|
aaronmk
|
|
216 |
|
|
# Special handling for structural XML functions
|
217 |
|
|
if out_table == '_simplifyPath':
|
218 |
2436
|
aaronmk
|
# Parse args
|
219 |
|
|
def wrap_e(e): raise xml_func.SyntaxError(e)
|
220 |
2434
|
aaronmk
|
try:
|
221 |
2506
|
aaronmk
|
next = row['next'] # modifies outer next var used by put_table_()
|
222 |
2434
|
aaronmk
|
require = row['require']
|
223 |
|
|
path = row['path']
|
224 |
2436
|
aaronmk
|
except KeyError, e: wrap_e(e)
|
225 |
|
|
try: next = xpath.parse(next)
|
226 |
|
|
except Parser.SyntaxError, e: wrap_e(e)
|
227 |
|
|
try: next = next[0].name
|
228 |
|
|
except IndexError, e: wrap_e(e)
|
229 |
2434
|
aaronmk
|
|
230 |
2506
|
aaronmk
|
return put_table_(path)
|
231 |
2434
|
aaronmk
|
|
232 |
2966
|
aaronmk
|
no_empty = set([in_table])
|
233 |
|
|
|
234 |
3107
|
aaronmk
|
def pkey(table): return sql.pkey(db, table, True)
|
235 |
|
|
|
236 |
2434
|
aaronmk
|
# Remove any explicit pkey
|
237 |
2133
|
aaronmk
|
try: del row[pkey(out_table)]
|
238 |
2005
|
aaronmk
|
except KeyError: pass
|
239 |
|
|
|
240 |
2177
|
aaronmk
|
# Add fkey to parent
|
241 |
|
|
if parent_ids_loc != None:
|
242 |
2966
|
aaronmk
|
no_empty.add(parent_ids_loc.table)
|
243 |
2177
|
aaronmk
|
parent_ptr = node.getAttribute('fkey')
|
244 |
|
|
if parent_ptr == '': parent_ptr = pkey(name_of(node.parentNode))
|
245 |
|
|
row[parent_ptr] = parent_ids_loc
|
246 |
|
|
|
247 |
2060
|
aaronmk
|
# Divide fields into input columns and literal values
|
248 |
2506
|
aaronmk
|
parent_ids_loc = None # applies to this section
|
249 |
2313
|
aaronmk
|
in_tables = [in_table]
|
250 |
2060
|
aaronmk
|
for out_col, value in row.iteritems():
|
251 |
2432
|
aaronmk
|
# Handle forward pointers
|
252 |
|
|
if xml_dom.is_node(value): row[out_col] = value = put_table_(value)
|
253 |
|
|
|
254 |
|
|
# Translate values
|
255 |
2268
|
aaronmk
|
if isinstance(value, sql_gen.Col): # value is temp table column
|
256 |
|
|
in_tables.append(value.table)
|
257 |
2177
|
aaronmk
|
elif util.is_str(value) and value.startswith(input_col_prefix):
|
258 |
|
|
# value is input column
|
259 |
2272
|
aaronmk
|
row[out_col] = sql_gen.Col(strings.remove_prefix(input_col_prefix,
|
260 |
2313
|
aaronmk
|
value), in_table)
|
261 |
2177
|
aaronmk
|
else: # value is literal value; should only be string or None
|
262 |
|
|
assert util.is_str(value) or value == None
|
263 |
2323
|
aaronmk
|
row[out_col] = sql_gen.NamedCol(out_col, value)
|
264 |
2060
|
aaronmk
|
|
265 |
|
|
# Insert node
|
266 |
3077
|
aaronmk
|
pkeys_loc = sql_io.put_table(db, out_table, in_tables, row, row_ins_ct_ref,
|
267 |
2813
|
aaronmk
|
None, next, is_func, on_error)
|
268 |
2060
|
aaronmk
|
|
269 |
2975
|
aaronmk
|
sql.empty_temp(db, set(in_tables) - no_empty)
|
270 |
2966
|
aaronmk
|
|
271 |
2177
|
aaronmk
|
# Insert children with fkeys to parent
|
272 |
2506
|
aaronmk
|
parent_ids_loc = pkeys_loc # applies to this section
|
273 |
|
|
for child in children: put_table_(child)
|
274 |
2177
|
aaronmk
|
|
275 |
2133
|
aaronmk
|
return pkeys_loc
|