Revision 1714
Added by Aaron Marcuse-Kubitza over 12 years ago
bin/map | ||
---|---|---|
5 | 5 |
# http://vegbank.org/vegdocs/xml/vegbank_example_ver1.0.2.xml |
6 | 6 |
|
7 | 7 |
import csv |
8 |
import itertools |
|
8 | 9 |
import os.path |
9 | 10 |
import sys |
10 | 11 |
import xml.dom.minidom as minidom |
... | ... | |
13 | 14 |
|
14 | 15 |
import csvs |
15 | 16 |
import exc |
17 |
import iters |
|
16 | 18 |
import maps |
17 | 19 |
import opts |
18 | 20 |
import Parser |
... | ... | |
222 | 224 |
|
223 | 225 |
in_db.close() |
224 | 226 |
elif in_is_xml: |
225 |
for in_xml_root in xml_parse.docs_iter(sys.stdin):
|
|
226 |
if map_path == None:
|
|
227 |
if map_path == None:
|
|
228 |
for in_xml_root in xml_parse.docs_iter(sys.stdin):
|
|
227 | 229 |
iter_ = xml_dom.NodeElemIter(in_xml_root) |
228 | 230 |
util.skip(iter_, xml_dom.is_text) # skip metadata |
229 | 231 |
row_ct = process_rows(lambda row, i: root.appendChild(row), |
230 | 232 |
iter_) |
231 |
else: |
|
233 |
else: |
|
234 |
def doc_rows(in_xml_root): |
|
232 | 235 |
rows = xpath.get(in_xml_root, in_root, limit=end) |
233 |
if rows == []: raise SystemExit('Map error: Root "'+in_root |
|
234 |
+'" not found in input') |
|
235 |
|
|
236 |
def get_value(in_, row): |
|
237 |
in_ = './{'+(','.join(strings.with_prefixes( |
|
238 |
['']+prefixes, in_)))+'}' # also with no prefix |
|
239 |
nodes = xpath.get(row, in_, allow_rooted=False) |
|
240 |
if nodes != []: return xml_dom.value(nodes[0]) |
|
241 |
else: return None |
|
242 |
|
|
243 |
row_ct = map_rows(get_value, rows) |
|
236 |
if rows == []: raise SystemExit('Map error: Root "' |
|
237 |
+in_root+'" not found in input') |
|
238 |
return rows |
|
239 |
|
|
240 |
def get_value(in_, row): |
|
241 |
in_ = './{'+(','.join(strings.with_prefixes( |
|
242 |
['']+prefixes, in_)))+'}' # also with no prefix |
|
243 |
nodes = xpath.get(row, in_, allow_rooted=False) |
|
244 |
if nodes != []: return xml_dom.value(nodes[0]) |
|
245 |
else: return None |
|
246 |
|
|
247 |
row_ct = map_rows(get_value, iters.flatten(itertools.imap( |
|
248 |
doc_rows, xml_parse.docs_iter(sys.stdin)))) |
|
244 | 249 |
else: # input is CSV |
245 | 250 |
map_ = dict(mappings) |
246 | 251 |
reader, col_names = csvs.reader_and_header(sys.stdin) |
Also available in: Unified diff
bin/map: Fixed bug in iteration over consecutive XML documents where only the first element of the first document was processed. Use of iters.flatten() and itertools.imap() fixes this problem so that the consecutive XML documents are regarded as a continuous stream of rows.