Revision 21
Added by Aaron Marcuse-Kubitza about 13 years ago
scripts/data2xml/xml_dom.py | ||
---|---|---|
1 |
# XML DOM tree manipulation |
|
2 |
|
|
3 |
from xml.dom import Node |
|
4 |
|
|
5 |
class MappingException(Exception): pass |
|
6 |
|
|
7 |
def by_tag_name(parent, name): |
|
8 |
node = parent.lastChild |
|
9 |
while node != None and\ |
|
10 |
not (node.nodeType == Node.ELEMENT_NODE and node.tagName == name): |
|
11 |
node = node.previousSibling |
|
12 |
return node |
|
13 |
|
|
14 |
def value(node): |
|
15 |
if node.firstChild != None: return node.firstChild.nodeValue |
|
16 |
else: return node.nodeValue |
|
17 |
|
|
18 |
def by_path(doc, path, create=False, parent=None): |
|
19 |
if not parent: parent = doc.documentElement |
|
20 |
for elem in path: |
|
21 |
node = None |
|
22 |
if elem.is_attr: node = parent.getAttributeNode(elem.name) |
|
23 |
elif elem.name == '.': node = parent |
|
24 |
else: node = by_tag_name(parent, elem.name) |
|
25 |
if node != None and elem.value != None and value(node) != elem.value: |
|
26 |
node = None |
|
27 |
for attr in elem.attrs: |
|
28 |
if by_path(doc, attr, parent=node) == None: node = None; break |
|
29 |
if node == None: |
|
30 |
if not create: return None |
|
31 |
if elem.is_attr: |
|
32 |
parent.setAttribute(elem.name, '') |
|
33 |
node = parent.getAttributeNode(elem.name) |
|
34 |
else: node = parent.appendChild(doc.createElement(elem.name)) |
|
35 |
if elem.value != None: |
|
36 |
if node.nodeType == Node.ELEMENT_NODE: |
|
37 |
node.appendChild(doc.createTextNode(elem.value)) |
|
38 |
else: node.nodeValue = elem.value |
|
39 |
for attr in elem.attrs: by_path(doc, attr, create, node) |
|
40 |
parent = node |
|
41 |
return parent |
scripts/data2xml/XpathParser.py | ||
---|---|---|
1 |
# A general recursive descent parser |
|
2 |
|
|
3 |
from Parser import Parser |
|
4 |
|
|
5 |
class XpathElem: |
|
6 |
def __init__(self, name, value=None, is_attr=False, attrs=None): |
|
7 |
if attrs == None: attrs = [] |
|
8 |
self.name = name |
|
9 |
self.value = value |
|
10 |
self.is_attr = is_attr |
|
11 |
self.attrs = attrs |
|
12 |
|
|
13 |
def __repr__(self): |
|
14 |
if self.is_attr: prefix = '@' |
|
15 |
else: prefix = '' |
|
16 |
return prefix+self.name+repr(self.attrs)+'='+repr(self.value) |
|
17 |
|
|
18 |
def __eq__(self, other): return self.__dict__ == other.__dict__ |
|
19 |
|
|
20 |
class XpathParser(Parser): |
|
21 |
def _main(self): |
|
22 |
while True: |
|
23 |
self._match_str('/', required=True) |
|
24 |
tree = self._path() # just use last path for now |
|
25 |
if not self._match_str('->'): break |
|
26 |
return tree |
|
27 |
|
|
28 |
def _path(self): |
|
29 |
tree = [] |
|
30 |
while True: |
|
31 |
elem = XpathElem(is_attr=self._match_str('@'), name=self._fields()) |
|
32 |
if self._match_str('['): |
|
33 |
elem.attrs = self._attrs() |
|
34 |
self._match_str(']', required=True) |
|
35 |
tree.append(elem) |
|
36 |
if not self._match_str('/'): break |
|
37 |
return tree |
|
38 |
|
|
39 |
def _fields(self): |
|
40 |
if self._match_str('{'): |
|
41 |
tree = [] |
|
42 |
while True: |
|
43 |
tree.append(self._field()) |
|
44 |
if not self._match_str(','): break |
|
45 |
self._match_str('}', required=True) |
|
46 |
tree = tuple(tree) |
|
47 |
tree = tree[0] # just use first field for now |
|
48 |
else: tree = self._field() |
|
49 |
return tree |
|
50 |
|
|
51 |
def _attrs(self): |
|
52 |
tree = [] |
|
53 |
while True: |
|
54 |
path = self._path() |
|
55 |
self._match_str('=', required=True) |
|
56 |
path[-1].value = self._value() |
|
57 |
tree.append(path) |
|
58 |
if not self._match_str(','): break |
|
59 |
return tree |
|
60 |
|
|
61 |
def _field(self): |
|
62 |
return self._name() |
|
63 |
|
|
64 |
def _name(self): return self._match_re(r'[\w.]+', required=True) |
|
65 |
|
|
66 |
def _value(self): return self._match_re(r'[\w.|]+', required=True) |
|
67 | 0 |
scripts/xml2db/xml_util.py | ||
---|---|---|
26 | 26 |
|
27 | 27 |
def first_elem(node): return NodeElemIter(node).next() |
28 | 28 |
|
29 |
class NodeElemReverseIter: |
|
30 |
def __init__(self, node): self.child = node.lastChild |
|
31 |
|
|
32 |
def __iter__(self): return self |
|
33 |
|
|
34 |
def curr(self): |
|
35 |
while self.child != None: |
|
36 |
if self.child.nodeType == Node.ELEMENT_NODE: return self.child |
|
37 |
self.child = self.child.previousSibling |
|
38 |
raise StopIteration |
|
39 |
|
|
40 |
def next(self): |
|
41 |
child = self.curr() |
|
42 |
self.child = self.child.previousSibling |
|
43 |
return child |
|
44 |
|
|
45 |
def last_elem(node): return NodeElemReverseIter(node).next() |
|
46 |
|
|
29 | 47 |
class NodeParentIter: |
30 | 48 |
def __init__(self, node): self.node = node |
31 | 49 |
|
... | ... | |
47 | 65 |
|
48 | 66 |
def value(node): |
49 | 67 |
if node.firstChild != None: return node.firstChild.nodeValue.strip() |
50 |
else: return None |
|
68 |
else: return node.nodeValue |
|
69 |
|
|
70 |
def by_tag_name(node, name): |
|
71 |
for child in NodeElemReverseIter(node): |
|
72 |
if child.tagName == name: return child |
|
73 |
return None |
scripts/data2xml/test | ||
---|---|---|
3 | 3 |
# Usage: [env n=<num-rows>] ./test |
4 | 4 |
|
5 | 5 |
selfDir="$(dirname -- "$0")" |
6 |
cd "$selfDir" |
|
6 | 7 |
|
7 | 8 |
test -n "$n" || n=2 |
8 | 9 |
let n++ |
9 | 10 |
|
10 |
cd "$selfDir" |
|
11 | 11 |
head -$n NYSpecimenDataAmericas.csv|./data2xml VegX NYBG-VegBank-VegX_mapping.csv |
scripts/data2xml/xml_util.py | ||
---|---|---|
1 |
# XML DOM tree manipulation |
|
2 |
|
|
3 |
from xml.dom import Node |
|
4 |
|
|
5 |
def name_of(node): return node.tagName.lower() |
|
6 |
|
|
7 |
def get_id(node): return node.getAttribute('id') |
|
8 |
|
|
9 |
def set_id(node, id_): node.setAttribute('id', id_) |
|
10 |
|
|
11 |
class NodeElemIter: |
|
12 |
def __init__(self, node): self.child = node.firstChild |
|
13 |
|
|
14 |
def __iter__(self): return self |
|
15 |
|
|
16 |
def curr(self): |
|
17 |
while self.child != None: |
|
18 |
if self.child.nodeType == Node.ELEMENT_NODE: return self.child |
|
19 |
self.child = self.child.nextSibling |
|
20 |
raise StopIteration |
|
21 |
|
|
22 |
def next(self): |
|
23 |
child = self.curr() |
|
24 |
self.child = self.child.nextSibling |
|
25 |
return child |
|
26 |
|
|
27 |
def first_elem(node): return NodeElemIter(node).next() |
|
28 |
|
|
29 |
class NodeElemReverseIter: |
|
30 |
def __init__(self, node): self.child = node.lastChild |
|
31 |
|
|
32 |
def __iter__(self): return self |
|
33 |
|
|
34 |
def curr(self): |
|
35 |
while self.child != None: |
|
36 |
if self.child.nodeType == Node.ELEMENT_NODE: return self.child |
|
37 |
self.child = self.child.previousSibling |
|
38 |
raise StopIteration |
|
39 |
|
|
40 |
def next(self): |
|
41 |
child = self.curr() |
|
42 |
self.child = self.child.previousSibling |
|
43 |
return child |
|
44 |
|
|
45 |
def last_elem(node): return NodeElemReverseIter(node).next() |
|
46 |
|
|
47 |
class NodeParentIter: |
|
48 |
def __init__(self, node): self.node = node |
|
49 |
|
|
50 |
def __iter__(self): return self |
|
51 |
|
|
52 |
def curr(self): |
|
53 |
if self.node != None and self.node.nodeType == Node.ELEMENT_NODE: |
|
54 |
return self.node |
|
55 |
raise StopIteration |
|
56 |
|
|
57 |
def next(self): |
|
58 |
node = self.curr() |
|
59 |
self.node = self.node.parentNode |
|
60 |
return node |
|
61 |
|
|
62 |
def is_text(node): |
|
63 |
for child in NodeElemIter(node): return False # has an element node |
|
64 |
return True |
|
65 |
|
|
66 |
def value(node): |
|
67 |
if node.firstChild != None: return node.firstChild.nodeValue.strip() |
|
68 |
else: return node.nodeValue |
|
69 |
|
|
70 |
def by_tag_name(node, name): |
|
71 |
for child in NodeElemReverseIter(node): |
|
72 |
if child.tagName == name: return child |
|
73 |
return None |
scripts/data2xml/NYBG-VegBank-VegX_mapping.csv | ||
---|---|---|
1 | 1 |
"NYBG","VegBank","VegX" |
2 | 2 |
"key",, |
3 | 3 |
"DateLastModified",, |
4 |
"InstitutionCode","/taxonInterpretation/museum_ID->party/OrganizationName/_name/firstName","/*s/taxonNameUsageConcept/partyWithRole/partyID->/parties/party/organizationName/_name/firstName" |
|
5 |
"CollectionCode","/taxonInterpretation/museum_ID->party/OrganizationName/_name/lastName","/*s/taxonNameUsageConcept/partyWithRole/partyID->/parties/party/organizationName/_name/lastName" |
|
4 |
"InstitutionCode","/taxonInterpretation/museum_ID->/party/OrganizationName/_name/firstName","/*s/taxonNameUsageConcept/partyWithRole/partyID->/parties/party/organizationName/_name/firstName"
|
|
5 |
"CollectionCode","/taxonInterpretation/museum_ID->/party/OrganizationName/_name/lastName","/*s/taxonNameUsageConcept/partyWithRole/partyID->/parties/party/organizationName/_name/lastName"
|
|
6 | 6 |
"CatalogNumber","/taxonInterpretation/museumAccessionNumber","/*s/taxonNameUsageConcept/voucher" |
7 | 7 |
"ScientificName","/plantName/plantName","/*s/taxonName/Simple" |
8 | 8 |
"BasisOfRecord",, |
... | ... | |
14 | 14 |
"Genus","/plantName[plantStatus/plantLevel=Genus]/plantName","/*s/taxonConcept[Rank/@code=gen]/Name" |
15 | 15 |
"Species","/plantName[plantStatus/plantLevel=Species]/plantName","/*s/taxonConcept[Rank/@code=sp]/Name" |
16 | 16 |
"Subspecies","/plantName[plantStatus/plantLevel=Subspecies]/plantName","/*s/taxonConcept[Rank/@code=ssp]/Name" |
17 |
"ScientificNameAuthor","/plantConcept/reference_ID->referenceParty/{givenName,surname}","/*s/taxonConcept/AccordingTo/Simple" |
|
18 |
"IdentifiedBy","/taxonInterpretation/PARTY_ID->party/{givenName,middleName,surName}","/*s/taxonDetermination/partyWithRole/partyID->/parties/party/individualName/{givenName,surName}"
|
|
17 |
"ScientificNameAuthor","/plantConcept/reference_ID->/referenceParty/{givenName,surname}","/*s/taxonConcept/AccordingTo/Simple"
|
|
18 |
"IdentifiedBy","/taxonInterpretation/PARTY_ID->/party/{givenName,middleName,surName}","/*s/taxonDetermination/partyWithRole[role=identifier]/partyID->/parties/party/individualName/{givenName,surName}"
|
|
19 | 19 |
"YearIdentified","/taxonInterpretation/interpretationDate/_date/year","/*s/taxonDetermination/date/_date/year" |
20 | 20 |
"MonthIdentified","/taxonInterpretation/interpretationDate/_date/month","/*s/taxonDetermination/date/_date/month" |
21 | 21 |
"DayIdentified","/taxonInterpretation/interpretationDate/_date/day","/*s/taxonDetermination/date/_date/day" |
22 | 22 |
"TypeStatus",, |
23 | 23 |
"CollectorNumber",, |
24 | 24 |
"FieldNumber","/taxonInterpretation/collectionNumber","/*s/taxonNameUsageConcept/authorCode" |
25 |
"Collector","/taxonInterpretation/collector_ID->party/{givenName,middleName,surName}","/*s/taxonDetermination/partyWithRole[role=collector]/partyID->/parties/party/individualName/{givenName,surName}" |
|
25 |
"Collector","/taxonInterpretation/collector_ID->/party/{givenName,middleName,surName}","/*s/taxonDetermination/partyWithRole[role=collector]/partyID->/parties/party/individualName/{givenName,surName}"
|
|
26 | 26 |
"YearCollected","/taxonInterpretation/collectionDate/_date/year","/*s/plotObservation/obsStartDate/_date/year" |
27 | 27 |
"MonthCollected","/taxonInterpretation/collectionDate/_date/month","/*s/plotObservation/obsStartDate/_date/month" |
28 | 28 |
"DayCollected","/taxonInterpretation/collectionDate/_date/day","/*s/plotObservation/obsStartDate/_date/day" |
scripts/data2xml/xpath.py | ||
---|---|---|
1 |
# XPath-based XML tree manipulation |
|
2 |
|
|
3 |
from xml.dom import Node |
|
4 |
|
|
5 |
from Parser import Parser |
|
6 |
import xml_util |
|
7 |
|
|
8 |
class XpathElem: |
|
9 |
def __init__(self, name, value=None, attrs=None, is_attr=False, |
|
10 |
is_ptr=False): |
|
11 |
if attrs == None: attrs = [] |
|
12 |
self.name = name |
|
13 |
self.value = value |
|
14 |
self.attrs = attrs |
|
15 |
self.is_attr = is_attr |
|
16 |
self.is_ptr = is_ptr |
|
17 |
|
|
18 |
def __repr__(self): |
|
19 |
str_ = '' |
|
20 |
if self.is_attr: str_ += '@' |
|
21 |
str_ += self.name+repr(self.attrs)+'='+repr(self.value) |
|
22 |
if self.is_ptr: str_ += '->' |
|
23 |
return str_ |
|
24 |
|
|
25 |
def __eq__(self, other): return self.__dict__ == other.__dict__ |
|
26 |
|
|
27 |
class XpathParser(Parser): |
|
28 |
def _main(self): |
|
29 |
self._match_str('/', required=True) |
|
30 |
return self._path() |
|
31 |
|
|
32 |
def _path(self): |
|
33 |
tree = [] |
|
34 |
while True: |
|
35 |
elem = XpathElem(is_attr=self._match_str('@'), name=self._fields()) |
|
36 |
if self._match_str('['): |
|
37 |
elem.attrs = self._attrs() |
|
38 |
self._match_str(']', required=True) |
|
39 |
elem.is_ptr = self._match_str('->') |
|
40 |
tree.append(elem) |
|
41 |
if not self._match_str('/'): break |
|
42 |
return tree |
|
43 |
|
|
44 |
def _fields(self): |
|
45 |
if self._match_str('{'): |
|
46 |
tree = [] |
|
47 |
while True: |
|
48 |
tree.append(self._field()) |
|
49 |
if not self._match_str(','): break |
|
50 |
self._match_str('}', required=True) |
|
51 |
tree = tuple(tree) |
|
52 |
tree = tree[0] # just use first field for now |
|
53 |
else: tree = self._field() |
|
54 |
return tree |
|
55 |
|
|
56 |
def _attrs(self): |
|
57 |
tree = [] |
|
58 |
while True: |
|
59 |
path = self._path() |
|
60 |
self._match_str('=', required=True) |
|
61 |
path[-1].value = self._value() |
|
62 |
tree.append(path) |
|
63 |
if not self._match_str(','): break |
|
64 |
return tree |
|
65 |
|
|
66 |
def _field(self): |
|
67 |
return self._name() |
|
68 |
|
|
69 |
def _name(self): return self._match_re(r'[\w.]+', required=True) |
|
70 |
|
|
71 |
def _value(self): return self._match_re(r'[\w.|]+', required=True) |
|
72 |
|
|
73 |
def set_id(path, id_, has_type_containers=True): |
|
74 |
if has_type_containers: id_level = 1 |
|
75 |
else: id_level = 0 |
|
76 |
path[id_level].attrs.append([XpathElem('id', id_, is_attr=True)]) |
|
77 |
|
|
78 |
def get(doc, path, create=False, parent=None): |
|
79 |
if parent == None: parent = doc.documentElement |
|
80 |
for elem in path: |
|
81 |
node = None |
|
82 |
if elem.is_attr: node = parent.getAttributeNode(elem.name) |
|
83 |
elif elem.name == '.': node = parent |
|
84 |
else: node = xml_util.by_tag_name(parent, elem.name) |
|
85 |
if node != None and elem.value != None\ |
|
86 |
and xml_util.value(node) != elem.value: node = None |
|
87 |
for attr in elem.attrs: |
|
88 |
if get(doc, attr, parent=node) == None: node = None; break |
|
89 |
if node == None: |
|
90 |
if not create: return None |
|
91 |
if elem.is_attr: |
|
92 |
parent.setAttribute(elem.name, '') |
|
93 |
node = parent.getAttributeNode(elem.name) |
|
94 |
else: node = parent.appendChild(doc.createElement(elem.name)) |
|
95 |
if elem.value != None: |
|
96 |
if node.nodeType == Node.ELEMENT_NODE: |
|
97 |
node.appendChild(doc.createTextNode(elem.value)) |
|
98 |
else: node.nodeValue = elem.value |
|
99 |
for attr in elem.attrs: get(doc, attr, create, node) |
|
100 |
if elem.is_ptr: |
|
101 |
pass |
|
102 |
parent = node |
|
103 |
return parent |
scripts/data2xml/data2xml | ||
---|---|---|
7 | 7 |
from copy import deepcopy |
8 | 8 |
from xml.dom.minidom import getDOMImplementation |
9 | 9 |
|
10 |
from XpathParser import XpathParser, XpathElem |
|
11 |
from xml_dom import by_path |
|
10 |
import xpath |
|
12 | 11 |
|
13 | 12 |
def main(): |
14 | 13 |
prog_name = sys.argv.pop(0) |
... | ... | |
34 | 33 |
path = path.replace('<name>', name) |
35 | 34 |
path, repl_ct = re.subn(r'(?<=/)\*(?=s/(\w+))', r'\1', path) |
36 | 35 |
if repl_ct > 0: has_type_containers = True # *s used for type elem |
37 |
mappings[name] = XpathParser(path).parse() |
|
36 |
mappings[name] = xpath.XpathParser(path).parse()
|
|
38 | 37 |
stream.close() |
39 |
if has_type_containers: id_level = 1 |
|
40 |
else: id_level = 0 |
|
41 | 38 |
|
42 | 39 |
# Process dataset |
43 | 40 |
doc = getDOMImplementation().createDocument(None, dest, None) |
... | ... | |
51 | 48 |
value = row[idx] |
52 | 49 |
if value != '' and name in mappings: |
53 | 50 |
path = deepcopy(mappings[name]) # don't modify main value! |
54 |
path[id_level].attrs.append([XpathElem('id', row_id, |
|
55 |
is_attr=True)]) |
|
51 |
xpath.set_id(path, row_id, has_type_containers) |
|
56 | 52 |
path[-1].value = value |
57 |
by_path(doc, path, True)
|
|
53 |
xpath.get(doc, path, True)
|
|
58 | 54 |
row_idx += 1 |
59 | 55 |
doc.writexml(sys.stdout, addindent=' ', newl='\n') |
60 | 56 |
|
Also available in: Unified diff
Merged data2xml XPath functionality into xpath.py. Merged data2xml xml_dom.py and xml2db xml_util.py into identical xml_util.py for each script.