Project

General

Profile

« Previous | Next » 

Revision 21

Merged data2xml XPath functionality into xpath.py. Merged data2xml xml_dom.py and xml2db xml_util.py into identical xml_util.py for each script.

View differences:

scripts/data2xml/xml_dom.py
1
# XML DOM tree manipulation
2

  
3
from xml.dom import Node
4

  
5
class MappingException(Exception): pass
6

  
7
def by_tag_name(parent, name):
8
    node = parent.lastChild
9
    while node != None and\
10
    not (node.nodeType == Node.ELEMENT_NODE and node.tagName == name):
11
        node = node.previousSibling
12
    return node
13

  
14
def value(node):
15
    if node.firstChild != None: return node.firstChild.nodeValue
16
    else: return node.nodeValue
17

  
18
def by_path(doc, path, create=False, parent=None):
19
    if not parent: parent = doc.documentElement
20
    for elem in path:
21
        node = None
22
        if elem.is_attr: node = parent.getAttributeNode(elem.name)
23
        elif elem.name == '.': node = parent
24
        else: node = by_tag_name(parent, elem.name)
25
        if node != None and elem.value != None and value(node) != elem.value:
26
            node = None
27
        for attr in elem.attrs:
28
            if by_path(doc, attr, parent=node) == None: node = None; break
29
        if node == None:
30
            if not create: return None
31
            if elem.is_attr:
32
                parent.setAttribute(elem.name, '')
33
                node = parent.getAttributeNode(elem.name)
34
            else: node = parent.appendChild(doc.createElement(elem.name))
35
            if elem.value != None:
36
                if node.nodeType == Node.ELEMENT_NODE:
37
                    node.appendChild(doc.createTextNode(elem.value))
38
                else: node.nodeValue = elem.value
39
            for attr in elem.attrs: by_path(doc, attr, create, node)
40
        parent = node
41
    return parent
scripts/data2xml/XpathParser.py
1
# A general recursive descent parser
2

  
3
from Parser import Parser
4

  
5
class XpathElem:
6
    def __init__(self, name, value=None, is_attr=False, attrs=None):
7
        if attrs == None: attrs = []
8
        self.name = name
9
        self.value = value
10
        self.is_attr = is_attr
11
        self.attrs = attrs
12
    
13
    def __repr__(self):
14
        if self.is_attr: prefix = '@'
15
        else: prefix = ''
16
        return prefix+self.name+repr(self.attrs)+'='+repr(self.value)
17
    
18
    def __eq__(self, other): return self.__dict__ == other.__dict__
19

  
20
class XpathParser(Parser):
21
    def _main(self):
22
        while True:
23
            self._match_str('/', required=True)
24
            tree = self._path() # just use last path for now
25
            if not self._match_str('->'): break
26
        return tree
27
    
28
    def _path(self):
29
        tree = []
30
        while True:
31
            elem = XpathElem(is_attr=self._match_str('@'), name=self._fields())
32
            if self._match_str('['):
33
                elem.attrs = self._attrs()
34
                self._match_str(']', required=True)
35
            tree.append(elem)
36
            if not self._match_str('/'): break
37
        return tree
38
    
39
    def _fields(self):
40
        if self._match_str('{'):
41
            tree = []
42
            while True:
43
                tree.append(self._field())
44
                if not self._match_str(','): break
45
            self._match_str('}', required=True)
46
            tree = tuple(tree)
47
            tree = tree[0] # just use first field for now
48
        else: tree = self._field()
49
        return tree
50
    
51
    def _attrs(self):
52
        tree = []
53
        while True:
54
            path = self._path()
55
            self._match_str('=', required=True)
56
            path[-1].value = self._value()
57
            tree.append(path)
58
            if not self._match_str(','): break
59
        return tree
60
    
61
    def _field(self):
62
        return self._name()
63
    
64
    def _name(self): return self._match_re(r'[\w.]+', required=True)
65
    
66
    def _value(self): return self._match_re(r'[\w.|]+', required=True)
67 0

  
scripts/xml2db/xml_util.py
26 26

  
27 27
def first_elem(node): return NodeElemIter(node).next()
28 28

  
29
class NodeElemReverseIter:
30
    def __init__(self, node): self.child = node.lastChild
31
    
32
    def __iter__(self): return self
33
    
34
    def curr(self):
35
        while self.child != None:
36
            if self.child.nodeType == Node.ELEMENT_NODE: return self.child
37
            self.child = self.child.previousSibling
38
        raise StopIteration
39
    
40
    def next(self):
41
        child = self.curr()
42
        self.child = self.child.previousSibling
43
        return child
44

  
45
def last_elem(node): return NodeElemReverseIter(node).next()
46

  
29 47
class NodeParentIter:
30 48
    def __init__(self, node): self.node = node
31 49
    
......
47 65

  
48 66
def value(node):
49 67
    if node.firstChild != None: return node.firstChild.nodeValue.strip()
50
    else: return None
68
    else: return node.nodeValue
69

  
70
def by_tag_name(node, name):
71
    for child in NodeElemReverseIter(node):
72
        if child.tagName == name: return child
73
    return None
scripts/data2xml/test
3 3
# Usage: [env n=<num-rows>] ./test 
4 4

  
5 5
selfDir="$(dirname -- "$0")"
6
cd "$selfDir"
6 7

  
7 8
test -n "$n" || n=2
8 9
let n++
9 10

  
10
cd "$selfDir"
11 11
head -$n NYSpecimenDataAmericas.csv|./data2xml VegX NYBG-VegBank-VegX_mapping.csv
scripts/data2xml/xml_util.py
1
# XML DOM tree manipulation
2

  
3
from xml.dom import Node
4

  
5
def name_of(node): return node.tagName.lower()
6

  
7
def get_id(node): return node.getAttribute('id')
8

  
9
def set_id(node, id_): node.setAttribute('id', id_)
10

  
11
class NodeElemIter:
12
    def __init__(self, node): self.child = node.firstChild
13
    
14
    def __iter__(self): return self
15
    
16
    def curr(self):
17
        while self.child != None:
18
            if self.child.nodeType == Node.ELEMENT_NODE: return self.child
19
            self.child = self.child.nextSibling
20
        raise StopIteration
21
    
22
    def next(self):
23
        child = self.curr()
24
        self.child = self.child.nextSibling
25
        return child
26

  
27
def first_elem(node): return NodeElemIter(node).next()
28

  
29
class NodeElemReverseIter:
30
    def __init__(self, node): self.child = node.lastChild
31
    
32
    def __iter__(self): return self
33
    
34
    def curr(self):
35
        while self.child != None:
36
            if self.child.nodeType == Node.ELEMENT_NODE: return self.child
37
            self.child = self.child.previousSibling
38
        raise StopIteration
39
    
40
    def next(self):
41
        child = self.curr()
42
        self.child = self.child.previousSibling
43
        return child
44

  
45
def last_elem(node): return NodeElemReverseIter(node).next()
46

  
47
class NodeParentIter:
48
    def __init__(self, node): self.node = node
49
    
50
    def __iter__(self): return self
51
    
52
    def curr(self):
53
        if self.node != None and self.node.nodeType == Node.ELEMENT_NODE:
54
            return self.node
55
        raise StopIteration
56
    
57
    def next(self):
58
        node = self.curr()
59
        self.node = self.node.parentNode
60
        return node
61

  
62
def is_text(node):
63
    for child in NodeElemIter(node): return False # has an element node
64
    return True
65

  
66
def value(node):
67
    if node.firstChild != None: return node.firstChild.nodeValue.strip()
68
    else: return node.nodeValue
69

  
70
def by_tag_name(node, name):
71
    for child in NodeElemReverseIter(node):
72
        if child.tagName == name: return child
73
    return None
scripts/data2xml/NYBG-VegBank-VegX_mapping.csv
1 1
"NYBG","VegBank","VegX"
2 2
"key",,
3 3
"DateLastModified",,
4
"InstitutionCode","/taxonInterpretation/museum_ID->party/OrganizationName/_name/firstName","/*s/taxonNameUsageConcept/partyWithRole/partyID->/parties/party/organizationName/_name/firstName"
5
"CollectionCode","/taxonInterpretation/museum_ID->party/OrganizationName/_name/lastName","/*s/taxonNameUsageConcept/partyWithRole/partyID->/parties/party/organizationName/_name/lastName"
4
"InstitutionCode","/taxonInterpretation/museum_ID->/party/OrganizationName/_name/firstName","/*s/taxonNameUsageConcept/partyWithRole/partyID->/parties/party/organizationName/_name/firstName"
5
"CollectionCode","/taxonInterpretation/museum_ID->/party/OrganizationName/_name/lastName","/*s/taxonNameUsageConcept/partyWithRole/partyID->/parties/party/organizationName/_name/lastName"
6 6
"CatalogNumber","/taxonInterpretation/museumAccessionNumber","/*s/taxonNameUsageConcept/voucher"
7 7
"ScientificName","/plantName/plantName","/*s/taxonName/Simple"
8 8
"BasisOfRecord",,
......
14 14
"Genus","/plantName[plantStatus/plantLevel=Genus]/plantName","/*s/taxonConcept[Rank/@code=gen]/Name"
15 15
"Species","/plantName[plantStatus/plantLevel=Species]/plantName","/*s/taxonConcept[Rank/@code=sp]/Name"
16 16
"Subspecies","/plantName[plantStatus/plantLevel=Subspecies]/plantName","/*s/taxonConcept[Rank/@code=ssp]/Name"
17
"ScientificNameAuthor","/plantConcept/reference_ID->referenceParty/{givenName,surname}","/*s/taxonConcept/AccordingTo/Simple"
18
"IdentifiedBy","/taxonInterpretation/PARTY_ID->party/{givenName,middleName,surName}","/*s/taxonDetermination/partyWithRole/partyID->/parties/party/individualName/{givenName,surName}"
17
"ScientificNameAuthor","/plantConcept/reference_ID->/referenceParty/{givenName,surname}","/*s/taxonConcept/AccordingTo/Simple"
18
"IdentifiedBy","/taxonInterpretation/PARTY_ID->/party/{givenName,middleName,surName}","/*s/taxonDetermination/partyWithRole[role=identifier]/partyID->/parties/party/individualName/{givenName,surName}"
19 19
"YearIdentified","/taxonInterpretation/interpretationDate/_date/year","/*s/taxonDetermination/date/_date/year"
20 20
"MonthIdentified","/taxonInterpretation/interpretationDate/_date/month","/*s/taxonDetermination/date/_date/month"
21 21
"DayIdentified","/taxonInterpretation/interpretationDate/_date/day","/*s/taxonDetermination/date/_date/day"
22 22
"TypeStatus",,
23 23
"CollectorNumber",,
24 24
"FieldNumber","/taxonInterpretation/collectionNumber","/*s/taxonNameUsageConcept/authorCode"
25
"Collector","/taxonInterpretation/collector_ID->party/{givenName,middleName,surName}","/*s/taxonDetermination/partyWithRole[role=collector]/partyID->/parties/party/individualName/{givenName,surName}"
25
"Collector","/taxonInterpretation/collector_ID->/party/{givenName,middleName,surName}","/*s/taxonDetermination/partyWithRole[role=collector]/partyID->/parties/party/individualName/{givenName,surName}"
26 26
"YearCollected","/taxonInterpretation/collectionDate/_date/year","/*s/plotObservation/obsStartDate/_date/year"
27 27
"MonthCollected","/taxonInterpretation/collectionDate/_date/month","/*s/plotObservation/obsStartDate/_date/month"
28 28
"DayCollected","/taxonInterpretation/collectionDate/_date/day","/*s/plotObservation/obsStartDate/_date/day"
scripts/data2xml/xpath.py
1
# XPath-based XML tree manipulation
2

  
3
from xml.dom import Node
4

  
5
from Parser import Parser
6
import xml_util
7

  
8
class XpathElem:
9
    def __init__(self, name, value=None, attrs=None, is_attr=False,
10
        is_ptr=False):
11
        if attrs == None: attrs = []
12
        self.name = name
13
        self.value = value
14
        self.attrs = attrs
15
        self.is_attr = is_attr
16
        self.is_ptr = is_ptr
17
    
18
    def __repr__(self):
19
        str_ = ''
20
        if self.is_attr: str_ += '@'
21
        str_ += self.name+repr(self.attrs)+'='+repr(self.value)
22
        if self.is_ptr: str_ += '->'
23
        return str_
24
    
25
    def __eq__(self, other): return self.__dict__ == other.__dict__
26

  
27
class XpathParser(Parser):
28
    def _main(self):
29
        self._match_str('/', required=True)
30
        return self._path()
31
    
32
    def _path(self):
33
        tree = []
34
        while True:
35
            elem = XpathElem(is_attr=self._match_str('@'), name=self._fields())
36
            if self._match_str('['):
37
                elem.attrs = self._attrs()
38
                self._match_str(']', required=True)
39
            elem.is_ptr = self._match_str('->')
40
            tree.append(elem)
41
            if not self._match_str('/'): break
42
        return tree
43
    
44
    def _fields(self):
45
        if self._match_str('{'):
46
            tree = []
47
            while True:
48
                tree.append(self._field())
49
                if not self._match_str(','): break
50
            self._match_str('}', required=True)
51
            tree = tuple(tree)
52
            tree = tree[0] # just use first field for now
53
        else: tree = self._field()
54
        return tree
55
    
56
    def _attrs(self):
57
        tree = []
58
        while True:
59
            path = self._path()
60
            self._match_str('=', required=True)
61
            path[-1].value = self._value()
62
            tree.append(path)
63
            if not self._match_str(','): break
64
        return tree
65
    
66
    def _field(self):
67
        return self._name()
68
    
69
    def _name(self): return self._match_re(r'[\w.]+', required=True)
70
    
71
    def _value(self): return self._match_re(r'[\w.|]+', required=True)
72

  
73
def set_id(path, id_, has_type_containers=True):
74
    if has_type_containers: id_level = 1
75
    else: id_level = 0
76
    path[id_level].attrs.append([XpathElem('id', id_, is_attr=True)])
77

  
78
def get(doc, path, create=False, parent=None):
79
    if parent == None: parent = doc.documentElement
80
    for elem in path:
81
        node = None
82
        if elem.is_attr: node = parent.getAttributeNode(elem.name)
83
        elif elem.name == '.': node = parent
84
        else: node = xml_util.by_tag_name(parent, elem.name)
85
        if node != None and elem.value != None\
86
        and xml_util.value(node) != elem.value: node = None
87
        for attr in elem.attrs:
88
            if get(doc, attr, parent=node) == None: node = None; break
89
        if node == None:
90
            if not create: return None
91
            if elem.is_attr:
92
                parent.setAttribute(elem.name, '')
93
                node = parent.getAttributeNode(elem.name)
94
            else: node = parent.appendChild(doc.createElement(elem.name))
95
            if elem.value != None:
96
                if node.nodeType == Node.ELEMENT_NODE:
97
                    node.appendChild(doc.createTextNode(elem.value))
98
                else: node.nodeValue = elem.value
99
            for attr in elem.attrs: get(doc, attr, create, node)
100
        if elem.is_ptr:
101
            pass
102
        parent = node
103
    return parent
scripts/data2xml/data2xml
7 7
from copy import deepcopy
8 8
from xml.dom.minidom import getDOMImplementation
9 9

  
10
from XpathParser import XpathParser, XpathElem
11
from xml_dom import by_path
10
import xpath
12 11

  
13 12
def main():
14 13
    prog_name = sys.argv.pop(0)
......
34 33
            path = path.replace('<name>', name)
35 34
            path, repl_ct = re.subn(r'(?<=/)\*(?=s/(\w+))', r'\1', path)
36 35
            if repl_ct > 0: has_type_containers = True # *s used for type elem
37
            mappings[name] = XpathParser(path).parse()
36
            mappings[name] = xpath.XpathParser(path).parse()
38 37
    stream.close()
39
    if has_type_containers: id_level = 1
40
    else: id_level = 0  
41 38
    
42 39
    # Process dataset
43 40
    doc = getDOMImplementation().createDocument(None, dest, None)
......
51 48
            value = row[idx]
52 49
            if value != '' and name in mappings:
53 50
                path = deepcopy(mappings[name]) # don't modify main value!
54
                path[id_level].attrs.append([XpathElem('id', row_id,
55
                    is_attr=True)])
51
                xpath.set_id(path, row_id, has_type_containers)
56 52
                path[-1].value = value
57
                by_path(doc, path, True)
53
                xpath.get(doc, path, True)
58 54
        row_idx += 1
59 55
    doc.writexml(sys.stdout, addindent='    ', newl='\n')
60 56

  

Also available in: Unified diff