/ - Diff - BIEN 3 - NCEAS Projects

     # XML DOM tree manipulation
     from xml.dom import Node
     class MappingException(Exception): pass
     def by_tag_name(parent, name):
         node = parent.lastChild
         while node != None and\
         not (node.nodeType == Node.ELEMENT_NODE and node.tagName == name):
             node = node.previousSibling
         return node
     def value(node):
         if node.firstChild != None: return node.firstChild.nodeValue
         else: return node.nodeValue
     def by_path(doc, path, create=False, parent=None):
         if not parent: parent = doc.documentElement
         for elem in path:
             node = None
             if elem.is_attr: node = parent.getAttributeNode(elem.name)
             elif elem.name == '.': node = parent
             else: node = by_tag_name(parent, elem.name)
             if node != None and elem.value != None and value(node) != elem.value:
                 node = None
             for attr in elem.attrs:
                 if by_path(doc, attr, parent=node) == None: node = None; break
             if node == None:
                 if not create: return None
                 if elem.is_attr:
                     parent.setAttribute(elem.name, '')
                     node = parent.getAttributeNode(elem.name)
                 else: node = parent.appendChild(doc.createElement(elem.name))
                 if elem.value != None:
                     if node.nodeType == Node.ELEMENT_NODE:
                         node.appendChild(doc.createTextNode(elem.value))
                     else: node.nodeValue = elem.value
                 for attr in elem.attrs: by_path(doc, attr, create, node)
             parent = node
         return parent

     # A general recursive descent parser
     from Parser import Parser
     class XpathElem:
         def __init__(self, name, value=None, is_attr=False, attrs=None):
             if attrs == None: attrs = []
             self.name = name
             self.value = value
             self.is_attr = is_attr
             self.attrs = attrs
         def __repr__(self):
             if self.is_attr: prefix = '@'
             else: prefix = ''
             return prefix+self.name+repr(self.attrs)+'='+repr(self.value)
         def __eq__(self, other): return self.__dict__ == other.__dict__
     class XpathParser(Parser):
         def _main(self):
             while True:
                 self._match_str('/', required=True)
                 tree = self._path() # just use last path for now
                 if not self._match_str('->'): break
             return tree
         def _path(self):
             tree = []
             while True:
                 elem = XpathElem(is_attr=self._match_str('@'), name=self._fields())
                 if self._match_str('['):
                     elem.attrs = self._attrs()
                     self._match_str(']', required=True)
                 tree.append(elem)
                 if not self._match_str('/'): break
             return tree
         def _fields(self):
             if self._match_str('{'):
                 tree = []
                 while True:
                     tree.append(self._field())
                     if not self._match_str(','): break
                 self._match_str('}', required=True)
                 tree = tuple(tree)
                 tree = tree[0] # just use first field for now
             else: tree = self._field()
             return tree
         def _attrs(self):
             tree = []
             while True:
                 path = self._path()
                 self._match_str('=', required=True)
                 path[-1].value = self._value()
                 tree.append(path)
                 if not self._match_str(','): break
             return tree
         def _field(self):
             return self._name()
         def _name(self): return self._match_re(r'[\w.]+', required=True)
         def _value(self): return self._match_re(r'[\w.|]+', required=True)

     def first_elem(node): return NodeElemIter(node).next()
     class NodeElemReverseIter:
         def __init__(self, node): self.child = node.lastChild
         def __iter__(self): return self
         def curr(self):
             while self.child != None:
                 if self.child.nodeType == Node.ELEMENT_NODE: return self.child
                 self.child = self.child.previousSibling
             raise StopIteration
         def next(self):
             child = self.curr()
             self.child = self.child.previousSibling
             return child
     def last_elem(node): return NodeElemReverseIter(node).next()
     class NodeParentIter:
         def __init__(self, node): self.node = node
-...
     def value(node):
         if node.firstChild != None: return node.firstChild.nodeValue.strip()
         else: return None
         else: return node.nodeValue
     def by_tag_name(node, name):
         for child in NodeElemReverseIter(node):
             if child.tagName == name: return child
         return None

     # Usage: [env n=<num-rows>] ./test
     selfDir="$(dirname -- "$0")"
     cd "$selfDir"
     test -n "$n" || n=2
     let n++
     cd "$selfDir"
     head -$n NYSpecimenDataAmericas.csv|./data2xml VegX NYBG-VegBank-VegX_mapping.csv

     # XML DOM tree manipulation
     from xml.dom import Node
     def name_of(node): return node.tagName.lower()
     def get_id(node): return node.getAttribute('id')
     def set_id(node, id_): node.setAttribute('id', id_)
     class NodeElemIter:
         def __init__(self, node): self.child = node.firstChild
         def __iter__(self): return self
         def curr(self):
             while self.child != None:
                 if self.child.nodeType == Node.ELEMENT_NODE: return self.child
                 self.child = self.child.nextSibling
             raise StopIteration
         def next(self):
             child = self.curr()
             self.child = self.child.nextSibling
             return child
     def first_elem(node): return NodeElemIter(node).next()
     class NodeElemReverseIter:
         def __init__(self, node): self.child = node.lastChild
         def __iter__(self): return self
         def curr(self):
             while self.child != None:
                 if self.child.nodeType == Node.ELEMENT_NODE: return self.child
                 self.child = self.child.previousSibling
             raise StopIteration
         def next(self):
             child = self.curr()
             self.child = self.child.previousSibling
             return child
     def last_elem(node): return NodeElemReverseIter(node).next()
     class NodeParentIter:
         def __init__(self, node): self.node = node
         def __iter__(self): return self
         def curr(self):
             if self.node != None and self.node.nodeType == Node.ELEMENT_NODE:
                 return self.node
             raise StopIteration
         def next(self):
             node = self.curr()
             self.node = self.node.parentNode
             return node
     def is_text(node):
         for child in NodeElemIter(node): return False # has an element node
         return True
     def value(node):
         if node.firstChild != None: return node.firstChild.nodeValue.strip()
         else: return node.nodeValue
     def by_tag_name(node, name):
         for child in NodeElemReverseIter(node):
             if child.tagName == name: return child
         return None

     "NYBG","VegBank","VegX"
     "key",,
     "DateLastModified",,
     "InstitutionCode","/taxonInterpretation/museum_ID->party/OrganizationName/_name/firstName","/*s/taxonNameUsageConcept/partyWithRole/partyID->/parties/party/organizationName/_name/firstName"
     "CollectionCode","/taxonInterpretation/museum_ID->party/OrganizationName/_name/lastName","/*s/taxonNameUsageConcept/partyWithRole/partyID->/parties/party/organizationName/_name/lastName"
     "InstitutionCode","/taxonInterpretation/museum_ID->/party/OrganizationName/_name/firstName","/*s/taxonNameUsageConcept/partyWithRole/partyID->/parties/party/organizationName/_name/firstName"
     "CollectionCode","/taxonInterpretation/museum_ID->/party/OrganizationName/_name/lastName","/*s/taxonNameUsageConcept/partyWithRole/partyID->/parties/party/organizationName/_name/lastName"
     "CatalogNumber","/taxonInterpretation/museumAccessionNumber","/*s/taxonNameUsageConcept/voucher"
     "ScientificName","/plantName/plantName","/*s/taxonName/Simple"
     "BasisOfRecord",,
-...
     "Genus","/plantName[plantStatus/plantLevel=Genus]/plantName","/*s/taxonConcept[Rank/@code=gen]/Name"
     "Species","/plantName[plantStatus/plantLevel=Species]/plantName","/*s/taxonConcept[Rank/@code=sp]/Name"
     "Subspecies","/plantName[plantStatus/plantLevel=Subspecies]/plantName","/*s/taxonConcept[Rank/@code=ssp]/Name"
     "ScientificNameAuthor","/plantConcept/reference_ID->referenceParty/{givenName,surname}","/*s/taxonConcept/AccordingTo/Simple"
     "IdentifiedBy","/taxonInterpretation/PARTY_ID->party/{givenName,middleName,surName}","/*s/taxonDetermination/partyWithRole/partyID->/parties/party/individualName/{givenName,surName}"
     "ScientificNameAuthor","/plantConcept/reference_ID->/referenceParty/{givenName,surname}","/*s/taxonConcept/AccordingTo/Simple"
     "IdentifiedBy","/taxonInterpretation/PARTY_ID->/party/{givenName,middleName,surName}","/*s/taxonDetermination/partyWithRole[role=identifier]/partyID->/parties/party/individualName/{givenName,surName}"
     "YearIdentified","/taxonInterpretation/interpretationDate/_date/year","/*s/taxonDetermination/date/_date/year"
     "MonthIdentified","/taxonInterpretation/interpretationDate/_date/month","/*s/taxonDetermination/date/_date/month"
     "DayIdentified","/taxonInterpretation/interpretationDate/_date/day","/*s/taxonDetermination/date/_date/day"
     "TypeStatus",,
     "CollectorNumber",,
     "FieldNumber","/taxonInterpretation/collectionNumber","/*s/taxonNameUsageConcept/authorCode"
     "Collector","/taxonInterpretation/collector_ID->party/{givenName,middleName,surName}","/*s/taxonDetermination/partyWithRole[role=collector]/partyID->/parties/party/individualName/{givenName,surName}"
     "Collector","/taxonInterpretation/collector_ID->/party/{givenName,middleName,surName}","/*s/taxonDetermination/partyWithRole[role=collector]/partyID->/parties/party/individualName/{givenName,surName}"
     "YearCollected","/taxonInterpretation/collectionDate/_date/year","/*s/plotObservation/obsStartDate/_date/year"
     "MonthCollected","/taxonInterpretation/collectionDate/_date/month","/*s/plotObservation/obsStartDate/_date/month"
     "DayCollected","/taxonInterpretation/collectionDate/_date/day","/*s/plotObservation/obsStartDate/_date/day"

     # XPath-based XML tree manipulation
     from xml.dom import Node
     from Parser import Parser
     import xml_util
     class XpathElem:
         def __init__(self, name, value=None, attrs=None, is_attr=False,
             is_ptr=False):
             if attrs == None: attrs = []
             self.name = name
             self.value = value
             self.attrs = attrs
             self.is_attr = is_attr
             self.is_ptr = is_ptr
         def __repr__(self):
             str_ = ''
             if self.is_attr: str_ += '@'
             str_ += self.name+repr(self.attrs)+'='+repr(self.value)
             if self.is_ptr: str_ += '->'
             return str_
         def __eq__(self, other): return self.__dict__ == other.__dict__
     class XpathParser(Parser):
         def _main(self):
             self._match_str('/', required=True)
             return self._path()
         def _path(self):
             tree = []
             while True:
                 elem = XpathElem(is_attr=self._match_str('@'), name=self._fields())
                 if self._match_str('['):
                     elem.attrs = self._attrs()
                     self._match_str(']', required=True)
                 elem.is_ptr = self._match_str('->')
                 tree.append(elem)
                 if not self._match_str('/'): break
             return tree
         def _fields(self):
             if self._match_str('{'):
                 tree = []
                 while True:
                     tree.append(self._field())
                     if not self._match_str(','): break
                 self._match_str('}', required=True)
                 tree = tuple(tree)
                 tree = tree[0] # just use first field for now
             else: tree = self._field()
             return tree
         def _attrs(self):
             tree = []
             while True:
                 path = self._path()
                 self._match_str('=', required=True)
                 path[-1].value = self._value()
                 tree.append(path)
                 if not self._match_str(','): break
             return tree
         def _field(self):
             return self._name()
         def _name(self): return self._match_re(r'[\w.]+', required=True)
         def _value(self): return self._match_re(r'[\w.|]+', required=True)
     def set_id(path, id_, has_type_containers=True):
         if has_type_containers: id_level = 1
         else: id_level = 0
         path[id_level].attrs.append([XpathElem('id', id_, is_attr=True)])
     def get(doc, path, create=False, parent=None):
         if parent == None: parent = doc.documentElement
         for elem in path:
             node = None
             if elem.is_attr: node = parent.getAttributeNode(elem.name)
             elif elem.name == '.': node = parent
             else: node = xml_util.by_tag_name(parent, elem.name)
             if node != None and elem.value != None\
             and xml_util.value(node) != elem.value: node = None
             for attr in elem.attrs:
                 if get(doc, attr, parent=node) == None: node = None; break
             if node == None:
                 if not create: return None
                 if elem.is_attr:
                     parent.setAttribute(elem.name, '')
                     node = parent.getAttributeNode(elem.name)
                 else: node = parent.appendChild(doc.createElement(elem.name))
                 if elem.value != None:
                     if node.nodeType == Node.ELEMENT_NODE:
                         node.appendChild(doc.createTextNode(elem.value))
                     else: node.nodeValue = elem.value
                 for attr in elem.attrs: get(doc, attr, create, node)
             if elem.is_ptr:
                 pass
             parent = node
         return parent

     from copy import deepcopy
     from xml.dom.minidom import getDOMImplementation
     from XpathParser import XpathParser, XpathElem
     from xml_dom import by_path
     import xpath
     def main():
         prog_name = sys.argv.pop(0)
-...
                 path = path.replace('<name>', name)
                 path, repl_ct = re.subn(r'(?<=/)\*(?=s/(\w+))', r'\1', path)
                 if repl_ct > 0: has_type_containers = True # *s used for type elem
                 mappings[name] = XpathParser(path).parse()
                 mappings[name] = xpath.XpathParser(path).parse()
         stream.close()
         if has_type_containers: id_level = 1
         else: id_level = 0
         # Process dataset
         doc = getDOMImplementation().createDocument(None, dest, None)
-...
                 value = row[idx]
                 if value != '' and name in mappings:
                     path = deepcopy(mappings[name]) # don't modify main value!
                     path[id_level].attrs.append([XpathElem('id', row_id,
                         is_attr=True)])
                     xpath.set_id(path, row_id, has_type_containers)
                     path[-1].value = value
                     by_path(doc, path, True)
                     xpath.get(doc, path, True)
             row_idx += 1
         doc.writexml(sys.stdout, addindent='    ', newl='\n')

Project

General

Profile

Revision 21

Added by Aaron Marcuse-Kubitza over 13 years ago