/ - Diff - BIEN 3 - NCEAS Projects

     # XML-database conversion
     import re
     from xml.dom import Node
     import db_util
     import xml_util
     def name_of(node): return re.sub(r'^.*\.', r'', xml_util.name_of(node))
     ptr_suffix = '_id'
     def is_ptr(node_name): return node_name.endswith(ptr_suffix)
     def ptr_type(node_name):
         assert is_ptr(node_name)
         return node_name[:-len(ptr_suffix)]
     def ptr_target(node):
         assert is_ptr(name_of(node))
         return xml_util.first_elem(node)
     def find_by_name(node, name):
         for parent in xml_util.NodeParentIter(node):
             if name_of(parent) == name: return parent
             else:
                 for child in xml_util.NodeElemIter(parent):
                     child_name = name_of(child)
                     if is_ptr(child_name):
                         if ptr_type(child_name) == name: return ptr_target(child)
                     elif child_name == name: return child
         return None
     def get(db, node, create=False, store_ids=False, row_ct_ref=None, pkeys=None):
         # store_ids enables searching the tree for missing fields
         if pkeys == None: pkeys = {}
         def pkey(table):
             if table not in pkeys: pkeys[table] = db_util.pkey(db, table)
             return pkeys[table]
         def obj(node, parent_id=None):
             table = name_of(node)
             pkey_ = pkey(table)
             row = {}
             children = []
             # Divide children into fields and children with fkeys to parent
             for child in xml_util.NodeElemIter(node):
                 child_name = name_of(child)
                 if xml_util.is_text(child): row[child_name] = xml_util.value(child)
                 elif is_ptr(child_name): row[child_name] = obj(ptr_target(child))
                 else: children.append(child)
             try: del row[pkey_]
             except KeyError: pass
             # Add fkey to parent
             if parent_id != None: row[pkey(name_of(node.parentNode))] = parent_id
             # Insert node
             for try_num in range(2):
                 try:
                     id_ = db_util.get(db, table, row, pkey_, create, row_ct_ref)
                     if store_ids: xml_util.set_id(node, id_)
                     break
                 except db_util.NullValueException, ex:
                     if try_num > 0: raise # exception still raised after retry
                     # Search for required column in ancestors and their children
                     target = find_by_name(node, ptr_type(ex.col))
                     if target == None: raise
                     row[ex.col] = xml_util.get_id(target)
             # Insert children with fkeys to parent
             for child in children: obj(child, id_)
             return id_
         return obj(node)
     def xml2db(db, node, row_ct_ref=None):
         for child in xml_util.NodeElemIter(node):
             if not xml_util.is_text(child): # not XML metadata
                 get(db, child, True, True, row_ct_ref)

     # Database access
     import random
     import re
     import sys
     import ex_util
     def _add_cursor_info(ex, cur): ex_util.add_msg(ex, 'query: '+cur.query)
     class NameException(Exception): pass
     class DbException(ex_util.ExceptionWithCause):
         def __init__(self, msg, cause=None, cur=None):
             ex_util.ExceptionWithCause.__init__(self, msg, cause)
             if cur != None: _add_cursor_info(self, cur)
     class ExceptionWithColumn(DbException):
         def __init__(self, col, cause=None):
             DbException.__init__(self, 'column: '+col, cause)
             self.col = col
     class DuplicateKeyException(ExceptionWithColumn): pass
     class NullValueException(ExceptionWithColumn): pass
     def check_name(name):
         if re.search(r'\W', name) != None: raise NameException('Name "'+name
             +'" may contain only alphanumeric characters and _')
     def run_query(db, query, params=None):
         cur = db.cursor()
         try: cur.execute(query, params)
         except Exception, ex:
             _add_cursor_info(ex, cur)
             raise
         return cur
     def col(cur, idx): return cur.description[idx][0]
     def row(cur): return iter(lambda: cur.fetchone(), None).next()
     def value(cur): return row(cur)[0]
     def with_savepoint(db, func):
         savepoint = 'savepoint_'+str(random.randint(0, sys.maxint)) # must be unique
         run_query(db, 'SAVEPOINT '+savepoint)
         try: return_val = func()
         except:
             run_query(db, 'ROLLBACK TO SAVEPOINT '+savepoint)
             raise
         else:
             run_query(db, 'RELEASE SAVEPOINT '+savepoint)
             return return_val
     def select(db, table, fields, conds):
         check_name(table)
         map(check_name, fields)
         map(check_name, conds.keys())
         def cond(entry):
             col, value = entry
             cond_ = col+' '
             if value == None: cond_ += 'IS'
             else: cond_ += '='
             cond_ += ' %s'
             return cond_
         return run_query(db, 'SELECT '+', '.join(fields)+' FROM '+table+' WHERE '
             +' AND '.join(map(cond, conds.iteritems())), conds.values())
     def insert(db, table, row):
         check_name(table)
         cols = row.keys()
         map(check_name, cols)
         return run_query(db, 'INSERT INTO '+table+' ('+', '.join(cols)
             +') VALUES ('+', '.join(['%s']*len(cols))+')', row.values())
     def last_insert_id(db): return value(run_query(db, 'SELECT lastval()'))
     def try_insert(db, table, row):
         try: return with_savepoint(db, lambda: insert(db, table, row))
         except Exception, ex:
             msg = str(ex)
             match = re.search(r'duplicate key value violates unique constraint "'
                 +table+'_(\w+)_index"', msg)
             if match: raise DuplicateKeyException(match.group(1), ex)
             match = re.search(r'null value in column "(\w+)" violates not-null '
                 'constraint', msg)
             if match: raise NullValueException(match.group(1), ex)
             raise # no specific exception raised
     def pkey(db, table): # Assumed to be first column in table
         check_name(table)
         return col(run_query(db, 'SELECT * FROM '+table+' LIMIT 0'), 0)
     def get(db, table, row, pkey, create=False, row_ct_ref=None):
         try: return value(select(db, table, [pkey], row))
         except StopIteration:
             if not create: raise
             # Insert new row
             try:
                 row_ct = try_insert(db, table, row).rowcount
                 if row_ct_ref != None and row_ct >= 0: row_ct_ref[0] += row_ct
                 return last_insert_id(db)
             except DuplicateKeyException, ex:
                 return value(select(db, table, [pkey], {ex.col: row[ex.col]}))

     # Exception handling
     def add_msg(ex, msg): ex.args = (str(ex).rstrip()+'\n'+msg,)
     class ExceptionWithCause(Exception):
         def __init__(self, msg, cause=None):
             Exception.__init__(self, msg)
             if cause != None: add_msg(self, 'cause: '+str(cause))

     # XML DOM tree manipulation
     from xml.dom import Node
     import xml.dom.minidom
     def name_of(node): return node.tagName.lower()
     def get_id(node): return node.getAttribute('id')
     def set_id(node, id_): node.setAttribute('id', id_)
     class NodeElemIter:
         def __init__(self, node): self.child = node.firstChild
         def __iter__(self): return self
         def curr(self):
             while self.child != None:
                 if self.child.nodeType == Node.ELEMENT_NODE: return self.child
                 self.child = self.child.nextSibling
             raise StopIteration
         def next(self):
             child = self.curr()
             self.child = self.child.nextSibling
             return child
     def first_elem(node): return NodeElemIter(node).next()
     class NodeElemReverseIter:
         def __init__(self, node): self.child = node.lastChild
         def __iter__(self): return self
         def curr(self):
             while self.child != None:
                 if self.child.nodeType == Node.ELEMENT_NODE: return self.child
                 self.child = self.child.previousSibling
             raise StopIteration
         def next(self):
             child = self.curr()
             self.child = self.child.previousSibling
             return child
     def last_elem(node): return NodeElemReverseIter(node).next()
     class NodeParentIter:
         def __init__(self, node): self.node = node
         def __iter__(self): return self
         def curr(self):
             if self.node != None and self.node.nodeType == Node.ELEMENT_NODE:
                 return self.node
             raise StopIteration
         def next(self):
             node = self.curr()
             self.node = self.node.parentNode
             return node
     def is_text(node):
         for child in NodeElemIter(node): return False # has an element node
         return True
     def value(node):
         if node.firstChild != None: return node.firstChild.nodeValue
         else: return node.nodeValue
     def set_value(doc, node, value):
         if node.nodeType == Node.ELEMENT_NODE:
             node.appendChild(doc.createTextNode(value))
         else: node.nodeValue = value
     def by_tag_name(node, name, last_only=False):
         # last_only optimization returns last (most recently inserted) matching node
         children = []
         for child in NodeElemReverseIter(node):
             if child.tagName == name:
                 children.append(child)
                 if last_only: break
         return children
     _writexml_orig = xml.dom.minidom.Element.writexml
     def _writexml(self, writer, indent="", addindent="", newl=""):
         if self.firstChild != None and self.firstChild.nextSibling == None\
         and self.firstChild.nodeType == Node.TEXT_NODE: # a single text node
             writer.write(indent+'<'+self.tagName)
             for attr_idx in range(self.attributes.length):
                 attr = self.attributes.item(attr_idx)
                 writer.write(' '+attr.name+'='+attr.value)
             writer.write('>'+value(self)+'</'+self.tagName+'>'+newl)
         else: _writexml_orig(self, writer, indent, addindent, newl)
     xml.dom.minidom.Element.writexml = _writexml

scripts/data2xml/util.py
1		# Useful functions and classes
2
3		class Obj:
4		def __init__(self, **attrs): self.__dict__ = attrs
5
6		def __repr__(self): return repr(self.__dict__)
7	0

     # XPath-based XML tree manipulation
     from copy import deepcopy
     from xml.dom import Node
     from Parser import Parser
     import xml_util
     class XpathElem:
         def __init__(self, name, value=None, attrs=None, is_attr=False,
             is_ptr=False):
             if attrs == None: attrs = []
             self.name = name
             self.value = value
             self.attrs = attrs
             self.is_attr = is_attr
             self.is_ptr = is_ptr
         def __repr__(self):
             str_ = ''
             if self.is_attr: str_ += '@'
             str_ += self.name
             if self.attrs != []: str_ += repr(self.attrs)
             if self.value != None: str_ += '='+repr(self.value)
             if self.is_ptr: str_ += '->'
             return str_
         def __eq__(self, other): return self.__dict__ == other.__dict__
     def value(path): return path[-1].value
     def set_value(path, value): path[-1].value = value
     def backward_id(elem):
         if len(elem.attrs) >= 1 and value(elem.attrs[0]) == None:
             return elem.attrs[0]
         else: return None
     class XpathParser(Parser):
         def _main(self):
             self._match_str('/') # optional leading /
             return self._path()
         def _path(self):
             tree = []
             trailing_slash = False
             while True:
                 # Split path
                 if self._match_str('{'):
                     paths = []
                     while True:
                         paths.append(tree + self._path())
                         if not self._match_str(','): break
                     self._match_str('}', required=True)
                     tree = paths[0] # just use first subpath for now
                     break # nothing allowed after split path
                 elem = XpathElem(is_attr=self._match_str('@'),
                     name=self._match_re(r'[\w.*]+', required=True))
                 # Attrs
                 if self._match_str('['):
                     elem.attrs = []
                     while True:
                         path = self._path()
                         if self._match_str('='):
                             set_value(path, self._match_re(r'[\w.|]*'))
                         elem.attrs.append(path)
                         if not self._match_str(','): break
                     self._match_str(']', required=True)
                 elem.is_ptr = self._match_str('->')
                 tree.append(elem)
                 # Lookahead assertion
                 if self._match_str('('):
                     self._match_str('/', required=True) # next / is inside ()
                     path = self._path()
                     self._match_str(')', required=True)
                     elem.attrs.append(path)
                     tree += path
                 if not self._match_str('/'): break
             # Expand * abbrs
             elem_idx = 0
             for elem in tree:
                 id_ = backward_id(elem)
                 if id_ != None: elem = id_[0]; offset = -2
                 elif elem.is_ptr: offset = 2
                 else: offset = 1
                 before, abbr, after = elem.name.partition('*')
                 if abbr != '':
                     try: elem.name = before+tree[elem_idx+offset].name+after
                     except IndexError: pass # no replacement elem
                 elem_idx += 1
             return tree
     instance_level = 1
     def obj(path):
         obj_path = deepcopy(path[:instance_level+1])
         obj_path[-1].is_ptr = False # prevent pointer w/o target
         return obj_path
     def set_id(path, id_, has_types=True):
         if has_types: id_level = instance_level
         else: id_level = 0
         path[id_level].attrs.append([XpathElem('id', id_, is_attr=True)])
     def get(doc, path, create=False, last_only=None, parent=None):
         # Warning: The last_only optimization may put data that should be together
         # into separate nodes
         if parent == None: parent = doc.documentElement
         if last_only == None: last_only = create
         elem_idx = 0
         for elem in path:
             # Find possible matches
             children = []
             if elem.is_attr:
                 child = parent.getAttributeNode(elem.name)
                 if child != None: children = [child]
             elif elem.name == '.': children = [parent]
             else: children = xml_util.by_tag_name(parent, elem.name, last_only)
             # Check each match
             node = None
             for child in children:
                 is_match = elem.value == None or xml_util.value(child) == elem.value
                 for attr in elem.attrs:
                     if not is_match: break
                     is_match = get(doc, attr, False, last_only, child) != None
                 if is_match: node = child; break
             # Create node
             if node == None:
                 if not create: return None
                 if elem.is_attr:
                     parent.setAttribute(elem.name, '')
                     node = parent.getAttributeNode(elem.name)
                 else: node = parent.appendChild(doc.createElement(elem.name))
                 if elem.value != None: xml_util.set_value(doc, node, elem.value)
                 for attr in elem.attrs: get(doc, attr, create, last_only, node)
             # Follow pointer
             if elem.is_ptr:
                 path = deepcopy(path[elem_idx+1:]) # rest of path
                 id_elem = backward_id(path[instance_level])
                 if id_elem != None:
                     # backward (child-to-parent) pointer with target ID attr
                     set_value(id_elem, xml_util.get_id(node))
                 else: # forward (parent-to-child) pointer
                     id_ = xml_util.value(node)
                     obj_path = obj(path) # target object
                     if id_ == None or get(doc, obj_path, False, True) == None:
                         # no target or target attrs don't match
                         if not create: return None
                         # Use last target object's ID + 1
                         obj_path[-1].attrs = [] # just get by tag name
                         last = get(doc, obj_path, False, True)
                         if last != None: id_ = str(int(xml_util.get_id(last)) + 1)
                         else: id_ = '0'
                         # Will append if target attrs didn't match. Place ! in XPath
                         # after element to fork at to avoid this.
                         xml_util.set_value(doc, node, id_)
                     else: last_only = False
                     set_id(path, id_)
                 return get(doc, path, create, last_only)
             parent = node
             elem_idx += 1
         return parent

     # A general recursive descent parser
     import re
     class SyntaxException(Exception): pass
     class Parser:
         def __init__(self, string):
             self._str = string
             self._pos = 0
         def parse(self):
             tree = self._main()
             if not self._pos == len(self._str): self._syntax_err('End of string')
             return tree
         def _match_re(self, pattern, required=False):
             matcher = re.compile(pattern).match(self._str, self._pos)
             if matcher:
                 self._pos = matcher.end(0)
                 return matcher.group(0)
             elif required: self._syntax_err(pattern)
             else: return None
         def _match_str(self, string, required=False):
             end_pos = self._pos + len(string)
             if self._str[self._pos:end_pos] == string:
                 self._pos = end_pos
                 return True
             elif required: self._syntax_err(string)
             else: return False
         def _syntax_err(self, token):
             raise SyntaxException(token+' expected in '+self._str[self._pos:])

     # XML DOM tree manipulation
     from xml.dom import Node
     import xml.dom.minidom
     def name_of(node): return node.tagName.lower()
     def get_id(node): return node.getAttribute('id')
     def set_id(node, id_): node.setAttribute('id', id_)
     class NodeElemIter:
         def __init__(self, node): self.child = node.firstChild
         def __iter__(self): return self
         def curr(self):
             while self.child != None:
                 if self.child.nodeType == Node.ELEMENT_NODE: return self.child
                 self.child = self.child.nextSibling
             raise StopIteration
         def next(self):
             child = self.curr()
             self.child = self.child.nextSibling
             return child
     def first_elem(node): return NodeElemIter(node).next()
     class NodeElemReverseIter:
         def __init__(self, node): self.child = node.lastChild
         def __iter__(self): return self
         def curr(self):
             while self.child != None:
                 if self.child.nodeType == Node.ELEMENT_NODE: return self.child
                 self.child = self.child.previousSibling
             raise StopIteration
         def next(self):
             child = self.curr()
             self.child = self.child.previousSibling
             return child
     def last_elem(node): return NodeElemReverseIter(node).next()
     class NodeParentIter:
         def __init__(self, node): self.node = node
         def __iter__(self): return self
         def curr(self):
             if self.node != None and self.node.nodeType == Node.ELEMENT_NODE:
                 return self.node
             raise StopIteration
         def next(self):
             node = self.curr()
             self.node = self.node.parentNode
             return node
     def is_text(node):
         for child in NodeElemIter(node): return False # has an element node
         return True
     def value(node):
         if node.firstChild != None: return node.firstChild.nodeValue
         else: return node.nodeValue
     def set_value(doc, node, value):
         if node.nodeType == Node.ELEMENT_NODE:
             node.appendChild(doc.createTextNode(value))
         else: node.nodeValue = value
     def by_tag_name(node, name, last_only=False):
         # last_only optimization returns last (most recently inserted) matching node
         children = []
         for child in NodeElemReverseIter(node):
             if child.tagName == name:
                 children.append(child)
                 if last_only: break
         return children
     _writexml_orig = xml.dom.minidom.Element.writexml
     def _writexml(self, writer, indent="", addindent="", newl=""):
         if self.firstChild != None and self.firstChild.nextSibling == None\
         and self.firstChild.nodeType == Node.TEXT_NODE: # a single text node
             writer.write(indent+'<'+self.tagName)
             for attr_idx in range(self.attributes.length):
                 attr = self.attributes.item(attr_idx)
                 writer.write(' '+attr.name+'='+attr.value)
             writer.write('>'+value(self)+'</'+self.tagName+'>'+newl)
         else: _writexml_orig(self, writer, indent, addindent, newl)
     xml.dom.minidom.Element.writexml = _writexml

     # Exception handling
     def add_msg(ex, msg): ex.args = (str(ex).rstrip()+'\n'+msg,)
     class ExceptionWithCause(Exception):
         def __init__(self, msg, cause=None):
             Exception.__init__(self, msg)
             if cause != None: add_msg(self, 'cause: '+str(cause))

     # A general recursive descent parser
     import re
     class SyntaxException(Exception): pass
     class Parser:
         def __init__(self, string):
             self._str = string
             self._pos = 0
         def parse(self):
             tree = self._main()
             if not self._pos == len(self._str): self._syntax_err('End of string')
             return tree
         def _match_re(self, pattern, required=False):
             matcher = re.compile(pattern).match(self._str, self._pos)
             if matcher:
                 self._pos = matcher.end(0)
                 return matcher.group(0)
             elif required: self._syntax_err(pattern)
             else: return None
         def _match_str(self, string, required=False):
             end_pos = self._pos + len(string)
             if self._str[self._pos:end_pos] == string:
                 self._pos = end_pos
                 return True
             elif required: self._syntax_err(string)
             else: return False
         def _syntax_err(self, token):
             raise SyntaxException(token+' expected in '+self._str[self._pos:])

scripts/lib/util.py
	1	# Useful functions and classes
	2
	3	class Obj:
	4	def __init__(self, **attrs): self.__dict__ = attrs
	5
	6	def __repr__(self): return repr(self.__dict__)
0	7

     # XML DOM tree manipulation
     from xml.dom import Node
     import xml.dom.minidom
     def name_of(node): return node.tagName.lower()
     def get_id(node): return node.getAttribute('id')
     def set_id(node, id_): node.setAttribute('id', id_)
     class NodeElemIter:
         def __init__(self, node): self.child = node.firstChild
         def __iter__(self): return self
         def curr(self):
             while self.child != None:
                 if self.child.nodeType == Node.ELEMENT_NODE: return self.child
                 self.child = self.child.nextSibling
             raise StopIteration
         def next(self):
             child = self.curr()
             self.child = self.child.nextSibling
             return child
     def first_elem(node): return NodeElemIter(node).next()
     class NodeElemReverseIter:
         def __init__(self, node): self.child = node.lastChild
         def __iter__(self): return self
         def curr(self):
             while self.child != None:
                 if self.child.nodeType == Node.ELEMENT_NODE: return self.child
                 self.child = self.child.previousSibling
             raise StopIteration
         def next(self):
             child = self.curr()
             self.child = self.child.previousSibling
             return child
     def last_elem(node): return NodeElemReverseIter(node).next()
     class NodeParentIter:
         def __init__(self, node): self.node = node
         def __iter__(self): return self
         def curr(self):
             if self.node != None and self.node.nodeType == Node.ELEMENT_NODE:
                 return self.node
             raise StopIteration
         def next(self):
             node = self.curr()
             self.node = self.node.parentNode
             return node
     def is_text(node):
         for child in NodeElemIter(node): return False # has an element node
         return True
     def value(node):
         if node.firstChild != None: return node.firstChild.nodeValue
         else: return node.nodeValue
     def set_value(doc, node, value):
         if node.nodeType == Node.ELEMENT_NODE:
             node.appendChild(doc.createTextNode(value))
         else: node.nodeValue = value
     def by_tag_name(node, name, last_only=False):
         # last_only optimization returns last (most recently inserted) matching node
         children = []
         for child in NodeElemReverseIter(node):
             if child.tagName == name:
                 children.append(child)
                 if last_only: break
         return children
     _writexml_orig = xml.dom.minidom.Element.writexml
     def _writexml(self, writer, indent="", addindent="", newl=""):
         if self.firstChild != None and self.firstChild.nextSibling == None\
         and self.firstChild.nodeType == Node.TEXT_NODE: # a single text node
             writer.write(indent+'<'+self.tagName)
             for attr_idx in range(self.attributes.length):
                 attr = self.attributes.item(attr_idx)
                 writer.write(' '+attr.name+'='+attr.value)
             writer.write('>'+value(self)+'</'+self.tagName+'>'+newl)
         else: _writexml_orig(self, writer, indent, addindent, newl)
     xml.dom.minidom.Element.writexml = _writexml

     # Database access
     import random
     import re
     import sys
     import ex_util
     def _add_cursor_info(ex, cur): ex_util.add_msg(ex, 'query: '+cur.query)
     class NameException(Exception): pass
     class DbException(ex_util.ExceptionWithCause):
         def __init__(self, msg, cause=None, cur=None):
             ex_util.ExceptionWithCause.__init__(self, msg, cause)
             if cur != None: _add_cursor_info(self, cur)
     class ExceptionWithColumn(DbException):
         def __init__(self, col, cause=None):
             DbException.__init__(self, 'column: '+col, cause)
             self.col = col
     class DuplicateKeyException(ExceptionWithColumn): pass
     class NullValueException(ExceptionWithColumn): pass
     def check_name(name):
         if re.search(r'\W', name) != None: raise NameException('Name "'+name
             +'" may contain only alphanumeric characters and _')
     def run_query(db, query, params=None):
         cur = db.cursor()
         try: cur.execute(query, params)
         except Exception, ex:
             _add_cursor_info(ex, cur)
             raise
         return cur
     def col(cur, idx): return cur.description[idx][0]
     def row(cur): return iter(lambda: cur.fetchone(), None).next()
     def value(cur): return row(cur)[0]
     def with_savepoint(db, func):
         savepoint = 'savepoint_'+str(random.randint(0, sys.maxint)) # must be unique
         run_query(db, 'SAVEPOINT '+savepoint)
         try: return_val = func()
         except:
             run_query(db, 'ROLLBACK TO SAVEPOINT '+savepoint)
             raise
         else:
             run_query(db, 'RELEASE SAVEPOINT '+savepoint)
             return return_val
     def select(db, table, fields, conds):
         check_name(table)
         map(check_name, fields)
         map(check_name, conds.keys())
         def cond(entry):
             col, value = entry
             cond_ = col+' '
             if value == None: cond_ += 'IS'
             else: cond_ += '='
             cond_ += ' %s'
             return cond_
         return run_query(db, 'SELECT '+', '.join(fields)+' FROM '+table+' WHERE '
             +' AND '.join(map(cond, conds.iteritems())), conds.values())
     def insert(db, table, row):
         check_name(table)
         cols = row.keys()
         map(check_name, cols)
         return run_query(db, 'INSERT INTO '+table+' ('+', '.join(cols)
             +') VALUES ('+', '.join(['%s']*len(cols))+')', row.values())
     def last_insert_id(db): return value(run_query(db, 'SELECT lastval()'))
     def try_insert(db, table, row):
         try: return with_savepoint(db, lambda: insert(db, table, row))
         except Exception, ex:
             msg = str(ex)
             match = re.search(r'duplicate key value violates unique constraint "'
                 +table+'_(\w+)_index"', msg)
             if match: raise DuplicateKeyException(match.group(1), ex)
             match = re.search(r'null value in column "(\w+)" violates not-null '
                 'constraint', msg)
             if match: raise NullValueException(match.group(1), ex)
             raise # no specific exception raised
     def pkey(db, table): # Assumed to be first column in table
         check_name(table)
         return col(run_query(db, 'SELECT * FROM '+table+' LIMIT 0'), 0)
     def get(db, table, row, pkey, create=False, row_ct_ref=None):
         try: return value(select(db, table, [pkey], row))
         except StopIteration:
             if not create: raise
             # Insert new row
             try:
                 row_ct = try_insert(db, table, row).rowcount
                 if row_ct_ref != None and row_ct >= 0: row_ct_ref[0] += row_ct
                 return last_insert_id(db)
             except DuplicateKeyException, ex:
                 return value(select(db, table, [pkey], {ex.col: row[ex.col]}))

     # XPath-based XML tree manipulation
     from copy import deepcopy
     from xml.dom import Node
     from Parser import Parser
     import xml_util
     class XpathElem:
         def __init__(self, name, value=None, attrs=None, is_attr=False,
             is_ptr=False):
             if attrs == None: attrs = []
             self.name = name
             self.value = value
             self.attrs = attrs
             self.is_attr = is_attr
             self.is_ptr = is_ptr
         def __repr__(self):
             str_ = ''
             if self.is_attr: str_ += '@'
             str_ += self.name
             if self.attrs != []: str_ += repr(self.attrs)
             if self.value != None: str_ += '='+repr(self.value)
             if self.is_ptr: str_ += '->'
             return str_
         def __eq__(self, other): return self.__dict__ == other.__dict__
     def value(path): return path[-1].value
     def set_value(path, value): path[-1].value = value
     def backward_id(elem):
         if len(elem.attrs) >= 1 and value(elem.attrs[0]) == None:
             return elem.attrs[0]
         else: return None
     class XpathParser(Parser):
         def _main(self):
             self._match_str('/') # optional leading /
             return self._path()
         def _path(self):
             tree = []
             trailing_slash = False
             while True:
                 # Split path
                 if self._match_str('{'):
                     paths = []
                     while True:
                         paths.append(tree + self._path())
                         if not self._match_str(','): break
                     self._match_str('}', required=True)
                     tree = paths[0] # just use first subpath for now
                     break # nothing allowed after split path
                 elem = XpathElem(is_attr=self._match_str('@'),
                     name=self._match_re(r'[\w.*]+', required=True))
                 # Attrs
                 if self._match_str('['):
                     elem.attrs = []
                     while True:
                         path = self._path()
                         if self._match_str('='):
                             set_value(path, self._match_re(r'[\w.|]*'))
                         elem.attrs.append(path)
                         if not self._match_str(','): break
                     self._match_str(']', required=True)
                 elem.is_ptr = self._match_str('->')
                 tree.append(elem)
                 # Lookahead assertion
                 if self._match_str('('):
                     self._match_str('/', required=True) # next / is inside ()
                     path = self._path()
                     self._match_str(')', required=True)
                     elem.attrs.append(path)
                     tree += path
                 if not self._match_str('/'): break
             # Expand * abbrs
             elem_idx = 0
             for elem in tree:
                 id_ = backward_id(elem)
                 if id_ != None: elem = id_[0]; offset = -2
                 elif elem.is_ptr: offset = 2
                 else: offset = 1
                 before, abbr, after = elem.name.partition('*')
                 if abbr != '':
                     try: elem.name = before+tree[elem_idx+offset].name+after
                     except IndexError: pass # no replacement elem
                 elem_idx += 1
             return tree
     instance_level = 1
     def obj(path):
         obj_path = deepcopy(path[:instance_level+1])
         obj_path[-1].is_ptr = False # prevent pointer w/o target
         return obj_path
     def set_id(path, id_, has_types=True):
         if has_types: id_level = instance_level
         else: id_level = 0
         path[id_level].attrs.append([XpathElem('id', id_, is_attr=True)])
     def get(doc, path, create=False, last_only=None, parent=None):
         # Warning: The last_only optimization may put data that should be together
         # into separate nodes
         if parent == None: parent = doc.documentElement
         if last_only == None: last_only = create
         elem_idx = 0
         for elem in path:
             # Find possible matches
             children = []
             if elem.is_attr:
                 child = parent.getAttributeNode(elem.name)
                 if child != None: children = [child]
             elif elem.name == '.': children = [parent]
             else: children = xml_util.by_tag_name(parent, elem.name, last_only)
             # Check each match
             node = None
             for child in children:
                 is_match = elem.value == None or xml_util.value(child) == elem.value
                 for attr in elem.attrs:
                     if not is_match: break
                     is_match = get(doc, attr, False, last_only, child) != None
                 if is_match: node = child; break
             # Create node
             if node == None:
                 if not create: return None
                 if elem.is_attr:
                     parent.setAttribute(elem.name, '')
                     node = parent.getAttributeNode(elem.name)
                 else: node = parent.appendChild(doc.createElement(elem.name))
                 if elem.value != None: xml_util.set_value(doc, node, elem.value)
                 for attr in elem.attrs: get(doc, attr, create, last_only, node)
             # Follow pointer
             if elem.is_ptr:
                 path = deepcopy(path[elem_idx+1:]) # rest of path
                 id_elem = backward_id(path[instance_level])
                 if id_elem != None:
                     # backward (child-to-parent) pointer with target ID attr
                     set_value(id_elem, xml_util.get_id(node))
                 else: # forward (parent-to-child) pointer
                     id_ = xml_util.value(node)
                     obj_path = obj(path) # target object
                     if id_ == None or get(doc, obj_path, False, True) == None:
                         # no target or target attrs don't match
                         if not create: return None
                         # Use last target object's ID + 1
                         obj_path[-1].attrs = [] # just get by tag name
                         last = get(doc, obj_path, False, True)
                         if last != None: id_ = str(int(xml_util.get_id(last)) + 1)
                         else: id_ = '0'
                         # Will append if target attrs didn't match. Place ! in XPath
                         # after element to fork at to avoid this.
                         xml_util.set_value(doc, node, id_)
                     else: last_only = False
                     set_id(path, id_)
                 return get(doc, path, create, last_only)
             parent = node
             elem_idx += 1
         return parent

     # XML-database conversion
     import re
     from xml.dom import Node
     import db_util
     import xml_util
     def name_of(node): return re.sub(r'^.*\.', r'', xml_util.name_of(node))
     ptr_suffix = '_id'
     def is_ptr(node_name): return node_name.endswith(ptr_suffix)
     def ptr_type(node_name):
         assert is_ptr(node_name)
         return node_name[:-len(ptr_suffix)]
     def ptr_target(node):
         assert is_ptr(name_of(node))
         return xml_util.first_elem(node)
     def find_by_name(node, name):
         for parent in xml_util.NodeParentIter(node):
             if name_of(parent) == name: return parent
             else:
                 for child in xml_util.NodeElemIter(parent):
                     child_name = name_of(child)
                     if is_ptr(child_name):
                         if ptr_type(child_name) == name: return ptr_target(child)
                     elif child_name == name: return child
         return None
     def get(db, node, create=False, store_ids=False, row_ct_ref=None, pkeys=None):
         # store_ids enables searching the tree for missing fields
         if pkeys == None: pkeys = {}
         def pkey(table):
             if table not in pkeys: pkeys[table] = db_util.pkey(db, table)
             return pkeys[table]
         def obj(node, parent_id=None):
             table = name_of(node)
             pkey_ = pkey(table)
             row = {}
             children = []
             # Divide children into fields and children with fkeys to parent
             for child in xml_util.NodeElemIter(node):
                 child_name = name_of(child)
                 if xml_util.is_text(child): row[child_name] = xml_util.value(child)
                 elif is_ptr(child_name): row[child_name] = obj(ptr_target(child))
                 else: children.append(child)
             try: del row[pkey_]
             except KeyError: pass
             # Add fkey to parent
             if parent_id != None: row[pkey(name_of(node.parentNode))] = parent_id
             # Insert node
             for try_num in range(2):
                 try:
                     id_ = db_util.get(db, table, row, pkey_, create, row_ct_ref)
                     if store_ids: xml_util.set_id(node, id_)
                     break
                 except db_util.NullValueException, ex:
                     if try_num > 0: raise # exception still raised after retry
                     # Search for required column in ancestors and their children
                     target = find_by_name(node, ptr_type(ex.col))
                     if target == None: raise
                     row[ex.col] = xml_util.get_id(target)
             # Insert children with fkeys to parent
             for child in children: obj(child, id_)
             return id_
         return obj(node)
     def xml2db(db, node, row_ct_ref=None):
         for child in xml_util.NodeElemIter(node):
             if not xml_util.is_text(child): # not XML metadata
                 get(db, child, True, True, row_ct_ref)

     # Format: see http://vegbank.org/vegdocs/xml/vegbank_example_ver1.0.2.xml
     import os
     import os.path
     import psycopg2
     from psycopg2.extensions import ISOLATION_LEVEL_SERIALIZABLE
     import sys
     import xml.dom.minidom
     sys.path.append(os.path.dirname(__file__)+"/../lib")
     import xml_db
     def env_flag(name): return name in os.environ and os.environ[name] != ''

     # Converts a CSV dataset to XML using a mappings spreadsheet
     import csv
     import os.path
     import re
     import sys
     from copy import deepcopy
     from xml.dom.minidom import getDOMImplementation
     sys.path.append(os.path.dirname(__file__)+"/../lib")
     import xpath
     def main():

Project

General

Profile

Revision 42

Added by Aaron Marcuse-Kubitza over 13 years ago