Project

General

Profile

« Previous | Next » 

Revision 42

Moved Python modules to shared lib folder

View differences:

scripts/xml2db/xml_db.py
1
# XML-database conversion
2

  
3
import re
4
from xml.dom import Node
5

  
6
import db_util
7
import xml_util
8

  
9
def name_of(node): return re.sub(r'^.*\.', r'', xml_util.name_of(node))
10

  
11
ptr_suffix = '_id'
12

  
13
def is_ptr(node_name): return node_name.endswith(ptr_suffix)
14

  
15
def ptr_type(node_name):
16
    assert is_ptr(node_name)
17
    return node_name[:-len(ptr_suffix)]
18

  
19
def ptr_target(node):
20
    assert is_ptr(name_of(node))
21
    return xml_util.first_elem(node)
22

  
23
def find_by_name(node, name):
24
    for parent in xml_util.NodeParentIter(node):
25
        if name_of(parent) == name: return parent
26
        else:
27
            for child in xml_util.NodeElemIter(parent):
28
                child_name = name_of(child)
29
                if is_ptr(child_name):
30
                    if ptr_type(child_name) == name: return ptr_target(child)
31
                elif child_name == name: return child
32
    return None
33

  
34
def get(db, node, create=False, store_ids=False, row_ct_ref=None, pkeys=None):
35
    # store_ids enables searching the tree for missing fields
36
    if pkeys == None: pkeys = {}
37
    def pkey(table):
38
        if table not in pkeys: pkeys[table] = db_util.pkey(db, table)
39
        return pkeys[table]
40
    
41
    def obj(node, parent_id=None):
42
        table = name_of(node)
43
        pkey_ = pkey(table)
44
        row = {}
45
        children = []
46
        
47
        # Divide children into fields and children with fkeys to parent
48
        for child in xml_util.NodeElemIter(node):
49
            child_name = name_of(child)
50
            if xml_util.is_text(child): row[child_name] = xml_util.value(child)
51
            elif is_ptr(child_name): row[child_name] = obj(ptr_target(child))
52
            else: children.append(child)
53
        try: del row[pkey_]
54
        except KeyError: pass
55
        
56
        # Add fkey to parent
57
        if parent_id != None: row[pkey(name_of(node.parentNode))] = parent_id
58
        
59
        # Insert node
60
        for try_num in range(2):
61
            try:
62
                id_ = db_util.get(db, table, row, pkey_, create, row_ct_ref)
63
                if store_ids: xml_util.set_id(node, id_)
64
                break
65
            except db_util.NullValueException, ex:
66
                if try_num > 0: raise # exception still raised after retry
67
                # Search for required column in ancestors and their children
68
                target = find_by_name(node, ptr_type(ex.col))
69
                if target == None: raise
70
                row[ex.col] = xml_util.get_id(target)
71
        
72
        # Insert children with fkeys to parent
73
        for child in children: obj(child, id_)
74
        
75
        return id_
76
    
77
    return obj(node)
78

  
79
def xml2db(db, node, row_ct_ref=None):
80
    for child in xml_util.NodeElemIter(node):
81
        if not xml_util.is_text(child): # not XML metadata
82
            get(db, child, True, True, row_ct_ref)
scripts/xml2db/db_util.py
1
# Database access
2

  
3
import random
4
import re
5
import sys
6

  
7
import ex_util
8

  
9
def _add_cursor_info(ex, cur): ex_util.add_msg(ex, 'query: '+cur.query)
10

  
11
class NameException(Exception): pass
12

  
13
class DbException(ex_util.ExceptionWithCause):
14
    def __init__(self, msg, cause=None, cur=None):
15
        ex_util.ExceptionWithCause.__init__(self, msg, cause)
16
        if cur != None: _add_cursor_info(self, cur)
17

  
18
class ExceptionWithColumn(DbException):
19
    def __init__(self, col, cause=None):
20
        DbException.__init__(self, 'column: '+col, cause)
21
        self.col = col
22

  
23
class DuplicateKeyException(ExceptionWithColumn): pass
24

  
25
class NullValueException(ExceptionWithColumn): pass
26

  
27
def check_name(name):
28
    if re.search(r'\W', name) != None: raise NameException('Name "'+name
29
        +'" may contain only alphanumeric characters and _')
30

  
31
def run_query(db, query, params=None):
32
    cur = db.cursor()
33
    try: cur.execute(query, params)
34
    except Exception, ex:
35
        _add_cursor_info(ex, cur)
36
        raise
37
    return cur
38

  
39
def col(cur, idx): return cur.description[idx][0]
40

  
41
def row(cur): return iter(lambda: cur.fetchone(), None).next()
42

  
43
def value(cur): return row(cur)[0]
44

  
45
def with_savepoint(db, func):
46
    savepoint = 'savepoint_'+str(random.randint(0, sys.maxint)) # must be unique
47
    run_query(db, 'SAVEPOINT '+savepoint)
48
    try: return_val = func()
49
    except:
50
        run_query(db, 'ROLLBACK TO SAVEPOINT '+savepoint)
51
        raise
52
    else:
53
        run_query(db, 'RELEASE SAVEPOINT '+savepoint)
54
        return return_val
55

  
56
def select(db, table, fields, conds):
57
    check_name(table)
58
    map(check_name, fields)
59
    map(check_name, conds.keys())
60
    def cond(entry):
61
        col, value = entry
62
        cond_ = col+' '
63
        if value == None: cond_ += 'IS'
64
        else: cond_ += '='
65
        cond_ += ' %s'
66
        return cond_
67
    return run_query(db, 'SELECT '+', '.join(fields)+' FROM '+table+' WHERE '
68
        +' AND '.join(map(cond, conds.iteritems())), conds.values())
69

  
70
def insert(db, table, row):
71
    check_name(table)
72
    cols = row.keys()
73
    map(check_name, cols)
74
    return run_query(db, 'INSERT INTO '+table+' ('+', '.join(cols)
75
        +') VALUES ('+', '.join(['%s']*len(cols))+')', row.values())
76

  
77
def last_insert_id(db): return value(run_query(db, 'SELECT lastval()'))
78

  
79
def try_insert(db, table, row):
80
    try: return with_savepoint(db, lambda: insert(db, table, row))
81
    except Exception, ex:
82
        msg = str(ex)
83
        match = re.search(r'duplicate key value violates unique constraint "'
84
            +table+'_(\w+)_index"', msg)
85
        if match: raise DuplicateKeyException(match.group(1), ex)
86
        match = re.search(r'null value in column "(\w+)" violates not-null '
87
            'constraint', msg)
88
        if match: raise NullValueException(match.group(1), ex)
89
        raise # no specific exception raised
90

  
91
def pkey(db, table): # Assumed to be first column in table
92
    check_name(table)
93
    return col(run_query(db, 'SELECT * FROM '+table+' LIMIT 0'), 0)
94

  
95
def get(db, table, row, pkey, create=False, row_ct_ref=None):
96
    try: return value(select(db, table, [pkey], row))
97
    except StopIteration:
98
        if not create: raise
99
        # Insert new row
100
        try:
101
            row_ct = try_insert(db, table, row).rowcount
102
            if row_ct_ref != None and row_ct >= 0: row_ct_ref[0] += row_ct
103
            return last_insert_id(db)
104
        except DuplicateKeyException, ex:
105
            return value(select(db, table, [pkey], {ex.col: row[ex.col]}))
scripts/xml2db/ex_util.py
1
# Exception handling
2

  
3
def add_msg(ex, msg): ex.args = (str(ex).rstrip()+'\n'+msg,)
4

  
5
class ExceptionWithCause(Exception):
6
    def __init__(self, msg, cause=None):
7
        Exception.__init__(self, msg)
8
        if cause != None: add_msg(self, 'cause: '+str(cause))
scripts/xml2db/xml_util.py
1
# XML DOM tree manipulation
2

  
3
from xml.dom import Node
4
import xml.dom.minidom
5

  
6
def name_of(node): return node.tagName.lower()
7

  
8
def get_id(node): return node.getAttribute('id')
9

  
10
def set_id(node, id_): node.setAttribute('id', id_)
11

  
12
class NodeElemIter:
13
    def __init__(self, node): self.child = node.firstChild
14
    
15
    def __iter__(self): return self
16
    
17
    def curr(self):
18
        while self.child != None:
19
            if self.child.nodeType == Node.ELEMENT_NODE: return self.child
20
            self.child = self.child.nextSibling
21
        raise StopIteration
22
    
23
    def next(self):
24
        child = self.curr()
25
        self.child = self.child.nextSibling
26
        return child
27

  
28
def first_elem(node): return NodeElemIter(node).next()
29

  
30
class NodeElemReverseIter:
31
    def __init__(self, node): self.child = node.lastChild
32
    
33
    def __iter__(self): return self
34
    
35
    def curr(self):
36
        while self.child != None:
37
            if self.child.nodeType == Node.ELEMENT_NODE: return self.child
38
            self.child = self.child.previousSibling
39
        raise StopIteration
40
    
41
    def next(self):
42
        child = self.curr()
43
        self.child = self.child.previousSibling
44
        return child
45

  
46
def last_elem(node): return NodeElemReverseIter(node).next()
47

  
48
class NodeParentIter:
49
    def __init__(self, node): self.node = node
50
    
51
    def __iter__(self): return self
52
    
53
    def curr(self):
54
        if self.node != None and self.node.nodeType == Node.ELEMENT_NODE:
55
            return self.node
56
        raise StopIteration
57
    
58
    def next(self):
59
        node = self.curr()
60
        self.node = self.node.parentNode
61
        return node
62

  
63
def is_text(node):
64
    for child in NodeElemIter(node): return False # has an element node
65
    return True
66

  
67
def value(node):
68
    if node.firstChild != None: return node.firstChild.nodeValue
69
    else: return node.nodeValue
70

  
71
def set_value(doc, node, value):
72
    if node.nodeType == Node.ELEMENT_NODE:
73
        node.appendChild(doc.createTextNode(value))
74
    else: node.nodeValue = value
75

  
76
def by_tag_name(node, name, last_only=False):
77
    # last_only optimization returns last (most recently inserted) matching node
78
    children = []
79
    for child in NodeElemReverseIter(node):
80
        if child.tagName == name:
81
            children.append(child)
82
            if last_only: break
83
    return children
84

  
85
_writexml_orig = xml.dom.minidom.Element.writexml
86

  
87
def _writexml(self, writer, indent="", addindent="", newl=""):
88
    if self.firstChild != None and self.firstChild.nextSibling == None\
89
    and self.firstChild.nodeType == Node.TEXT_NODE: # a single text node
90
        writer.write(indent+'<'+self.tagName)
91
        for attr_idx in range(self.attributes.length):
92
            attr = self.attributes.item(attr_idx)
93
            writer.write(' '+attr.name+'='+attr.value)
94
        writer.write('>'+value(self)+'</'+self.tagName+'>'+newl)
95
    else: _writexml_orig(self, writer, indent, addindent, newl)
96

  
97
xml.dom.minidom.Element.writexml = _writexml
scripts/data2xml/util.py
1
# Useful functions and classes
2

  
3
class Obj:
4
    def __init__(self, **attrs): self.__dict__ = attrs
5
    
6
    def __repr__(self): return repr(self.__dict__)
7 0

  
scripts/data2xml/xpath.py
1
# XPath-based XML tree manipulation
2

  
3
from copy import deepcopy
4
from xml.dom import Node
5

  
6
from Parser import Parser
7
import xml_util
8

  
9
class XpathElem:
10
    def __init__(self, name, value=None, attrs=None, is_attr=False,
11
        is_ptr=False):
12
        if attrs == None: attrs = []
13
        self.name = name
14
        self.value = value
15
        self.attrs = attrs
16
        self.is_attr = is_attr
17
        self.is_ptr = is_ptr
18
    
19
    def __repr__(self):
20
        str_ = ''
21
        if self.is_attr: str_ += '@'
22
        str_ += self.name
23
        if self.attrs != []: str_ += repr(self.attrs)
24
        if self.value != None: str_ += '='+repr(self.value)
25
        if self.is_ptr: str_ += '->'
26
        return str_
27
    
28
    def __eq__(self, other): return self.__dict__ == other.__dict__
29

  
30
def value(path): return path[-1].value
31

  
32
def set_value(path, value): path[-1].value = value
33

  
34
def backward_id(elem):
35
    if len(elem.attrs) >= 1 and value(elem.attrs[0]) == None:
36
        return elem.attrs[0]
37
    else: return None
38

  
39
class XpathParser(Parser):
40
    def _main(self):
41
        self._match_str('/') # optional leading /
42
        return self._path()
43
    
44
    def _path(self):
45
        tree = []
46
        trailing_slash = False
47
        while True:
48
            # Split path
49
            if self._match_str('{'):
50
                paths = []
51
                while True:
52
                    paths.append(tree + self._path())
53
                    if not self._match_str(','): break
54
                self._match_str('}', required=True)
55
                tree = paths[0] # just use first subpath for now
56
                break # nothing allowed after split path
57
            
58
            elem = XpathElem(is_attr=self._match_str('@'),
59
                name=self._match_re(r'[\w.*]+', required=True))
60
            
61
            # Attrs
62
            if self._match_str('['):
63
                elem.attrs = []
64
                while True:
65
                    path = self._path()
66
                    if self._match_str('='):
67
                        set_value(path, self._match_re(r'[\w.|]*'))
68
                    elem.attrs.append(path)
69
                    if not self._match_str(','): break
70
                self._match_str(']', required=True)
71
            
72
            elem.is_ptr = self._match_str('->')
73
            tree.append(elem)
74
            
75
            # Lookahead assertion
76
            if self._match_str('('):
77
                self._match_str('/', required=True) # next / is inside ()
78
                path = self._path()
79
                self._match_str(')', required=True)
80
                elem.attrs.append(path)
81
                tree += path
82
            
83
            if not self._match_str('/'): break
84
        
85
        # Expand * abbrs
86
        elem_idx = 0
87
        for elem in tree:
88
            id_ = backward_id(elem)
89
            if id_ != None: elem = id_[0]; offset = -2
90
            elif elem.is_ptr: offset = 2
91
            else: offset = 1
92
            before, abbr, after = elem.name.partition('*')
93
            if abbr != '':
94
                try: elem.name = before+tree[elem_idx+offset].name+after
95
                except IndexError: pass # no replacement elem
96
            elem_idx += 1
97
        
98
        return tree
99

  
100
instance_level = 1
101

  
102
def obj(path):
103
    obj_path = deepcopy(path[:instance_level+1])
104
    obj_path[-1].is_ptr = False # prevent pointer w/o target
105
    return obj_path
106

  
107
def set_id(path, id_, has_types=True):
108
    if has_types: id_level = instance_level
109
    else: id_level = 0
110
    path[id_level].attrs.append([XpathElem('id', id_, is_attr=True)])
111

  
112
def get(doc, path, create=False, last_only=None, parent=None):
113
    # Warning: The last_only optimization may put data that should be together
114
    # into separate nodes
115
    if parent == None: parent = doc.documentElement
116
    if last_only == None: last_only = create
117
    elem_idx = 0
118
    for elem in path:
119
        # Find possible matches
120
        children = []
121
        if elem.is_attr:
122
            child = parent.getAttributeNode(elem.name)
123
            if child != None: children = [child]
124
        elif elem.name == '.': children = [parent]
125
        else: children = xml_util.by_tag_name(parent, elem.name, last_only)
126
        
127
        # Check each match
128
        node = None
129
        for child in children:
130
            is_match = elem.value == None or xml_util.value(child) == elem.value
131
            for attr in elem.attrs:
132
                if not is_match: break
133
                is_match = get(doc, attr, False, last_only, child) != None
134
            if is_match: node = child; break
135
        
136
        # Create node
137
        if node == None:
138
            if not create: return None
139
            if elem.is_attr:
140
                parent.setAttribute(elem.name, '')
141
                node = parent.getAttributeNode(elem.name)
142
            else: node = parent.appendChild(doc.createElement(elem.name))
143
            if elem.value != None: xml_util.set_value(doc, node, elem.value)
144
            for attr in elem.attrs: get(doc, attr, create, last_only, node)
145
        
146
        # Follow pointer
147
        if elem.is_ptr:
148
            path = deepcopy(path[elem_idx+1:]) # rest of path
149
            id_elem = backward_id(path[instance_level])
150
            if id_elem != None:
151
                # backward (child-to-parent) pointer with target ID attr
152
                set_value(id_elem, xml_util.get_id(node))
153
            else: # forward (parent-to-child) pointer
154
                id_ = xml_util.value(node)
155
                obj_path = obj(path) # target object
156
                if id_ == None or get(doc, obj_path, False, True) == None:
157
                    # no target or target attrs don't match
158
                    if not create: return None
159
                    
160
                    # Use last target object's ID + 1
161
                    obj_path[-1].attrs = [] # just get by tag name
162
                    last = get(doc, obj_path, False, True)
163
                    if last != None: id_ = str(int(xml_util.get_id(last)) + 1)
164
                    else: id_ = '0'
165
                    
166
                    # Will append if target attrs didn't match. Place ! in XPath
167
                    # after element to fork at to avoid this.
168
                    xml_util.set_value(doc, node, id_)
169
                else: last_only = False
170
                set_id(path, id_)
171
            return get(doc, path, create, last_only)
172
        
173
        parent = node
174
        elem_idx += 1
175
    return parent
scripts/data2xml/Parser.py
1
# A general recursive descent parser
2

  
3
import re
4

  
5
class SyntaxException(Exception): pass
6

  
7
class Parser:
8
    def __init__(self, string):
9
        self._str = string
10
        self._pos = 0
11
    
12
    def parse(self):
13
        tree = self._main()
14
        if not self._pos == len(self._str): self._syntax_err('End of string')
15
        return tree
16
    
17
    def _match_re(self, pattern, required=False):
18
        matcher = re.compile(pattern).match(self._str, self._pos)
19
        if matcher:
20
            self._pos = matcher.end(0)
21
            return matcher.group(0)
22
        elif required: self._syntax_err(pattern)
23
        else: return None
24
    
25
    def _match_str(self, string, required=False):
26
        end_pos = self._pos + len(string)
27
        if self._str[self._pos:end_pos] == string:
28
            self._pos = end_pos
29
            return True
30
        elif required: self._syntax_err(string)
31
        else: return False
32
    
33
    def _syntax_err(self, token):
34
        raise SyntaxException(token+' expected in '+self._str[self._pos:])
35 0

  
scripts/data2xml/xml_util.py
1
# XML DOM tree manipulation
2

  
3
from xml.dom import Node
4
import xml.dom.minidom
5

  
6
def name_of(node): return node.tagName.lower()
7

  
8
def get_id(node): return node.getAttribute('id')
9

  
10
def set_id(node, id_): node.setAttribute('id', id_)
11

  
12
class NodeElemIter:
13
    def __init__(self, node): self.child = node.firstChild
14
    
15
    def __iter__(self): return self
16
    
17
    def curr(self):
18
        while self.child != None:
19
            if self.child.nodeType == Node.ELEMENT_NODE: return self.child
20
            self.child = self.child.nextSibling
21
        raise StopIteration
22
    
23
    def next(self):
24
        child = self.curr()
25
        self.child = self.child.nextSibling
26
        return child
27

  
28
def first_elem(node): return NodeElemIter(node).next()
29

  
30
class NodeElemReverseIter:
31
    def __init__(self, node): self.child = node.lastChild
32
    
33
    def __iter__(self): return self
34
    
35
    def curr(self):
36
        while self.child != None:
37
            if self.child.nodeType == Node.ELEMENT_NODE: return self.child
38
            self.child = self.child.previousSibling
39
        raise StopIteration
40
    
41
    def next(self):
42
        child = self.curr()
43
        self.child = self.child.previousSibling
44
        return child
45

  
46
def last_elem(node): return NodeElemReverseIter(node).next()
47

  
48
class NodeParentIter:
49
    def __init__(self, node): self.node = node
50
    
51
    def __iter__(self): return self
52
    
53
    def curr(self):
54
        if self.node != None and self.node.nodeType == Node.ELEMENT_NODE:
55
            return self.node
56
        raise StopIteration
57
    
58
    def next(self):
59
        node = self.curr()
60
        self.node = self.node.parentNode
61
        return node
62

  
63
def is_text(node):
64
    for child in NodeElemIter(node): return False # has an element node
65
    return True
66

  
67
def value(node):
68
    if node.firstChild != None: return node.firstChild.nodeValue
69
    else: return node.nodeValue
70

  
71
def set_value(doc, node, value):
72
    if node.nodeType == Node.ELEMENT_NODE:
73
        node.appendChild(doc.createTextNode(value))
74
    else: node.nodeValue = value
75

  
76
def by_tag_name(node, name, last_only=False):
77
    # last_only optimization returns last (most recently inserted) matching node
78
    children = []
79
    for child in NodeElemReverseIter(node):
80
        if child.tagName == name:
81
            children.append(child)
82
            if last_only: break
83
    return children
84

  
85
_writexml_orig = xml.dom.minidom.Element.writexml
86

  
87
def _writexml(self, writer, indent="", addindent="", newl=""):
88
    if self.firstChild != None and self.firstChild.nextSibling == None\
89
    and self.firstChild.nodeType == Node.TEXT_NODE: # a single text node
90
        writer.write(indent+'<'+self.tagName)
91
        for attr_idx in range(self.attributes.length):
92
            attr = self.attributes.item(attr_idx)
93
            writer.write(' '+attr.name+'='+attr.value)
94
        writer.write('>'+value(self)+'</'+self.tagName+'>'+newl)
95
    else: _writexml_orig(self, writer, indent, addindent, newl)
96

  
97
xml.dom.minidom.Element.writexml = _writexml
scripts/lib/ex_util.py
1
# Exception handling
2

  
3
def add_msg(ex, msg): ex.args = (str(ex).rstrip()+'\n'+msg,)
4

  
5
class ExceptionWithCause(Exception):
6
    def __init__(self, msg, cause=None):
7
        Exception.__init__(self, msg)
8
        if cause != None: add_msg(self, 'cause: '+str(cause))
scripts/lib/Parser.py
1
# A general recursive descent parser
2

  
3
import re
4

  
5
class SyntaxException(Exception): pass
6

  
7
class Parser:
8
    def __init__(self, string):
9
        self._str = string
10
        self._pos = 0
11
    
12
    def parse(self):
13
        tree = self._main()
14
        if not self._pos == len(self._str): self._syntax_err('End of string')
15
        return tree
16
    
17
    def _match_re(self, pattern, required=False):
18
        matcher = re.compile(pattern).match(self._str, self._pos)
19
        if matcher:
20
            self._pos = matcher.end(0)
21
            return matcher.group(0)
22
        elif required: self._syntax_err(pattern)
23
        else: return None
24
    
25
    def _match_str(self, string, required=False):
26
        end_pos = self._pos + len(string)
27
        if self._str[self._pos:end_pos] == string:
28
            self._pos = end_pos
29
            return True
30
        elif required: self._syntax_err(string)
31
        else: return False
32
    
33
    def _syntax_err(self, token):
34
        raise SyntaxException(token+' expected in '+self._str[self._pos:])
0 35

  
scripts/lib/util.py
1
# Useful functions and classes
2

  
3
class Obj:
4
    def __init__(self, **attrs): self.__dict__ = attrs
5
    
6
    def __repr__(self): return repr(self.__dict__)
0 7

  
scripts/lib/xml_util.py
1
# XML DOM tree manipulation
2

  
3
from xml.dom import Node
4
import xml.dom.minidom
5

  
6
def name_of(node): return node.tagName.lower()
7

  
8
def get_id(node): return node.getAttribute('id')
9

  
10
def set_id(node, id_): node.setAttribute('id', id_)
11

  
12
class NodeElemIter:
13
    def __init__(self, node): self.child = node.firstChild
14
    
15
    def __iter__(self): return self
16
    
17
    def curr(self):
18
        while self.child != None:
19
            if self.child.nodeType == Node.ELEMENT_NODE: return self.child
20
            self.child = self.child.nextSibling
21
        raise StopIteration
22
    
23
    def next(self):
24
        child = self.curr()
25
        self.child = self.child.nextSibling
26
        return child
27

  
28
def first_elem(node): return NodeElemIter(node).next()
29

  
30
class NodeElemReverseIter:
31
    def __init__(self, node): self.child = node.lastChild
32
    
33
    def __iter__(self): return self
34
    
35
    def curr(self):
36
        while self.child != None:
37
            if self.child.nodeType == Node.ELEMENT_NODE: return self.child
38
            self.child = self.child.previousSibling
39
        raise StopIteration
40
    
41
    def next(self):
42
        child = self.curr()
43
        self.child = self.child.previousSibling
44
        return child
45

  
46
def last_elem(node): return NodeElemReverseIter(node).next()
47

  
48
class NodeParentIter:
49
    def __init__(self, node): self.node = node
50
    
51
    def __iter__(self): return self
52
    
53
    def curr(self):
54
        if self.node != None and self.node.nodeType == Node.ELEMENT_NODE:
55
            return self.node
56
        raise StopIteration
57
    
58
    def next(self):
59
        node = self.curr()
60
        self.node = self.node.parentNode
61
        return node
62

  
63
def is_text(node):
64
    for child in NodeElemIter(node): return False # has an element node
65
    return True
66

  
67
def value(node):
68
    if node.firstChild != None: return node.firstChild.nodeValue
69
    else: return node.nodeValue
70

  
71
def set_value(doc, node, value):
72
    if node.nodeType == Node.ELEMENT_NODE:
73
        node.appendChild(doc.createTextNode(value))
74
    else: node.nodeValue = value
75

  
76
def by_tag_name(node, name, last_only=False):
77
    # last_only optimization returns last (most recently inserted) matching node
78
    children = []
79
    for child in NodeElemReverseIter(node):
80
        if child.tagName == name:
81
            children.append(child)
82
            if last_only: break
83
    return children
84

  
85
_writexml_orig = xml.dom.minidom.Element.writexml
86

  
87
def _writexml(self, writer, indent="", addindent="", newl=""):
88
    if self.firstChild != None and self.firstChild.nextSibling == None\
89
    and self.firstChild.nodeType == Node.TEXT_NODE: # a single text node
90
        writer.write(indent+'<'+self.tagName)
91
        for attr_idx in range(self.attributes.length):
92
            attr = self.attributes.item(attr_idx)
93
            writer.write(' '+attr.name+'='+attr.value)
94
        writer.write('>'+value(self)+'</'+self.tagName+'>'+newl)
95
    else: _writexml_orig(self, writer, indent, addindent, newl)
96

  
97
xml.dom.minidom.Element.writexml = _writexml
scripts/lib/db_util.py
1
# Database access
2

  
3
import random
4
import re
5
import sys
6

  
7
import ex_util
8

  
9
def _add_cursor_info(ex, cur): ex_util.add_msg(ex, 'query: '+cur.query)
10

  
11
class NameException(Exception): pass
12

  
13
class DbException(ex_util.ExceptionWithCause):
14
    def __init__(self, msg, cause=None, cur=None):
15
        ex_util.ExceptionWithCause.__init__(self, msg, cause)
16
        if cur != None: _add_cursor_info(self, cur)
17

  
18
class ExceptionWithColumn(DbException):
19
    def __init__(self, col, cause=None):
20
        DbException.__init__(self, 'column: '+col, cause)
21
        self.col = col
22

  
23
class DuplicateKeyException(ExceptionWithColumn): pass
24

  
25
class NullValueException(ExceptionWithColumn): pass
26

  
27
def check_name(name):
28
    if re.search(r'\W', name) != None: raise NameException('Name "'+name
29
        +'" may contain only alphanumeric characters and _')
30

  
31
def run_query(db, query, params=None):
32
    cur = db.cursor()
33
    try: cur.execute(query, params)
34
    except Exception, ex:
35
        _add_cursor_info(ex, cur)
36
        raise
37
    return cur
38

  
39
def col(cur, idx): return cur.description[idx][0]
40

  
41
def row(cur): return iter(lambda: cur.fetchone(), None).next()
42

  
43
def value(cur): return row(cur)[0]
44

  
45
def with_savepoint(db, func):
46
    savepoint = 'savepoint_'+str(random.randint(0, sys.maxint)) # must be unique
47
    run_query(db, 'SAVEPOINT '+savepoint)
48
    try: return_val = func()
49
    except:
50
        run_query(db, 'ROLLBACK TO SAVEPOINT '+savepoint)
51
        raise
52
    else:
53
        run_query(db, 'RELEASE SAVEPOINT '+savepoint)
54
        return return_val
55

  
56
def select(db, table, fields, conds):
57
    check_name(table)
58
    map(check_name, fields)
59
    map(check_name, conds.keys())
60
    def cond(entry):
61
        col, value = entry
62
        cond_ = col+' '
63
        if value == None: cond_ += 'IS'
64
        else: cond_ += '='
65
        cond_ += ' %s'
66
        return cond_
67
    return run_query(db, 'SELECT '+', '.join(fields)+' FROM '+table+' WHERE '
68
        +' AND '.join(map(cond, conds.iteritems())), conds.values())
69

  
70
def insert(db, table, row):
71
    check_name(table)
72
    cols = row.keys()
73
    map(check_name, cols)
74
    return run_query(db, 'INSERT INTO '+table+' ('+', '.join(cols)
75
        +') VALUES ('+', '.join(['%s']*len(cols))+')', row.values())
76

  
77
def last_insert_id(db): return value(run_query(db, 'SELECT lastval()'))
78

  
79
def try_insert(db, table, row):
80
    try: return with_savepoint(db, lambda: insert(db, table, row))
81
    except Exception, ex:
82
        msg = str(ex)
83
        match = re.search(r'duplicate key value violates unique constraint "'
84
            +table+'_(\w+)_index"', msg)
85
        if match: raise DuplicateKeyException(match.group(1), ex)
86
        match = re.search(r'null value in column "(\w+)" violates not-null '
87
            'constraint', msg)
88
        if match: raise NullValueException(match.group(1), ex)
89
        raise # no specific exception raised
90

  
91
def pkey(db, table): # Assumed to be first column in table
92
    check_name(table)
93
    return col(run_query(db, 'SELECT * FROM '+table+' LIMIT 0'), 0)
94

  
95
def get(db, table, row, pkey, create=False, row_ct_ref=None):
96
    try: return value(select(db, table, [pkey], row))
97
    except StopIteration:
98
        if not create: raise
99
        # Insert new row
100
        try:
101
            row_ct = try_insert(db, table, row).rowcount
102
            if row_ct_ref != None and row_ct >= 0: row_ct_ref[0] += row_ct
103
            return last_insert_id(db)
104
        except DuplicateKeyException, ex:
105
            return value(select(db, table, [pkey], {ex.col: row[ex.col]}))
scripts/lib/xpath.py
1
# XPath-based XML tree manipulation
2

  
3
from copy import deepcopy
4
from xml.dom import Node
5

  
6
from Parser import Parser
7
import xml_util
8

  
9
class XpathElem:
10
    def __init__(self, name, value=None, attrs=None, is_attr=False,
11
        is_ptr=False):
12
        if attrs == None: attrs = []
13
        self.name = name
14
        self.value = value
15
        self.attrs = attrs
16
        self.is_attr = is_attr
17
        self.is_ptr = is_ptr
18
    
19
    def __repr__(self):
20
        str_ = ''
21
        if self.is_attr: str_ += '@'
22
        str_ += self.name
23
        if self.attrs != []: str_ += repr(self.attrs)
24
        if self.value != None: str_ += '='+repr(self.value)
25
        if self.is_ptr: str_ += '->'
26
        return str_
27
    
28
    def __eq__(self, other): return self.__dict__ == other.__dict__
29

  
30
def value(path): return path[-1].value
31

  
32
def set_value(path, value): path[-1].value = value
33

  
34
def backward_id(elem):
35
    if len(elem.attrs) >= 1 and value(elem.attrs[0]) == None:
36
        return elem.attrs[0]
37
    else: return None
38

  
39
class XpathParser(Parser):
40
    def _main(self):
41
        self._match_str('/') # optional leading /
42
        return self._path()
43
    
44
    def _path(self):
45
        tree = []
46
        trailing_slash = False
47
        while True:
48
            # Split path
49
            if self._match_str('{'):
50
                paths = []
51
                while True:
52
                    paths.append(tree + self._path())
53
                    if not self._match_str(','): break
54
                self._match_str('}', required=True)
55
                tree = paths[0] # just use first subpath for now
56
                break # nothing allowed after split path
57
            
58
            elem = XpathElem(is_attr=self._match_str('@'),
59
                name=self._match_re(r'[\w.*]+', required=True))
60
            
61
            # Attrs
62
            if self._match_str('['):
63
                elem.attrs = []
64
                while True:
65
                    path = self._path()
66
                    if self._match_str('='):
67
                        set_value(path, self._match_re(r'[\w.|]*'))
68
                    elem.attrs.append(path)
69
                    if not self._match_str(','): break
70
                self._match_str(']', required=True)
71
            
72
            elem.is_ptr = self._match_str('->')
73
            tree.append(elem)
74
            
75
            # Lookahead assertion
76
            if self._match_str('('):
77
                self._match_str('/', required=True) # next / is inside ()
78
                path = self._path()
79
                self._match_str(')', required=True)
80
                elem.attrs.append(path)
81
                tree += path
82
            
83
            if not self._match_str('/'): break
84
        
85
        # Expand * abbrs
86
        elem_idx = 0
87
        for elem in tree:
88
            id_ = backward_id(elem)
89
            if id_ != None: elem = id_[0]; offset = -2
90
            elif elem.is_ptr: offset = 2
91
            else: offset = 1
92
            before, abbr, after = elem.name.partition('*')
93
            if abbr != '':
94
                try: elem.name = before+tree[elem_idx+offset].name+after
95
                except IndexError: pass # no replacement elem
96
            elem_idx += 1
97
        
98
        return tree
99

  
100
instance_level = 1
101

  
102
def obj(path):
103
    obj_path = deepcopy(path[:instance_level+1])
104
    obj_path[-1].is_ptr = False # prevent pointer w/o target
105
    return obj_path
106

  
107
def set_id(path, id_, has_types=True):
108
    if has_types: id_level = instance_level
109
    else: id_level = 0
110
    path[id_level].attrs.append([XpathElem('id', id_, is_attr=True)])
111

  
112
def get(doc, path, create=False, last_only=None, parent=None):
113
    # Warning: The last_only optimization may put data that should be together
114
    # into separate nodes
115
    if parent == None: parent = doc.documentElement
116
    if last_only == None: last_only = create
117
    elem_idx = 0
118
    for elem in path:
119
        # Find possible matches
120
        children = []
121
        if elem.is_attr:
122
            child = parent.getAttributeNode(elem.name)
123
            if child != None: children = [child]
124
        elif elem.name == '.': children = [parent]
125
        else: children = xml_util.by_tag_name(parent, elem.name, last_only)
126
        
127
        # Check each match
128
        node = None
129
        for child in children:
130
            is_match = elem.value == None or xml_util.value(child) == elem.value
131
            for attr in elem.attrs:
132
                if not is_match: break
133
                is_match = get(doc, attr, False, last_only, child) != None
134
            if is_match: node = child; break
135
        
136
        # Create node
137
        if node == None:
138
            if not create: return None
139
            if elem.is_attr:
140
                parent.setAttribute(elem.name, '')
141
                node = parent.getAttributeNode(elem.name)
142
            else: node = parent.appendChild(doc.createElement(elem.name))
143
            if elem.value != None: xml_util.set_value(doc, node, elem.value)
144
            for attr in elem.attrs: get(doc, attr, create, last_only, node)
145
        
146
        # Follow pointer
147
        if elem.is_ptr:
148
            path = deepcopy(path[elem_idx+1:]) # rest of path
149
            id_elem = backward_id(path[instance_level])
150
            if id_elem != None:
151
                # backward (child-to-parent) pointer with target ID attr
152
                set_value(id_elem, xml_util.get_id(node))
153
            else: # forward (parent-to-child) pointer
154
                id_ = xml_util.value(node)
155
                obj_path = obj(path) # target object
156
                if id_ == None or get(doc, obj_path, False, True) == None:
157
                    # no target or target attrs don't match
158
                    if not create: return None
159
                    
160
                    # Use last target object's ID + 1
161
                    obj_path[-1].attrs = [] # just get by tag name
162
                    last = get(doc, obj_path, False, True)
163
                    if last != None: id_ = str(int(xml_util.get_id(last)) + 1)
164
                    else: id_ = '0'
165
                    
166
                    # Will append if target attrs didn't match. Place ! in XPath
167
                    # after element to fork at to avoid this.
168
                    xml_util.set_value(doc, node, id_)
169
                else: last_only = False
170
                set_id(path, id_)
171
            return get(doc, path, create, last_only)
172
        
173
        parent = node
174
        elem_idx += 1
175
    return parent
scripts/lib/xml_db.py
1
# XML-database conversion
2

  
3
import re
4
from xml.dom import Node
5

  
6
import db_util
7
import xml_util
8

  
9
def name_of(node): return re.sub(r'^.*\.', r'', xml_util.name_of(node))
10

  
11
ptr_suffix = '_id'
12

  
13
def is_ptr(node_name): return node_name.endswith(ptr_suffix)
14

  
15
def ptr_type(node_name):
16
    assert is_ptr(node_name)
17
    return node_name[:-len(ptr_suffix)]
18

  
19
def ptr_target(node):
20
    assert is_ptr(name_of(node))
21
    return xml_util.first_elem(node)
22

  
23
def find_by_name(node, name):
24
    for parent in xml_util.NodeParentIter(node):
25
        if name_of(parent) == name: return parent
26
        else:
27
            for child in xml_util.NodeElemIter(parent):
28
                child_name = name_of(child)
29
                if is_ptr(child_name):
30
                    if ptr_type(child_name) == name: return ptr_target(child)
31
                elif child_name == name: return child
32
    return None
33

  
34
def get(db, node, create=False, store_ids=False, row_ct_ref=None, pkeys=None):
35
    # store_ids enables searching the tree for missing fields
36
    if pkeys == None: pkeys = {}
37
    def pkey(table):
38
        if table not in pkeys: pkeys[table] = db_util.pkey(db, table)
39
        return pkeys[table]
40
    
41
    def obj(node, parent_id=None):
42
        table = name_of(node)
43
        pkey_ = pkey(table)
44
        row = {}
45
        children = []
46
        
47
        # Divide children into fields and children with fkeys to parent
48
        for child in xml_util.NodeElemIter(node):
49
            child_name = name_of(child)
50
            if xml_util.is_text(child): row[child_name] = xml_util.value(child)
51
            elif is_ptr(child_name): row[child_name] = obj(ptr_target(child))
52
            else: children.append(child)
53
        try: del row[pkey_]
54
        except KeyError: pass
55
        
56
        # Add fkey to parent
57
        if parent_id != None: row[pkey(name_of(node.parentNode))] = parent_id
58
        
59
        # Insert node
60
        for try_num in range(2):
61
            try:
62
                id_ = db_util.get(db, table, row, pkey_, create, row_ct_ref)
63
                if store_ids: xml_util.set_id(node, id_)
64
                break
65
            except db_util.NullValueException, ex:
66
                if try_num > 0: raise # exception still raised after retry
67
                # Search for required column in ancestors and their children
68
                target = find_by_name(node, ptr_type(ex.col))
69
                if target == None: raise
70
                row[ex.col] = xml_util.get_id(target)
71
        
72
        # Insert children with fkeys to parent
73
        for child in children: obj(child, id_)
74
        
75
        return id_
76
    
77
    return obj(node)
78

  
79
def xml2db(db, node, row_ct_ref=None):
80
    for child in xml_util.NodeElemIter(node):
81
        if not xml_util.is_text(child): # not XML metadata
82
            get(db, child, True, True, row_ct_ref)
scripts/xml2db/xml2db
3 3
# Format: see http://vegbank.org/vegdocs/xml/vegbank_example_ver1.0.2.xml
4 4

  
5 5
import os
6
import os.path
6 7
import psycopg2
7 8
from psycopg2.extensions import ISOLATION_LEVEL_SERIALIZABLE
8 9
import sys
9 10
import xml.dom.minidom
10 11

  
12
sys.path.append(os.path.dirname(__file__)+"/../lib")
11 13
import xml_db
12 14

  
13 15
def env_flag(name): return name in os.environ and os.environ[name] != ''
scripts/data2xml/data2xml
2 2
# Converts a CSV dataset to XML using a mappings spreadsheet
3 3

  
4 4
import csv
5
import os.path
5 6
import re
6 7
import sys
7 8
from copy import deepcopy
8 9
from xml.dom.minidom import getDOMImplementation
9 10

  
11
sys.path.append(os.path.dirname(__file__)+"/../lib")
10 12
import xpath
11 13

  
12 14
def main():

Also available in: Unified diff