Project

General

Profile

« Previous | Next » 

Revision 13

Initial version of xml2db. Doesn't yet handle all duplicate rows correctly.

View differences:

scripts/xml2db/test
6 6
export host= user=postgres database=vegbank
7 7
if test -z "${password+_}"
8 8
then
9
    echo "Password can also be specified with: export password=..."
9 10
    read -s -p "Password for $user@$host: "; echo
10 11
    export password="$REPLY"
11 12
fi
scripts/xml2db/ex_util.py
1 1
# Exception handling
2 2

  
3 3
def add_msg(ex, msg): ex.args = (str(ex).rstrip()+'\n'+msg,)
4

  
5
class ExceptionWithCause(Exception):
6
    def __init__(self, msg, cause=None):
7
        Exception.__init__(self, msg)
8
        if cause != None: add_msg(self, 'cause: '+str(cause))
scripts/xml2db/xml_util.py
2 2

  
3 3
from xml.dom import Node
4 4

  
5
def name_of(node): return node.tagName.lower()
6

  
7
def get_id(node): return node.getAttribute('id')
8

  
9
def set_id(node, id_): node.setAttribute('id', id_)
10

  
5 11
class NodeElemIter:
6 12
    def __init__(self, node): self.child = node.firstChild
7 13
    
......
20 26

  
21 27
def first_elem(node): return NodeElemIter(node).next()
22 28

  
29
class NodeParentIter:
30
    def __init__(self, node): self.node = node
31
    
32
    def __iter__(self): return self
33
    
34
    def curr(self):
35
        if self.node != None and self.node.nodeType == Node.ELEMENT_NODE:
36
            return self.node
37
        raise StopIteration
38
    
39
    def next(self):
40
        node = self.curr()
41
        self.node = self.node.parentNode
42
        return node
43

  
23 44
def is_text(node):
24 45
    for child in NodeElemIter(node): return False # has an element node
25 46
    return True
......
27 48
def value(node):
28 49
    if node.firstChild != None: return node.firstChild.nodeValue
29 50
    else: return None
30

  
31
def get_id(node): return node.getAttribute('id')
32

  
33
def set_id(node, get_id): node.setAttribute('id', get_id)
scripts/xml2db/xml2db
8 8
import sys
9 9
import xml.dom.minidom
10 10

  
11
import db_util
12
import xml_util
11
import xml_db
13 12

  
14
def table_name(node): return node.tagName.lower()
15

  
16
def is_pointer(node): return node.tagName.lower().endswith('_id')
17

  
18
def pkey_name(node): return table_name(node)+'_id'
19

  
20
def is_field(node): return node.tagName.find('.') >= 0
21

  
22
def field_name(node): return node.tagName.partition('.')[2].lower()
23

  
24
def is_db_export(node): return node.tagName.find('.') >= 1 # has prefix
25

  
26
class Importer:
27
    def __init__(self, db):
28
        self.db = db
29
    
30
    def process(self, node):
31
        self._main(node)
32
    
33
    def _main(self, node):
34
        for child in xml_util.NodeElemIter(node):
35
            if not xml_util.is_text(child): self._obj(child) # not XML metadata
36
    
37
    def _obj(self, node):
38
        table = table_name(node)
39
        pkey = pkey_name(node)
40
        db_util.check_name(table)
41
        row = {}
42
        children = []
43
        iter_ = xml_util.NodeElemIter(node)
44
        
45
        # Skip any pkey
46
        child = iter_.curr() # first child
47
        if is_db_export(child) and is_pointer(child)\
48
        or field_name(child) == pkey: iter_.next()
49
        
50
        for child in iter_:
51
            if is_field(child): row.update([self._field(child)])
52
            else: children.append(child)
53
        
54
        # Add fkey to parent
55
        parent_id = xml_util.get_id(node.parentNode)
56
        if parent_id != '': row[pkey_name(node.parentNode)] = parent_id
57
        
58
        xml_util.set_id(node, db_util.insert_or_get(self.db, table, row, pkey))
59
        
60
        # Add children with fkeys to parent
61
        for child in children: self._obj(child)
62
    
63
    def _field(self, node):
64
        if xml_util.is_text(node): value = xml_util.value(node)
65
        else:
66
            child = xml_util.first_elem(node)
67
            self._obj(child)
68
            value = xml_util.get_id(child)
69
        return (field_name(node), value)
70
    
71
    def _pointer(self, node): return None
72

  
73 13
def env_flag(name): return name in os.environ and os.environ[name] != ''
74 14

  
75 15
def main():
......
87 27
    db.set_isolation_level(ISOLATION_LEVEL_SERIALIZABLE)
88 28
    try:
89 29
        doc = xml.dom.minidom.parse(sys.stdin)
90
        Importer(db).process(doc.documentElement)
30
        print 'Inserted '+str(xml_db.xml2db(doc.documentElement, db))+' rows'
91 31
        if commit: db.commit()
92 32
    finally:
93 33
        db.rollback()
scripts/xml2db/db_util.py
8 8

  
9 9
class NameException(Exception): pass
10 10

  
11
class DuplicateKeyException(Exception): pass
11
class ExceptionWithColumn(ex_util.ExceptionWithCause):
12
    def __init__(self, col, cause=None):
13
        ex_util.ExceptionWithCause.__init__(self, 'column: '+col, cause)
14
        self.col = col
12 15

  
16
class DuplicateKeyException(ExceptionWithColumn): pass
17

  
18
class NullValueException(ExceptionWithColumn): pass
19

  
13 20
def check_name(name):
14 21
    if re.search(r'\W', name) != None: raise NameException('Name "'+name
15 22
        +'" may contain only alphanumeric characters and _')
......
39 46

  
40 47
def select(db, table, fields, conds):
41 48
    for field in fields: check_name(field)
42
    for key in conds.keys(): check_name(key)
49
    for col in conds.keys(): check_name(col)
43 50
    def cond(entry):
44
        key, value = entry
45
        cond_ = key+' '
51
        col, value = entry
52
        cond_ = col+' '
46 53
        if value == None: cond_ += 'IS'
47 54
        else: cond_ += '='
48 55
        cond_ += ' %s'
......
50 57
    return run_query(db, 'SELECT '+', '.join(fields)+' FROM '+table+' WHERE '
51 58
        +' AND '.join(map(cond, conds.iteritems())), conds.values())
52 59

  
53
def insert(db, table, row, get_id=False):
60
def insert(db, table, row):
54 61
    check_name(table)
55
    keys = row.keys()
56
    for key in keys: check_name(key)
57
    query = 'INSERT INTO '+table+' ('+', '.join(keys)+') VALUES ('\
58
        +', '.join(['%s']*len(keys))+')'
59
    if get_id: query += ' RETURNING lastval()'
60
    cur = run_query(db, query, row.values())
61
    if get_id: return value(cur)
62
    cols = row.keys()
63
    for col in cols: check_name(col)
64
    return run_query(db, 'INSERT INTO '+table+' ('+', '.join(cols)
65
        +') VALUES ('+', '.join(['%s']*len(cols))+')', row.values())
62 66

  
63
def insert_ignore(db, table, row, get_id=False):
64
    try: return with_savepoint(db, lambda: insert(db, table, row, get_id))
67
def last_insert_id(db): return value(run_query(db, 'SELECT lastval()'))
68

  
69
def insert_ignore(db, table, row):
70
    try: return with_savepoint(db, lambda: insert(db, table, row))
65 71
    except Exception, ex:
72
        msg = str(ex)
66 73
        match = re.search(r'duplicate key value violates unique constraint "'
67
            +table+'_(\w+)_index"', str(ex))
68
        if match: raise DuplicateKeyException(match.group(1))
69
        else: raise
74
            +table+'_(\w+)_index"', msg)
75
        if match: raise DuplicateKeyException(match.group(1), ex)
76
        match = re.search(r'null value in column "(\w+)" violates not-null '
77
            'constraint', msg)
78
        if match: raise NullValueException(match.group(1), ex)
79
        raise # no specific exception raised
70 80

  
71
def insert_or_get(db, table, row, pkey):
72
    try: return insert_ignore(db, table, row, True)
81
def insert_or_get(db, table, row, pkey, row_ct_ref=None):
82
    try:
83
        row_ct = insert_ignore(db, table, row).rowcount
84
        if row_ct_ref != None and row_ct >= 0: row_ct_ref[0] += row_ct
85
        return last_insert_id(db)
73 86
    except DuplicateKeyException, ex:
74
        dup_key = str(ex)
75
        return value(select(db, table, [pkey], {dup_key: row[dup_key]}))
87
        return value(select(db, table, [pkey], {ex.col: row[ex.col]}))
scripts/xml2db/vegbank.sql
6777 6777
-- PostgreSQL database dump complete
6778 6778
--
6779 6779

  
6780
-- Modifications
6781

  
6782
ALTER TABLE aux_role RENAME role_id  TO aux_role_id;
6783
ALTER TABLE place RENAME plotplace_id  TO place_id;
scripts/xml2db/xml_db.py
1
# XML-database conversion
2

  
3
from xml.dom import Node
4

  
5
import db_util
6
import xml_util
7
from xml_util import name_of
8

  
9
ptr_suffix = '_id'
10

  
11
def is_ptr(node_name): return node_name.endswith(ptr_suffix)
12

  
13
def ptr_type(node_name):
14
    assert is_ptr(node_name)
15
    return node_name[:-len(ptr_suffix)]
16

  
17
def ptr_target(node):
18
    assert is_ptr(name_of(node))
19
    return xml_util.first_elem(node)
20

  
21
def pkey_name(node_name): return node_name+ptr_suffix
22

  
23
def is_field(node_name): return node_name.find('.') >= 0
24

  
25
def field_name(node_name):
26
    if is_field(node_name): return node_name.partition('.')[2]
27
    else: return node_name
28

  
29
def is_db_export(node_name): return node_name.find('.') >= 1 # has prefix
30

  
31
def find_by_name(node, name):
32
    for parent in xml_util.NodeParentIter(node):
33
        if name_of(parent) == name: return parent
34
        else:
35
            for child in xml_util.NodeElemIter(parent):
36
                child_name = field_name(name_of(child))
37
                if is_ptr(child_name):
38
                    if ptr_type(child_name) == name: return ptr_target(child)
39
                elif child_name == name: return child
40
    return None
41

  
42
def xml2db(node, db):
43
    def _main(node):
44
        for child in xml_util.NodeElemIter(node):
45
            if not xml_util.is_text(child): _obj(child) # not XML metadata
46
    
47
    def _obj(node):
48
        table = name_of(node)
49
        db_util.check_name(table)
50
        pkey = pkey_name(table)
51
        row = {}
52
        children = []
53
        iter_ = xml_util.NodeElemIter(node)
54
        
55
        # Skip any pkey
56
        child_name = name_of(iter_.curr()) # first child
57
        if is_db_export(child_name) and is_ptr(child_name)\
58
        or field_name(child_name) == pkey: iter_.next()
59
        
60
        # Divide children into fields and children with fkeys to parent
61
        for child in iter_:
62
            if is_field(name_of(child)): row.update([_field(child)])
63
            else: children.append(child)
64
        
65
        # Add fkey to parent
66
        parent_id = xml_util.get_id(node.parentNode)
67
        if parent_id != '': row[pkey_name(name_of(node.parentNode))] = parent_id
68
        
69
        # Insert node
70
        for try_num in range(2):
71
            try:
72
                xml_util.set_id(node, db_util.insert_or_get(db, table, row,
73
                    pkey, row_ct_ref))
74
                break
75
            except db_util.NullValueException, ex:
76
                if try_num > 0: raise # exception still raised after retry
77
                target = find_by_name(node, ptr_type(ex.col))
78
                if target == None: raise
79
                row[ex.col] = xml_util.get_id(target)
80
        
81
        # Insert children with fkeys to parent
82
        for child in children: _obj(child)
83
    
84
    def _field(node):
85
        if xml_util.is_text(node): value = xml_util.value(node)
86
        else:
87
            child = xml_util.first_elem(node)
88
            _obj(child)
89
            value = xml_util.get_id(child)
90
        return (field_name(name_of(node)), value)
91
    
92
    row_ct_ref = [0]
93
    _main(node)
94
    return row_ct_ref[0]

Also available in: Unified diff