Project

General

Profile

« Previous | Next » 

Revision 73

map: Fixed bugs to enable mapping straight from CSV to a database. Still need a way to set plot.authorPlotCode for specimens data.

View differences:

scripts/lib/strings.py
1
# String manipulation
2

  
3
def to_unicode(str_):
4
    if isinstance(str_, unicode): return str_
5
    encodings = ['utf_8', 'latin_1']
6
    for encoding in encodings:
7
        try: return unicode(str_, encoding)
8
        except UnicodeDecodeError, e: pass
9
    raise AssertionError(encoding+' is not a catch-all encoding')
scripts/lib/xml_dom.py
1 1
# XML DOM tree manipulation
2 2

  
3
import cgi
4
from HTMLParser import HTMLParser
3 5
from xml.dom import Node
4 6
import xml.dom.minidom
5 7

  
8
import strings
9

  
10
def escape(str_):
11
    return strings.to_unicode(cgi.escape(str_, True)).encode('ascii',
12
        'xmlcharrefreplace')
13

  
14
def unescape(str_): return HTMLParser().unescape(str_)
15

  
6 16
def name_of(node): return node.tagName.lower()
7 17

  
8 18
def get_id(node): return node.getAttribute('id')
......
82 92
            if last_only: break
83 93
    return children
84 94

  
95
# xml.dom.minidom modifications
96

  
97
def _write_data(writer, data): writer.write(escape(data))
98

  
99
xml.dom.minidom._write_data = _write_data
100

  
85 101
_writexml_orig = xml.dom.minidom.Element.writexml
86 102

  
87 103
def _writexml(self, writer, indent="", addindent="", newl=""):
......
90 106
        writer.write(indent+'<'+self.tagName)
91 107
        for attr_idx in xrange(self.attributes.length):
92 108
            attr = self.attributes.item(attr_idx)
93
            writer.write(' '+attr.name+'='+attr.value)
94
        writer.write('>'+value(self)+'</'+self.tagName+'>'+newl)
109
            writer.write(' '+attr.name+'='+escape(attr.value))
110
        writer.write('>'+escape(value(self))+'</'+self.tagName+'>'+newl)
95 111
    else: _writexml_orig(self, writer, indent, addindent, newl)
96 112

  
97 113
xml.dom.minidom.Element.writexml = _writexml
scripts/lib/db_xml.py
67 67
            break
68 68
        except sql.NullValueException, ex:
69 69
            if try_num > 0: raise # exception still raised after retry
70
            # Search for required column in ancestors and their children
71
            target = find_by_name(node, ptr_type(ex.col))
72
            if target == None: raise
73
            row[ex.col] = xml_dom.get_id(target)
70
            if is_ptr(ex.col):
71
                # Search for required column in ancestors and their children
72
                target = find_by_name(node, ptr_type(ex.col))
73
                if target == None: raise
74
                row[ex.col] = xml_dom.get_id(target)
75
            else: raise
74 76
    
75 77
    # Insert children with fkeys to parent
76 78
    for child in children: put(db, child, store_ids, row_ct_ref, pkeys, id_)
scripts/map
22 22
    out_db_config = get_db_config('out')
23 23
    in_is_db = in_db_config != None
24 24
    out_is_db = out_db_config != None
25
    uses_map = in_is_db or not out_is_db
26 25
    
27 26
    # Parse args
27
    map_path = None
28 28
    try: _prog_name, map_path = sys.argv
29 29
    except ValueError:
30
        if uses_map: raise SystemExit('Usage: '+opts.env_usage(env_names, True)
31
            +' [commit=1] '+sys.argv[0]+' [map_path] [<input] [>output]')
30
        if in_is_db or not out_is_db: raise SystemExit('Usage: '
31
            +opts.env_usage(env_names, True)+' [commit=1] '+sys.argv[0]
32
            +' [map_path] [<input] [>output]')
32 33
    commit = opts.env_flag('commit')
33 34
    
34 35
    # Load map header
35 36
    in_is_xml = True
36
    if uses_map:
37
    if map_path != None:
37 38
        import copy
38 39
        import csv
39 40
        
40 41
        import xpath
41 42
        
42
        map_stream = open(map_path, 'rb')
43
        map_reader = csv.reader(map_stream)
44
        src, dest = map_reader.next()[:2]
43
        mappings = []
44
        stream = open(map_path, 'rb')
45
        reader = csv.reader(stream)
46
        src, dest = reader.next()[:2]
45 47
        def split_col_name(name):
46 48
            name, sep, root = name.partition(':')
47 49
            return name, sep != '', root
......
49 51
        dest, out_is_xml, dest_root = split_col_name(dest)
50 52
        assert out_is_xml
51 53
        has_types = dest_root.startswith('/*s/') # outer elements are types
54
        for row in reader:
55
            in_, out = row[:2]
56
            if out != '':
57
                try: out = xpath.parse(dest_root+out)
58
                except SyntaxException, ex: raise SystemExit(str(ex))
59
                mappings.append((in_, out))
60
        stream.close()
52 61
    
53 62
    # Input datasource to XML tree, mapping if needed
54 63
    if in_is_xml: doc = xml.dom.minidom.parse(sys.stdin)
55
    if uses_map:
64
    if map_path != None:
56 65
        from Parser import SyntaxException
57 66
        import xml_xpath
58 67
        
59
        map_ = {}
60
        for row in map_reader:
61
            in_, out = row[:2]
62
            if out != '':
63
                try: out = xpath.parse(dest_root+out)
64
                except SyntaxException, ex: raise SystemExit(str(ex))
65
                if in_is_xml: pass # TODO: process the mapping
66
                elif in_is_db: pass # TODO: process the mapping
67
                else: map_[in_] = out
68
        map_stream.close()
69
        
70 68
        out_doc = xml.dom.minidom.getDOMImplementation().createDocument(None,
71 69
            dest, None)
72 70
        if in_is_xml: raise Exception('XML-XML mapping not supported yet')
71
        elif in_is_db: raise Exception('DB-XML mapping not supported yet')
73 72
        else: # input is CSV
73
            map_ = dict(mappings)
74 74
            reader = csv.reader(sys.stdin)
75 75
            fieldnames = reader.next()
76 76
            row_idx = 0
scripts/map2vegbank
6 6
export out_host=localhost out_user=vegbank out_password=vegbank \
7 7
out_database=vegbank
8 8

  
9
"$selfDir/map"
9
"$selfDir/map" "$@"
scripts/util/NYSpecimenDataAmericas.xml
211 211
        <plot id="0">
212 212
            <country>Peru</country>
213 213
            <state>Madre de Dios</state>
214
            <county>Man?</county>
214
            <county>Man&#250;</county>
215 215
            <simpleUserdefined>
216 216
                <name>locality</name>
217
                <value>Parque Nacional del Manu. R?o Manu: Cocha Casha Station</value>
217
                <value>Parque Nacional del Manu. R&#237;o Manu: Cocha Casha Station</value>
218 218
            </simpleUserdefined>
219 219
            <geospatial>
220 220
                <DecimalLongitude>-71.40</DecimalLongitude>
scripts/util/test_map
8 8
test -n "$n" || n=2
9 9
let n++
10 10

  
11
head -$n NYSpecimenDataAmericas.csv|../map \
12
../../mappings/NYBG-VegX.organisms.csv >NYSpecimenDataAmericas.test.xml
13
diff NYSpecimenDataAmericas.xml NYSpecimenDataAmericas.test.xml
11
head -"$n" NYSpecimenDataAmericas.csv|../map \
12
../../mappings/NYBG-VegX.organisms.csv >NYSpecimenDataAmericas.test.xml \
13
&& diff NYSpecimenDataAmericas.xml NYSpecimenDataAmericas.test.xml
14 14

  
15
../map2vegbank <vb_plot_observation.xml
15
head -"$n" NYSpecimenDataAmericas.csv|../map2vegbank \
16
../../mappings/joins/NYBG-VegBank.organisms.csv
mappings/VegX-VegBank.organisms.csv
28 28
"/*ID->/*s/individualOrganism/*sID->/*s/taxonNameUsageConcept/note/text","/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/notes"
29 29
"/*ID->/*s/individualOrganism/*sID->/*s/taxonNameUsageConcept/partyWithRole/*ID->/parties/party/organizationName/_name/firstName","/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museum_ID/party/OrganizationName/_name/firstName"
30 30
"/*ID->/*s/individualOrganism/*sID->/*s/taxonNameUsageConcept/partyWithRole/*ID->/parties/party/organizationName/_name/lastName","/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museum_ID/party/OrganizationName/_name/lastName"
31
"/*ID->/*s/individualOrganism/*sID->/*s/taxonNameUsageConcept/voucher","/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museumAccessionNumber"
31
"/*ID->/*s/individualOrganism/*sID->/*s/taxonNameUsageConcept/voucher","/{taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museumAccessionNumber,*_ID/observation/*_ID/plot/authorPlotCode}"
32 32
"/*ID->/*s/individualOrganism/identificationLabel","/taxonImportance/stemCount/stemLocation/stemCode"
33 33
"/*ID->/*s/individualOrganism/simpleUserdefined[name=sex]/value","/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/definedValue[@fkey=tableRecord_ID,userDefined[tableName=taxonInterpretation,userDefinedName=sex]]/definedValue"
34 34
"/*ID->/*s/plotObservation->/*s/abioticObservation[*ID]/magnesium","/*_ID/observation/soilObs/definedValue[@fkey=tableRecord_ID,userDefined[tableName=soilObs,userDefinedName=soilMagnesium]]/definedValue"
mappings/joins/NYBG-VegBank.organisms.csv
1 1
NYBG,VegBank:/taxonObservation
2 2
InstitutionCode,/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museum_ID/party/OrganizationName/_name/firstName
3 3
CollectionCode,/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museum_ID/party/OrganizationName/_name/lastName
4
CatalogNumber,/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museumAccessionNumber
4
CatalogNumber,"/{taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museumAccessionNumber,*_ID/observation/*_ID/plot/authorPlotCode}"
5 5
ScientificName,/taxonImportance/stemCount/stemLocation/taxonInterpretation(/*_ID/plantConcept[plantStatus/plantLevel=Species])/*_ID/*/plantName
6 6
Kingdom,/taxonImportance/stemCount/stemLocation/taxonInterpretation(/*_ID/plantConcept[plantStatus/plantLevel=Kingdom])/*_ID/*/plantName
7 7
Phylum,/taxonImportance/stemCount/stemLocation/taxonInterpretation(/*_ID/plantConcept[plantStatus/plantLevel=Subkingdom])/*_ID/*/plantName
mappings/joins/SALVIAS-VegBank.organisms.csv
1 1
SALVIAS,VegBank:/taxonObservation
2
OBSERVATION_ID,/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museumAccessionNumber
2
OBSERVATION_ID,"/{taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museumAccessionNumber,*_ID/observation/*_ID/plot/authorPlotCode}"
3 3
plot_code,/*_ID/observation/*_ID/plot/PARENT_ID->/*_ID/observation/*_ID/plot/authorPlotCode
4 4
census_no,"/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/definedValue[@fkey=tableRecord_ID,userDefined[tableName=taxonInterpretation,userDefinedName=censusNo]]/definedValue"
5 5
census_date,/*_ID/observation/obsStartDate

Also available in: Unified diff