Revision 73
Added by Aaron Marcuse-Kubitza almost 13 years ago
scripts/lib/strings.py | ||
---|---|---|
1 |
# String manipulation |
|
2 |
|
|
3 |
def to_unicode(str_): |
|
4 |
if isinstance(str_, unicode): return str_ |
|
5 |
encodings = ['utf_8', 'latin_1'] |
|
6 |
for encoding in encodings: |
|
7 |
try: return unicode(str_, encoding) |
|
8 |
except UnicodeDecodeError, e: pass |
|
9 |
raise AssertionError(encoding+' is not a catch-all encoding') |
scripts/lib/xml_dom.py | ||
---|---|---|
1 | 1 |
# XML DOM tree manipulation |
2 | 2 |
|
3 |
import cgi |
|
4 |
from HTMLParser import HTMLParser |
|
3 | 5 |
from xml.dom import Node |
4 | 6 |
import xml.dom.minidom |
5 | 7 |
|
8 |
import strings |
|
9 |
|
|
10 |
def escape(str_): |
|
11 |
return strings.to_unicode(cgi.escape(str_, True)).encode('ascii', |
|
12 |
'xmlcharrefreplace') |
|
13 |
|
|
14 |
def unescape(str_): return HTMLParser().unescape(str_) |
|
15 |
|
|
6 | 16 |
def name_of(node): return node.tagName.lower() |
7 | 17 |
|
8 | 18 |
def get_id(node): return node.getAttribute('id') |
... | ... | |
82 | 92 |
if last_only: break |
83 | 93 |
return children |
84 | 94 |
|
95 |
# xml.dom.minidom modifications |
|
96 |
|
|
97 |
def _write_data(writer, data): writer.write(escape(data)) |
|
98 |
|
|
99 |
xml.dom.minidom._write_data = _write_data |
|
100 |
|
|
85 | 101 |
_writexml_orig = xml.dom.minidom.Element.writexml |
86 | 102 |
|
87 | 103 |
def _writexml(self, writer, indent="", addindent="", newl=""): |
... | ... | |
90 | 106 |
writer.write(indent+'<'+self.tagName) |
91 | 107 |
for attr_idx in xrange(self.attributes.length): |
92 | 108 |
attr = self.attributes.item(attr_idx) |
93 |
writer.write(' '+attr.name+'='+attr.value)
|
|
94 |
writer.write('>'+value(self)+'</'+self.tagName+'>'+newl)
|
|
109 |
writer.write(' '+attr.name+'='+escape(attr.value))
|
|
110 |
writer.write('>'+escape(value(self))+'</'+self.tagName+'>'+newl)
|
|
95 | 111 |
else: _writexml_orig(self, writer, indent, addindent, newl) |
96 | 112 |
|
97 | 113 |
xml.dom.minidom.Element.writexml = _writexml |
scripts/lib/db_xml.py | ||
---|---|---|
67 | 67 |
break |
68 | 68 |
except sql.NullValueException, ex: |
69 | 69 |
if try_num > 0: raise # exception still raised after retry |
70 |
# Search for required column in ancestors and their children |
|
71 |
target = find_by_name(node, ptr_type(ex.col)) |
|
72 |
if target == None: raise |
|
73 |
row[ex.col] = xml_dom.get_id(target) |
|
70 |
if is_ptr(ex.col): |
|
71 |
# Search for required column in ancestors and their children |
|
72 |
target = find_by_name(node, ptr_type(ex.col)) |
|
73 |
if target == None: raise |
|
74 |
row[ex.col] = xml_dom.get_id(target) |
|
75 |
else: raise |
|
74 | 76 |
|
75 | 77 |
# Insert children with fkeys to parent |
76 | 78 |
for child in children: put(db, child, store_ids, row_ct_ref, pkeys, id_) |
scripts/map | ||
---|---|---|
22 | 22 |
out_db_config = get_db_config('out') |
23 | 23 |
in_is_db = in_db_config != None |
24 | 24 |
out_is_db = out_db_config != None |
25 |
uses_map = in_is_db or not out_is_db |
|
26 | 25 |
|
27 | 26 |
# Parse args |
27 |
map_path = None |
|
28 | 28 |
try: _prog_name, map_path = sys.argv |
29 | 29 |
except ValueError: |
30 |
if uses_map: raise SystemExit('Usage: '+opts.env_usage(env_names, True) |
|
31 |
+' [commit=1] '+sys.argv[0]+' [map_path] [<input] [>output]') |
|
30 |
if in_is_db or not out_is_db: raise SystemExit('Usage: ' |
|
31 |
+opts.env_usage(env_names, True)+' [commit=1] '+sys.argv[0] |
|
32 |
+' [map_path] [<input] [>output]') |
|
32 | 33 |
commit = opts.env_flag('commit') |
33 | 34 |
|
34 | 35 |
# Load map header |
35 | 36 |
in_is_xml = True |
36 |
if uses_map:
|
|
37 |
if map_path != None:
|
|
37 | 38 |
import copy |
38 | 39 |
import csv |
39 | 40 |
|
40 | 41 |
import xpath |
41 | 42 |
|
42 |
map_stream = open(map_path, 'rb') |
|
43 |
map_reader = csv.reader(map_stream) |
|
44 |
src, dest = map_reader.next()[:2] |
|
43 |
mappings = [] |
|
44 |
stream = open(map_path, 'rb') |
|
45 |
reader = csv.reader(stream) |
|
46 |
src, dest = reader.next()[:2] |
|
45 | 47 |
def split_col_name(name): |
46 | 48 |
name, sep, root = name.partition(':') |
47 | 49 |
return name, sep != '', root |
... | ... | |
49 | 51 |
dest, out_is_xml, dest_root = split_col_name(dest) |
50 | 52 |
assert out_is_xml |
51 | 53 |
has_types = dest_root.startswith('/*s/') # outer elements are types |
54 |
for row in reader: |
|
55 |
in_, out = row[:2] |
|
56 |
if out != '': |
|
57 |
try: out = xpath.parse(dest_root+out) |
|
58 |
except SyntaxException, ex: raise SystemExit(str(ex)) |
|
59 |
mappings.append((in_, out)) |
|
60 |
stream.close() |
|
52 | 61 |
|
53 | 62 |
# Input datasource to XML tree, mapping if needed |
54 | 63 |
if in_is_xml: doc = xml.dom.minidom.parse(sys.stdin) |
55 |
if uses_map:
|
|
64 |
if map_path != None:
|
|
56 | 65 |
from Parser import SyntaxException |
57 | 66 |
import xml_xpath |
58 | 67 |
|
59 |
map_ = {} |
|
60 |
for row in map_reader: |
|
61 |
in_, out = row[:2] |
|
62 |
if out != '': |
|
63 |
try: out = xpath.parse(dest_root+out) |
|
64 |
except SyntaxException, ex: raise SystemExit(str(ex)) |
|
65 |
if in_is_xml: pass # TODO: process the mapping |
|
66 |
elif in_is_db: pass # TODO: process the mapping |
|
67 |
else: map_[in_] = out |
|
68 |
map_stream.close() |
|
69 |
|
|
70 | 68 |
out_doc = xml.dom.minidom.getDOMImplementation().createDocument(None, |
71 | 69 |
dest, None) |
72 | 70 |
if in_is_xml: raise Exception('XML-XML mapping not supported yet') |
71 |
elif in_is_db: raise Exception('DB-XML mapping not supported yet') |
|
73 | 72 |
else: # input is CSV |
73 |
map_ = dict(mappings) |
|
74 | 74 |
reader = csv.reader(sys.stdin) |
75 | 75 |
fieldnames = reader.next() |
76 | 76 |
row_idx = 0 |
scripts/map2vegbank | ||
---|---|---|
6 | 6 |
export out_host=localhost out_user=vegbank out_password=vegbank \ |
7 | 7 |
out_database=vegbank |
8 | 8 |
|
9 |
"$selfDir/map" |
|
9 |
"$selfDir/map" "$@" |
scripts/util/NYSpecimenDataAmericas.xml | ||
---|---|---|
211 | 211 |
<plot id="0"> |
212 | 212 |
<country>Peru</country> |
213 | 213 |
<state>Madre de Dios</state> |
214 |
<county>Man?</county>
|
|
214 |
<county>Manú</county>
|
|
215 | 215 |
<simpleUserdefined> |
216 | 216 |
<name>locality</name> |
217 |
<value>Parque Nacional del Manu. R?o Manu: Cocha Casha Station</value>
|
|
217 |
<value>Parque Nacional del Manu. Río Manu: Cocha Casha Station</value>
|
|
218 | 218 |
</simpleUserdefined> |
219 | 219 |
<geospatial> |
220 | 220 |
<DecimalLongitude>-71.40</DecimalLongitude> |
scripts/util/test_map | ||
---|---|---|
8 | 8 |
test -n "$n" || n=2 |
9 | 9 |
let n++ |
10 | 10 |
|
11 |
head -$n NYSpecimenDataAmericas.csv|../map \
|
|
12 |
../../mappings/NYBG-VegX.organisms.csv >NYSpecimenDataAmericas.test.xml |
|
13 |
diff NYSpecimenDataAmericas.xml NYSpecimenDataAmericas.test.xml |
|
11 |
head -"$n" NYSpecimenDataAmericas.csv|../map \
|
|
12 |
../../mappings/NYBG-VegX.organisms.csv >NYSpecimenDataAmericas.test.xml \
|
|
13 |
&& diff NYSpecimenDataAmericas.xml NYSpecimenDataAmericas.test.xml
|
|
14 | 14 |
|
15 |
../map2vegbank <vb_plot_observation.xml |
|
15 |
head -"$n" NYSpecimenDataAmericas.csv|../map2vegbank \ |
|
16 |
../../mappings/joins/NYBG-VegBank.organisms.csv |
mappings/VegX-VegBank.organisms.csv | ||
---|---|---|
28 | 28 |
"/*ID->/*s/individualOrganism/*sID->/*s/taxonNameUsageConcept/note/text","/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/notes" |
29 | 29 |
"/*ID->/*s/individualOrganism/*sID->/*s/taxonNameUsageConcept/partyWithRole/*ID->/parties/party/organizationName/_name/firstName","/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museum_ID/party/OrganizationName/_name/firstName" |
30 | 30 |
"/*ID->/*s/individualOrganism/*sID->/*s/taxonNameUsageConcept/partyWithRole/*ID->/parties/party/organizationName/_name/lastName","/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museum_ID/party/OrganizationName/_name/lastName" |
31 |
"/*ID->/*s/individualOrganism/*sID->/*s/taxonNameUsageConcept/voucher","/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museumAccessionNumber"
|
|
31 |
"/*ID->/*s/individualOrganism/*sID->/*s/taxonNameUsageConcept/voucher","/{taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museumAccessionNumber,*_ID/observation/*_ID/plot/authorPlotCode}"
|
|
32 | 32 |
"/*ID->/*s/individualOrganism/identificationLabel","/taxonImportance/stemCount/stemLocation/stemCode" |
33 | 33 |
"/*ID->/*s/individualOrganism/simpleUserdefined[name=sex]/value","/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/definedValue[@fkey=tableRecord_ID,userDefined[tableName=taxonInterpretation,userDefinedName=sex]]/definedValue" |
34 | 34 |
"/*ID->/*s/plotObservation->/*s/abioticObservation[*ID]/magnesium","/*_ID/observation/soilObs/definedValue[@fkey=tableRecord_ID,userDefined[tableName=soilObs,userDefinedName=soilMagnesium]]/definedValue" |
mappings/joins/NYBG-VegBank.organisms.csv | ||
---|---|---|
1 | 1 |
NYBG,VegBank:/taxonObservation |
2 | 2 |
InstitutionCode,/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museum_ID/party/OrganizationName/_name/firstName |
3 | 3 |
CollectionCode,/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museum_ID/party/OrganizationName/_name/lastName |
4 |
CatalogNumber,/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museumAccessionNumber
|
|
4 |
CatalogNumber,"/{taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museumAccessionNumber,*_ID/observation/*_ID/plot/authorPlotCode}"
|
|
5 | 5 |
ScientificName,/taxonImportance/stemCount/stemLocation/taxonInterpretation(/*_ID/plantConcept[plantStatus/plantLevel=Species])/*_ID/*/plantName |
6 | 6 |
Kingdom,/taxonImportance/stemCount/stemLocation/taxonInterpretation(/*_ID/plantConcept[plantStatus/plantLevel=Kingdom])/*_ID/*/plantName |
7 | 7 |
Phylum,/taxonImportance/stemCount/stemLocation/taxonInterpretation(/*_ID/plantConcept[plantStatus/plantLevel=Subkingdom])/*_ID/*/plantName |
mappings/joins/SALVIAS-VegBank.organisms.csv | ||
---|---|---|
1 | 1 |
SALVIAS,VegBank:/taxonObservation |
2 |
OBSERVATION_ID,/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museumAccessionNumber
|
|
2 |
OBSERVATION_ID,"/{taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/museumAccessionNumber,*_ID/observation/*_ID/plot/authorPlotCode}"
|
|
3 | 3 |
plot_code,/*_ID/observation/*_ID/plot/PARENT_ID->/*_ID/observation/*_ID/plot/authorPlotCode |
4 | 4 |
census_no,"/taxonImportance/stemCount/stemLocation/taxonInterpretation[PLANTCONCEPT_ID=]/definedValue[@fkey=tableRecord_ID,userDefined[tableName=taxonInterpretation,userDefinedName=censusNo]]/definedValue" |
5 | 5 |
census_date,/*_ID/observation/obsStartDate |
Also available in: Unified diff
map: Fixed bugs to enable mapping straight from CSV to a database. Still need a way to set plot.authorPlotCode for specimens data.