Project

General

Profile

« Previous | Next » 

Revision 10349

inputs/REMIB/: switched to new-style import, using the steps at wiki.vegpath.org/Switching_to_new-style_import#stage-I-source-specific > "run the following for each datasource"

View differences:

inputs/REMIB/Specimen/postprocess.sql
1 1
SELECT util.search_path_append('util');
2 2

  
3
SELECT util.rename_cols(:table_str, $$
4
country  => country_verbatim,
5
state    => stateProvince_verbatim,
6
county   => county_verbatim,
7
locality => locality_verbatim,
8
long_deg => longitude_deg_verbatim,
9
long_min => longitude_min_verbatim,
10
long_sec => longitude_sec_verbatim,
11
lat_deg  => latitude_deg_verbatim,
12
lat_min  => latitude_min_verbatim,
13
lat_sec  => latitude_sec_verbatim,
14
habitat  => habitat_verbatim,
15
$$::hstore);
3
SELECT create_if_not_exists($$CREATE INDEX "Specimen.specimenHolderInstitutions" ON $$||:table_str||$$ ("specimenHolderInstitutions")$$);
16 4

  
17
SELECT create_if_not_exists($$CREATE INDEX "Specimen.acronym" ON $$||:table_str||$$ (acronym)$$);
18

  
19 5
-- remove frameshifted rows
20
DELETE FROM :table WHERE acronym IS NULL;
21
DELETE FROM :table WHERE coll_year !~ '^(?:1[7-9]|20)\d{2}$';
6
DELETE FROM :table WHERE "specimenHolderInstitutions" IS NULL;
7
DELETE FROM :table WHERE "yearCollected" !~ '^(?:1[7-9]|20)\d{2}$';
22 8
DELETE FROM :table WHERE country_verbatim ~ '\d';
23 9
DELETE FROM :table WHERE longitude_deg_verbatim ~ '[[:alpha:]]' AND longitude_deg_verbatim NOT IN ('RESTRINGIDO');
24 10
DELETE FROM :table WHERE longitude_min_verbatim !~ '^\d*$';
......
28 14

  
29 15
-- Remove institutions that we have direct data for
30 16
DELETE FROM :table
31
WHERE acronym IN (
17
WHERE "specimenHolderInstitutions" IN (
32 18
    -- Comments are from e-mail from Brad Boyle on 2013-1-16
33 19
    'MO' -- "all MO records in REMIB are also available from MO's own website"
34 20
    --, 'ARIZ' -- Some REMIB ARIZ specimens not yet in ARIZ itself
inputs/REMIB/Specimen/run
1
#!/bin/bash -e
2
. "$(dirname "${BASH_SOURCE[0]}")"/../table.run
0 3

  
inputs/REMIB/Specimen/map.csv
29 29
collector,recordedBy,,
30 30
habitat,habitat_verbatim,,
31 31
preparation,preparations,,
32
country,country,,
33
stateProvince,stateProvince,,
34
county,county,,
35
locality,locality,,
36
habitat,habitat,,
37
longitude_deg,longitude_deg,,
38
longitude_min,longitude_min,,
39
longitude_sec,longitude_sec,,
40
latitude_deg,latitude_deg,,
41
latitude_min,latitude_min,,
42
latitude_sec,latitude_sec,,
inputs/REMIB/Specimen/test.xml.ref
9 9
                    <location>
10 10
                        <authorlocationcode>
11 11
                            <_join>
12
                                <1>$acronym</1>
13
                                <3>$accession_number</3>
12
                                <1>$specimenHolderInstitutions</1>
13
                                <3>$accessionNumber</3>
14 14
                            </_join>
15 15
                        </authorlocationcode>
16 16
                        <iscultivated>
......
32 32
                                                <_nullIf>
33 33
                                                    <null>0</null>
34 34
                                                    <type>float</type>
35
                                                    <value>$coll_day</value>
35
                                                    <value>$dayCollected</value>
36 36
                                                </_nullIf>
37 37
                                            </day>
38 38
                                            <month>
39 39
                                                <_nullIf>
40 40
                                                    <null>0</null>
41 41
                                                    <type>float</type>
42
                                                    <value>$coll_month</value>
42
                                                    <value>$monthCollected</value>
43 43
                                                </_nullIf>
44 44
                                            </month>
45 45
                                            <year>
46 46
                                                <_nullIf>
47 47
                                                    <null>0</null>
48 48
                                                    <type>float</type>
49
                                                    <value>$coll_year</value>
49
                                                    <value>$yearCollected</value>
50 50
                                                </_nullIf>
51 51
                                            </year>
52 52
                                        </_date>
53 53
                                    </collectiondate>
54 54
                                    <plantobservation>
55 55
                                        <specimenreplicate>
56
                                            <catalognumber_dwc>$accession_number</catalognumber_dwc>
56
                                            <catalognumber_dwc>$accessionNumber</catalognumber_dwc>
57 57
                                            <institution_id>
58 58
                                                <sourcelist>
59
                                                    <name>$acronym</name>
59
                                                    <name>$specimenHolderInstitutions</name>
60 60
                                                    <sourcename>
61 61
                                                        <name>
62 62
                                                            <_split>
63 63
                                                                <separator>[,;] *</separator>
64
                                                                <value>$acronym</value>
64
                                                                <value>$specimenHolderInstitutions</value>
65 65
                                                            </_split>
66 66
                                                        </name>
67 67
                                                    </sourcename>
......
70 70
                                        </specimenreplicate>
71 71
                                    </plantobservation>
72 72
                                </aggregateoccurrence>
73
                                <collector_id><party><fullname>$collector</fullname></party></collector_id>
73
                                <collector_id><party><fullname>$recordedBy</fullname></party></collector_id>
74 74
                                <taxondetermination>
75 75
                                    <taxonverbatim_id>
76 76
                                        <taxonverbatim>
inputs/REMIB/Specimen/VegBIEN.csv
1
REMIB,VegBIEN:/_setDefault:[source_id/source/shortname/_env:[name=source]]/path/_simplifyPath:[next=parent_id]/path,Comments
2
accession_number,"/_if[@name=""if specimen""]/cond/_exists","Not globally unique, only (usually) unique within the institution providing the data (""acronym""). Note that there are nevertheless 11,869 rows where an accession_number appears multiple times within the same institution. [1]
3

  
4
[1] Using the following query:
5
-----
6
SELECT acronym, accession_number, count(*)
7
FROM ""REMIB"".""Specimen""
8
GROUP BY acronym, accession_number
9
HAVING count(*) > 1
10
-----"
11
acronym,"/_if[@name=""if specimen""]/else/source/shortname/_first/1",
12
accession_number,"/location/_if[@name=""if subplot""]/else/authorlocationcode/_first/3/_if[@name=""if plot""]/else/_alt/2/_if[@name=""if catalogNumber""]/cond/_exists","Not globally unique, only (usually) unique within the institution providing the data (""acronym""). Note that there are nevertheless 11,869 rows where an accession_number appears multiple times within the same institution. [1]
13

  
14
[1] Using the following query:
15
-----
16
SELECT acronym, accession_number, count(*)
17
FROM ""REMIB"".""Specimen""
18
GROUP BY acronym, accession_number
19
HAVING count(*) > 1
20
-----"
21
accession_number,"/location/_if[@name=""if subplot""]/else/authorlocationcode/_first/3/_if[@name=""if plot""]/else/_alt/2/_if[@name=""if catalogNumber""]/then/_join/1/_if[@name=""if specimen""]/cond/_exists","Not globally unique, only (usually) unique within the institution providing the data (""acronym""). Note that there are nevertheless 11,869 rows where an accession_number appears multiple times within the same institution. [1]
22

  
23
[1] Using the following query:
24
-----
25
SELECT acronym, accession_number, count(*)
26
FROM ""REMIB"".""Specimen""
27
GROUP BY acronym, accession_number
28
HAVING count(*) > 1
29
-----"
30
acronym,"/location/_if[@name=""if subplot""]/else/authorlocationcode/_first/3/_if[@name=""if plot""]/else/_alt/2/_if[@name=""if catalogNumber""]/then/_join/1/_if[@name=""if specimen""]/then/_first/2",
31
accession_number,"/location/_if[@name=""if subplot""]/else/authorlocationcode/_first/3/_if[@name=""if plot""]/else/_alt/2/_if[@name=""if catalogNumber""]/then/_join/3/_if[@name=""if indirect voucher""]/else","Not globally unique, only (usually) unique within the institution providing the data (""acronym""). Note that there are nevertheless 11,869 rows where an accession_number appears multiple times within the same institution. [1]
32

  
33
[1] Using the following query:
34
-----
35
SELECT acronym, accession_number, count(*)
36
FROM ""REMIB"".""Specimen""
37
GROUP BY acronym, accession_number
38
HAVING count(*) > 1
39
-----"
40
locality,/location/iscultivated/_or/2/_locationnarrative_is_cultivated/locationnarrative/_join/1,
41
habitat,/location/iscultivated/_or/2/_locationnarrative_is_cultivated/locationnarrative/_join/3,
42
coll_day,"/location/locationevent/taxonoccurrence/aggregateoccurrence/collectiondate/_alt/2/_date/day/_nullIf:[null=0,type=float]/value",
43
coll_month,"/location/locationevent/taxonoccurrence/aggregateoccurrence/collectiondate/_alt/2/_date/month/_nullIf:[null=0,type=float]/value",
44
coll_year,"/location/locationevent/taxonoccurrence/aggregateoccurrence/collectiondate/_alt/2/_date/year/_nullIf:[null=0,type=float]/value",
45
accession_number,"/location/locationevent/taxonoccurrence/aggregateoccurrence/plantobservation/specimenreplicate/catalognumber_dwc/_if[@name=""if indirect voucher""]/else","Not globally unique, only (usually) unique within the institution providing the data (""acronym""). Note that there are nevertheless 11,869 rows where an accession_number appears multiple times within the same institution. [1]
46

  
47
[1] Using the following query:
48
-----
49
SELECT acronym, accession_number, count(*)
50
FROM ""REMIB"".""Specimen""
51
GROUP BY acronym, accession_number
52
HAVING count(*) > 1
53
-----"
54
accession_number,"/location/locationevent/taxonoccurrence/aggregateoccurrence/plantobservation/specimenreplicate/institution_id/sourcelist/_if[@name=""if specimen""]/cond/_exists","Not globally unique, only (usually) unique within the institution providing the data (""acronym""). Note that there are nevertheless 11,869 rows where an accession_number appears multiple times within the same institution. [1]
55

  
56
[1] Using the following query:
57
-----
58
SELECT acronym, accession_number, count(*)
59
FROM ""REMIB"".""Specimen""
60
GROUP BY acronym, accession_number
61
HAVING count(*) > 1
62
-----"
63
acronym,"/location/locationevent/taxonoccurrence/aggregateoccurrence/plantobservation/specimenreplicate/institution_id/sourcelist/_if[@name=""if specimen""]/then/name/_first/2",
64
acronym,"/location/locationevent/taxonoccurrence/aggregateoccurrence/plantobservation/specimenreplicate/institution_id/sourcelist/_if[@name=""if specimen""]/then/sourcename/name/_first/2/_split:[separator=""[,;] *""]/value",
65
collector,/location/locationevent/taxonoccurrence/collector_id/party/fullname,
66
specificEpithet,"/location/locationevent/taxonoccurrence/taxondetermination[!isoriginal]/*_id/taxonverbatim/*_id/taxonlabel/parent_id/taxonlabel/parent_id/_forEach:[in:[cultivar,forma,variety,subspecies,],do=""taxonlabel[rank=_val]/parent_id""]/taxonlabel[rank=species]/taxonepithet",
67
genus,"/location/locationevent/taxonoccurrence/taxondetermination[!isoriginal]/*_id/taxonverbatim/*_id/taxonlabel/parent_id/taxonlabel/parent_id/_forEach:[in:[cultivar,forma,variety,subspecies,species,],do=""taxonlabel[rank=_val]/parent_id""]/taxonlabel[rank=genus]/taxonepithet",
68
family,"/location/locationevent/taxonoccurrence/taxondetermination[!isoriginal]/*_id/taxonverbatim/*_id/taxonlabel/parent_id/taxonlabel/parent_id/_forEach:[in:[cultivar,forma,variety,subspecies,species,genus,],do=""taxonlabel[rank=_val]/parent_id""]/taxonlabel[rank=family]/taxonepithet",
69
family,/location/locationevent/taxonoccurrence/taxondetermination[!isoriginal]/*_id/taxonverbatim/*_id/taxonlabel/taxonomicname/_merge_prefix/prefix/_taxon_family_require_std/family,
70
genus,/location/locationevent/taxonoccurrence/taxondetermination[!isoriginal]/*_id/taxonverbatim/*_id/taxonlabel/taxonomicname/_merge_prefix/value/_alt/2/_join_words/1/_alt/2/_join_words/2/_filter_genus/value,
71
specificEpithet,/location/locationevent/taxonoccurrence/taxondetermination[!isoriginal]/*_id/taxonverbatim/*_id/taxonlabel/taxonomicname/_merge_prefix/value/_alt/2/_join_words/1/_alt/2/_join_words/3/_join_words/1,
72
family,/location/locationevent/taxonoccurrence/taxondetermination[!isoriginal]/*_id/taxonverbatim/family,
73
genus,/location/locationevent/taxonoccurrence/taxondetermination[!isoriginal]/*_id/taxonverbatim/genus,
74
specificEpithet,/location/locationevent/taxonoccurrence/taxondetermination[!isoriginal]/*_id/taxonverbatim/specific_epithet,
75
accession_number,"/location/locationevent/taxonoccurrence/voucher/*_id/specimenreplicate/catalognumber_dwc/_if[@name=""if indirect voucher""]/then","Not globally unique, only (usually) unique within the institution providing the data (""acronym""). Note that there are nevertheless 11,869 rows where an accession_number appears multiple times within the same institution. [1]
76

  
77
[1] Using the following query:
78
-----
79
SELECT acronym, accession_number, count(*)
80
FROM ""REMIB"".""Specimen""
81
GROUP BY acronym, accession_number
82
HAVING count(*) > 1
83
-----"
84
locality,/location/locationnarrative/_merge/1,
85
habitat,"/location/locationnarrative/_merge/3/_label[label=""habitat""]/value","Brad: Free-text description of vegetation community where collected, frequently redundane wrt 'Vegetation'. Bob, Nick: keep as user defined or create special element?"
86
latitude_deg,/location/locationplace/*_id/place/*_id/coordinates/latitude_deg/_dms_to_dd/deg,
87
latitude_min,/location/locationplace/*_id/place/*_id/coordinates/latitude_deg/_dms_to_dd/min,
88
latitude_sec,/location/locationplace/*_id/place/*_id/coordinates/latitude_deg/_dms_to_dd/sec,
89
longitude_deg,/location/locationplace/*_id/place/*_id/coordinates/longitude_deg/_dms_to_dd/deg,
90
longitude_min,/location/locationplace/*_id/place/*_id/coordinates/longitude_deg/_dms_to_dd/min,
91
longitude_sec,/location/locationplace/*_id/place/*_id/coordinates/longitude_deg/_dms_to_dd/sec,
92
country,/location/locationplace/*_id/place/country,
93
county,/location/locationplace/*_id/place/county,
94
latitude_deg,"/location/locationplace/*_id/place/matched_place_id/place:[.,source_id/source/shortname=geoscrub]/*_id/coordinates:[source_id/source/shortname=geoscrub]/_first/2/latitude_deg/_dms_to_dd/deg",
95
latitude_min,"/location/locationplace/*_id/place/matched_place_id/place:[.,source_id/source/shortname=geoscrub]/*_id/coordinates:[source_id/source/shortname=geoscrub]/_first/2/latitude_deg/_dms_to_dd/min",
96
latitude_sec,"/location/locationplace/*_id/place/matched_place_id/place:[.,source_id/source/shortname=geoscrub]/*_id/coordinates:[source_id/source/shortname=geoscrub]/_first/2/latitude_deg/_dms_to_dd/sec",
97
longitude_deg,"/location/locationplace/*_id/place/matched_place_id/place:[.,source_id/source/shortname=geoscrub]/*_id/coordinates:[source_id/source/shortname=geoscrub]/_first/2/longitude_deg/_dms_to_dd/deg",
98
longitude_min,"/location/locationplace/*_id/place/matched_place_id/place:[.,source_id/source/shortname=geoscrub]/*_id/coordinates:[source_id/source/shortname=geoscrub]/_first/2/longitude_deg/_dms_to_dd/min",
99
longitude_sec,"/location/locationplace/*_id/place/matched_place_id/place:[.,source_id/source/shortname=geoscrub]/*_id/coordinates:[source_id/source/shortname=geoscrub]/_first/2/longitude_deg/_dms_to_dd/sec",
100
country,/location/locationplace/*_id/place/matched_place_id/place:[source_id/source/shortname=geoscrub]/country/_first/2,
101
county,/location/locationplace/*_id/place/matched_place_id/place:[source_id/source/shortname=geoscrub]/county/_first/2,
102
stateProvince,/location/locationplace/*_id/place/matched_place_id/place:[source_id/source/shortname=geoscrub]/stateprovince/_first/2,
103
stateProvince,/location/locationplace/*_id/place/stateprovince,
104
country,,** No join mapping for country_verbatim ** 
105
county,,** No join mapping for county_verbatim ** 
106
habitat,,** No join mapping for habitat_verbatim ** 
107
lat_deg,,** No join mapping for latitude_deg_verbatim ** 
108
lat_min,,** No join mapping for latitude_min_verbatim ** 
109
lat_sec,,** No join mapping for latitude_sec_verbatim ** 
110
locality,,** No join mapping for locality_verbatim ** 
111
long_deg,,** No join mapping for longitude_deg_verbatim ** 
112
long_min,,** No join mapping for longitude_min_verbatim ** 
113
long_sec,,** No join mapping for longitude_sec_verbatim ** 
114
preparation,,** No join mapping for preparations ** 
115
row_num,,** No join mapping for *row_num ** 
116
state,,** No join mapping for stateProvince_verbatim ** 
1
link ../../../mappings/VegCore-VegBIEN.csv
117 2

  
inputs/REMIB/run
1
#!/bin/bash -e
2
. "$(dirname "${BASH_SOURCE[0]}")"/../../lib/runscripts/datasrc_dir.run
0 3

  

Also available in: Unified diff