1 |
10245
|
aaronmk
|
SELECT util.search_path_append('util');
|
2 |
7249
|
aaronmk
|
|
3 |
12879
|
aaronmk
|
SELECT create_if_not_exists($$CREATE INDEX "Specimen.specimen_duplicate_institutions" ON $$||:table_str||$$ ("specimen_duplicate_institutions")$$);
|
4 |
10339
|
aaronmk
|
|
5 |
10330
|
aaronmk
|
-- remove frameshifted rows
|
6 |
12879
|
aaronmk
|
DELETE FROM :table WHERE "specimen_duplicate_institutions" IS NULL;
|
7 |
10349
|
aaronmk
|
DELETE FROM :table WHERE "yearCollected" !~ '^(?:1[7-9]|20)\d{2}$';
|
8 |
10339
|
aaronmk
|
DELETE FROM :table WHERE country_verbatim ~ '\d';
|
9 |
|
|
DELETE FROM :table WHERE longitude_deg_verbatim ~ '[[:alpha:]]' AND longitude_deg_verbatim NOT IN ('RESTRINGIDO');
|
10 |
|
|
DELETE FROM :table WHERE longitude_min_verbatim !~ '^\d*$';
|
11 |
|
|
DELETE FROM :table WHERE longitude_sec_verbatim !~ '^\d*$';
|
12 |
|
|
DELETE FROM :table WHERE latitude_min_verbatim !~ '^\d*$';
|
13 |
|
|
DELETE FROM :table WHERE latitude_sec_verbatim !~ '^\d*$';
|
14 |
10330
|
aaronmk
|
|
15 |
7249
|
aaronmk
|
-- Remove institutions that we have direct data for
|
16 |
|
|
DELETE FROM :table
|
17 |
12879
|
aaronmk
|
WHERE "specimen_duplicate_institutions" IN (
|
18 |
7250
|
aaronmk
|
-- Comments are from e-mail from Brad Boyle on 2013-1-16
|
19 |
|
|
'MO' -- "all MO records in REMIB are also available from MO's own website"
|
20 |
|
|
--, 'ARIZ' -- Some REMIB ARIZ specimens not yet in ARIZ itself
|
21 |
|
|
--, 'NY' -- Some REMIB NY specimens not yet in NY itself
|
22 |
9502
|
aaronmk
|
, 'TEX'
|
23 |
7249
|
aaronmk
|
)
|
24 |
9502
|
aaronmk
|
/* list obtained using the following on r9459:
|
25 |
9501
|
aaronmk
|
SELECT DISTINCT dataprovider
|
26 |
7249
|
aaronmk
|
FROM sourcelist
|
27 |
9501
|
aaronmk
|
JOIN provider_count ON provider_count.dataprovider = sourcelist.name
|
28 |
12516
|
aaronmk
|
WHERE source_id = (SELECT source_by_shortname('REMIB'))
|
29 |
9501
|
aaronmk
|
ORDER BY dataprovider
|
30 |
7249
|
aaronmk
|
*/
|
31 |
|
|
;
|
32 |
10339
|
aaronmk
|
|
33 |
|
|
|
34 |
10377
|
aaronmk
|
-- map_nulls() derived cols
|
35 |
|
|
-- runtime: 7.5 min ("real 7m27.083s") @vegbiendev
|
36 |
|
|
|
37 |
10339
|
aaronmk
|
CREATE OR REPLACE FUNCTION map_nulls__text(value text)
|
38 |
|
|
RETURNS text AS
|
39 |
|
|
$BODY$
|
40 |
|
|
-- "ND = no disponible = not available" (https://projects.nceas.ucsb.edu/nceas/projects/bien/wiki/Spot-checking#REMIB)
|
41 |
|
|
SELECT util.map_nulls('{ND,NA}', $1)
|
42 |
|
|
$BODY$
|
43 |
10361
|
aaronmk
|
LANGUAGE sql IMMUTABLE
|
44 |
10339
|
aaronmk
|
COST 100;
|
45 |
|
|
|
46 |
10376
|
aaronmk
|
SELECT mk_derived_col((:table_str, 'country' ), $$map_nulls__text("country_verbatim" )$$); -- runtime: 35 s ("Time: 35690.797 ms") @vegbiendev
|
47 |
|
|
SELECT mk_derived_col((:table_str, 'stateProvince'), $$map_nulls__text("stateProvince_verbatim")$$); -- runtime: 35 s ("Time: 36074.430 ms") @vegbiendev
|
48 |
|
|
SELECT mk_derived_col((:table_str, 'county' ), $$map_nulls__text("county_verbatim" )$$); -- runtime: 35 s ("Time: 36096.911 ms") @vegbiendev
|
49 |
|
|
SELECT mk_derived_col((:table_str, 'locality' ), $$map_nulls__text("locality_verbatim" )$$); -- runtime: 35 s ("Time: 36076.364 ms") @vegbiendev
|
50 |
|
|
SELECT mk_derived_col((:table_str, 'habitat' ), $$map_nulls__text("habitat_verbatim" )$$); -- runtime: 35 s ("Time: 35481.828 ms") @vegbiendev
|
51 |
10339
|
aaronmk
|
|
52 |
|
|
|
53 |
|
|
CREATE OR REPLACE FUNCTION map_nulls__coord__longitude(value text)
|
54 |
|
|
RETURNS double precision AS
|
55 |
|
|
$BODY$
|
56 |
|
|
-- TODO: sometimes also +-99, but not when min/sec are valid
|
57 |
|
|
SELECT util.map_nulls('{-999,999,1000}',
|
58 |
|
|
util.map_nulls('{RESTRINGIDO}', $1)::double precision)
|
59 |
|
|
$BODY$
|
60 |
10361
|
aaronmk
|
LANGUAGE sql IMMUTABLE
|
61 |
10339
|
aaronmk
|
COST 100;
|
62 |
|
|
|
63 |
|
|
CREATE OR REPLACE FUNCTION map_nulls__coord__other(value text)
|
64 |
|
|
RETURNS double precision AS
|
65 |
|
|
$BODY$
|
66 |
|
|
-- also map nulls that are valid longitudes
|
67 |
|
|
SELECT util.map_nulls('{-99,99}', map_nulls__coord__longitude($1))
|
68 |
|
|
$BODY$
|
69 |
10361
|
aaronmk
|
LANGUAGE sql IMMUTABLE
|
70 |
10339
|
aaronmk
|
COST 100;
|
71 |
|
|
|
72 |
10376
|
aaronmk
|
SELECT mk_derived_col((:table_str, 'longitude_deg'), $$map_nulls__coord__longitude("longitude_deg_verbatim")$$); -- runtime: 40 s ("Time: 39417.099 ms") @vegbiendev
|
73 |
|
|
SELECT mk_derived_col((:table_str, 'longitude_min'), $$map_nulls__coord__other ("longitude_min_verbatim")$$); -- runtime: 40 s ("Time: 41929.772 ms") @vegbiendev
|
74 |
|
|
SELECT mk_derived_col((:table_str, 'longitude_sec'), $$map_nulls__coord__other ("longitude_sec_verbatim")$$); -- runtime: 40 s ("Time: 42106.205 ms") @vegbiendev
|
75 |
|
|
SELECT mk_derived_col((:table_str, 'latitude_deg' ), $$map_nulls__coord__other ("latitude_deg_verbatim" )$$); -- runtime: 40 s ("Time: 42187.294 ms") @vegbiendev
|
76 |
|
|
SELECT mk_derived_col((:table_str, 'latitude_min' ), $$map_nulls__coord__other ("latitude_min_verbatim" )$$); -- runtime: 40 s ("Time: 43220.851 ms") @vegbiendev
|
77 |
|
|
SELECT mk_derived_col((:table_str, 'latitude_sec' ), $$map_nulls__coord__other ("latitude_sec_verbatim" )$$); -- runtime: 40 s ("Time: 42267.566 ms") @vegbiendev
|