Project

General

Profile

1
SELECT util.search_path_append('util');
2

    
3
SELECT create_if_not_exists($$CREATE INDEX "Specimen.specimenHolderInstitutions" ON $$||:table_str||$$ ("specimenHolderInstitutions")$$);
4

    
5
-- remove frameshifted rows
6
DELETE FROM :table WHERE "specimenHolderInstitutions" IS NULL;
7
DELETE FROM :table WHERE "yearCollected" !~ '^(?:1[7-9]|20)\d{2}$';
8
DELETE FROM :table WHERE country_verbatim ~ '\d';
9
DELETE FROM :table WHERE longitude_deg_verbatim ~ '[[:alpha:]]' AND longitude_deg_verbatim NOT IN ('RESTRINGIDO');
10
DELETE FROM :table WHERE longitude_min_verbatim !~ '^\d*$';
11
DELETE FROM :table WHERE longitude_sec_verbatim !~ '^\d*$';
12
DELETE FROM :table WHERE latitude_min_verbatim  !~ '^\d*$';
13
DELETE FROM :table WHERE latitude_sec_verbatim  !~ '^\d*$';
14

    
15
-- Remove institutions that we have direct data for
16
DELETE FROM :table
17
WHERE "specimenHolderInstitutions" IN (
18
    -- Comments are from e-mail from Brad Boyle on 2013-1-16
19
    'MO' -- "all MO records in REMIB are also available from MO's own website"
20
    --, 'ARIZ' -- Some REMIB ARIZ specimens not yet in ARIZ itself
21
    --, 'NY' -- Some REMIB NY specimens not yet in NY itself
22
    , 'TEX'
23
)
24
/* list obtained using the following on r9459:
25
SELECT DISTINCT dataprovider
26
FROM sourcelist
27
JOIN provider_count ON provider_count.dataprovider = sourcelist.name
28
WHERE source_id = source_by_shortname('REMIB')
29
ORDER BY dataprovider
30
*/
31
;
32

    
33

    
34
CREATE OR REPLACE FUNCTION map_nulls__text(value text)
35
  RETURNS text AS
36
$BODY$
37
-- "ND = no disponible = not available" (https://projects.nceas.ucsb.edu/nceas/projects/bien/wiki/Spot-checking#REMIB)
38
SELECT util.map_nulls('{ND,NA}', $1)
39
$BODY$
40
  LANGUAGE sql IMMUTABLE STRICT
41
  COST 100;
42

    
43
SELECT mk_derived_col((:table_str, 'country'      ), $$map_nulls__text("country_verbatim"      )$$); -- runtime: 3   min ("Time: 183171.010 ms")
44
SELECT mk_derived_col((:table_str, 'stateProvince'), $$map_nulls__text("stateProvince_verbatim")$$); -- runtime: 2.5 min ("Time: 159935.791 ms")
45
SELECT mk_derived_col((:table_str, 'county'       ), $$map_nulls__text("county_verbatim"       )$$); -- runtime: 2.5 min ("Time: 151076.487 ms")
46
SELECT mk_derived_col((:table_str, 'locality'     ), $$map_nulls__text("locality_verbatim"     )$$); -- runtime: 2.5 min ("Time: 155188.486 ms")
47
SELECT mk_derived_col((:table_str, 'habitat'      ), $$map_nulls__text("habitat_verbatim"      )$$); -- runtime: 2   min ("Time: 131904.068 ms")
48

    
49

    
50
CREATE OR REPLACE FUNCTION map_nulls__coord__longitude(value text)
51
  RETURNS double precision AS
52
$BODY$
53
-- TODO: sometimes also +-99, but not when min/sec are valid
54
SELECT util.map_nulls('{-999,999,1000}',
55
       util.map_nulls('{RESTRINGIDO}', $1)::double precision)
56
$BODY$
57
  LANGUAGE sql IMMUTABLE STRICT
58
  COST 100;
59

    
60
CREATE OR REPLACE FUNCTION map_nulls__coord__other(value text)
61
  RETURNS double precision AS
62
$BODY$
63
-- also map nulls that are valid longitudes
64
SELECT util.map_nulls('{-99,99}', map_nulls__coord__longitude($1))
65
$BODY$
66
  LANGUAGE sql IMMUTABLE STRICT
67
  COST 100;
68

    
69
SELECT mk_derived_col((:table_str, 'longitude_deg'), $$map_nulls__coord__longitude("longitude_deg_verbatim")$$); -- runtime: 6   min ("Time: 367070.877 ms")
70
SELECT mk_derived_col((:table_str, 'longitude_min'), $$map_nulls__coord__other    ("longitude_min_verbatim")$$);
71
SELECT mk_derived_col((:table_str, 'longitude_sec'), $$map_nulls__coord__other    ("longitude_sec_verbatim")$$); -- runtime: 8   min ("Time: 473198.465 ms")
72
SELECT mk_derived_col((:table_str, 'latitude_deg' ), $$map_nulls__coord__other    ("latitude_deg_verbatim" )$$); -- runtime: 8.5 min ("Time: 508835.119 ms")
73
SELECT mk_derived_col((:table_str, 'latitude_min' ), $$map_nulls__coord__other    ("latitude_min_verbatim" )$$); -- runtime: 8   min ("Time: 484806.499 ms")
74
SELECT mk_derived_col((:table_str, 'latitude_sec' ), $$map_nulls__coord__other    ("latitude_sec_verbatim" )$$); -- runtime: 8.5 min ("Time: 505171.027 ms")
(7-7/10)