Revision 10339
Added by Aaron Marcuse-Kubitza about 11 years ago
postprocess.sql | ||
---|---|---|
1 | 1 |
SELECT util.search_path_append('util'); |
2 | 2 |
|
3 |
SELECT util.rename_cols(:table_str, $$ |
|
4 |
country => country_verbatim, |
|
5 |
state => stateProvince_verbatim, |
|
6 |
county => county_verbatim, |
|
7 |
locality => locality_verbatim, |
|
8 |
long_deg => longitude_deg_verbatim, |
|
9 |
long_min => longitude_min_verbatim, |
|
10 |
long_sec => longitude_sec_verbatim, |
|
11 |
lat_deg => latitude_deg_verbatim, |
|
12 |
lat_min => latitude_min_verbatim, |
|
13 |
lat_sec => latitude_sec_verbatim, |
|
14 |
habitat => habitat_verbatim, |
|
15 |
$$::hstore); |
|
16 |
|
|
3 | 17 |
SELECT create_if_not_exists($$CREATE INDEX "Specimen.acronym" ON $$||:table_str||$$ (acronym)$$); |
4 | 18 |
|
5 | 19 |
-- remove frameshifted rows |
6 | 20 |
-- rerun time: 6 s ("Time: 5537.211 ms") thanks to index |
7 | 21 |
DELETE FROM :table WHERE acronym IS NULL; |
8 | 22 |
DELETE FROM :table WHERE coll_year !~ '^(?:1[7-9]|20)\d{2}$'; |
9 |
DELETE FROM :table WHERE country ~ '\d'; |
|
10 |
DELETE FROM :table WHERE long_deg ~ '[[:alpha:]]' AND long_deg NOT IN ('RESTRINGIDO');
|
|
11 |
DELETE FROM :table WHERE long_min !~ '^\d*$';
|
|
12 |
DELETE FROM :table WHERE long_sec !~ '^\d*$';
|
|
13 |
DELETE FROM :table WHERE lat_min !~ '^\d*$';
|
|
14 |
DELETE FROM :table WHERE lat_sec !~ '^\d*$';
|
|
23 |
DELETE FROM :table WHERE country_verbatim ~ '\d';
|
|
24 |
DELETE FROM :table WHERE longitude_deg_verbatim ~ '[[:alpha:]]' AND longitude_deg_verbatim NOT IN ('RESTRINGIDO');
|
|
25 |
DELETE FROM :table WHERE longitude_min_verbatim !~ '^\d*$';
|
|
26 |
DELETE FROM :table WHERE longitude_sec_verbatim !~ '^\d*$';
|
|
27 |
DELETE FROM :table WHERE latitude_min_verbatim !~ '^\d*$';
|
|
28 |
DELETE FROM :table WHERE latitude_sec_verbatim !~ '^\d*$';
|
|
15 | 29 |
|
16 | 30 |
-- Remove institutions that we have direct data for |
17 | 31 |
DELETE FROM :table |
... | ... | |
30 | 44 |
ORDER BY dataprovider |
31 | 45 |
*/ |
32 | 46 |
; |
47 |
|
|
48 |
|
|
49 |
CREATE OR REPLACE FUNCTION map_nulls__text(value text) |
|
50 |
RETURNS text AS |
|
51 |
$BODY$ |
|
52 |
-- "ND = no disponible = not available" (https://projects.nceas.ucsb.edu/nceas/projects/bien/wiki/Spot-checking#REMIB) |
|
53 |
SELECT util.map_nulls('{ND,NA}', $1) |
|
54 |
$BODY$ |
|
55 |
LANGUAGE sql IMMUTABLE STRICT |
|
56 |
COST 100; |
|
57 |
|
|
58 |
SELECT mk_derived_col((:table_str, 'country' ), $$map_nulls__text("country_verbatim" )$$); -- runtime: 3 min ("Time: 183171.010 ms") |
|
59 |
SELECT mk_derived_col((:table_str, 'stateProvince'), $$map_nulls__text("stateProvince_verbatim")$$); -- runtime: 2.5 min ("Time: 159935.791 ms") |
|
60 |
SELECT mk_derived_col((:table_str, 'county' ), $$map_nulls__text("county_verbatim" )$$); -- runtime: 2.5 min ("Time: 151076.487 ms") |
|
61 |
SELECT mk_derived_col((:table_str, 'locality' ), $$map_nulls__text("locality_verbatim" )$$); -- runtime: 2.5 min ("Time: 155188.486 ms") |
|
62 |
SELECT mk_derived_col((:table_str, 'habitat' ), $$map_nulls__text("habitat_verbatim" )$$); -- runtime: 2 min ("Time: 131904.068 ms") |
|
63 |
|
|
64 |
|
|
65 |
CREATE OR REPLACE FUNCTION map_nulls__coord__longitude(value text) |
|
66 |
RETURNS double precision AS |
|
67 |
$BODY$ |
|
68 |
-- TODO: sometimes also +-99, but not when min/sec are valid |
|
69 |
SELECT util.map_nulls('{-999,999,1000}', |
|
70 |
util.map_nulls('{RESTRINGIDO}', $1)::double precision) |
|
71 |
$BODY$ |
|
72 |
LANGUAGE sql IMMUTABLE STRICT |
|
73 |
COST 100; |
|
74 |
|
|
75 |
CREATE OR REPLACE FUNCTION map_nulls__coord__other(value text) |
|
76 |
RETURNS double precision AS |
|
77 |
$BODY$ |
|
78 |
-- also map nulls that are valid longitudes |
|
79 |
SELECT util.map_nulls('{-99,99}', map_nulls__coord__longitude($1)) |
|
80 |
$BODY$ |
|
81 |
LANGUAGE sql IMMUTABLE STRICT |
|
82 |
COST 100; |
|
83 |
|
|
84 |
SELECT mk_derived_col((:table_str, 'longitude_deg'), $$map_nulls__coord__longitude("longitude_deg_verbatim")$$); -- runtime: 6 min ("Time: 367070.877 ms") |
|
85 |
SELECT mk_derived_col((:table_str, 'longitude_min'), $$map_nulls__coord__other ("longitude_min_verbatim")$$); |
|
86 |
SELECT mk_derived_col((:table_str, 'longitude_sec'), $$map_nulls__coord__other ("longitude_sec_verbatim")$$); -- runtime: 8 min ("Time: 473198.465 ms") |
|
87 |
SELECT mk_derived_col((:table_str, 'latitude_deg' ), $$map_nulls__coord__other ("latitude_deg_verbatim" )$$); -- runtime: 8.5 min ("Time: 508835.119 ms") |
|
88 |
SELECT mk_derived_col((:table_str, 'latitude_min' ), $$map_nulls__coord__other ("latitude_min_verbatim" )$$); -- runtime: 8 min ("Time: 484806.499 ms") |
|
89 |
SELECT mk_derived_col((:table_str, 'latitude_sec' ), $$map_nulls__coord__other ("latitude_sec_verbatim" )$$); -- runtime: 8.5 min ("Time: 505171.027 ms") |
Also available in: Unified diff
inputs/REMIB/Specimen/: translated single-column filters to postprocessing derived columns, using the steps at wiki.vegpath.org/Switching_to_new-style_import#stage-I-source-specific > "translate single-column filters to postprocessing derived columns". null-mapping filters now use wrappers around new util.map_nulls(). note that the verbatim columns input to the filters need to be renamed to avoid name collisions with their filtered columns, which must be VegCore terms for new-style import.