SELECT util.search_path_append('util'); SELECT create_if_not_exists($$CREATE INDEX "Specimen.specimenHolderInstitutions" ON $$||:table_str||$$ ("specimenHolderInstitutions")$$); -- remove frameshifted rows DELETE FROM :table WHERE "specimenHolderInstitutions" IS NULL; DELETE FROM :table WHERE "yearCollected" !~ '^(?:1[7-9]|20)\d{2}$'; DELETE FROM :table WHERE country_verbatim ~ '\d'; DELETE FROM :table WHERE longitude_deg_verbatim ~ '[[:alpha:]]' AND longitude_deg_verbatim NOT IN ('RESTRINGIDO'); DELETE FROM :table WHERE longitude_min_verbatim !~ '^\d*$'; DELETE FROM :table WHERE longitude_sec_verbatim !~ '^\d*$'; DELETE FROM :table WHERE latitude_min_verbatim !~ '^\d*$'; DELETE FROM :table WHERE latitude_sec_verbatim !~ '^\d*$'; -- Remove institutions that we have direct data for DELETE FROM :table WHERE "specimenHolderInstitutions" IN ( -- Comments are from e-mail from Brad Boyle on 2013-1-16 'MO' -- "all MO records in REMIB are also available from MO's own website" --, 'ARIZ' -- Some REMIB ARIZ specimens not yet in ARIZ itself --, 'NY' -- Some REMIB NY specimens not yet in NY itself , 'TEX' ) /* list obtained using the following on r9459: SELECT DISTINCT dataprovider FROM sourcelist JOIN provider_count ON provider_count.dataprovider = sourcelist.name WHERE source_id = (SELECT source_by_shortname('REMIB')) ORDER BY dataprovider */ ; -- map_nulls() derived cols -- runtime: 7.5 min ("real 7m27.083s") @vegbiendev CREATE OR REPLACE FUNCTION map_nulls__text(value text) RETURNS text AS $BODY$ -- "ND = no disponible = not available" (https://projects.nceas.ucsb.edu/nceas/projects/bien/wiki/Spot-checking#REMIB) SELECT util.map_nulls('{ND,NA}', $1) $BODY$ LANGUAGE sql IMMUTABLE COST 100; SELECT mk_derived_col((:table_str, 'country' ), $$map_nulls__text("country_verbatim" )$$); -- runtime: 35 s ("Time: 35690.797 ms") @vegbiendev SELECT mk_derived_col((:table_str, 'stateProvince'), $$map_nulls__text("stateProvince_verbatim")$$); -- runtime: 35 s ("Time: 36074.430 ms") @vegbiendev SELECT mk_derived_col((:table_str, 'county' ), $$map_nulls__text("county_verbatim" )$$); -- runtime: 35 s ("Time: 36096.911 ms") @vegbiendev SELECT mk_derived_col((:table_str, 'locality' ), $$map_nulls__text("locality_verbatim" )$$); -- runtime: 35 s ("Time: 36076.364 ms") @vegbiendev SELECT mk_derived_col((:table_str, 'habitat' ), $$map_nulls__text("habitat_verbatim" )$$); -- runtime: 35 s ("Time: 35481.828 ms") @vegbiendev CREATE OR REPLACE FUNCTION map_nulls__coord__longitude(value text) RETURNS double precision AS $BODY$ -- TODO: sometimes also +-99, but not when min/sec are valid SELECT util.map_nulls('{-999,999,1000}', util.map_nulls('{RESTRINGIDO}', $1)::double precision) $BODY$ LANGUAGE sql IMMUTABLE COST 100; CREATE OR REPLACE FUNCTION map_nulls__coord__other(value text) RETURNS double precision AS $BODY$ -- also map nulls that are valid longitudes SELECT util.map_nulls('{-99,99}', map_nulls__coord__longitude($1)) $BODY$ LANGUAGE sql IMMUTABLE COST 100; SELECT mk_derived_col((:table_str, 'longitude_deg'), $$map_nulls__coord__longitude("longitude_deg_verbatim")$$); -- runtime: 40 s ("Time: 39417.099 ms") @vegbiendev SELECT mk_derived_col((:table_str, 'longitude_min'), $$map_nulls__coord__other ("longitude_min_verbatim")$$); -- runtime: 40 s ("Time: 41929.772 ms") @vegbiendev SELECT mk_derived_col((:table_str, 'longitude_sec'), $$map_nulls__coord__other ("longitude_sec_verbatim")$$); -- runtime: 40 s ("Time: 42106.205 ms") @vegbiendev SELECT mk_derived_col((:table_str, 'latitude_deg' ), $$map_nulls__coord__other ("latitude_deg_verbatim" )$$); -- runtime: 40 s ("Time: 42187.294 ms") @vegbiendev SELECT mk_derived_col((:table_str, 'latitude_min' ), $$map_nulls__coord__other ("latitude_min_verbatim" )$$); -- runtime: 40 s ("Time: 43220.851 ms") @vegbiendev SELECT mk_derived_col((:table_str, 'latitude_sec' ), $$map_nulls__coord__other ("latitude_sec_verbatim" )$$); -- runtime: 40 s ("Time: 42267.566 ms") @vegbiendev