/* SQL statements carrying out geovalidation on vegbien location data. Currently hard-coded to operate on the 'geoscrub' table that is assumed to have been generated by the geonames scrubbing process (see geonames.sql). This table should minimally contain the following columns: decimallatitude (latitude, WGS84 decimal degrees) decimallongitude (longitude, WGS84 decimal degrees) country (original asserted country name) stateprovince (original asserted stateprovince name) county (original asserted county name) countrystd (GADM2-mapped country name) stateprovincestd (GADM2-mapped state/province name) countystd (GADM2-mapped county name) (Note that the geonames.sql script also currently creates columns with geonames.org IDs for country, stateprovince, and county.) After execution of the statements below, the table will contain the following new columns (along with geom and geog columns used by postgis): latlonvalidity (validity score for lat/lon coordinate) countryvalidity (validity score for country) stateprovincevalidity (validity score for stateprovince) countyvalidity (validity score for county) pointcountry (GADM2 country containing the point) pointstateprovince (GADM2 stateprovince containing the point) pointcounty (GADM2 county containing the point) Workflow with actual times (as executed 15-Nov-2012) 12s create point geoms 16s create point geogs 23s index geoms 26s index geogs 37s index countrystd 25s index stateprovincestd 5s vacuum analyze 1s mark missing coords (UPDATE 0) 133s mark valid coords (UPDATE 1702989) 2s mark invalid coords (UPDATE 4981) 17s vacuum analyze 27s mark missing countries (UPDATE 222085) 36s mark non-gadm countries (UPDATE 6547) 1277s mark point in country (UPDATE 1400627) 1491s mark point near country (UPDATE 24626) 573s mark point near country accurate (UPDATE 1182) 0s mark point near Antarctica (UPDATE 0) 2s mark point not in country (UPDATE 54085) 8s vacuum analyze 15s mark missing stateprovinces (UPDATE 299015) 12s mark non-gadm stateprovinces (UPDATE 19716) 920s mark point in stateprovince (UPDATE 1241068) 864s mark point near stateprovince (UPDATE 35685) 424s mark point near stateprovince accurate (UPDATE 1051) 7s mark point not in stateprovince (UPDATE 111282) s vacuum analyze 128s mark missing counties (UPDATE 1431161) 29s mark non-gadm counties (UPDATE 63014) 86s mark point in county (UPDATE 174469) 65s mark point near county (UPDATE 12477) ?s mark point near county accurate (UPDATE ?) 3s mark point not in county (UPDATE 26849) ?s vacuum analyze 890s look up GADM location using given lat/lon (UPDATE 1656709) todo: * make firm decision about whether we should assume here that *std columns contain valid GADM names (or are NULL), or whether we should check that here Jim Regetz NCEAS Created Nov 2012 */ BEGIN; TRUNCATE geoscrub; DROP INDEX IF EXISTS "geoscrub_geom_gist"; DROP INDEX IF EXISTS "geoscrub_geog_gist"; DROP INDEX IF EXISTS "geoscrub_countrystd_idx"; DROP INDEX IF EXISTS "geoscrub_stateprovincestd_idx"; DROP INDEX IF EXISTS "geoscrub_countystd_idx"; ----------------------------- -- put everything together -- ----------------------------- -- reconstitute the whole dang thang -- populate geoscrub from vegbien_geoscrub data INSERT INTO geoscrub SELECT decimallatitude, decimallongitude, country, stateprovince, county, countryid, stateprovinceid, countyid, countrystd, stateprovincestd, countystd FROM vegbien_geoscrub v LEFT JOIN vcountry USING (country) LEFT JOIN vstate USING (countryid, stateprovince) LEFT JOIN vcounty USING (countryid, stateprovinceid, county); -- SELECT 1707970 -- Time: 26172.154 ms -- try to squeeze out a few more direct stateprovince mappings to gadm2 UPDATE geoscrub SET stateprovincestd = name_1 FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm WHERE country IS NOT NULL AND stateprovince IS NOT NULL AND countrystd = name_0 AND stateprovincestd IS NULL AND stateprovince = name_1; -- UPDATE 0 -- Time: 3630.973 ms -- try to squeeze out a few more direct county mappings to gadm2 UPDATE geoscrub SET countystd = name_2 FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm WHERE country IS NOT NULL AND stateprovince IS NOT NULL AND county IS NOT NULL AND countrystd = name_0 AND stateprovincestd = name_1 AND countystd IS NULL AND county = name_2; -- UPDATE 69 -- Time: 3982.715 ms COMMIT; -- generate point geometries from coordinates UPDATE geoscrub SET geom = ST_SetSRID(ST_Point(decimallongitude, decimallatitude), 4326); -- ... create point geographies too ... -- note that this excludes (leaves null) any occurrences with coordinates -- that are not valid decimal degrees UPDATE geoscrub SET geog = geom::geography WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90)); -- create indexes CREATE INDEX "geoscrub_geom_gist" ON geoscrub USING GIST (geom); CREATE INDEX "geoscrub_geog_gist" ON geoscrub USING GIST (geog); CREATE INDEX "geoscrub_countrystd_idx" ON geoscrub (countrystd); CREATE INDEX "geoscrub_stateprovincestd_idx" ON geoscrub (stateprovincestd); CREATE INDEX "geoscrub_countystd_idx" ON geoscrub (countystd); -- may want to do this first, otherwise the query planner wants to do a -- hash join instead of nested loop join, and at least when I use a limit -- statement to time queries on either side of the threshold where it -- switches, the hash join does waaaaay worse SET enable_hashjoin = false; -- -- validate lat/lon coordinates -- VACUUM ANALYZE geoscrub; -- -1 if missing one or both coordinates UPDATE geoscrub o SET latlonvalidity = -1 WHERE ( decimallatitude IS NULL OR decimallongitude IS NULL ); -- 1 if the coordinates falls within the global bounding box UPDATE geoscrub o SET latlonvalidity = 1 WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90)); -- 0 otherwise (have coordinates, but not valid) UPDATE geoscrub o SET latlonvalidity = 0 WHERE o.latlonvalidity IS NULL; -- -- validate country -- VACUUM ANALYZE geoscrub; -- -1 if missing country name UPDATE geoscrub o SET countryvalidity = -1 WHERE o.country IS NULL; -- 0 if a country name is asserted, but not recognized among GADM -- country names UPDATE geoscrub o SET countryvalidity = 0 WHERE (o.countrystd IS NULL OR NOT EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 ) ) AND o.countryvalidity IS NULL; -- 3 if the point is in (or on the border of) the asserted country UPDATE geoscrub AS o SET countryvalidity = 3 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND ST_Intersects(o.geom, c.simple_geom) ) AND o.countryvalidity IS NULL; -- 2 for those that aren't 3's, but the point is in within 5km of -- the asserted country UPDATE geoscrub o SET countryvalidity = 2 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND ST_DWithin(o.geog, c.simple_geog, 5000) ) AND o.countryvalidity IS NULL; -- now recheck the 2's to see if any are within the country based on the -- original (unsimplified) geometry, and if so, upgrade to 3 UPDATE geoscrub AS o SET countryvalidity = 3 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND ST_Intersects(o.geom, c.geom) ) AND o.countryvalidity = 2; -- also handle special case Antartica, which has invalid geography in -- gadm2 and thus wasn't included in the within-5km check UPDATE geoscrub AS o SET countryvalidity = 3 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND ST_Intersects(o.geom, c.geom) ) AND o.countryvalidity < 3 AND o.countrystd = 'Antarctica'; -- 1 otherwise (have recognized country name, but point is not within -- 5km of the asserted country) UPDATE geoscrub o SET countryvalidity = 1 WHERE o.countryvalidity IS NULL; ----------------------------- -- validate state/province -- ----------------------------- VACUUM ANALYZE geoscrub; -- -1 if missing stateprovince name UPDATE geoscrub o SET stateprovincevalidity = -1 WHERE o.country IS NULL OR o.stateprovince IS NULL; -- 0 if a stateprovince name is asserted, but not recognized among GADM -- stateprovince names UPDATE geoscrub o SET stateprovincevalidity = 0 WHERE (o.countrystd IS NULL OR o.stateprovincestd IS NULL OR NOT EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 ) ) AND o.stateprovincevalidity IS NULL; -- 3 if the point is in (or on the border of) the asserted stateprovince UPDATE geoscrub AS o SET stateprovincevalidity = 3 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 AND ST_Intersects(o.geom, c.simple_geom) ) AND o.countryvalidity = 3 AND o.stateprovincevalidity IS NULL; -- for those that aren't 3's, set to 2 if the point is in within 5km of -- the asserted stateprovince UPDATE geoscrub o SET stateprovincevalidity = 2 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 AND ST_DWithin(o.geog, c.simple_geog, 5000) ) AND (o.countryvalidity = 2 OR o.countryvalidity = 3) AND o.stateprovincevalidity IS NULL; -- now recheck the 2's to see if any are within the stateprovince based -- on the original (unsimplified) geometry, and if so, upgrade to 3 UPDATE geoscrub AS o SET stateprovincevalidity = 3 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 AND ST_Intersects(o.geom, c.geom) ) AND o.stateprovincevalidity = 2; -- 1 otherwise (have recognized stateprovince name, but point is not within -- 5km of the asserted stateprovince) UPDATE geoscrub o SET stateprovincevalidity = 1 WHERE o.stateprovincevalidity IS NULL; ----------------------------- -- validate county/parish -- ----------------------------- -- -1 if missing county name UPDATE geoscrub o SET countyvalidity = -1 WHERE o.country IS NULL OR o.stateprovince IS NULL OR o.county IS NULL; -- 0 if a county name is asserted, but not recognized among GADM -- county names UPDATE geoscrub o SET countyvalidity = 0 WHERE (o.countrystd IS NULL OR o.stateprovincestd IS NULL OR o.countystd IS NULL OR NOT EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 AND o.countystd=c.name_2 ) ) AND o.countyvalidity IS NULL; -- 3 if the point is in (or on the border of) the asserted county UPDATE geoscrub AS o SET countyvalidity = 3 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 AND o.countystd=c.name_2 AND ST_Intersects(o.geom, c.simple_geom) ) AND o.stateprovincevalidity = 3 AND o.countyvalidity IS NULL; -- for those that aren't 3's, set to 2 if the point is in within 5km of -- the asserted county UPDATE geoscrub o SET countyvalidity = 2 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 AND o.countystd=c.name_2 AND ST_DWithin(o.geog, c.simple_geog, 5000) ) AND (o.stateprovincevalidity = 2 OR o.stateprovincevalidity = 3) AND o.countyvalidity IS NULL; -- 1 otherwise (have recognized county name, but point is not within -- 5km of the asserted county) UPDATE geoscrub o SET countyvalidity = 1 WHERE o.countyvalidity IS NULL; -- -- Look up GADM place names based purely on coordinates for all points -- -- note: only using simplified geometry now, for speed reasons UPDATE geoscrub o SET pointcountry = c.name_0, pointstateprovince = c.name_1, pointcounty = c.name_2 FROM gadm2 c WHERE ST_Intersects(o.geom, c.simple_geom) AND o.latlonvalidity = 1; SET enable_hashjoin = true;