/* SQL statements carrying out geovalidation on vegbien location data. Currently hard-coded to operate on the 'geoscrub' table that is assumed to have been generated by the geonames scrubbing process (see geonames.sql). This table should minimally contain the following columns: decimallatitude (latitude, WGS84 decimal degrees) decimallongitude (longitude, WGS84 decimal degrees) country (original asserted country name) stateprovince (original asserted stateprovince name) county (original asserted county name) countrystd (GADM2-mapped country name) stateprovincestd (GADM2-mapped state/province name) countystd (GADM2-mapped county name) (Note that the geonames.sql script also currently creates columns with geonames.org IDs for country, stateprovince, and county.) After execution of the statements below, the table will contain the following new columns (along with geom and geog columns used by postgis): latlonvalidity (validity score for lat/lon coordinate) countryvalidity (validity score for country) stateprovincevalidity (validity score for stateprovince) countyvalidity (validity score for county) pointcountry (GADM2 country containing the point) pointstateprovince (GADM2 stateprovince containing the point) pointcounty (GADM2 county containing the point) Workflow with actual times (as executed 15-Nov-2012) 12s create point geoms 16s create point geogs 23s index geoms 26s index geogs 37s index countrystd 25s index stateprovincestd 5s vacuum analyze 1s mark missing coords (UPDATE 0) 133s mark valid coords (UPDATE 1702989) 2s mark invalid coords (UPDATE 4981) 17s vacuum analyze 27s mark missing countries (UPDATE 222085) 36s mark non-gadm countries (UPDATE 6547) 1277s mark point in country (UPDATE 1400627) 1491s mark point near country (UPDATE 24626) 573s mark point near country accurate (UPDATE 1182) 0s mark point near Antarctica (UPDATE 0) 2s mark point not in country (UPDATE 54085) 8s vacuum analyze 15s mark missing stateprovinces (UPDATE 299015) 12s mark non-gadm stateprovinces (UPDATE 19716) 920s mark point in stateprovince (UPDATE 1241068) 864s mark point near stateprovince (UPDATE 35685) 424s mark point near stateprovince accurate (UPDATE 1051) 7s mark point not in stateprovince (UPDATE 111282) s vacuum analyze 128s mark missing counties (UPDATE 1431161) 29s mark non-gadm counties (UPDATE 63014) 86s mark point in county (UPDATE 174469) 65s mark point near county (UPDATE 12477) ?s mark point near county accurate (UPDATE ?) 3s mark point not in county (UPDATE 26849) ?s vacuum analyze 890s look up GADM location using given lat/lon (UPDATE 1656709) todo: * make firm decision about whether we should assume here that *std columns contain valid GADM names (or are NULL), or whether we should check that here Jim Regetz NCEAS Created Nov 2012 */ -- generate point geometries from coordinates SELECT AddGeometryColumn('public', 'geoscrub', 'geom', 4326, 'POINT', 2 ); UPDATE geoscrub SET geom = ST_SetSRID(ST_Point(decimallongitude, decimallatitude), 4326); -- ... create point geographies too ... -- note that this excludes (leaves null) any occurrences with coordinates -- that are not valid decimal degrees ALTER TABLE geoscrub ADD COLUMN geog geography(POINT, 4326); UPDATE geoscrub SET geog = geom::geography WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90)); -- create indexes CREATE INDEX "geoscrub_geom_gist" ON geoscrub USING GIST (geom); CREATE INDEX "geoscrub_geog_gist" ON geoscrub USING GIST (geog); CREATE INDEX "geoscrub_countrystd_idx" ON geoscrub (countrystd); CREATE INDEX "geoscrub_stateprovincestd_idx" ON geoscrub (stateprovincestd); CREATE INDEX "geoscrub_countystd_idx" ON geoscrub (countystd); -- may want to do this first, otherwise the query planner wants to do a -- hash join instead of nested loop join, and at least when I use a limit -- statement to time queries on either side of the threshold where it -- switches, the hash join does waaaaay worse SET enable_hashjoin = false; -- -- validate lat/lon coordinates -- VACUUM ANALYZE geoscrub; ALTER TABLE geoscrub ADD COLUMN latlonvalidity int; -- -1 if missing one or both coordinates UPDATE geoscrub o SET latlonvalidity = -1 WHERE ( decimallatitude IS NULL OR decimallongitude IS NULL ); -- 1 if the coordinates falls within the global bounding box UPDATE geoscrub o SET latlonvalidity = 1 WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90)); -- 0 otherwise (have coordinates, but not valid) UPDATE geoscrub o SET latlonvalidity = 0 WHERE o.latlonvalidity IS NULL; -- -- validate country -- VACUUM ANALYZE geoscrub; ALTER TABLE geoscrub ADD COLUMN countryvalidity int; -- -1 if missing country name UPDATE geoscrub o SET countryvalidity = -1 WHERE o.country IS NULL; -- 0 if a country name is asserted, but not recognized among GADM -- country names UPDATE geoscrub o SET countryvalidity = 0 WHERE (o.countrystd IS NULL OR NOT EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 ) ) AND o.countryvalidity IS NULL; -- 3 if the point is in (or on the border of) the asserted country UPDATE geoscrub AS o SET countryvalidity = 3 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND ST_Intersects(o.geom, c.simple_geom) ) AND o.countryvalidity IS NULL; -- 2 for those that aren't 3's, but the point is in within 5km of -- the asserted country UPDATE geoscrub o SET countryvalidity = 2 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND ST_DWithin(o.geog, c.simple_geog, 5000) ) AND o.countryvalidity IS NULL; -- now recheck the 2's to see if any are within the country based on the -- original (unsimplified) geometry, and if so, upgrade to 3 UPDATE geoscrub AS o SET countryvalidity = 3 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND ST_Intersects(o.geom, c.geom) ) AND o.countryvalidity = 2; -- also handle special case Antartica, which has invalid geography in -- gadm2 and thus wasn't included in the within-5km check UPDATE geoscrub AS o SET countryvalidity = 3 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND ST_Intersects(o.geom, c.geom) ) AND o.countryvalidity < 3 AND o.countrystd = 'Antarctica'; -- 1 otherwise (have recognized country name, but point is not within -- 5km of the asserted country) UPDATE geoscrub o SET countryvalidity = 1 WHERE o.countryvalidity IS NULL; ----------------------------- -- validate state/province -- ----------------------------- VACUUM ANALYZE geoscrub; ALTER TABLE geoscrub ADD COLUMN stateprovincevalidity int; -- -1 if missing stateprovince name UPDATE geoscrub o SET stateprovincevalidity = -1 WHERE o.country IS NULL OR o.stateprovince IS NULL; -- 0 if a stateprovince name is asserted, but not recognized among GADM -- stateprovince names UPDATE geoscrub o SET stateprovincevalidity = 0 WHERE (o.countrystd IS NULL OR o.stateprovincestd IS NULL OR NOT EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 ) ) AND o.stateprovincevalidity IS NULL; -- 3 if the point is in (or on the border of) the asserted stateprovince UPDATE geoscrub AS o SET stateprovincevalidity = 3 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 AND ST_Intersects(o.geom, c.simple_geom) ) AND o.countryvalidity = 3 AND o.stateprovincevalidity IS NULL; -- for those that aren't 3's, set to 2 if the point is in within 5km of -- the asserted stateprovince UPDATE geoscrub o SET stateprovincevalidity = 2 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 AND ST_DWithin(o.geog, c.simple_geog, 5000) ) AND (o.countryvalidity = 2 OR o.countryvalidity = 3) AND o.stateprovincevalidity IS NULL; -- now recheck the 2's to see if any are within the stateprovince based -- on the original (unsimplified) geometry, and if so, upgrade to 3 UPDATE geoscrub AS o SET stateprovincevalidity = 3 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 AND ST_Intersects(o.geom, c.geom) ) AND o.stateprovincevalidity = 2; -- 1 otherwise (have recognized stateprovince name, but point is not within -- 5km of the asserted stateprovince) UPDATE geoscrub o SET stateprovincevalidity = 1 WHERE o.stateprovincevalidity IS NULL; ----------------------------- -- validate county/parish -- ----------------------------- ALTER TABLE geoscrub ADD COLUMN countyvalidity int; -- -1 if missing county name UPDATE geoscrub o SET countyvalidity = -1 WHERE o.country IS NULL OR o.stateprovince IS NULL OR o.county IS NULL; -- 0 if a county name is asserted, but not recognized among GADM -- county names UPDATE geoscrub o SET countyvalidity = 0 WHERE (o.countrystd IS NULL OR o.stateprovincestd IS NULL OR o.countystd IS NULL OR NOT EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 AND o.countystd=c.name_2 ) ) AND o.countyvalidity IS NULL; -- 3 if the point is in (or on the border of) the asserted county UPDATE geoscrub AS o SET countyvalidity = 3 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 AND o.countystd=c.name_2 AND ST_Intersects(o.geom, c.simple_geom) ) AND o.stateprovincevalidity = 3 AND o.countyvalidity IS NULL; -- for those that aren't 3's, set to 2 if the point is in within 5km of -- the asserted county UPDATE geoscrub o SET countyvalidity = 2 WHERE EXISTS ( SELECT 1 FROM gadm2 c WHERE o.countrystd=c.name_0 AND o.stateprovincestd=c.name_1 AND o.countystd=c.name_2 AND ST_DWithin(o.geog, c.simple_geog, 5000) ) AND (o.stateprovincevalidity = 2 OR o.stateprovincevalidity = 3) AND o.countyvalidity IS NULL; -- 1 otherwise (have recognized county name, but point is not within -- 5km of the asserted county) UPDATE geoscrub o SET countyvalidity = 1 WHERE o.countyvalidity IS NULL; -- -- Look up GADM place names based purely on coordinates for all points -- ALTER TABLE geoscrub ADD COLUMN pointcountry text; ALTER TABLE geoscrub ADD COLUMN pointstateprovince text; ALTER TABLE geoscrub ADD COLUMN pointcounty text; -- note: only using simplified geometry now, for speed reasons UPDATE geoscrub o SET pointcountry = c.name_0, pointstateprovince = c.name_1, pointcounty = c.name_2 FROM gadm2 c WHERE ST_Intersects(o.geom, c.simple_geom) AND o.latlonvalidity = 1; SET enable_hashjoin = true;