/* SQL statements carrying out country name, stateprovince name, and county name scrubbing on vegbien location data. Currently hard-coded to operate on a 'vegbien_geoscrub' table that contains the input for geoscrubbing, with the following columns: decimallatitude (latitude, WGS84 decimal degrees) decimallongitude (longitude, WGS84 decimal degrees) country (original asserted country name) stateprovince (original asserted stateprovince name) county (original asserted county name) (Note that the lat/lon columns are not needed for scrubbing, but are used later for geovalidation.) Misc issues to contemplate and possibly address: * Congo can be either "Democratic Republic of the Congo" (COD) or "Republic of the Congo" (COG), eh? * Vietnam: gadm2 seems to treat provinces as level2 units * Yugoslavia: no longer exists, not in geonames nor gadm2 * Azerbaijan: Yevlakh City and Yevlakh Rayon both ADM1 in geonames, but just Yevlax in gadm2 * England is not a geoname country, only the UK is * the general problem of historical names in cases where countries have either been split, combined, removed (?), or otherwise redefined * Cases with multiple alternativename matches??? Notes: - the vegbien_geoscrub input used during development and testing this scrubbing procedure came from a select distinct on (lat, lon, country, state, province) from analytical_aggregate on vegbiendev Jim Regetz NCEAS Created Nov 2012 */ -- define custom function to pass characters back through latin-1 -- encoding to revert errors; in principle this could introduce new -- errors, but it seems to do more good than harm CREATE OR REPLACE FUNCTION u2L2u(text) RETURNS text AS $$ DECLARE string text; BEGIN string := convert_from(convert($1::bytea, 'UTF8', 'LATIN1'), 'UTF8'); RETURN string; EXCEPTION WHEN data_exception THEN RETURN $1; END; $$ LANGUAGE plpgsql; ------------------- -- scrub country -- ------------------- CREATE TABLE vcountry AS SELECT DISTINCT country FROM vegbien_geoscrub WHERE country IS NOT NULL; -- SELECT 439 -- Time: 862.413 ms ALTER TABLE vcountry ADD COLUMN countryutf8 text; ALTER TABLE vcountry ADD COLUMN countryid integer; -- try to clean up character representations UPDATE vcountry SET countryutf8 = u2L2u(country); -- UPDATE 439 -- Time: 24.706 ms -- try matching against direct country names UPDATE vcountry vc SET countryid = gc.geonameid FROM countries gc WHERE lower(vc.countryutf8) = lower(gc.country); -- UPDATE 219 -- Time: 12.384 ms -- try matching against ISO 3166-1 alpha-3 UPDATE vcountry vc SET countryid = gc.geonameid FROM countries gc WHERE vc.countryutf8=gc.iso3 AND vc.countryid IS NULL; -- UPDATE 1 -- Time: 20.146 ms -- try matching against ISO 3166-1 alpha-2 UPDATE vcountry vc SET countryid = gc.geonameid FROM countries gc WHERE vc.countryutf8 = iso AND vc.countryid IS NULL; -- UPDATE 64 -- Time: 3.378 ms /* -- try matching against FIPS, but not if already matched (i.e., -- the 2-digit ISO match takes preference) UPDATE vcountry vc SET countryid = gc.geonameid FROM countries gc WHERE vc.countryutf8 = fips AND vc.countryid IS NULL; -- UPDATE 1 -- Time: 2.514 ms */ -- try matching against BIEN alt country names UPDATE vcountry vc SET countryid = c.geonameid FROM alt_country a, countries c WHERE vc.countryutf8 = a.alternatename AND a.country = c.country AND vc.countryid IS NULL; -- UPDATE 21 -- Time: 4.769 ms -- try case-insensitive matching on geonames.org 'alternatenames' -- note: this is a little sloppy because the set condition will return -- more than one row if there are more than one matched alternate -- country names, and which one gets used is non-deterministic; however, -- we'll go back and remove those in the next update (specifically for -- the cases where there are more than one *unique* matched countries) UPDATE vcountry vc SET countryid = gc.geonameid FROM alternatenames a, countries gc WHERE lower(vc.countryutf8) = lower(a.alternatename) AND a.geonameid = gc.geonameid AND vc.countryid IS NULL; -- UPDATE 76 -- Time: 22019.592 ms -- go back and remove if altname lookup was ambiguous UPDATE vcountry vc SET countryid = NULL WHERE countryutf8 IN ( SELECT countryutf8 FROM ( SELECT DISTINCT vc.countryutf8, gc.iso3 FROM vcountry vc, alternatenames a, countries gc WHERE lower(vc.countryutf8) = lower(a.alternatename) AND a.geonameid = gc.geonameid) lookup GROUP BY countryutf8 HAVING COUNT(*) > 1); -- UPDATE 2 -- Time: 2296.257 ms -- Removes: Congo, CONGO -- merge in GADM country names (our standard names) ALTER TABLE vcountry ADD COLUMN countrystd text; UPDATE vcountry vc SET countrystd = name_0 FROM gadm_country_lookup gadm0 WHERE vc.countryid=gadm0.countryid; -- UPDATE 377 -- Time: 6.254 ms ------------------------- -- scrub stateprovince -- ------------------------- CREATE TABLE vstate AS SELECT DISTINCT countryid, stateprovince FROM vegbien_geoscrub join vcountry using (country) WHERE countryid IS NOT NULL AND stateprovince IS NOT NULL; -- SELECT 3312 -- Time: 4164.948 ms ALTER TABLE vstate ADD COLUMN stateprovinceutf8 text; ALTER TABLE vstate ADD COLUMN stateprovinceid integer; -- try to clean up character representations UPDATE vstate SET stateprovinceutf8 = u2L2u(stateprovince); -- UPDATE 3312 -- Time: 92.702 ms -- replace common mojibake with valid utf8 replacement character -- note: second pass of u2L2u would do the same thing, but this is a -- little more explicit about the intended effect UPDATE vstate SET stateprovinceutf8 = replace(stateprovinceutf8, '�', chr(65533)); -- manually replace html character codes found in the names -- todo: use some library to do this exhaustively rather than using -- manually specified updates UPDATE vstate SET stateprovinceutf8 = replace(stateprovinceutf8, ''', chr(39)); UPDATE vstate SET stateprovinceutf8 = replace(stateprovinceutf8, 'á', chr(225)); UPDATE vstate SET stateprovinceutf8 = replace(stateprovinceutf8, 'ã', chr(227)); UPDATE vstate SET stateprovinceutf8 = replace(stateprovinceutf8, 'é', chr(233)); UPDATE vstate SET stateprovinceutf8 = replace(stateprovinceutf8, 'é', chr(233)); UPDATE vstate SET stateprovinceutf8 = replace(stateprovinceutf8, 'í', chr(237)); UPDATE vstate SET stateprovinceutf8 = replace(stateprovinceutf8, 'ñ', chr(241)); UPDATE vstate SET stateprovinceutf8 = replace(stateprovinceutf8, 'ó', chr(243)); UPDATE vstate SET stateprovinceutf8 = replace(stateprovinceutf8, 'ô', chr(244)); UPDATE vstate SET stateprovinceutf8 = replace(stateprovinceutf8, 'ü', chr(252)); -- try matching against direct stateprovince names UPDATE vstate vs SET stateprovinceid = gn.geonameid FROM geonames gn, countries c WHERE lower(vs.stateprovinceutf8) = lower(gn.name) AND gn.countrycode=c.iso AND c.geonameid=vs.countryid AND gn.featurecode='ADM1'; -- UPDATE 238 -- Time: 2919.512 ms -- try case-insensitive matching on geonames.org 'alternatenames' -- note: this is a little sloppy because the set condition will return -- more than one row if there are more than one matched alternate -- stateprovince names, and which one gets used is non-deterministic UPDATE vstate vs SET stateprovinceid = gn.geonameid FROM alternatenames a, geonames gn, countries c WHERE lower(vs.stateprovinceutf8) = lower(a.alternatename) AND a.geonameid = gn.geonameid AND gn.countrycode=c.iso AND c.geonameid=vs.countryid AND gn.featurecode='ADM1' AND vs.stateprovinceid IS NULL; -- UPDATE 1281 -- Time: 5229.274 ms -- try doing a 'like' query that ignores replacement characters (�) UPDATE vstate vs SET stateprovinceid = gn.geonameid FROM alternatenames a, geonames gn, countries c WHERE a.alternatename ILIKE regexp_replace(stateprovinceutf8, chr(65533), '_') AND stateprovinceutf8 LIKE '%'||chr(65533)||'%' AND a.geonameid = gn.geonameid AND gn.countrycode=c.iso AND c.geonameid=vs.countryid AND gn.featurecode='ADM1' AND vs.stateprovinceid IS NULL; -- UPDATE 112 -- Time: 6232.106 ms -- try matching against alternatenames column in geonames table UPDATE vstate vs SET stateprovinceid = gn.geonameid FROM geonames gn, countries c WHERE lower(vs.stateprovinceutf8) = ANY (string_to_array(lower(gn.alternatenames), ',')) AND gn.countrycode=c.iso AND c.geonameid=vs.countryid AND gn.featurecode='ADM1' AND vs.stateprovinceid IS NULL; -- UPDATE 220 -- Time: 5053.520 ms -- try matching against BIEN alt stateprovince names UPDATE vstate vs SET stateprovinceid = g.geonameid FROM alt_stateprovince asp, countries c, geonames g WHERE lower(vs.stateprovinceutf8) = lower(asp.alternatename) AND asp.country = c.country AND c.iso = g.countrycode AND asp.stateprovince = g.name AND g.featurecode='ADM1' AND vs.stateprovinceid IS NULL; -- UPDATE 45 -- Time: 3452.274 ms -- todo: the same stuff done for country to deal with Congo-type case -- merge in GADM stateprovince names (our standard names) ALTER TABLE vstate ADD COLUMN stateprovincestd text; UPDATE vstate vs SET stateprovincestd = name_1 FROM gadm_stateprovince_lookup gadm1 WHERE vs.stateprovinceid=gadm1.stateprovinceid; -- UPDATE 1896 -- Time: 23.946 ms -- match any remaining unknowns directly to gadm UPDATE vstate vs SET stateprovincestd = name_1 FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm, vcountry vc WHERE vc.countryid = vs.countryid AND countrystd = name_0 AND lower(stateprovinceutf8) = lower(name_1) AND stateprovincestd IS NULL; -- UPDATE 55 -- Time: 4799.837 ms -- try to recover geonameids for any of these so we can drill down to -- the next level UPDATE vstate vs SET stateprovinceid = gadm1.stateprovinceid FROM gadm_country_lookup gadm0, gadm_stateprovince_lookup gadm1 WHERE vs.countryid = gadm0.countryid AND gadm0.name_0 = gadm1.name_0 AND vs.stateprovincestd = gadm1.name_1 AND vs.stateprovinceid IS NULL; -- UPDATE 0 -- Time: 13.882 ms /* TODO: is this still needed??? -- now reverse map INSERT INTO gadm_stateprovince_lookup select name_0, alternatename as name_1, g.geonameid from vstate vs, alt_stateprovince asp, geonames g, countries c, gadm_country_lookup gc where vs.stateprovinceid=g.geonameid AND asp.stateprovince=g.name AND vs.countryid=c.geonameid AND vs.countryid=gc.countryid AND asp.country=c.country AND vs.stateprovinceid not in (select stateprovinceid from gadm_stateprovince_lookup); -- INSERT 0 4 -- Time: 20.228 ms */ ------------------ -- scrub county -- ------------------ CREATE TABLE vcounty AS SELECT DISTINCT countryid, stateprovinceid, county FROM vegbien_geoscrub JOIN vcountry USING (country) JOIN vstate USING (countryid, stateprovince) WHERE countryid IS NOT NULL AND stateprovinceid IS NOT NULL AND county IS NOT NULL; -- SELECT 18715 -- Time: 3590.725 ms ALTER TABLE vcounty ADD COLUMN countyutf8 text; ALTER TABLE vcounty ADD COLUMN countyid integer; -- try to clean up character representations UPDATE vcounty SET countyutf8 = u2L2u(county); -- UPDATE 18715 -- Time: 683.781 ms -- replace common mojibake with valid utf8 replacement character -- note: second pass of u2L2u would do the same thing, but this is a -- little more explicit about the intended effect UPDATE vcounty SET countyutf8 = replace(countyutf8, '�', chr(65533)); -- UPDATE 18715 -- Time: 153.395 ms -- clean up some common municipality label abbreviations -- todo: other fixes like this; check what brad did for bien2 on this front? UPDATE vcounty SET countyutf8 = regexp_replace(countyutf8, ' Co[.]?$', ' County', 'i'); -- try matching against geonames (names and alternatenames) UPDATE vcounty vc SET countyid = gn.geonameid FROM geonames gn, hierarchy h WHERE vc.stateprovinceid = h.parentid AND h.childid=gn.geonameid AND (lower(vc.countyutf8) = lower(gn.name) OR lower(vc.countyutf8) = ANY (string_to_array(lower(gn.alternatenames), ','))) AND gn.featurecode='ADM2'; -- UPDATE 6673 -- Time: 8369.737 ms -- use alternatenames UPDATE vcounty vc SET countyid = gn.geonameid FROM geonames gn, alternatenames a, hierarchy h WHERE vc.stateprovinceid = h.parentid AND h.childid = gn.geonameid AND a.geonameid = gn.geonameid AND lower(vc.countyutf8) = lower(a.alternatename) AND gn.featurecode='ADM2' AND vc.countyid IS NULL; -- UPDATE 0 -- Time: 11255.926 ms ALTER TABLE vcounty ADD COLUMN countystd text; -- merge in GADM county names (our standard names) -- for cases where we have geonameid UPDATE vcounty vs SET countystd = name_2 FROM gadm_county_lookup gadm1 WHERE vs.countyid=gadm1.countyid AND countystd IS NULL; -- UPDATE 6673 -- Time: 118.378 ms -- match directly to gadm UPDATE vcounty vco SET countystd = name_2 FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm, vcountry vc, vstate vs WHERE vco.countryid = vc.countryid AND vco.stateprovinceid = vs.stateprovinceid AND countrystd = name_0 AND stateprovincestd = name_1 AND lower(countyutf8) = lower(name_2) AND countystd IS NULL; -- UPDATE 5027 -- Time: 1410.377 ms ----------------------------- -- put everything together -- ----------------------------- -- reconstitute the whole dang thang -- new approach CREATE TABLE geoscrub AS SELECT decimallatitude, decimallongitude, country, stateprovince, county, countryid, stateprovinceid, countyid, countrystd, stateprovincestd, countystd FROM vegbien_geoscrub v LEFT JOIN vcountry USING (country) LEFT JOIN vstate USING (countryid, stateprovince) LEFT JOIN vcounty USING (countryid, stateprovinceid, county); -- SELECT 1707970 -- Time: 26172.154 ms -- try to squeeze out a few more direct stateprovince mappings to gadm2 UPDATE geoscrub SET stateprovincestd = name_1 FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm WHERE country IS NOT NULL AND stateprovince IS NOT NULL AND countrystd = name_0 AND stateprovincestd IS NULL AND stateprovince = name_1; -- UPDATE 0 -- Time: 3630.973 ms -- try to squeeze out a few more direct county mappings to gadm2 UPDATE geoscrub SET countystd = name_2 FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm WHERE country IS NOT NULL AND stateprovince IS NOT NULL AND county IS NOT NULL AND countrystd = name_0 AND stateprovincestd = name_1 AND countystd IS NULL AND county = name_2; -- UPDATE 69 -- Time: 3982.715 ms