/* Misc notes on how well we did scrubbing names. And lots of misc SQL statements for examining various scrubbing outcomes, some of which may not be so relevant anymore as a consequences of other changes. Provisional (and not necessarily fully updated) summary of geoname scrubbing performance on vegbien_geoscrub (= geoscrub_input from vegbien schema public.2012-11-04-07-34-10.r5984) ======================================================================== 1707970 total unique location determinations 32541 distinct on country, stateprovince, county (including all nulls) ======================================================================== * Country * 1485885 assertions 1479338 mapped to geonames (6547 not mapped to geonames) 1479338 mapped to gadm2 (6547 not mapped to gadm2) (0 cases of failed mapping to gadm2 name_0) 1479338 mapped both to geonames and to gadm2 ======================================================================== * Country with State/Province * 1408955 assertions 1388525 mapped to geonames (20430 not mapped to geonames) (5576 of these had unmapped country) 1389239 mapped to gadm2 (19716 not mapped to gadm2) (5576 of these had unmapped country) 1387915 mapped both to geonames and to gadm2 ======================================================================== * Country with State/Province with County/Parish* 276809 assertions 145767 mapped to geonames 213795 mapped to gadm2 140983 mapped both to geonames and to gadm2 ======================================================================== ======================================================================== Distinct country names: 439 total unique country names (non-null) 377 recognized unique country names (non-null) 62 unrecognized unique country names (non-null) ======================================================================== Distinct country+stateprovince names: 4764 total unique country and stateprovince (stateprovince non-null) 4704 total unique country and stateprovince (both non-null) 2944 mapped to geonames (1760 not mapped to geonames) 2950 mapped to gadm2 (1754 not mapped to gadm2) 2885 mapped both to geonames and to gadm2 (1819 not mapped to both) ------------------------------------------------------------------------ * Distinct mappedcountry+stateprovince names * 3312 total unique 1895 mapped to geonames (1417 not mapped to geonames) 1901 mapped to gadm2 (1411 not mapped to gadm2) ======================================================================== Distinct country+stateprovince+county names: 27440 total unique country and stateprovince and county (county non-null) 25764 total unique country and stateprovince and county (all non-null) 8507 mapped to geonames (17257 not mapped to geonames) 12610 mapped to gadm2 (13154 not mapped to gadm2) 4917 mapped both to geonames and to gadm2 (20847 not mapped to both) ------------------------------------------------------------------------ * Distinct mappedcountry+mappedstateprovince+county names * 18715 total unique 6673 mapped to geonames (12042 not mapped to geonames) 11310 mapped to gadm2 (7405 not mapped to gadm2) ======================================================================== for comparison, here are bien2 stats based on boyle/donoghue scrubbing: 753302 total unique location determinations have asserted country 745321 have scrubbed country have asserted country and stateprovince have scrubbed country and stateprovince have asserted country and stateprovince and county have scrubbed country and stateprovince and county Jim Regetz NCEAS Created Nov 2012 */ -- Generate human-readable view of all distinct cases where we haven't -- obtained a match for the asserted country CREATE TEMP VIEW unmatched_country AS SELECT count(*), country FROM (SELECT CASE WHEN length(countryutf8)>4 THEN initcap(countryutf8) ELSE countryutf8 END AS country FROM scrubbed JOIN vcountry USING (country) WHERE scrubbed.country IS NOT NULL AND scrubbed.countryid IS NULL) summary GROUP BY country ORDER BY count DESC; -- Generate human-readable view of all distinct cases where we have -- obtained match for the asserted country, but not stateprovince CREATE TEMP VIEW unmatched_stateprovince AS SELECT count(*), country, stateprovince FROM (SELECT countries.country AS country, CASE WHEN length(stateprovinceutf8)>3 THEN initcap(stateprovinceutf8) ELSE stateprovinceutf8 END AS stateprovince FROM scrubbed LEFT JOIN vstate USING (countryid, stateprovince) LEFT JOIN countries ON vstate.countryid=geonameid WHERE scrubbed.stateprovince IS NOT NULL AND scrubbed.stateprovinceid IS NULL AND scrubbed.country IS NOT NULL) summary GROUP BY country, stateprovince ORDER BY count DESC; /* SELECT DISTINCT countries.country, CASE WHEN length(stateprovinceutf8)>3 THEN initcap(stateprovinceutf8) ELSE stateprovinceutf8 END AS stateprovince FROM scrubbed JOIN vstate USING (countryid, stateprovince) JOIN countries ON vstate.countryid=geonameid WHERE scrubbed.stateprovince IS NOT NULL AND scrubbed.stateprovinceid IS NULL ORDER BY country, stateprovince; */ -- Generate human-readable view of all distinct cases where we have -- obtained match for the asserted country and stateprovince, but not -- county /* CREATE TEMP VIEW unmatched_county AS SELECT DISTINCT countries.country, CASE WHEN length(stateprovinceutf8)>3 THEN initcap(stateprovinceutf8) ELSE stateprovinceutf8 END AS stateprovince FROM scrubbed JOIN vstate USING (countryid, stateprovince) JOIN countries ON vstate.countryid=geonameid WHERE scrubbed.stateprovince IS NOT NULL AND scrubbed.stateprovinceid IS NULL ORDER BY country, stateprovince; */ /* -- which states aren't in gadm2? select distinct iso3, s.name state from scrubbed join countries c on countryid=c.geonameid join geonames s on stateprovinceid=s.geonameid except select iso, name_1 state from gadm2; select country, count(*) from scrubbed s left join gadm_stateprovince_lookup using (stateprovinceid) where name_0 is null and stateprovinceid is not null group by country order by count desc; select name from gadm2 join alternatenames on name_1=alternatename join geonames using (geonameid) join countries on countrycode=countries.iso where featurecode='ADM1' and name_1='North Solomons'; */ /* -- report distinct geonames-scrubbed country+stateprovince pairs that -- we didn't match to any gadm polygons SELECT DISTINCT c.country, g.name FROM countries c JOIN vstate v ON geonameid=countryid JOIN geonames g ON g.geonameid=v.stateprovinceid WHERE stateprovinceid NOT IN ( SELECT stateprovinceid FROM gadm_stateprovince_lookup ) ORDER BY c.country, g.name; -- report failed stateprovince mappings from gadm2 to geonames SELECT countries.country AS countryname, name AS stateprovincename, count(*) FROM scrubbed LEFT JOIN gadm_stateprovince_lookup USING (stateprovinceid) LEFT JOIN countries ON countryid=countries.geonameid LEFT JOIN geonames ON stateprovinceid=geonames.geonameid WHERE stateprovinceid IS NOT NULL AND name_1 IS NULL GROUP BY countryname, stateprovincename ORDER BY countryname, stateprovincename; */ /* select iso, name_1, geonameid as stateid from (select distinct iso, name_1 from gadm2) gadm2 left join ( select distinct gadm2.iso, name_1, alternatenames.geonameid, geonames.name from (select distinct iso, name_1 from gadm2) gadm2 join alternatenames on name_1=alternatename join geonames using (geonameid) join countries on countrycode=countries.iso where featurecode='ADM1' and gadm2.iso=countries.iso3) foo using (iso, name_1); */ -- how did we do? -- country -- percent of unique locations with asserted country names SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct FROM scrubbed WHERE country IS NOT NULL; -- 87.0% -- percent of unique locations with geoname-scrubbed country names SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct FROM scrubbed WHERE countryid IS NOT NULL; -- 86.6% -- percent of unique locations with gadm-matchedl country names SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct FROM scrubbed WHERE countryid IN (SELECT countryid FROM gadm_country_lookup); -- 86.6% -- stateprovince -- note: 212221 records have a stateprovince but no country! those are -- considered missing below -- percent of unique locations with asserted stateprovince names SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct FROM scrubbed WHERE country IS NOT NULL AND stateprovince IS NOT NULL; -- 82.5% -- percent of unique locations with geoname-scrubbed stateprovince names SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct FROM scrubbed WHERE stateprovinceid IS NOT NULL; -- 80.2% -- percent of unique locations with gadm-matchedl stateprovince names SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct FROM scrubbed WHERE stateprovinceid IN (SELECT stateprovinceid FROM gadm_stateprovince_lookup); -- 79.5% -- stateprovince -- percent of unique locations with asserted county names SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct FROM scrubbed WHERE country IS NOT NULL AND stateprovince IS NOT NULL AND county IS NOT NULL; -- 16.2% -- percent of unique locations with geoname-scrubbed county names SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct FROM scrubbed WHERE countyid IS NOT NULL; -- 6.3% -- percent of unique locations with gadm-matchedl county names SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct FROM scrubbed WHERE countyid IN (SELECT countyid FROM gadm_county_lookup); -- tbd SELECT CASE WHEN countryid IS NOT NULL THEN 'matched' ELSE 'unmatched' END AS matches, COUNT(*) FROM vcountry GROUP BY matches; SELECT CASE WHEN stateprovinceid IS NOT NULL THEN 'matched' ELSE 'unmatched' END AS matches, COUNT(*) FROM vstate GROUP BY matches; -- matched | 1770 -- unmatched | 1541 SELECT CASE WHEN countyid IS NOT NULL THEN 'matched' ELSE 'unmatched' END AS matches, COUNT(*) FROM vcounty GROUP BY matches; -- matched | 4295 -- unmatched | 14354 SELECT CASE WHEN country IS NULL THEN 'no country provided' WHEN country IS NOT NULL AND countryid IS NULL THEN 'unrecognized country' ELSE 'scrubbed country' END results, COUNT(*) FROM scrubbed GROUP BY results; SELECT CASE WHEN country IS NULL THEN 'no fully qualified stateprovince provided' WHEN country IS NOT NULL AND stateprovince IS NOT NULL AND stateprovinceid IS NULL THEN 'unrecognized stateprovince' ELSE 'scrubbed stateprovince' END results, COUNT(*) FROM scrubbed GROUP BY results; -- file output SELECT DISTINCT countryid, country, stateprovinceid, stateprovince, countyid, county FROM scrubbed ORDER BY country, stateprovince, county; /* -- look at the whole iso2 vs fips thing SELECT DISTINCT vc.country, i.country iso, f.country fips, stateprovince FROM vcountry vc LEFT JOIN countries i ON vc.country=i.iso LEFT JOIN countries f ON vc.country=f.fips LEFT JOIN scrubbed s ON s.country=vc.country WHERE vc.countryid IS NULL AND i.country IS NOT NULL AND f.country IS NOT NULL ORDER BY vc.country; */