/* SQL statements to generate tables mapping geonames features at each relevant administative level (country, state/province, county/parish) to GADM2 features. Todo: * Be tolerant to presence/absence of administrative qualifiers in names? (Already doing this mildly for County/Co/Co.) - Municipality - Barrio - District * Get rid of rudundancy * Try matching against GADM2 varname_* columns? Jim Regetz NCEAS Created Nov 2012 */ ------------------------------ -- Level 1 (State/Province) -- ------------------------------ BEGIN; TRUNCATE gadm_stateprovince_lookup; -- map gadm2 level1 to geonames ADM1 INSERT INTO gadm_stateprovince_lookup (stateprovinceid) SELECT DISTINCT geonameid FROM geonames WHERE featurecode='ADM1'; -- INSERT 0 3841 -- Time: 7085.635 ms -- try matching against alternatenames table UPDATE gadm_stateprovince_lookup gs SET name_0 = gadm.name_0, name_1 = gadm.name_1 FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm, alternatenames a, geonames g, hierarchy h, gadm_country_lookup gc WHERE lower(gadm.name_1)=lower(alternatename) AND gs.stateprovinceid = a.geonameid AND a.geonameid = g.geonameid AND gs.stateprovinceid = h.childid AND h.parentid = gc.countryid AND gc.name_0 = gadm.name_0 AND g.featurecode='ADM1' AND gs.name_1 IS NULL; -- UPDATE 2145 -- Time: 3453.568 ms -- try matching against geonames (names and alternatenames) UPDATE gadm_stateprovince_lookup gs SET name_0 = gadm.name_0, name_1 = gadm.name_1 FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm, geonames g, hierarchy h, gadm_country_lookup gc WHERE (lower(gadm.name_1)=lower(g.name) OR lower(gadm.name_1) = ANY (string_to_array(lower(g.alternatenames), ','))) AND gs.stateprovinceid = h.childid AND h.parentid = gc.countryid AND gc.name_0 = gadm.name_0 AND gs.stateprovinceid = g.geonameid AND g.featurecode='ADM1' AND gs.name_1 IS NULL; -- UPDATE 319 -- Time: 857.885 ms -- now again but against our manual mapping UPDATE gadm_stateprovince_lookup gs SET name_0 = gadm.name_0, name_1 = gadm.name_1 FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm, alt_stateprovince asp, geonames g, countries c WHERE gadm.name_1=asp.alternatename AND gadm.iso=c.iso3 AND asp.country=c.country AND gs.stateprovinceid = g.geonameid AND g.countrycode=c.iso AND g.name=asp.stateprovince AND gs.name_0 IS NULL; -- UPDATE 29 -- Time: 777.376 ms COMMIT; /* -- these geonameids match multiple gadm2 state/provinces, and that's bad -- because we don't know which one to use for geovalidation select stateprovinceid, name_0, array_agg(name_1) as name_1 from gadm_stateprovince_lookup group by stateprovinceid, name_0 having count(*)>1; stateprovinceid | name_0 | name_1 -----------------+----------+-------------------------------------- 3653890 | Ecuador | {Orellana,Orellana} 453751 | Bulgaria | {Razgrad,Ruse} 1831095 | Cambodia | {"Phnom Penh",Kândal} 1506272 | Russia | {Altay,Gorno-Altay} 128222 | Iran | {Kermanshah,Kordestan} 3457415 | Brazil | {"Mato Grosso do Sul","Mato Grosso"} 170652 | Syria | {Damascus,"Rif Dimashq"} (7 rows) -- for now, manually clean up after these ambiguous cases -- ... this countryid should only apply to Mato Grosso do Sul DELETE FROM gadm_stateprovince_lookup WHERE name_0 = 'Brazil' AND name_1 = 'Mato Grosso' AND stateprovinceid = '3457415'; */ /* -- these gadm2 state/provinces match multiple geonameids, but i'm pretty sure -- we don't really care in this direction select iso, name_1, count(*) from (select distinct iso, name_1 from gadm2) gadm2 left join ( select distinct gadm2.iso, name_1, alternatenames.geonameid, geonames.name from (select distinct iso, name_1 from gadm2) gadm2 join alternatenames on name_1=alternatename join geonames using (geonameid) join countries on countrycode=countries.iso where featurecode='ADM1' and gadm2.iso=countries.iso3) foo using (iso, name_1) group by iso, name_1 having count(*)>1; iso | name_1 | count -----+-------------+------- AZE | Yevlax | 2 BGR | Ruse | 2 BMU | Hamilton | 2 BRA | Mato Grosso | 2 KAZ | Almaty | 2 KHM | Phnom Penh | 2 RUS | Altay | 2 RUS | Moskva | 2 (8 rows) */