/* SQL statements to generate tables mapping geonames features at each relevant administative level (country, state/province, county/parish) to GADM2 features. Todo: * Be tolerant to presence/absence of administrative qualifiers in names? (Already doing this mildly for County/Co/Co.) - Municipality - Barrio - District * Get rid of rudundancy * Try matching against GADM2 varname_* columns? Jim Regetz NCEAS Created Nov 2012 */ ----------------------- -- Level 0 (Country) -- ----------------------- -- map gadm2 level0 to geonames countries CREATE TABLE gadm_country_lookup ( countryid integer primary key references geonames (geonameid), name_0 text ); INSERT INTO gadm_country_lookup (countryid) SELECT DISTINCT geonameid FROM countries WHERE geonameid IS NOT NULL; -- INSERT 0 250 -- Time: 20.770 ms UPDATE gadm_country_lookup lu SET name_0 = gadm.name_0 FROM (SELECT DISTINCT iso, name_0 FROM gadm2) gadm, countries c WHERE lu.countryid = c.geonameid AND c.iso3 = gadm.iso; -- UPDATE 248 -- Time: 3562.601 ms ------------------------------ -- Level 1 (State/Province) -- ------------------------------ -- map gadm2 level1 to geonames ADM1 CREATE TABLE gadm_stateprovince_lookup ( stateprovinceid integer primary key references geonames (geonameid), name_0 text, name_1 text ); INSERT INTO gadm_stateprovince_lookup (stateprovinceid) SELECT DISTINCT geonameid FROM geonames WHERE featurecode='ADM1'; -- INSERT 0 3841 -- Time: 7085.635 ms -- try matching against alternatenames table UPDATE gadm_stateprovince_lookup gs SET name_0 = gadm.name_0, name_1 = gadm.name_1 FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm, alternatenames a, geonames g, hierarchy h, gadm_country_lookup gc WHERE lower(gadm.name_1)=lower(alternatename) AND gs.stateprovinceid = a.geonameid AND a.geonameid = g.geonameid AND gs.stateprovinceid = h.childid AND h.parentid = gc.countryid AND gc.name_0 = gadm.name_0 AND g.featurecode='ADM1' AND gs.name_1 IS NULL; -- UPDATE 2145 -- Time: 3453.568 ms -- try matching against geonames (names and alternatenames) UPDATE gadm_stateprovince_lookup gs SET name_0 = gadm.name_0, name_1 = gadm.name_1 FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm, geonames g, hierarchy h, gadm_country_lookup gc WHERE (lower(gadm.name_1)=lower(g.name) OR lower(gadm.name_1) = ANY (string_to_array(lower(g.alternatenames), ','))) AND gs.stateprovinceid = h.childid AND h.parentid = gc.countryid AND gc.name_0 = gadm.name_0 AND gs.stateprovinceid = g.geonameid AND g.featurecode='ADM1' AND gs.name_1 IS NULL; -- UPDATE 319 -- Time: 857.885 ms -- now again but against our manual mapping UPDATE gadm_stateprovince_lookup gs SET name_0 = gadm.name_0, name_1 = gadm.name_1 FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm, alt_stateprovince asp, geonames g, countries c WHERE gadm.name_1=asp.alternatename AND gadm.iso=c.iso3 AND asp.country=c.country AND gs.stateprovinceid = g.geonameid AND g.countrycode=c.iso AND g.name=asp.stateprovince AND gs.name_0 IS NULL; -- UPDATE 29 -- Time: 777.376 ms ---------------------- -- Level 2 (County) -- ---------------------- -- map gadm2 level2 to geonames ADM2 CREATE TABLE gadm_county_lookup ( countyid integer primary key references geonames (geonameid), name_0 text, name_1 text, name_2 text ); INSERT INTO gadm_county_lookup (countyid) SELECT DISTINCT geonameid FROM geonames WHERE featurecode='ADM2'; -- INSERT 0 32374 -- Time: 13177.539 ms -- try matching against geonames (names and alternatenames) UPDATE gadm_county_lookup gcl SET name_0 = gadm.name_0, name_1 = gadm.name_1, name_2 = gadm.name_2 FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm, geonames g, hierarchy h, gadm_stateprovince_lookup gsl WHERE (lower(gadm.name_2)=lower(g.name) OR lower(gadm.name_2) = ANY (string_to_array(lower(g.alternatenames), ','))) AND gcl.countyid = g.geonameid AND gcl.countyid = h.childid AND h.parentid = gsl.stateprovinceid AND gsl.name_0 = gadm.name_0 AND gsl.name_1 = gadm.name_1 AND g.featurecode='ADM2'; -- UPDATE 12352 -- Time: 27390.357 ms -- try matching against alternatenames table UPDATE gadm_county_lookup gcl SET name_0 = gadm.name_0, name_1 = gadm.name_1, name_2 = gadm.name_2 FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm, alternatenames a, hierarchy h, geonames g, gadm_stateprovince_lookup gsl WHERE lower(gadm.name_2)=lower(a.alternatename) AND gcl.countyid = a.geonameid AND a.geonameid = g.geonameid AND gcl.countyid = h.childid AND h.parentid = gsl.stateprovinceid AND gsl.name_0 = gadm.name_0 AND gsl.name_1 = gadm.name_1 AND g.featurecode='ADM2' AND gcl.name_2 IS NULL; -- UPDATE 0 -- Time: 6340.441 ms -- map geonames '/Foo/ County' to gadm2 '/Foo/' -- todo: other mappings like this??? UPDATE gadm_county_lookup gcl SET name_0 = gadm.name_0, name_1 = gadm.name_1, name_2 = gadm.name_2 FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm, geonames g, hierarchy h, gadm_stateprovince_lookup gsl WHERE lower(gadm.name_2||' County')=lower(g.name) AND gcl.countyid = g.geonameid AND gcl.countyid = h.childid AND h.parentid = gsl.stateprovinceid AND gsl.name_0 = gadm.name_0 AND gsl.name_1 = gadm.name_1 AND g.featurecode='ADM2' AND gcl.name_2 IS NULL; -- UPDATE 3000 -- Time: 22248.393 ms /* -- another way to create gadm_stateprovince_lookup, though this won't -- pick up multiple matches from geonames (good or bad??) CREATE TABLE gadm_stateprovince_lookup AS SELECT DISTINCT name_0, name_1 FROM gadm2; ALTER TABLE gadm_stateprovince_lookup ADD COLUMN stateprovinceid integer; UPDATE gadm_stateprovince_lookup gs SET stateprovinceid = g.geonameid FROM gadm_country_lookup gc, alternatenames a, geonames g, countries c WHERE gs.name_0=gc.name_0 AND gs.name_1=a.alternatename AND a.geonameid=g.geonameid AND g.countrycode=c.iso AND gc.countryid=c.geonameid AND g.featurecode='ADM1'; -- UPDATE 2137 -- Time: 600.345 ms */ /* -- these geonameids match multiple gadm2 state/provinces, and that's bad -- because we don't know which one to use for geovalidation select stateprovinceid, name_0, array_agg(name_1) as name_1 from gadm_stateprovince_lookup group by stateprovinceid, name_0 having count(*)>1; stateprovinceid | name_0 | name_1 -----------------+----------+-------------------------------------- 3653890 | Ecuador | {Orellana,Orellana} 453751 | Bulgaria | {Razgrad,Ruse} 1831095 | Cambodia | {"Phnom Penh",Kândal} 1506272 | Russia | {Altay,Gorno-Altay} 128222 | Iran | {Kermanshah,Kordestan} 3457415 | Brazil | {"Mato Grosso do Sul","Mato Grosso"} 170652 | Syria | {Damascus,"Rif Dimashq"} (7 rows) -- for now, manually clean up after these ambiguous cases -- ... this countryid should only apply to Mato Grosso do Sul DELETE FROM gadm_stateprovince_lookup WHERE name_0 = 'Brazil' AND name_1 = 'Mato Grosso' AND stateprovinceid = '3457415'; */ /* -- these gadm2 state/provinces match multiple geonameids, but i'm pretty sure -- we don't really care in this direction select iso, name_1, count(*) from (select distinct iso, name_1 from gadm2) gadm2 left join ( select distinct gadm2.iso, name_1, alternatenames.geonameid, geonames.name from (select distinct iso, name_1 from gadm2) gadm2 join alternatenames on name_1=alternatename join geonames using (geonameid) join countries on countrycode=countries.iso where featurecode='ADM1' and gadm2.iso=countries.iso3) foo using (iso, name_1) group by iso, name_1 having count(*)>1; iso | name_1 | count -----+-------------+------- AZE | Yevlax | 2 BGR | Ruse | 2 BMU | Hamilton | 2 BRA | Mato Grosso | 2 KAZ | Almaty | 2 KHM | Phnom Penh | 2 RUS | Altay | 2 RUS | Moskva | 2 (8 rows) */