1 |
10707
|
aaronmk
|
/*
|
2 |
|
|
SQL statements to generate tables mapping geonames features at each
|
3 |
|
|
relevant administative level (country, state/province, county/parish)
|
4 |
|
|
to GADM2 features.
|
5 |
|
|
|
6 |
|
|
Todo:
|
7 |
|
|
* Be tolerant to presence/absence of administrative qualifiers in
|
8 |
|
|
names? (Already doing this mildly for County/Co/Co.)
|
9 |
|
|
- Municipality
|
10 |
|
|
- Barrio
|
11 |
|
|
- District
|
12 |
|
|
|
13 |
|
|
* Get rid of rudundancy
|
14 |
|
|
* Try matching against GADM2 varname_* columns?
|
15 |
|
|
|
16 |
|
|
Jim Regetz
|
17 |
|
|
NCEAS
|
18 |
|
|
Created Nov 2012
|
19 |
|
|
*/
|
20 |
|
|
|
21 |
|
|
-----------------------
|
22 |
|
|
-- Level 0 (Country) --
|
23 |
|
|
-----------------------
|
24 |
|
|
|
25 |
|
|
-- map gadm2 level0 to geonames countries
|
26 |
|
|
CREATE TABLE gadm_country_lookup (
|
27 |
|
|
countryid integer primary key references geonames (geonameid),
|
28 |
|
|
name_0 text
|
29 |
|
|
);
|
30 |
|
|
INSERT INTO gadm_country_lookup (countryid)
|
31 |
|
|
SELECT DISTINCT geonameid
|
32 |
|
|
FROM countries
|
33 |
|
|
WHERE geonameid IS NOT NULL;
|
34 |
|
|
-- INSERT 0 250
|
35 |
|
|
-- Time: 20.770 ms
|
36 |
|
|
UPDATE gadm_country_lookup lu
|
37 |
|
|
SET name_0 = gadm.name_0
|
38 |
|
|
FROM (SELECT DISTINCT iso, name_0 FROM gadm2) gadm,
|
39 |
|
|
countries c
|
40 |
|
|
WHERE lu.countryid = c.geonameid
|
41 |
|
|
AND c.iso3 = gadm.iso;
|
42 |
|
|
-- UPDATE 248
|
43 |
|
|
-- Time: 3562.601 ms
|
44 |
|
|
|
45 |
11346
|
psarando
|
CREATE INDEX "gadm_country_lookup_name_0_idx" ON gadm_country_lookup (name_0);
|
46 |
|
|
|
47 |
10707
|
aaronmk
|
------------------------------
|
48 |
|
|
-- Level 1 (State/Province) --
|
49 |
|
|
------------------------------
|
50 |
|
|
|
51 |
|
|
-- map gadm2 level1 to geonames ADM1
|
52 |
|
|
CREATE TABLE gadm_stateprovince_lookup (
|
53 |
|
|
stateprovinceid integer primary key references geonames (geonameid),
|
54 |
|
|
name_0 text,
|
55 |
|
|
name_1 text
|
56 |
|
|
);
|
57 |
|
|
INSERT INTO gadm_stateprovince_lookup (stateprovinceid)
|
58 |
|
|
SELECT DISTINCT geonameid
|
59 |
|
|
FROM geonames
|
60 |
|
|
WHERE featurecode='ADM1';
|
61 |
|
|
-- INSERT 0 3841
|
62 |
|
|
-- Time: 7085.635 ms
|
63 |
|
|
|
64 |
|
|
|
65 |
|
|
-- try matching against alternatenames table
|
66 |
|
|
UPDATE gadm_stateprovince_lookup gs
|
67 |
|
|
SET name_0 = gadm.name_0,
|
68 |
|
|
name_1 = gadm.name_1
|
69 |
|
|
FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
|
70 |
|
|
alternatenames a,
|
71 |
|
|
geonames g,
|
72 |
|
|
hierarchy h,
|
73 |
|
|
gadm_country_lookup gc
|
74 |
|
|
WHERE lower(gadm.name_1)=lower(alternatename)
|
75 |
|
|
AND gs.stateprovinceid = a.geonameid
|
76 |
|
|
AND a.geonameid = g.geonameid
|
77 |
|
|
AND gs.stateprovinceid = h.childid
|
78 |
|
|
AND h.parentid = gc.countryid
|
79 |
|
|
AND gc.name_0 = gadm.name_0
|
80 |
|
|
AND g.featurecode='ADM1'
|
81 |
|
|
AND gs.name_1 IS NULL;
|
82 |
|
|
-- UPDATE 2145
|
83 |
|
|
-- Time: 3453.568 ms
|
84 |
|
|
|
85 |
|
|
-- try matching against geonames (names and alternatenames)
|
86 |
|
|
UPDATE gadm_stateprovince_lookup gs
|
87 |
|
|
SET name_0 = gadm.name_0,
|
88 |
|
|
name_1 = gadm.name_1
|
89 |
|
|
FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
|
90 |
|
|
geonames g,
|
91 |
|
|
hierarchy h,
|
92 |
|
|
gadm_country_lookup gc
|
93 |
|
|
WHERE (lower(gadm.name_1)=lower(g.name)
|
94 |
|
|
OR lower(gadm.name_1) =
|
95 |
|
|
ANY (string_to_array(lower(g.alternatenames), ',')))
|
96 |
|
|
AND gs.stateprovinceid = h.childid
|
97 |
|
|
AND h.parentid = gc.countryid
|
98 |
|
|
AND gc.name_0 = gadm.name_0
|
99 |
|
|
AND gs.stateprovinceid = g.geonameid
|
100 |
|
|
AND g.featurecode='ADM1'
|
101 |
|
|
AND gs.name_1 IS NULL;
|
102 |
|
|
-- UPDATE 319
|
103 |
|
|
-- Time: 857.885 ms
|
104 |
|
|
|
105 |
|
|
|
106 |
|
|
-- now again but against our manual mapping
|
107 |
|
|
UPDATE gadm_stateprovince_lookup gs
|
108 |
|
|
SET name_0 = gadm.name_0,
|
109 |
|
|
name_1 = gadm.name_1
|
110 |
|
|
FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
|
111 |
|
|
alt_stateprovince asp,
|
112 |
|
|
geonames g,
|
113 |
|
|
countries c
|
114 |
|
|
WHERE gadm.name_1=asp.alternatename
|
115 |
|
|
AND gadm.iso=c.iso3
|
116 |
|
|
AND asp.country=c.country
|
117 |
|
|
AND gs.stateprovinceid = g.geonameid
|
118 |
|
|
AND g.countrycode=c.iso
|
119 |
|
|
AND g.name=asp.stateprovince
|
120 |
|
|
AND gs.name_0 IS NULL;
|
121 |
|
|
-- UPDATE 29
|
122 |
|
|
-- Time: 777.376 ms
|
123 |
|
|
|
124 |
|
|
----------------------
|
125 |
|
|
-- Level 2 (County) --
|
126 |
|
|
----------------------
|
127 |
|
|
|
128 |
|
|
-- map gadm2 level2 to geonames ADM2
|
129 |
|
|
CREATE TABLE gadm_county_lookup (
|
130 |
|
|
countyid integer primary key references geonames (geonameid),
|
131 |
|
|
name_0 text,
|
132 |
|
|
name_1 text,
|
133 |
|
|
name_2 text
|
134 |
|
|
);
|
135 |
|
|
INSERT INTO gadm_county_lookup (countyid)
|
136 |
|
|
SELECT DISTINCT geonameid
|
137 |
|
|
FROM geonames
|
138 |
|
|
WHERE featurecode='ADM2';
|
139 |
|
|
-- INSERT 0 32374
|
140 |
|
|
-- Time: 13177.539 ms
|
141 |
|
|
|
142 |
|
|
-- try matching against geonames (names and alternatenames)
|
143 |
|
|
UPDATE gadm_county_lookup gcl
|
144 |
|
|
SET name_0 = gadm.name_0,
|
145 |
|
|
name_1 = gadm.name_1,
|
146 |
|
|
name_2 = gadm.name_2
|
147 |
|
|
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
|
148 |
|
|
geonames g,
|
149 |
|
|
hierarchy h,
|
150 |
|
|
gadm_stateprovince_lookup gsl
|
151 |
|
|
WHERE (lower(gadm.name_2)=lower(g.name)
|
152 |
|
|
OR lower(gadm.name_2) =
|
153 |
|
|
ANY (string_to_array(lower(g.alternatenames), ',')))
|
154 |
|
|
AND gcl.countyid = g.geonameid
|
155 |
|
|
AND gcl.countyid = h.childid
|
156 |
|
|
AND h.parentid = gsl.stateprovinceid
|
157 |
|
|
AND gsl.name_0 = gadm.name_0
|
158 |
|
|
AND gsl.name_1 = gadm.name_1
|
159 |
|
|
AND g.featurecode='ADM2';
|
160 |
|
|
-- UPDATE 12352
|
161 |
|
|
-- Time: 27390.357 ms
|
162 |
|
|
|
163 |
|
|
-- try matching against alternatenames table
|
164 |
|
|
UPDATE gadm_county_lookup gcl
|
165 |
|
|
SET name_0 = gadm.name_0,
|
166 |
|
|
name_1 = gadm.name_1,
|
167 |
|
|
name_2 = gadm.name_2
|
168 |
|
|
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
|
169 |
|
|
alternatenames a,
|
170 |
|
|
hierarchy h,
|
171 |
|
|
geonames g,
|
172 |
|
|
gadm_stateprovince_lookup gsl
|
173 |
|
|
WHERE lower(gadm.name_2)=lower(a.alternatename)
|
174 |
|
|
AND gcl.countyid = a.geonameid
|
175 |
|
|
AND a.geonameid = g.geonameid
|
176 |
|
|
AND gcl.countyid = h.childid
|
177 |
|
|
AND h.parentid = gsl.stateprovinceid
|
178 |
|
|
AND gsl.name_0 = gadm.name_0
|
179 |
|
|
AND gsl.name_1 = gadm.name_1
|
180 |
|
|
AND g.featurecode='ADM2'
|
181 |
|
|
AND gcl.name_2 IS NULL;
|
182 |
|
|
-- UPDATE 0
|
183 |
|
|
-- Time: 6340.441 ms
|
184 |
|
|
|
185 |
|
|
-- map geonames '/Foo/ County' to gadm2 '/Foo/'
|
186 |
|
|
-- todo: other mappings like this???
|
187 |
|
|
UPDATE gadm_county_lookup gcl
|
188 |
|
|
SET name_0 = gadm.name_0,
|
189 |
|
|
name_1 = gadm.name_1,
|
190 |
|
|
name_2 = gadm.name_2
|
191 |
|
|
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
|
192 |
|
|
geonames g,
|
193 |
|
|
hierarchy h,
|
194 |
|
|
gadm_stateprovince_lookup gsl
|
195 |
|
|
WHERE lower(gadm.name_2||' County')=lower(g.name)
|
196 |
|
|
AND gcl.countyid = g.geonameid
|
197 |
|
|
AND gcl.countyid = h.childid
|
198 |
|
|
AND h.parentid = gsl.stateprovinceid
|
199 |
|
|
AND gsl.name_0 = gadm.name_0
|
200 |
|
|
AND gsl.name_1 = gadm.name_1
|
201 |
|
|
AND g.featurecode='ADM2'
|
202 |
|
|
AND gcl.name_2 IS NULL;
|
203 |
|
|
-- UPDATE 3000
|
204 |
|
|
-- Time: 22248.393 ms
|
205 |
|
|
|
206 |
|
|
|
207 |
|
|
/*
|
208 |
|
|
-- another way to create gadm_stateprovince_lookup, though this won't
|
209 |
|
|
-- pick up multiple matches from geonames (good or bad??)
|
210 |
|
|
CREATE TABLE gadm_stateprovince_lookup AS
|
211 |
|
|
SELECT DISTINCT name_0, name_1
|
212 |
|
|
FROM gadm2;
|
213 |
|
|
ALTER TABLE gadm_stateprovince_lookup ADD COLUMN stateprovinceid integer;
|
214 |
|
|
UPDATE gadm_stateprovince_lookup gs
|
215 |
|
|
SET stateprovinceid = g.geonameid
|
216 |
|
|
FROM gadm_country_lookup gc,
|
217 |
|
|
alternatenames a,
|
218 |
|
|
geonames g,
|
219 |
|
|
countries c
|
220 |
|
|
WHERE gs.name_0=gc.name_0
|
221 |
|
|
AND gs.name_1=a.alternatename
|
222 |
|
|
AND a.geonameid=g.geonameid
|
223 |
|
|
AND g.countrycode=c.iso
|
224 |
|
|
AND gc.countryid=c.geonameid
|
225 |
|
|
AND g.featurecode='ADM1';
|
226 |
|
|
-- UPDATE 2137
|
227 |
|
|
-- Time: 600.345 ms
|
228 |
|
|
*/
|
229 |
|
|
|
230 |
|
|
|
231 |
|
|
|
232 |
|
|
/*
|
233 |
|
|
-- these geonameids match multiple gadm2 state/provinces, and that's bad
|
234 |
|
|
-- because we don't know which one to use for geovalidation
|
235 |
|
|
select stateprovinceid, name_0, array_agg(name_1) as name_1
|
236 |
|
|
from gadm_stateprovince_lookup
|
237 |
|
|
group by stateprovinceid, name_0
|
238 |
|
|
having count(*)>1;
|
239 |
|
|
|
240 |
|
|
stateprovinceid | name_0 | name_1
|
241 |
|
|
-----------------+----------+--------------------------------------
|
242 |
|
|
3653890 | Ecuador | {Orellana,Orellana}
|
243 |
|
|
453751 | Bulgaria | {Razgrad,Ruse}
|
244 |
|
|
1831095 | Cambodia | {"Phnom Penh",Kândal}
|
245 |
|
|
1506272 | Russia | {Altay,Gorno-Altay}
|
246 |
|
|
128222 | Iran | {Kermanshah,Kordestan}
|
247 |
|
|
3457415 | Brazil | {"Mato Grosso do Sul","Mato Grosso"}
|
248 |
|
|
170652 | Syria | {Damascus,"Rif Dimashq"}
|
249 |
|
|
(7 rows)
|
250 |
|
|
|
251 |
|
|
-- for now, manually clean up after these ambiguous cases
|
252 |
|
|
-- ... this countryid should only apply to Mato Grosso do Sul
|
253 |
|
|
DELETE FROM gadm_stateprovince_lookup
|
254 |
|
|
WHERE name_0 = 'Brazil'
|
255 |
|
|
AND name_1 = 'Mato Grosso'
|
256 |
|
|
AND stateprovinceid = '3457415';
|
257 |
|
|
*/
|
258 |
|
|
|
259 |
|
|
/*
|
260 |
|
|
-- these gadm2 state/provinces match multiple geonameids, but i'm pretty sure
|
261 |
|
|
-- we don't really care in this direction
|
262 |
|
|
select iso, name_1, count(*)
|
263 |
|
|
from (select distinct iso, name_1 from gadm2) gadm2
|
264 |
|
|
left join (
|
265 |
|
|
select distinct gadm2.iso,
|
266 |
|
|
name_1,
|
267 |
|
|
alternatenames.geonameid,
|
268 |
|
|
geonames.name
|
269 |
|
|
from (select distinct iso, name_1 from gadm2) gadm2
|
270 |
|
|
join alternatenames on name_1=alternatename
|
271 |
|
|
join geonames using (geonameid)
|
272 |
|
|
join countries on countrycode=countries.iso
|
273 |
|
|
where featurecode='ADM1'
|
274 |
|
|
and gadm2.iso=countries.iso3) foo
|
275 |
|
|
using (iso, name_1) group by iso, name_1 having count(*)>1;
|
276 |
|
|
|
277 |
|
|
iso | name_1 | count
|
278 |
|
|
-----+-------------+-------
|
279 |
|
|
AZE | Yevlax | 2
|
280 |
|
|
BGR | Ruse | 2
|
281 |
|
|
BMU | Hamilton | 2
|
282 |
|
|
BRA | Mato Grosso | 2
|
283 |
|
|
KAZ | Almaty | 2
|
284 |
|
|
KHM | Phnom Penh | 2
|
285 |
|
|
RUS | Altay | 2
|
286 |
|
|
RUS | Moskva | 2
|
287 |
|
|
(8 rows)
|
288 |
|
|
*/
|