1
|
/*
|
2
|
SQL statements to generate tables mapping geonames features at each
|
3
|
relevant administative level (country, state/province, county/parish)
|
4
|
to GADM2 features.
|
5
|
|
6
|
Todo:
|
7
|
* Be tolerant to presence/absence of administrative qualifiers in
|
8
|
names? (Already doing this mildly for County/Co/Co.)
|
9
|
- Municipality
|
10
|
- Barrio
|
11
|
- District
|
12
|
|
13
|
* Get rid of rudundancy
|
14
|
* Try matching against GADM2 varname_* columns?
|
15
|
|
16
|
Jim Regetz
|
17
|
NCEAS
|
18
|
Created Nov 2012
|
19
|
*/
|
20
|
|
21
|
-----------------------
|
22
|
-- Level 0 (Country) --
|
23
|
-----------------------
|
24
|
|
25
|
-- map gadm2 level0 to geonames countries
|
26
|
CREATE TABLE gadm_country_lookup (
|
27
|
countryid integer primary key references geonames (geonameid),
|
28
|
name_0 text
|
29
|
);
|
30
|
INSERT INTO gadm_country_lookup (countryid)
|
31
|
SELECT DISTINCT geonameid
|
32
|
FROM countries
|
33
|
WHERE geonameid IS NOT NULL;
|
34
|
-- INSERT 0 250
|
35
|
-- Time: 20.770 ms
|
36
|
UPDATE gadm_country_lookup lu
|
37
|
SET name_0 = gadm.name_0
|
38
|
FROM (SELECT DISTINCT iso, name_0 FROM gadm2) gadm,
|
39
|
countries c
|
40
|
WHERE lu.countryid = c.geonameid
|
41
|
AND c.iso3 = gadm.iso;
|
42
|
-- UPDATE 248
|
43
|
-- Time: 3562.601 ms
|
44
|
|
45
|
------------------------------
|
46
|
-- Level 1 (State/Province) --
|
47
|
------------------------------
|
48
|
|
49
|
-- map gadm2 level1 to geonames ADM1
|
50
|
CREATE TABLE gadm_stateprovince_lookup (
|
51
|
stateprovinceid integer primary key references geonames (geonameid),
|
52
|
name_0 text,
|
53
|
name_1 text
|
54
|
);
|
55
|
INSERT INTO gadm_stateprovince_lookup (stateprovinceid)
|
56
|
SELECT DISTINCT geonameid
|
57
|
FROM geonames
|
58
|
WHERE featurecode='ADM1';
|
59
|
-- INSERT 0 3841
|
60
|
-- Time: 7085.635 ms
|
61
|
|
62
|
|
63
|
-- try matching against alternatenames table
|
64
|
UPDATE gadm_stateprovince_lookup gs
|
65
|
SET name_0 = gadm.name_0,
|
66
|
name_1 = gadm.name_1
|
67
|
FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
|
68
|
alternatenames a,
|
69
|
geonames g,
|
70
|
hierarchy h,
|
71
|
gadm_country_lookup gc
|
72
|
WHERE lower(gadm.name_1)=lower(alternatename)
|
73
|
AND gs.stateprovinceid = a.geonameid
|
74
|
AND a.geonameid = g.geonameid
|
75
|
AND gs.stateprovinceid = h.childid
|
76
|
AND h.parentid = gc.countryid
|
77
|
AND gc.name_0 = gadm.name_0
|
78
|
AND g.featurecode='ADM1'
|
79
|
AND gs.name_1 IS NULL;
|
80
|
-- UPDATE 2145
|
81
|
-- Time: 3453.568 ms
|
82
|
|
83
|
-- try matching against geonames (names and alternatenames)
|
84
|
UPDATE gadm_stateprovince_lookup gs
|
85
|
SET name_0 = gadm.name_0,
|
86
|
name_1 = gadm.name_1
|
87
|
FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
|
88
|
geonames g,
|
89
|
hierarchy h,
|
90
|
gadm_country_lookup gc
|
91
|
WHERE (lower(gadm.name_1)=lower(g.name)
|
92
|
OR lower(gadm.name_1) =
|
93
|
ANY (string_to_array(lower(g.alternatenames), ',')))
|
94
|
AND gs.stateprovinceid = h.childid
|
95
|
AND h.parentid = gc.countryid
|
96
|
AND gc.name_0 = gadm.name_0
|
97
|
AND gs.stateprovinceid = g.geonameid
|
98
|
AND g.featurecode='ADM1'
|
99
|
AND gs.name_1 IS NULL;
|
100
|
-- UPDATE 319
|
101
|
-- Time: 857.885 ms
|
102
|
|
103
|
|
104
|
-- now again but against our manual mapping
|
105
|
UPDATE gadm_stateprovince_lookup gs
|
106
|
SET name_0 = gadm.name_0,
|
107
|
name_1 = gadm.name_1
|
108
|
FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
|
109
|
alt_stateprovince asp,
|
110
|
geonames g,
|
111
|
countries c
|
112
|
WHERE gadm.name_1=asp.alternatename
|
113
|
AND gadm.iso=c.iso3
|
114
|
AND asp.country=c.country
|
115
|
AND gs.stateprovinceid = g.geonameid
|
116
|
AND g.countrycode=c.iso
|
117
|
AND g.name=asp.stateprovince
|
118
|
AND gs.name_0 IS NULL;
|
119
|
-- UPDATE 29
|
120
|
-- Time: 777.376 ms
|
121
|
|
122
|
----------------------
|
123
|
-- Level 2 (County) --
|
124
|
----------------------
|
125
|
|
126
|
-- map gadm2 level2 to geonames ADM2
|
127
|
CREATE TABLE gadm_county_lookup (
|
128
|
countyid integer primary key references geonames (geonameid),
|
129
|
name_0 text,
|
130
|
name_1 text,
|
131
|
name_2 text
|
132
|
);
|
133
|
INSERT INTO gadm_county_lookup (countyid)
|
134
|
SELECT DISTINCT geonameid
|
135
|
FROM geonames
|
136
|
WHERE featurecode='ADM2';
|
137
|
-- INSERT 0 32374
|
138
|
-- Time: 13177.539 ms
|
139
|
|
140
|
-- try matching against geonames (names and alternatenames)
|
141
|
UPDATE gadm_county_lookup gcl
|
142
|
SET name_0 = gadm.name_0,
|
143
|
name_1 = gadm.name_1,
|
144
|
name_2 = gadm.name_2
|
145
|
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
|
146
|
geonames g,
|
147
|
hierarchy h,
|
148
|
gadm_stateprovince_lookup gsl
|
149
|
WHERE (lower(gadm.name_2)=lower(g.name)
|
150
|
OR lower(gadm.name_2) =
|
151
|
ANY (string_to_array(lower(g.alternatenames), ',')))
|
152
|
AND gcl.countyid = g.geonameid
|
153
|
AND gcl.countyid = h.childid
|
154
|
AND h.parentid = gsl.stateprovinceid
|
155
|
AND gsl.name_0 = gadm.name_0
|
156
|
AND gsl.name_1 = gadm.name_1
|
157
|
AND g.featurecode='ADM2';
|
158
|
-- UPDATE 12352
|
159
|
-- Time: 27390.357 ms
|
160
|
|
161
|
-- try matching against alternatenames table
|
162
|
UPDATE gadm_county_lookup gcl
|
163
|
SET name_0 = gadm.name_0,
|
164
|
name_1 = gadm.name_1,
|
165
|
name_2 = gadm.name_2
|
166
|
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
|
167
|
alternatenames a,
|
168
|
hierarchy h,
|
169
|
geonames g,
|
170
|
gadm_stateprovince_lookup gsl
|
171
|
WHERE lower(gadm.name_2)=lower(a.alternatename)
|
172
|
AND gcl.countyid = a.geonameid
|
173
|
AND a.geonameid = g.geonameid
|
174
|
AND gcl.countyid = h.childid
|
175
|
AND h.parentid = gsl.stateprovinceid
|
176
|
AND gsl.name_0 = gadm.name_0
|
177
|
AND gsl.name_1 = gadm.name_1
|
178
|
AND g.featurecode='ADM2'
|
179
|
AND gcl.name_2 IS NULL;
|
180
|
-- UPDATE 0
|
181
|
-- Time: 6340.441 ms
|
182
|
|
183
|
-- map geonames '/Foo/ County' to gadm2 '/Foo/'
|
184
|
-- todo: other mappings like this???
|
185
|
UPDATE gadm_county_lookup gcl
|
186
|
SET name_0 = gadm.name_0,
|
187
|
name_1 = gadm.name_1,
|
188
|
name_2 = gadm.name_2
|
189
|
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
|
190
|
geonames g,
|
191
|
hierarchy h,
|
192
|
gadm_stateprovince_lookup gsl
|
193
|
WHERE lower(gadm.name_2||' County')=lower(g.name)
|
194
|
AND gcl.countyid = g.geonameid
|
195
|
AND gcl.countyid = h.childid
|
196
|
AND h.parentid = gsl.stateprovinceid
|
197
|
AND gsl.name_0 = gadm.name_0
|
198
|
AND gsl.name_1 = gadm.name_1
|
199
|
AND g.featurecode='ADM2'
|
200
|
AND gcl.name_2 IS NULL;
|
201
|
-- UPDATE 3000
|
202
|
-- Time: 22248.393 ms
|
203
|
|
204
|
|
205
|
/*
|
206
|
-- another way to create gadm_stateprovince_lookup, though this won't
|
207
|
-- pick up multiple matches from geonames (good or bad??)
|
208
|
CREATE TABLE gadm_stateprovince_lookup AS
|
209
|
SELECT DISTINCT name_0, name_1
|
210
|
FROM gadm2;
|
211
|
ALTER TABLE gadm_stateprovince_lookup ADD COLUMN stateprovinceid integer;
|
212
|
UPDATE gadm_stateprovince_lookup gs
|
213
|
SET stateprovinceid = g.geonameid
|
214
|
FROM gadm_country_lookup gc,
|
215
|
alternatenames a,
|
216
|
geonames g,
|
217
|
countries c
|
218
|
WHERE gs.name_0=gc.name_0
|
219
|
AND gs.name_1=a.alternatename
|
220
|
AND a.geonameid=g.geonameid
|
221
|
AND g.countrycode=c.iso
|
222
|
AND gc.countryid=c.geonameid
|
223
|
AND g.featurecode='ADM1';
|
224
|
-- UPDATE 2137
|
225
|
-- Time: 600.345 ms
|
226
|
*/
|
227
|
|
228
|
|
229
|
|
230
|
/*
|
231
|
-- these geonameids match multiple gadm2 state/provinces, and that's bad
|
232
|
-- because we don't know which one to use for geovalidation
|
233
|
select stateprovinceid, name_0, array_agg(name_1) as name_1
|
234
|
from gadm_stateprovince_lookup
|
235
|
group by stateprovinceid, name_0
|
236
|
having count(*)>1;
|
237
|
|
238
|
stateprovinceid | name_0 | name_1
|
239
|
-----------------+----------+--------------------------------------
|
240
|
3653890 | Ecuador | {Orellana,Orellana}
|
241
|
453751 | Bulgaria | {Razgrad,Ruse}
|
242
|
1831095 | Cambodia | {"Phnom Penh",Kândal}
|
243
|
1506272 | Russia | {Altay,Gorno-Altay}
|
244
|
128222 | Iran | {Kermanshah,Kordestan}
|
245
|
3457415 | Brazil | {"Mato Grosso do Sul","Mato Grosso"}
|
246
|
170652 | Syria | {Damascus,"Rif Dimashq"}
|
247
|
(7 rows)
|
248
|
|
249
|
-- for now, manually clean up after these ambiguous cases
|
250
|
-- ... this countryid should only apply to Mato Grosso do Sul
|
251
|
DELETE FROM gadm_stateprovince_lookup
|
252
|
WHERE name_0 = 'Brazil'
|
253
|
AND name_1 = 'Mato Grosso'
|
254
|
AND stateprovinceid = '3457415';
|
255
|
*/
|
256
|
|
257
|
/*
|
258
|
-- these gadm2 state/provinces match multiple geonameids, but i'm pretty sure
|
259
|
-- we don't really care in this direction
|
260
|
select iso, name_1, count(*)
|
261
|
from (select distinct iso, name_1 from gadm2) gadm2
|
262
|
left join (
|
263
|
select distinct gadm2.iso,
|
264
|
name_1,
|
265
|
alternatenames.geonameid,
|
266
|
geonames.name
|
267
|
from (select distinct iso, name_1 from gadm2) gadm2
|
268
|
join alternatenames on name_1=alternatename
|
269
|
join geonames using (geonameid)
|
270
|
join countries on countrycode=countries.iso
|
271
|
where featurecode='ADM1'
|
272
|
and gadm2.iso=countries.iso3) foo
|
273
|
using (iso, name_1) group by iso, name_1 having count(*)>1;
|
274
|
|
275
|
iso | name_1 | count
|
276
|
-----+-------------+-------
|
277
|
AZE | Yevlax | 2
|
278
|
BGR | Ruse | 2
|
279
|
BMU | Hamilton | 2
|
280
|
BRA | Mato Grosso | 2
|
281
|
KAZ | Almaty | 2
|
282
|
KHM | Phnom Penh | 2
|
283
|
RUS | Altay | 2
|
284
|
RUS | Moskva | 2
|
285
|
(8 rows)
|
286
|
*/
|
287
|
|