Project

General

Profile

1 10707 aaronmk
/*
2
  SQL statements to generate tables mapping geonames features at each
3
  relevant administative level (country, state/province, county/parish)
4
  to GADM2 features.
5
6
  Todo:
7
  * Be tolerant to presence/absence of administrative qualifiers in
8
    names? (Already doing this mildly for County/Co/Co.)
9
    - Municipality
10
    - Barrio
11
    - District
12
13
  * Get rid of rudundancy
14
  * Try matching against GADM2 varname_* columns?
15
16
  Jim Regetz
17
  NCEAS
18
  Created Nov 2012
19
*/
20
21
-----------------------
22
-- Level 0 (Country) --
23
-----------------------
24
25
-- map gadm2 level0 to geonames countries
26
CREATE TABLE gadm_country_lookup (
27
        countryid integer primary key references geonames (geonameid),
28
        name_0 text
29
);
30
INSERT INTO gadm_country_lookup (countryid)
31
  SELECT DISTINCT geonameid
32
    FROM countries
33
    WHERE geonameid IS NOT NULL;
34
-- INSERT 0 250
35
-- Time: 20.770 ms
36
UPDATE gadm_country_lookup lu
37
  SET name_0 = gadm.name_0
38
  FROM (SELECT DISTINCT iso, name_0 FROM gadm2) gadm,
39
       countries c
40
  WHERE lu.countryid = c.geonameid
41
    AND c.iso3 = gadm.iso;
42
-- UPDATE 248
43
-- Time: 3562.601 ms
44
45
------------------------------
46
-- Level 1 (State/Province) --
47
------------------------------
48
49
-- map gadm2 level1 to geonames ADM1
50
CREATE TABLE gadm_stateprovince_lookup (
51
        stateprovinceid integer primary key references geonames (geonameid),
52
        name_0 text,
53
        name_1 text
54
);
55
INSERT INTO gadm_stateprovince_lookup (stateprovinceid)
56
  SELECT DISTINCT geonameid
57
    FROM geonames
58
    WHERE featurecode='ADM1';
59
-- INSERT 0 3841
60
-- Time: 7085.635 ms
61
62
63
-- try matching against alternatenames table
64
UPDATE gadm_stateprovince_lookup gs
65
  SET name_0 = gadm.name_0,
66
      name_1 = gadm.name_1
67
  FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
68
       alternatenames a,
69
       geonames g,
70
       hierarchy h,
71
       gadm_country_lookup gc
72
  WHERE lower(gadm.name_1)=lower(alternatename)
73
    AND gs.stateprovinceid = a.geonameid
74
    AND a.geonameid = g.geonameid
75
    AND gs.stateprovinceid = h.childid
76
    AND h.parentid = gc.countryid
77
    AND gc.name_0 = gadm.name_0
78
    AND g.featurecode='ADM1'
79
    AND gs.name_1 IS NULL;
80
-- UPDATE 2145
81
-- Time: 3453.568 ms
82
83
-- try matching against geonames (names and alternatenames)
84
UPDATE gadm_stateprovince_lookup gs
85
  SET name_0 = gadm.name_0,
86
      name_1 = gadm.name_1
87
  FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
88
       geonames g,
89
       hierarchy h,
90
       gadm_country_lookup gc
91
  WHERE (lower(gadm.name_1)=lower(g.name)
92
         OR lower(gadm.name_1) =
93
            ANY (string_to_array(lower(g.alternatenames), ',')))
94
    AND gs.stateprovinceid = h.childid
95
    AND h.parentid = gc.countryid
96
    AND gc.name_0 = gadm.name_0
97
    AND gs.stateprovinceid = g.geonameid
98
    AND g.featurecode='ADM1'
99
    AND gs.name_1 IS NULL;
100
-- UPDATE 319
101
-- Time: 857.885 ms
102
103
104
-- now again but against our manual mapping
105
UPDATE gadm_stateprovince_lookup gs
106
  SET name_0 = gadm.name_0,
107
      name_1 = gadm.name_1
108
  FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
109
       alt_stateprovince asp,
110
       geonames g,
111
       countries c
112
  WHERE gadm.name_1=asp.alternatename
113
    AND gadm.iso=c.iso3
114
    AND asp.country=c.country
115
    AND gs.stateprovinceid = g.geonameid
116
    AND g.countrycode=c.iso
117
    AND g.name=asp.stateprovince
118
    AND gs.name_0 IS NULL;
119
-- UPDATE 29
120
-- Time: 777.376 ms
121
122
----------------------
123
-- Level 2 (County) --
124
----------------------
125
126
-- map gadm2 level2 to geonames ADM2
127
CREATE TABLE gadm_county_lookup (
128
        countyid integer primary key references geonames (geonameid),
129
        name_0 text,
130
        name_1 text,
131
        name_2 text
132
);
133
INSERT INTO gadm_county_lookup (countyid)
134
  SELECT DISTINCT geonameid
135
    FROM geonames
136
    WHERE featurecode='ADM2';
137
-- INSERT 0 32374
138
-- Time: 13177.539 ms
139
140
-- try matching against geonames (names and alternatenames)
141
UPDATE gadm_county_lookup gcl
142
  SET name_0 = gadm.name_0,
143
      name_1 = gadm.name_1,
144
      name_2 = gadm.name_2
145
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
146
       geonames g,
147
       hierarchy h,
148
       gadm_stateprovince_lookup gsl
149
  WHERE (lower(gadm.name_2)=lower(g.name)
150
         OR lower(gadm.name_2) =
151
            ANY (string_to_array(lower(g.alternatenames), ',')))
152
    AND gcl.countyid = g.geonameid
153
    AND gcl.countyid = h.childid
154
    AND h.parentid = gsl.stateprovinceid
155
    AND gsl.name_0 = gadm.name_0
156
    AND gsl.name_1 = gadm.name_1
157
    AND g.featurecode='ADM2';
158
-- UPDATE 12352
159
-- Time: 27390.357 ms
160
161
-- try matching against alternatenames table
162
UPDATE gadm_county_lookup gcl
163
  SET name_0 = gadm.name_0,
164
      name_1 = gadm.name_1,
165
      name_2 = gadm.name_2
166
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
167
       alternatenames a,
168
       hierarchy h,
169
       geonames g,
170
       gadm_stateprovince_lookup gsl
171
  WHERE lower(gadm.name_2)=lower(a.alternatename)
172
    AND gcl.countyid = a.geonameid
173
    AND a.geonameid = g.geonameid
174
    AND gcl.countyid = h.childid
175
    AND h.parentid = gsl.stateprovinceid
176
    AND gsl.name_0 = gadm.name_0
177
    AND gsl.name_1 = gadm.name_1
178
    AND g.featurecode='ADM2'
179
    AND gcl.name_2 IS NULL;
180
-- UPDATE 0
181
-- Time: 6340.441 ms
182
183
-- map geonames '/Foo/ County' to gadm2 '/Foo/'
184
-- todo: other mappings like this???
185
UPDATE gadm_county_lookup gcl
186
  SET name_0 = gadm.name_0,
187
      name_1 = gadm.name_1,
188
      name_2 = gadm.name_2
189
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
190
       geonames g,
191
       hierarchy h,
192
       gadm_stateprovince_lookup gsl
193
  WHERE lower(gadm.name_2||' County')=lower(g.name)
194
    AND gcl.countyid = g.geonameid
195
    AND gcl.countyid = h.childid
196
    AND h.parentid = gsl.stateprovinceid
197
    AND gsl.name_0 = gadm.name_0
198
    AND gsl.name_1 = gadm.name_1
199
    AND g.featurecode='ADM2'
200
    AND gcl.name_2 IS NULL;
201
-- UPDATE 3000
202
-- Time: 22248.393 ms
203
204
205
/*
206
-- another way to create gadm_stateprovince_lookup, though this won't
207
-- pick up multiple matches from geonames (good or bad??)
208
CREATE TABLE gadm_stateprovince_lookup AS
209
SELECT DISTINCT name_0, name_1
210
  FROM gadm2;
211
ALTER TABLE gadm_stateprovince_lookup ADD COLUMN stateprovinceid integer;
212
UPDATE gadm_stateprovince_lookup gs
213
  SET stateprovinceid = g.geonameid
214
  FROM gadm_country_lookup gc,
215
       alternatenames a,
216
       geonames g,
217
       countries c
218
  WHERE gs.name_0=gc.name_0
219
    AND gs.name_1=a.alternatename
220
    AND a.geonameid=g.geonameid
221
    AND g.countrycode=c.iso
222
    AND gc.countryid=c.geonameid
223
    AND g.featurecode='ADM1';
224
-- UPDATE 2137
225
-- Time: 600.345 ms
226
*/
227
228
229
230
/*
231
-- these geonameids match multiple gadm2 state/provinces, and that's bad
232
-- because we don't know which one to use for geovalidation
233
select stateprovinceid, name_0, array_agg(name_1) as name_1
234
  from gadm_stateprovince_lookup
235
  group by stateprovinceid, name_0
236
  having count(*)>1;
237
238
 stateprovinceid |  name_0  |                name_1
239
-----------------+----------+--------------------------------------
240
         3653890 | Ecuador  | {Orellana,Orellana}
241
          453751 | Bulgaria | {Razgrad,Ruse}
242
         1831095 | Cambodia | {"Phnom Penh",Kândal}
243
         1506272 | Russia   | {Altay,Gorno-Altay}
244
          128222 | Iran     | {Kermanshah,Kordestan}
245
         3457415 | Brazil   | {"Mato Grosso do Sul","Mato Grosso"}
246
          170652 | Syria    | {Damascus,"Rif Dimashq"}
247
(7 rows)
248
249
-- for now, manually clean up after these ambiguous cases
250
-- ... this countryid should only apply to Mato Grosso do Sul
251
DELETE FROM gadm_stateprovince_lookup
252
  WHERE name_0 = 'Brazil'
253
    AND name_1 = 'Mato Grosso'
254
    AND stateprovinceid = '3457415';
255
*/
256
257
/*
258
-- these gadm2 state/provinces match multiple geonameids, but i'm pretty sure
259
-- we don't really care in this direction
260
select iso, name_1, count(*)
261
  from (select distinct iso, name_1 from gadm2) gadm2
262
  left join (
263
      select distinct gadm2.iso,
264
             name_1,
265
             alternatenames.geonameid,
266
             geonames.name
267
        from (select distinct iso, name_1 from gadm2) gadm2
268
        join alternatenames on name_1=alternatename
269
        join geonames using (geonameid)
270
        join countries on countrycode=countries.iso
271
        where featurecode='ADM1'
272
          and gadm2.iso=countries.iso3) foo
273
    using (iso, name_1) group by iso, name_1 having count(*)>1;
274
275
 iso |   name_1    | count
276
-----+-------------+-------
277
 AZE | Yevlax      |     2
278
 BGR | Ruse        |     2
279
 BMU | Hamilton    |     2
280
 BRA | Mato Grosso |     2
281
 KAZ | Almaty      |     2
282
 KHM | Phnom Penh  |     2
283
 RUS | Altay       |     2
284
 RUS | Moskva      |     2
285
(8 rows)
286
*/