Project

General

Profile

1
/*
2
  SQL statements to generate tables mapping geonames features at each
3
  relevant administative level (country, state/province, county/parish)
4
  to GADM2 features.
5

    
6
  Todo:
7
  * Be tolerant to presence/absence of administrative qualifiers in
8
    names? (Already doing this mildly for County/Co/Co.)
9
    - Municipality
10
    - Barrio
11
    - District
12

    
13
  * Get rid of rudundancy
14
  * Try matching against GADM2 varname_* columns?
15

    
16
  Jim Regetz
17
  NCEAS
18
  Created Nov 2012
19
*/
20

    
21
-----------------------
22
-- Level 0 (Country) --
23
-----------------------
24

    
25
-- map gadm2 level0 to geonames countries
26
CREATE TABLE gadm_country_lookup (
27
        countryid integer primary key references geonames (geonameid),
28
        name_0 text
29
);
30
INSERT INTO gadm_country_lookup (countryid)
31
  SELECT DISTINCT geonameid
32
    FROM countries
33
    WHERE geonameid IS NOT NULL;
34
-- INSERT 0 250
35
-- Time: 20.770 ms
36
UPDATE gadm_country_lookup lu
37
  SET name_0 = gadm.name_0
38
  FROM (SELECT DISTINCT iso, name_0 FROM gadm2) gadm,
39
       countries c
40
  WHERE lu.countryid = c.geonameid
41
    AND c.iso3 = gadm.iso;
42
-- UPDATE 248
43
-- Time: 3562.601 ms
44

    
45
CREATE INDEX "gadm_country_lookup_name_0_idx" ON gadm_country_lookup (name_0);
46

    
47
------------------------------
48
-- Level 1 (State/Province) --
49
------------------------------
50

    
51
-- map gadm2 level1 to geonames ADM1
52
CREATE TABLE gadm_stateprovince_lookup (
53
        stateprovinceid integer primary key references geonames (geonameid),
54
        name_0 text,
55
        name_1 text
56
);
57
INSERT INTO gadm_stateprovince_lookup (stateprovinceid)
58
  SELECT DISTINCT geonameid
59
    FROM geonames
60
    WHERE featurecode='ADM1';
61
-- INSERT 0 3841
62
-- Time: 7085.635 ms
63

    
64

    
65
-- try matching against alternatenames table
66
UPDATE gadm_stateprovince_lookup gs
67
  SET name_0 = gadm.name_0,
68
      name_1 = gadm.name_1
69
  FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
70
       alternatenames a,
71
       geonames g,
72
       hierarchy h,
73
       gadm_country_lookup gc
74
  WHERE lower(gadm.name_1)=lower(alternatename)
75
    AND gs.stateprovinceid = a.geonameid
76
    AND a.geonameid = g.geonameid
77
    AND gs.stateprovinceid = h.childid
78
    AND h.parentid = gc.countryid
79
    AND gc.name_0 = gadm.name_0
80
    AND g.featurecode='ADM1'
81
    AND gs.name_1 IS NULL;
82
-- UPDATE 2145
83
-- Time: 3453.568 ms
84

    
85
-- try matching against geonames (names and alternatenames)
86
UPDATE gadm_stateprovince_lookup gs
87
  SET name_0 = gadm.name_0,
88
      name_1 = gadm.name_1
89
  FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
90
       geonames g,
91
       hierarchy h,
92
       gadm_country_lookup gc
93
  WHERE (lower(gadm.name_1)=lower(g.name)
94
         OR lower(gadm.name_1) =
95
            ANY (string_to_array(lower(g.alternatenames), ',')))
96
    AND gs.stateprovinceid = h.childid
97
    AND h.parentid = gc.countryid
98
    AND gc.name_0 = gadm.name_0
99
    AND gs.stateprovinceid = g.geonameid
100
    AND g.featurecode='ADM1'
101
    AND gs.name_1 IS NULL;
102
-- UPDATE 319
103
-- Time: 857.885 ms
104

    
105

    
106
-- now again but against our manual mapping
107
UPDATE gadm_stateprovince_lookup gs
108
  SET name_0 = gadm.name_0,
109
      name_1 = gadm.name_1
110
  FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
111
       alt_stateprovince asp,
112
       geonames g,
113
       countries c
114
  WHERE gadm.name_1=asp.alternatename
115
    AND gadm.iso=c.iso3
116
    AND asp.country=c.country
117
    AND gs.stateprovinceid = g.geonameid
118
    AND g.countrycode=c.iso
119
    AND g.name=asp.stateprovince
120
    AND gs.name_0 IS NULL;
121
-- UPDATE 29
122
-- Time: 777.376 ms
123

    
124
----------------------
125
-- Level 2 (County) --
126
----------------------
127

    
128
-- map gadm2 level2 to geonames ADM2
129
CREATE TABLE gadm_county_lookup (
130
        countyid integer primary key references geonames (geonameid),
131
        name_0 text,
132
        name_1 text,
133
        name_2 text
134
);
135
INSERT INTO gadm_county_lookup (countyid)
136
  SELECT DISTINCT geonameid
137
    FROM geonames
138
    WHERE featurecode='ADM2';
139
-- INSERT 0 32374
140
-- Time: 13177.539 ms
141

    
142
-- try matching against geonames (names and alternatenames)
143
UPDATE gadm_county_lookup gcl
144
  SET name_0 = gadm.name_0,
145
      name_1 = gadm.name_1,
146
      name_2 = gadm.name_2
147
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
148
       geonames g,
149
       hierarchy h,
150
       gadm_stateprovince_lookup gsl
151
  WHERE (lower(gadm.name_2)=lower(g.name)
152
         OR lower(gadm.name_2) =
153
            ANY (string_to_array(lower(g.alternatenames), ',')))
154
    AND gcl.countyid = g.geonameid
155
    AND gcl.countyid = h.childid
156
    AND h.parentid = gsl.stateprovinceid
157
    AND gsl.name_0 = gadm.name_0
158
    AND gsl.name_1 = gadm.name_1
159
    AND g.featurecode='ADM2';
160
-- UPDATE 12352
161
-- Time: 27390.357 ms
162

    
163
-- try matching against alternatenames table
164
UPDATE gadm_county_lookup gcl
165
  SET name_0 = gadm.name_0,
166
      name_1 = gadm.name_1,
167
      name_2 = gadm.name_2
168
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
169
       alternatenames a,
170
       hierarchy h,
171
       geonames g,
172
       gadm_stateprovince_lookup gsl
173
  WHERE lower(gadm.name_2)=lower(a.alternatename)
174
    AND gcl.countyid = a.geonameid
175
    AND a.geonameid = g.geonameid
176
    AND gcl.countyid = h.childid
177
    AND h.parentid = gsl.stateprovinceid
178
    AND gsl.name_0 = gadm.name_0
179
    AND gsl.name_1 = gadm.name_1
180
    AND g.featurecode='ADM2'
181
    AND gcl.name_2 IS NULL;
182
-- UPDATE 0
183
-- Time: 6340.441 ms
184

    
185
-- map geonames '/Foo/ County' to gadm2 '/Foo/'
186
-- todo: other mappings like this???
187
UPDATE gadm_county_lookup gcl
188
  SET name_0 = gadm.name_0,
189
      name_1 = gadm.name_1,
190
      name_2 = gadm.name_2
191
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
192
       geonames g,
193
       hierarchy h,
194
       gadm_stateprovince_lookup gsl
195
  WHERE lower(gadm.name_2||' County')=lower(g.name)
196
    AND gcl.countyid = g.geonameid
197
    AND gcl.countyid = h.childid
198
    AND h.parentid = gsl.stateprovinceid
199
    AND gsl.name_0 = gadm.name_0
200
    AND gsl.name_1 = gadm.name_1
201
    AND g.featurecode='ADM2'
202
    AND gcl.name_2 IS NULL;
203
-- UPDATE 3000
204
-- Time: 22248.393 ms
205

    
206

    
207
/*
208
-- another way to create gadm_stateprovince_lookup, though this won't
209
-- pick up multiple matches from geonames (good or bad??)
210
CREATE TABLE gadm_stateprovince_lookup AS
211
SELECT DISTINCT name_0, name_1
212
  FROM gadm2;
213
ALTER TABLE gadm_stateprovince_lookup ADD COLUMN stateprovinceid integer;
214
UPDATE gadm_stateprovince_lookup gs
215
  SET stateprovinceid = g.geonameid
216
  FROM gadm_country_lookup gc,
217
       alternatenames a,
218
       geonames g,
219
       countries c
220
  WHERE gs.name_0=gc.name_0
221
    AND gs.name_1=a.alternatename
222
    AND a.geonameid=g.geonameid
223
    AND g.countrycode=c.iso
224
    AND gc.countryid=c.geonameid
225
    AND g.featurecode='ADM1';
226
-- UPDATE 2137
227
-- Time: 600.345 ms
228
*/
229

    
230

    
231

    
232
/*
233
-- these geonameids match multiple gadm2 state/provinces, and that's bad
234
-- because we don't know which one to use for geovalidation
235
select stateprovinceid, name_0, array_agg(name_1) as name_1
236
  from gadm_stateprovince_lookup
237
  group by stateprovinceid, name_0
238
  having count(*)>1;
239

    
240
 stateprovinceid |  name_0  |                name_1
241
-----------------+----------+--------------------------------------
242
         3653890 | Ecuador  | {Orellana,Orellana}
243
          453751 | Bulgaria | {Razgrad,Ruse}
244
         1831095 | Cambodia | {"Phnom Penh",Kândal}
245
         1506272 | Russia   | {Altay,Gorno-Altay}
246
          128222 | Iran     | {Kermanshah,Kordestan}
247
         3457415 | Brazil   | {"Mato Grosso do Sul","Mato Grosso"}
248
          170652 | Syria    | {Damascus,"Rif Dimashq"}
249
(7 rows)
250

    
251
-- for now, manually clean up after these ambiguous cases
252
-- ... this countryid should only apply to Mato Grosso do Sul
253
DELETE FROM gadm_stateprovince_lookup
254
  WHERE name_0 = 'Brazil'
255
    AND name_1 = 'Mato Grosso'
256
    AND stateprovinceid = '3457415';
257
*/
258

    
259
/*
260
-- these gadm2 state/provinces match multiple geonameids, but i'm pretty sure
261
-- we don't really care in this direction
262
select iso, name_1, count(*)
263
  from (select distinct iso, name_1 from gadm2) gadm2
264
  left join (
265
      select distinct gadm2.iso,
266
             name_1,
267
             alternatenames.geonameid,
268
             geonames.name
269
        from (select distinct iso, name_1 from gadm2) gadm2
270
        join alternatenames on name_1=alternatename
271
        join geonames using (geonameid)
272
        join countries on countrycode=countries.iso
273
        where featurecode='ADM1'
274
          and gadm2.iso=countries.iso3) foo
275
    using (iso, name_1) group by iso, name_1 having count(*)>1;
276

    
277
 iso |   name_1    | count
278
-----+-------------+-------
279
 AZE | Yevlax      |     2
280
 BGR | Ruse        |     2
281
 BMU | Hamilton    |     2
282
 BRA | Mato Grosso |     2
283
 KAZ | Almaty      |     2
284
 KHM | Phnom Penh  |     2
285
 RUS | Altay       |     2
286
 RUS | Moskva      |     2
287
(8 rows)
288
*/
289

    
(4-4/9)