1
|
/*
|
2
|
SQL statements carrying out country name, stateprovince name, and
|
3
|
county name scrubbing on vegbien location data. Currently hard-coded
|
4
|
to operate on a 'vegbien_geoscrub' table that contains the input for
|
5
|
geoscrubbing, with the following columns:
|
6
|
decimallatitude (latitude, WGS84 decimal degrees)
|
7
|
decimallongitude (longitude, WGS84 decimal degrees)
|
8
|
country (original asserted country name)
|
9
|
stateprovince (original asserted stateprovince name)
|
10
|
county (original asserted county name)
|
11
|
(Note that the lat/lon columns are not needed for scrubbing, but are
|
12
|
used later for geovalidation.)
|
13
|
|
14
|
Misc issues to contemplate and possibly address:
|
15
|
|
16
|
* Congo can be either "Democratic Republic of the Congo" (COD) or
|
17
|
"Republic of the Congo" (COG), eh?
|
18
|
* Vietnam: gadm2 seems to treat provinces as level2 units
|
19
|
* Yugoslavia: no longer exists, not in geonames nor gadm2
|
20
|
|
21
|
* Azerbaijan: Yevlakh City and Yevlakh Rayon both ADM1 in geonames,
|
22
|
but just Yevlax in gadm2
|
23
|
|
24
|
* England is not a geoname country, only the UK is
|
25
|
|
26
|
* the general problem of historical names in cases where countries
|
27
|
have either been split, combined, removed (?), or otherwise
|
28
|
redefined
|
29
|
|
30
|
* Cases with multiple alternativename matches???
|
31
|
|
32
|
Notes:
|
33
|
- the vegbien_geoscrub input used during development and testing this
|
34
|
scrubbing procedure came from a select distinct on (lat, lon,
|
35
|
country, state, province) from analytical_aggregate on vegbiendev
|
36
|
|
37
|
Jim Regetz
|
38
|
NCEAS
|
39
|
Created Nov 2012
|
40
|
*/
|
41
|
|
42
|
BEGIN;
|
43
|
|
44
|
-------------------
|
45
|
-- scrub country --
|
46
|
-------------------
|
47
|
|
48
|
INSERT INTO vcountry
|
49
|
SELECT DISTINCT country
|
50
|
FROM vegbien_geoscrub
|
51
|
WHERE country IS NOT NULL;
|
52
|
-- SELECT 439
|
53
|
-- Time: 862.413 ms
|
54
|
|
55
|
-- try to clean up character representations
|
56
|
UPDATE vcountry
|
57
|
SET countryutf8 = u2L2u(country);
|
58
|
-- UPDATE 439
|
59
|
-- Time: 24.706 ms
|
60
|
|
61
|
-- try matching against direct country names
|
62
|
UPDATE vcountry vc
|
63
|
SET countryid = gc.geonameid
|
64
|
FROM countries gc
|
65
|
WHERE lower(vc.countryutf8) = lower(gc.country);
|
66
|
-- UPDATE 219
|
67
|
-- Time: 12.384 ms
|
68
|
|
69
|
-- try matching against ISO 3166-1 alpha-3
|
70
|
UPDATE vcountry vc
|
71
|
SET countryid = gc.geonameid
|
72
|
FROM countries gc
|
73
|
WHERE vc.countryutf8=gc.iso3
|
74
|
AND vc.countryid IS NULL;
|
75
|
-- UPDATE 1
|
76
|
-- Time: 20.146 ms
|
77
|
|
78
|
-- try matching against ISO 3166-1 alpha-2
|
79
|
UPDATE vcountry vc
|
80
|
SET countryid = gc.geonameid
|
81
|
FROM countries gc
|
82
|
WHERE vc.countryutf8 = iso
|
83
|
AND vc.countryid IS NULL;
|
84
|
-- UPDATE 64
|
85
|
-- Time: 3.378 ms
|
86
|
|
87
|
/*
|
88
|
-- try matching against FIPS, but not if already matched (i.e.,
|
89
|
-- the 2-digit ISO match takes preference)
|
90
|
UPDATE vcountry vc
|
91
|
SET countryid = gc.geonameid
|
92
|
FROM countries gc
|
93
|
WHERE vc.countryutf8 = fips
|
94
|
AND vc.countryid IS NULL;
|
95
|
-- UPDATE 1
|
96
|
-- Time: 2.514 ms
|
97
|
*/
|
98
|
|
99
|
-- try matching against BIEN alt country names
|
100
|
UPDATE vcountry vc
|
101
|
SET countryid = c.geonameid
|
102
|
FROM alt_country a, countries c
|
103
|
WHERE vc.countryutf8 = a.alternatename
|
104
|
AND a.country = c.country
|
105
|
AND vc.countryid IS NULL;
|
106
|
-- UPDATE 21
|
107
|
-- Time: 4.769 ms
|
108
|
|
109
|
-- try case-insensitive matching on geonames.org 'alternatenames'
|
110
|
-- note: this is a little sloppy because the set condition will return
|
111
|
-- more than one row if there are more than one matched alternate
|
112
|
-- country names, and which one gets used is non-deterministic; however,
|
113
|
-- we'll go back and remove those in the next update (specifically for
|
114
|
-- the cases where there are more than one *unique* matched countries)
|
115
|
UPDATE vcountry vc
|
116
|
SET countryid = gc.geonameid
|
117
|
FROM alternatenames a, countries gc
|
118
|
WHERE lower(vc.countryutf8) = lower(a.alternatename)
|
119
|
AND a.geonameid = gc.geonameid
|
120
|
AND vc.countryid IS NULL;
|
121
|
-- UPDATE 76
|
122
|
-- Time: 22019.592 ms
|
123
|
|
124
|
-- go back and remove if altname lookup was ambiguous
|
125
|
UPDATE vcountry vc
|
126
|
SET countryid = NULL
|
127
|
WHERE countryutf8 IN (
|
128
|
SELECT countryutf8
|
129
|
FROM (
|
130
|
SELECT DISTINCT vc.countryutf8, gc.iso3
|
131
|
FROM vcountry vc,
|
132
|
alternatenames a,
|
133
|
countries gc
|
134
|
WHERE lower(vc.countryutf8) = lower(a.alternatename)
|
135
|
AND a.geonameid = gc.geonameid) lookup
|
136
|
GROUP BY countryutf8
|
137
|
HAVING COUNT(*) > 1);
|
138
|
-- UPDATE 2
|
139
|
-- Time: 2296.257 ms
|
140
|
-- Removes: Congo, CONGO
|
141
|
|
142
|
-- merge in GADM country names (our standard names)
|
143
|
UPDATE vcountry vc
|
144
|
SET countrystd = name_0
|
145
|
FROM gadm_country_lookup gadm0
|
146
|
WHERE vc.countryid=gadm0.countryid;
|
147
|
-- UPDATE 377
|
148
|
-- Time: 6.254 ms
|
149
|
|
150
|
|
151
|
-------------------------
|
152
|
-- scrub stateprovince --
|
153
|
-------------------------
|
154
|
|
155
|
INSERT INTO vstate
|
156
|
SELECT DISTINCT countryid, stateprovince
|
157
|
FROM vegbien_geoscrub join vcountry using (country)
|
158
|
WHERE countryid IS NOT NULL
|
159
|
AND stateprovince IS NOT NULL;
|
160
|
-- SELECT 3312
|
161
|
-- Time: 4164.948 ms
|
162
|
|
163
|
-- try to clean up character representations
|
164
|
UPDATE vstate
|
165
|
SET stateprovinceutf8 = u2L2u(stateprovince);
|
166
|
-- UPDATE 3312
|
167
|
-- Time: 92.702 ms
|
168
|
|
169
|
-- replace common mojibake with valid utf8 replacement character
|
170
|
-- note: second pass of u2L2u would do the same thing, but this is a
|
171
|
-- little more explicit about the intended effect
|
172
|
UPDATE vstate
|
173
|
SET stateprovinceutf8 = replace(stateprovinceutf8, '�', chr(65533));
|
174
|
|
175
|
-- manually replace html character codes found in the names
|
176
|
-- todo: use some library to do this exhaustively rather than using
|
177
|
-- manually specified updates
|
178
|
UPDATE vstate
|
179
|
SET stateprovinceutf8 = replace(stateprovinceutf8, ''', chr(39));
|
180
|
UPDATE vstate
|
181
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'á', chr(225));
|
182
|
UPDATE vstate
|
183
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ã', chr(227));
|
184
|
UPDATE vstate
|
185
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'é', chr(233));
|
186
|
UPDATE vstate
|
187
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'é', chr(233));
|
188
|
UPDATE vstate
|
189
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'í', chr(237));
|
190
|
UPDATE vstate
|
191
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ñ', chr(241));
|
192
|
UPDATE vstate
|
193
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ó', chr(243));
|
194
|
UPDATE vstate
|
195
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ô', chr(244));
|
196
|
UPDATE vstate
|
197
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ü', chr(252));
|
198
|
|
199
|
-- try matching against direct stateprovince names
|
200
|
UPDATE vstate vs
|
201
|
SET stateprovinceid = gn.geonameid
|
202
|
FROM geonames gn,
|
203
|
countries c
|
204
|
WHERE lower(vs.stateprovinceutf8) = lower(gn.name)
|
205
|
AND gn.countrycode=c.iso
|
206
|
AND c.geonameid=vs.countryid
|
207
|
AND gn.featurecode='ADM1';
|
208
|
-- UPDATE 238
|
209
|
-- Time: 2919.512 ms
|
210
|
|
211
|
-- try case-insensitive matching on geonames.org 'alternatenames'
|
212
|
-- note: this is a little sloppy because the set condition will return
|
213
|
-- more than one row if there are more than one matched alternate
|
214
|
-- stateprovince names, and which one gets used is non-deterministic
|
215
|
UPDATE vstate vs
|
216
|
SET stateprovinceid = gn.geonameid
|
217
|
FROM alternatenames a,
|
218
|
geonames gn,
|
219
|
countries c
|
220
|
WHERE lower(vs.stateprovinceutf8) = lower(a.alternatename)
|
221
|
AND a.geonameid = gn.geonameid
|
222
|
AND gn.countrycode=c.iso
|
223
|
AND c.geonameid=vs.countryid
|
224
|
AND gn.featurecode='ADM1'
|
225
|
AND vs.stateprovinceid IS NULL;
|
226
|
-- UPDATE 1281
|
227
|
-- Time: 5229.274 ms
|
228
|
|
229
|
-- try doing a 'like' query that ignores replacement characters (�)
|
230
|
UPDATE vstate vs
|
231
|
SET stateprovinceid = gn.geonameid
|
232
|
FROM alternatenames a,
|
233
|
geonames gn,
|
234
|
countries c
|
235
|
WHERE a.alternatename ILIKE
|
236
|
regexp_replace(stateprovinceutf8, chr(65533), '_')
|
237
|
AND stateprovinceutf8 LIKE '%'||chr(65533)||'%'
|
238
|
AND a.geonameid = gn.geonameid
|
239
|
AND gn.countrycode=c.iso
|
240
|
AND c.geonameid=vs.countryid
|
241
|
AND gn.featurecode='ADM1'
|
242
|
AND vs.stateprovinceid IS NULL;
|
243
|
-- UPDATE 112
|
244
|
-- Time: 6232.106 ms
|
245
|
|
246
|
-- try matching against alternatenames column in geonames table
|
247
|
UPDATE vstate vs
|
248
|
SET stateprovinceid = gn.geonameid
|
249
|
FROM geonames gn,
|
250
|
countries c
|
251
|
WHERE lower(vs.stateprovinceutf8) =
|
252
|
ANY (string_to_array(lower(gn.alternatenames), ','))
|
253
|
AND gn.countrycode=c.iso
|
254
|
AND c.geonameid=vs.countryid
|
255
|
AND gn.featurecode='ADM1'
|
256
|
AND vs.stateprovinceid IS NULL;
|
257
|
-- UPDATE 220
|
258
|
-- Time: 5053.520 ms
|
259
|
|
260
|
-- try matching against BIEN alt stateprovince names
|
261
|
UPDATE vstate vs
|
262
|
SET stateprovinceid = g.geonameid
|
263
|
FROM alt_stateprovince asp,
|
264
|
countries c,
|
265
|
geonames g
|
266
|
WHERE lower(vs.stateprovinceutf8) = lower(asp.alternatename)
|
267
|
AND asp.country = c.country
|
268
|
AND c.iso = g.countrycode
|
269
|
AND asp.stateprovince = g.name
|
270
|
AND g.featurecode='ADM1'
|
271
|
AND vs.stateprovinceid IS NULL;
|
272
|
-- UPDATE 45
|
273
|
-- Time: 3452.274 ms
|
274
|
|
275
|
-- todo: the same stuff done for country to deal with Congo-type case
|
276
|
|
277
|
-- merge in GADM stateprovince names (our standard names)
|
278
|
UPDATE vstate vs
|
279
|
SET stateprovincestd = name_1
|
280
|
FROM gadm_stateprovince_lookup gadm1
|
281
|
WHERE vs.stateprovinceid=gadm1.stateprovinceid;
|
282
|
-- UPDATE 1896
|
283
|
-- Time: 23.946 ms
|
284
|
|
285
|
-- match any remaining unknowns directly to gadm
|
286
|
UPDATE vstate vs
|
287
|
SET stateprovincestd = name_1
|
288
|
FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm,
|
289
|
vcountry vc
|
290
|
WHERE vc.countryid = vs.countryid
|
291
|
AND countrystd = name_0
|
292
|
AND lower(stateprovinceutf8) = lower(name_1)
|
293
|
AND stateprovincestd IS NULL;
|
294
|
-- UPDATE 55
|
295
|
-- Time: 4799.837 ms
|
296
|
|
297
|
-- try to recover geonameids for any of these so we can drill down to
|
298
|
-- the next level
|
299
|
UPDATE vstate vs
|
300
|
SET stateprovinceid = gadm1.stateprovinceid
|
301
|
FROM gadm_country_lookup gadm0,
|
302
|
gadm_stateprovince_lookup gadm1
|
303
|
WHERE vs.countryid = gadm0.countryid
|
304
|
AND gadm0.name_0 = gadm1.name_0
|
305
|
AND vs.stateprovincestd = gadm1.name_1
|
306
|
AND vs.stateprovinceid IS NULL;
|
307
|
-- UPDATE 0
|
308
|
-- Time: 13.882 ms
|
309
|
|
310
|
|
311
|
/* TODO: is this still needed???
|
312
|
-- now reverse map
|
313
|
INSERT INTO gadm_stateprovince_lookup
|
314
|
select name_0, alternatename as name_1, g.geonameid
|
315
|
from vstate vs,
|
316
|
alt_stateprovince asp,
|
317
|
geonames g,
|
318
|
countries c,
|
319
|
gadm_country_lookup gc
|
320
|
where vs.stateprovinceid=g.geonameid
|
321
|
AND asp.stateprovince=g.name
|
322
|
AND vs.countryid=c.geonameid
|
323
|
AND vs.countryid=gc.countryid
|
324
|
AND asp.country=c.country
|
325
|
AND vs.stateprovinceid not in (select stateprovinceid from gadm_stateprovince_lookup);
|
326
|
-- INSERT 0 4
|
327
|
-- Time: 20.228 ms
|
328
|
*/
|
329
|
|
330
|
|
331
|
------------------
|
332
|
-- scrub county --
|
333
|
------------------
|
334
|
|
335
|
INSERT INTO vcounty
|
336
|
SELECT DISTINCT countryid, stateprovinceid, county
|
337
|
FROM vegbien_geoscrub
|
338
|
JOIN vcountry USING (country)
|
339
|
JOIN vstate USING (countryid, stateprovince)
|
340
|
WHERE countryid IS NOT NULL
|
341
|
AND stateprovinceid IS NOT NULL
|
342
|
AND county IS NOT NULL;
|
343
|
-- SELECT 18715
|
344
|
-- Time: 3590.725 ms
|
345
|
|
346
|
-- try to clean up character representations
|
347
|
UPDATE vcounty
|
348
|
SET countyutf8 = u2L2u(county);
|
349
|
-- UPDATE 18715
|
350
|
-- Time: 683.781 ms
|
351
|
|
352
|
-- replace common mojibake with valid utf8 replacement character
|
353
|
-- note: second pass of u2L2u would do the same thing, but this is a
|
354
|
-- little more explicit about the intended effect
|
355
|
UPDATE vcounty
|
356
|
SET countyutf8 = replace(countyutf8, '�', chr(65533));
|
357
|
-- UPDATE 18715
|
358
|
-- Time: 153.395 ms
|
359
|
|
360
|
-- clean up some common municipality label abbreviations
|
361
|
-- todo: other fixes like this; check what brad did for bien2 on this front?
|
362
|
UPDATE vcounty
|
363
|
SET countyutf8 = regexp_replace(countyutf8, ' Co[.]?$', ' County', 'i');
|
364
|
|
365
|
-- try matching against geonames (names and alternatenames)
|
366
|
UPDATE vcounty vc
|
367
|
SET countyid = gn.geonameid
|
368
|
FROM geonames gn,
|
369
|
hierarchy h
|
370
|
WHERE vc.stateprovinceid = h.parentid
|
371
|
AND h.childid=gn.geonameid
|
372
|
AND (lower(vc.countyutf8) = lower(gn.name)
|
373
|
OR lower(vc.countyutf8) =
|
374
|
ANY (string_to_array(lower(gn.alternatenames), ',')))
|
375
|
AND gn.featurecode='ADM2';
|
376
|
-- UPDATE 6673
|
377
|
-- Time: 8369.737 ms
|
378
|
|
379
|
-- use alternatenames
|
380
|
UPDATE vcounty vc
|
381
|
SET countyid = gn.geonameid
|
382
|
FROM geonames gn,
|
383
|
alternatenames a,
|
384
|
hierarchy h
|
385
|
WHERE vc.stateprovinceid = h.parentid
|
386
|
AND h.childid = gn.geonameid
|
387
|
AND a.geonameid = gn.geonameid
|
388
|
AND lower(vc.countyutf8) = lower(a.alternatename)
|
389
|
AND gn.featurecode='ADM2'
|
390
|
AND vc.countyid IS NULL;
|
391
|
-- UPDATE 0
|
392
|
-- Time: 11255.926 ms
|
393
|
|
394
|
-- merge in GADM county names (our standard names)
|
395
|
-- for cases where we have geonameid
|
396
|
UPDATE vcounty vs
|
397
|
SET countystd = name_2
|
398
|
FROM gadm_county_lookup gadm1
|
399
|
WHERE vs.countyid=gadm1.countyid
|
400
|
AND countystd IS NULL;
|
401
|
-- UPDATE 6673
|
402
|
-- Time: 118.378 ms
|
403
|
|
404
|
-- match directly to gadm
|
405
|
UPDATE vcounty vco
|
406
|
SET countystd = name_2
|
407
|
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
|
408
|
vcountry vc,
|
409
|
vstate vs
|
410
|
WHERE vco.countryid = vc.countryid
|
411
|
AND vco.stateprovinceid = vs.stateprovinceid
|
412
|
AND countrystd = name_0
|
413
|
AND stateprovincestd = name_1
|
414
|
AND lower(countyutf8) = lower(name_2)
|
415
|
AND countystd IS NULL;
|
416
|
-- UPDATE 5027
|
417
|
-- Time: 1410.377 ms
|
418
|
|
419
|
COMMIT;
|
420
|
|