1
|
/*
|
2
|
SQL statements carrying out country name, stateprovince name, and
|
3
|
county name scrubbing on vegbien location data. Currently hard-coded
|
4
|
to operate on a 'vegbien_geoscrub' table that contains the input for
|
5
|
geoscrubbing, with the following columns:
|
6
|
decimallatitude (latitude, WGS84 decimal degrees)
|
7
|
decimallongitude (longitude, WGS84 decimal degrees)
|
8
|
country (original asserted country name)
|
9
|
stateprovince (original asserted stateprovince name)
|
10
|
county (original asserted county name)
|
11
|
(Note that the lat/lon columns are not needed for scrubbing, but are
|
12
|
used later for geovalidation.)
|
13
|
|
14
|
Misc issues to contemplate and possibly address:
|
15
|
|
16
|
* Congo can be either "Democratic Republic of the Congo" (COD) or
|
17
|
"Republic of the Congo" (COG), eh?
|
18
|
* Vietnam: gadm2 seems to treat provinces as level2 units
|
19
|
* Yugoslavia: no longer exists, not in geonames nor gadm2
|
20
|
|
21
|
* Azerbaijan: Yevlakh City and Yevlakh Rayon both ADM1 in geonames,
|
22
|
but just Yevlax in gadm2
|
23
|
|
24
|
* England is not a geoname country, only the UK is
|
25
|
|
26
|
* the general problem of historical names in cases where countries
|
27
|
have either been split, combined, removed (?), or otherwise
|
28
|
redefined
|
29
|
|
30
|
* Cases with multiple alternativename matches???
|
31
|
|
32
|
Notes:
|
33
|
- the vegbien_geoscrub input used during development and testing this
|
34
|
scrubbing procedure came from a select distinct on (lat, lon,
|
35
|
country, state, province) from analytical_aggregate on vegbiendev
|
36
|
|
37
|
Jim Regetz
|
38
|
NCEAS
|
39
|
Created Nov 2012
|
40
|
*/
|
41
|
|
42
|
-- define custom function to pass characters back through latin-1
|
43
|
-- encoding to revert errors; in principle this could introduce new
|
44
|
-- errors, but it seems to do more good than harm
|
45
|
CREATE OR REPLACE FUNCTION u2L2u(text) RETURNS text AS $$
|
46
|
DECLARE
|
47
|
string text;
|
48
|
BEGIN
|
49
|
string :=
|
50
|
convert_from(convert($1::bytea, 'UTF8', 'LATIN1'), 'UTF8');
|
51
|
RETURN string;
|
52
|
EXCEPTION
|
53
|
WHEN data_exception THEN
|
54
|
RETURN $1;
|
55
|
END;
|
56
|
$$ LANGUAGE plpgsql;
|
57
|
|
58
|
-------------------
|
59
|
-- scrub country --
|
60
|
-------------------
|
61
|
|
62
|
CREATE TABLE vcountry AS
|
63
|
SELECT DISTINCT country
|
64
|
FROM vegbien_geoscrub
|
65
|
WHERE country IS NOT NULL;
|
66
|
-- SELECT 439
|
67
|
-- Time: 862.413 ms
|
68
|
ALTER TABLE vcountry ADD COLUMN countryutf8 text;
|
69
|
ALTER TABLE vcountry ADD COLUMN countryid integer;
|
70
|
|
71
|
-- try to clean up character representations
|
72
|
UPDATE vcountry
|
73
|
SET countryutf8 = u2L2u(country);
|
74
|
-- UPDATE 439
|
75
|
-- Time: 24.706 ms
|
76
|
|
77
|
-- try matching against direct country names
|
78
|
UPDATE vcountry vc
|
79
|
SET countryid = gc.geonameid
|
80
|
FROM countries gc
|
81
|
WHERE lower(vc.countryutf8) = lower(gc.country);
|
82
|
-- UPDATE 219
|
83
|
-- Time: 12.384 ms
|
84
|
|
85
|
-- try matching against ISO 3166-1 alpha-3
|
86
|
UPDATE vcountry vc
|
87
|
SET countryid = gc.geonameid
|
88
|
FROM countries gc
|
89
|
WHERE vc.countryutf8=gc.iso3
|
90
|
AND vc.countryid IS NULL;
|
91
|
-- UPDATE 1
|
92
|
-- Time: 20.146 ms
|
93
|
|
94
|
-- try matching against ISO 3166-1 alpha-2
|
95
|
UPDATE vcountry vc
|
96
|
SET countryid = gc.geonameid
|
97
|
FROM countries gc
|
98
|
WHERE vc.countryutf8 = iso
|
99
|
AND vc.countryid IS NULL;
|
100
|
-- UPDATE 64
|
101
|
-- Time: 3.378 ms
|
102
|
|
103
|
/*
|
104
|
-- try matching against FIPS, but not if already matched (i.e.,
|
105
|
-- the 2-digit ISO match takes preference)
|
106
|
UPDATE vcountry vc
|
107
|
SET countryid = gc.geonameid
|
108
|
FROM countries gc
|
109
|
WHERE vc.countryutf8 = fips
|
110
|
AND vc.countryid IS NULL;
|
111
|
-- UPDATE 1
|
112
|
-- Time: 2.514 ms
|
113
|
*/
|
114
|
|
115
|
-- try matching against BIEN alt country names
|
116
|
UPDATE vcountry vc
|
117
|
SET countryid = c.geonameid
|
118
|
FROM alt_country a, countries c
|
119
|
WHERE vc.countryutf8 = a.alternatename
|
120
|
AND a.country = c.country
|
121
|
AND vc.countryid IS NULL;
|
122
|
-- UPDATE 21
|
123
|
-- Time: 4.769 ms
|
124
|
|
125
|
-- try case-insensitive matching on geonames.org 'alternatenames'
|
126
|
-- note: this is a little sloppy because the set condition will return
|
127
|
-- more than one row if there are more than one matched alternate
|
128
|
-- country names, and which one gets used is non-deterministic; however,
|
129
|
-- we'll go back and remove those in the next update (specifically for
|
130
|
-- the cases where there are more than one *unique* matched countries)
|
131
|
UPDATE vcountry vc
|
132
|
SET countryid = gc.geonameid
|
133
|
FROM alternatenames a, countries gc
|
134
|
WHERE lower(vc.countryutf8) = lower(a.alternatename)
|
135
|
AND a.geonameid = gc.geonameid
|
136
|
AND vc.countryid IS NULL;
|
137
|
-- UPDATE 76
|
138
|
-- Time: 22019.592 ms
|
139
|
|
140
|
-- go back and remove if altname lookup was ambiguous
|
141
|
UPDATE vcountry vc
|
142
|
SET countryid = NULL
|
143
|
WHERE countryutf8 IN (
|
144
|
SELECT countryutf8
|
145
|
FROM (
|
146
|
SELECT DISTINCT vc.countryutf8, gc.iso3
|
147
|
FROM vcountry vc,
|
148
|
alternatenames a,
|
149
|
countries gc
|
150
|
WHERE lower(vc.countryutf8) = lower(a.alternatename)
|
151
|
AND a.geonameid = gc.geonameid) lookup
|
152
|
GROUP BY countryutf8
|
153
|
HAVING COUNT(*) > 1);
|
154
|
-- UPDATE 2
|
155
|
-- Time: 2296.257 ms
|
156
|
-- Removes: Congo, CONGO
|
157
|
|
158
|
-- merge in GADM country names (our standard names)
|
159
|
ALTER TABLE vcountry ADD COLUMN countrystd text;
|
160
|
UPDATE vcountry vc
|
161
|
SET countrystd = name_0
|
162
|
FROM gadm_country_lookup gadm0
|
163
|
WHERE vc.countryid=gadm0.countryid;
|
164
|
-- UPDATE 377
|
165
|
-- Time: 6.254 ms
|
166
|
|
167
|
|
168
|
-------------------------
|
169
|
-- scrub stateprovince --
|
170
|
-------------------------
|
171
|
|
172
|
CREATE TABLE vstate AS
|
173
|
SELECT DISTINCT countryid, stateprovince
|
174
|
FROM vegbien_geoscrub join vcountry using (country)
|
175
|
WHERE countryid IS NOT NULL
|
176
|
AND stateprovince IS NOT NULL;
|
177
|
-- SELECT 3312
|
178
|
-- Time: 4164.948 ms
|
179
|
ALTER TABLE vstate ADD COLUMN stateprovinceutf8 text;
|
180
|
ALTER TABLE vstate ADD COLUMN stateprovinceid integer;
|
181
|
|
182
|
-- try to clean up character representations
|
183
|
UPDATE vstate
|
184
|
SET stateprovinceutf8 = u2L2u(stateprovince);
|
185
|
-- UPDATE 3312
|
186
|
-- Time: 92.702 ms
|
187
|
|
188
|
-- replace common mojibake with valid utf8 replacement character
|
189
|
-- note: second pass of u2L2u would do the same thing, but this is a
|
190
|
-- little more explicit about the intended effect
|
191
|
UPDATE vstate
|
192
|
SET stateprovinceutf8 = replace(stateprovinceutf8, '�', chr(65533));
|
193
|
|
194
|
-- manually replace html character codes found in the names
|
195
|
-- todo: use some library to do this exhaustively rather than using
|
196
|
-- manually specified updates
|
197
|
UPDATE vstate
|
198
|
SET stateprovinceutf8 = replace(stateprovinceutf8, ''', chr(39));
|
199
|
UPDATE vstate
|
200
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'á', chr(225));
|
201
|
UPDATE vstate
|
202
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ã', chr(227));
|
203
|
UPDATE vstate
|
204
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'é', chr(233));
|
205
|
UPDATE vstate
|
206
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'é', chr(233));
|
207
|
UPDATE vstate
|
208
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'í', chr(237));
|
209
|
UPDATE vstate
|
210
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ñ', chr(241));
|
211
|
UPDATE vstate
|
212
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ó', chr(243));
|
213
|
UPDATE vstate
|
214
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ô', chr(244));
|
215
|
UPDATE vstate
|
216
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ü', chr(252));
|
217
|
|
218
|
-- try matching against direct stateprovince names
|
219
|
UPDATE vstate vs
|
220
|
SET stateprovinceid = gn.geonameid
|
221
|
FROM geonames gn,
|
222
|
countries c
|
223
|
WHERE lower(vs.stateprovinceutf8) = lower(gn.name)
|
224
|
AND gn.countrycode=c.iso
|
225
|
AND c.geonameid=vs.countryid
|
226
|
AND gn.featurecode='ADM1';
|
227
|
-- UPDATE 238
|
228
|
-- Time: 2919.512 ms
|
229
|
|
230
|
-- try case-insensitive matching on geonames.org 'alternatenames'
|
231
|
-- note: this is a little sloppy because the set condition will return
|
232
|
-- more than one row if there are more than one matched alternate
|
233
|
-- stateprovince names, and which one gets used is non-deterministic
|
234
|
UPDATE vstate vs
|
235
|
SET stateprovinceid = gn.geonameid
|
236
|
FROM alternatenames a,
|
237
|
geonames gn,
|
238
|
countries c
|
239
|
WHERE lower(vs.stateprovinceutf8) = lower(a.alternatename)
|
240
|
AND a.geonameid = gn.geonameid
|
241
|
AND gn.countrycode=c.iso
|
242
|
AND c.geonameid=vs.countryid
|
243
|
AND gn.featurecode='ADM1'
|
244
|
AND vs.stateprovinceid IS NULL;
|
245
|
-- UPDATE 1281
|
246
|
-- Time: 5229.274 ms
|
247
|
|
248
|
-- try doing a 'like' query that ignores replacement characters (�)
|
249
|
UPDATE vstate vs
|
250
|
SET stateprovinceid = gn.geonameid
|
251
|
FROM alternatenames a,
|
252
|
geonames gn,
|
253
|
countries c
|
254
|
WHERE a.alternatename ILIKE
|
255
|
regexp_replace(stateprovinceutf8, chr(65533), '_')
|
256
|
AND stateprovinceutf8 LIKE '%'||chr(65533)||'%'
|
257
|
AND a.geonameid = gn.geonameid
|
258
|
AND gn.countrycode=c.iso
|
259
|
AND c.geonameid=vs.countryid
|
260
|
AND gn.featurecode='ADM1'
|
261
|
AND vs.stateprovinceid IS NULL;
|
262
|
-- UPDATE 112
|
263
|
-- Time: 6232.106 ms
|
264
|
|
265
|
-- try matching against alternatenames column in geonames table
|
266
|
UPDATE vstate vs
|
267
|
SET stateprovinceid = gn.geonameid
|
268
|
FROM geonames gn,
|
269
|
countries c
|
270
|
WHERE lower(vs.stateprovinceutf8) =
|
271
|
ANY (string_to_array(lower(gn.alternatenames), ','))
|
272
|
AND gn.countrycode=c.iso
|
273
|
AND c.geonameid=vs.countryid
|
274
|
AND gn.featurecode='ADM1'
|
275
|
AND vs.stateprovinceid IS NULL;
|
276
|
-- UPDATE 220
|
277
|
-- Time: 5053.520 ms
|
278
|
|
279
|
-- try matching against BIEN alt stateprovince names
|
280
|
UPDATE vstate vs
|
281
|
SET stateprovinceid = g.geonameid
|
282
|
FROM alt_stateprovince asp,
|
283
|
countries c,
|
284
|
geonames g
|
285
|
WHERE lower(vs.stateprovinceutf8) = lower(asp.alternatename)
|
286
|
AND asp.country = c.country
|
287
|
AND c.iso = g.countrycode
|
288
|
AND asp.stateprovince = g.name
|
289
|
AND g.featurecode='ADM1'
|
290
|
AND vs.stateprovinceid IS NULL;
|
291
|
-- UPDATE 45
|
292
|
-- Time: 3452.274 ms
|
293
|
|
294
|
-- todo: the same stuff done for country to deal with Congo-type case
|
295
|
|
296
|
-- merge in GADM stateprovince names (our standard names)
|
297
|
ALTER TABLE vstate ADD COLUMN stateprovincestd text;
|
298
|
UPDATE vstate vs
|
299
|
SET stateprovincestd = name_1
|
300
|
FROM gadm_stateprovince_lookup gadm1
|
301
|
WHERE vs.stateprovinceid=gadm1.stateprovinceid;
|
302
|
-- UPDATE 1896
|
303
|
-- Time: 23.946 ms
|
304
|
|
305
|
-- match any remaining unknowns directly to gadm
|
306
|
UPDATE vstate vs
|
307
|
SET stateprovincestd = name_1
|
308
|
FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm,
|
309
|
vcountry vc
|
310
|
WHERE vc.countryid = vs.countryid
|
311
|
AND countrystd = name_0
|
312
|
AND lower(stateprovinceutf8) = lower(name_1)
|
313
|
AND stateprovincestd IS NULL;
|
314
|
-- UPDATE 55
|
315
|
-- Time: 4799.837 ms
|
316
|
|
317
|
-- try to recover geonameids for any of these so we can drill down to
|
318
|
-- the next level
|
319
|
UPDATE vstate vs
|
320
|
SET stateprovinceid = gadm1.stateprovinceid
|
321
|
FROM gadm_country_lookup gadm0,
|
322
|
gadm_stateprovince_lookup gadm1
|
323
|
WHERE vs.countryid = gadm0.countryid
|
324
|
AND gadm0.name_0 = gadm1.name_0
|
325
|
AND vs.stateprovincestd = gadm1.name_1
|
326
|
AND vs.stateprovinceid IS NULL;
|
327
|
-- UPDATE 0
|
328
|
-- Time: 13.882 ms
|
329
|
|
330
|
|
331
|
/* TODO: is this still needed???
|
332
|
-- now reverse map
|
333
|
INSERT INTO gadm_stateprovince_lookup
|
334
|
select name_0, alternatename as name_1, g.geonameid
|
335
|
from vstate vs,
|
336
|
alt_stateprovince asp,
|
337
|
geonames g,
|
338
|
countries c,
|
339
|
gadm_country_lookup gc
|
340
|
where vs.stateprovinceid=g.geonameid
|
341
|
AND asp.stateprovince=g.name
|
342
|
AND vs.countryid=c.geonameid
|
343
|
AND vs.countryid=gc.countryid
|
344
|
AND asp.country=c.country
|
345
|
AND vs.stateprovinceid not in (select stateprovinceid from gadm_stateprovince_lookup);
|
346
|
-- INSERT 0 4
|
347
|
-- Time: 20.228 ms
|
348
|
*/
|
349
|
|
350
|
|
351
|
------------------
|
352
|
-- scrub county --
|
353
|
------------------
|
354
|
|
355
|
CREATE TABLE vcounty AS
|
356
|
SELECT DISTINCT countryid, stateprovinceid, county
|
357
|
FROM vegbien_geoscrub
|
358
|
JOIN vcountry USING (country)
|
359
|
JOIN vstate USING (countryid, stateprovince)
|
360
|
WHERE countryid IS NOT NULL
|
361
|
AND stateprovinceid IS NOT NULL
|
362
|
AND county IS NOT NULL;
|
363
|
-- SELECT 18715
|
364
|
-- Time: 3590.725 ms
|
365
|
ALTER TABLE vcounty ADD COLUMN countyutf8 text;
|
366
|
ALTER TABLE vcounty ADD COLUMN countyid integer;
|
367
|
|
368
|
-- try to clean up character representations
|
369
|
UPDATE vcounty
|
370
|
SET countyutf8 = u2L2u(county);
|
371
|
-- UPDATE 18715
|
372
|
-- Time: 683.781 ms
|
373
|
|
374
|
-- replace common mojibake with valid utf8 replacement character
|
375
|
-- note: second pass of u2L2u would do the same thing, but this is a
|
376
|
-- little more explicit about the intended effect
|
377
|
UPDATE vcounty
|
378
|
SET countyutf8 = replace(countyutf8, '�', chr(65533));
|
379
|
-- UPDATE 18715
|
380
|
-- Time: 153.395 ms
|
381
|
|
382
|
-- clean up some common municipality label abbreviations
|
383
|
-- todo: other fixes like this; check what brad did for bien2 on this front?
|
384
|
UPDATE vcounty
|
385
|
SET countyutf8 = regexp_replace(countyutf8, ' Co[.]?$', ' County', 'i');
|
386
|
|
387
|
-- try matching against geonames (names and alternatenames)
|
388
|
UPDATE vcounty vc
|
389
|
SET countyid = gn.geonameid
|
390
|
FROM geonames gn,
|
391
|
hierarchy h
|
392
|
WHERE vc.stateprovinceid = h.parentid
|
393
|
AND h.childid=gn.geonameid
|
394
|
AND (lower(vc.countyutf8) = lower(gn.name)
|
395
|
OR lower(vc.countyutf8) =
|
396
|
ANY (string_to_array(lower(gn.alternatenames), ',')))
|
397
|
AND gn.featurecode='ADM2';
|
398
|
-- UPDATE 6673
|
399
|
-- Time: 8369.737 ms
|
400
|
|
401
|
-- use alternatenames
|
402
|
UPDATE vcounty vc
|
403
|
SET countyid = gn.geonameid
|
404
|
FROM geonames gn,
|
405
|
alternatenames a,
|
406
|
hierarchy h
|
407
|
WHERE vc.stateprovinceid = h.parentid
|
408
|
AND h.childid = gn.geonameid
|
409
|
AND a.geonameid = gn.geonameid
|
410
|
AND lower(vc.countyutf8) = lower(a.alternatename)
|
411
|
AND gn.featurecode='ADM2'
|
412
|
AND vc.countyid IS NULL;
|
413
|
-- UPDATE 0
|
414
|
-- Time: 11255.926 ms
|
415
|
|
416
|
ALTER TABLE vcounty ADD COLUMN countystd text;
|
417
|
|
418
|
-- merge in GADM county names (our standard names)
|
419
|
-- for cases where we have geonameid
|
420
|
UPDATE vcounty vs
|
421
|
SET countystd = name_2
|
422
|
FROM gadm_county_lookup gadm1
|
423
|
WHERE vs.countyid=gadm1.countyid
|
424
|
AND countystd IS NULL;
|
425
|
-- UPDATE 6673
|
426
|
-- Time: 118.378 ms
|
427
|
|
428
|
-- match directly to gadm
|
429
|
UPDATE vcounty vco
|
430
|
SET countystd = name_2
|
431
|
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
|
432
|
vcountry vc,
|
433
|
vstate vs
|
434
|
WHERE vco.countryid = vc.countryid
|
435
|
AND vco.stateprovinceid = vs.stateprovinceid
|
436
|
AND countrystd = name_0
|
437
|
AND stateprovincestd = name_1
|
438
|
AND lower(countyutf8) = lower(name_2)
|
439
|
AND countystd IS NULL;
|
440
|
-- UPDATE 5027
|
441
|
-- Time: 1410.377 ms
|
442
|
|
443
|
-----------------------------
|
444
|
-- put everything together --
|
445
|
-----------------------------
|
446
|
|
447
|
-- reconstitute the whole dang thang
|
448
|
|
449
|
-- new approach
|
450
|
CREATE TABLE geoscrub AS
|
451
|
SELECT decimallatitude, decimallongitude,
|
452
|
country, stateprovince, county,
|
453
|
countryid, stateprovinceid, countyid,
|
454
|
countrystd, stateprovincestd, countystd
|
455
|
FROM vegbien_geoscrub v
|
456
|
LEFT JOIN vcountry USING (country)
|
457
|
LEFT JOIN vstate USING (countryid, stateprovince)
|
458
|
LEFT JOIN vcounty USING (countryid, stateprovinceid, county);
|
459
|
-- SELECT 1707970
|
460
|
-- Time: 26172.154 ms
|
461
|
|
462
|
-- try to squeeze out a few more direct stateprovince mappings to gadm2
|
463
|
UPDATE geoscrub
|
464
|
SET stateprovincestd = name_1
|
465
|
FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm
|
466
|
WHERE country IS NOT NULL
|
467
|
AND stateprovince IS NOT NULL
|
468
|
AND countrystd = name_0
|
469
|
AND stateprovincestd IS NULL
|
470
|
AND stateprovince = name_1;
|
471
|
-- UPDATE 0
|
472
|
-- Time: 3630.973 ms
|
473
|
|
474
|
-- try to squeeze out a few more direct county mappings to gadm2
|
475
|
UPDATE geoscrub
|
476
|
SET countystd = name_2
|
477
|
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm
|
478
|
WHERE country IS NOT NULL
|
479
|
AND stateprovince IS NOT NULL
|
480
|
AND county IS NOT NULL
|
481
|
AND countrystd = name_0
|
482
|
AND stateprovincestd = name_1
|
483
|
AND countystd IS NULL
|
484
|
AND county = name_2;
|
485
|
-- UPDATE 69
|
486
|
-- Time: 3982.715 ms
|