Project

General

Profile

1
/*
2
  SQL statements carrying out country name, stateprovince name, and
3
  county name scrubbing on vegbien location data. Currently hard-coded
4
  to operate on a 'vegbien_geoscrub' table that contains the input for
5
  geoscrubbing, with the following columns:
6
     decimallatitude  (latitude, WGS84 decimal degrees)
7
     decimallongitude (longitude, WGS84 decimal degrees)
8
     country          (original asserted country name)
9
     stateprovince    (original asserted stateprovince name)
10
     county           (original asserted county name)
11
  (Note that the lat/lon columns are not needed for scrubbing, but are
12
  used later for geovalidation.)
13

    
14
 Misc issues to contemplate and possibly address:
15

    
16
  * Congo can be either "Democratic Republic of the Congo" (COD) or
17
    "Republic of the Congo" (COG), eh?
18
  * Vietnam: gadm2 seems to treat provinces as level2 units
19
  * Yugoslavia: no longer exists, not in geonames nor gadm2
20

    
21
  * Azerbaijan: Yevlakh City and Yevlakh Rayon both ADM1 in geonames,
22
    but just Yevlax in gadm2
23

    
24
  * England is not a geoname country, only the UK is
25

    
26
  * the general problem of historical names in cases where countries
27
    have either been split, combined, removed (?), or otherwise
28
    redefined
29

    
30
  * Cases with multiple alternativename matches???
31

    
32
 Notes:
33
 - the vegbien_geoscrub input used during development and testing this
34
   scrubbing procedure came from a select distinct on (lat, lon,
35
   country, state, province) from analytical_aggregate on vegbiendev
36

    
37
  Jim Regetz
38
  NCEAS
39
  Created Nov 2012
40
*/
41

    
42
-- define custom function to pass characters back through latin-1
43
-- encoding to revert errors; in principle this could introduce new
44
-- errors, but it seems to do more good than harm
45
CREATE OR REPLACE FUNCTION u2L2u(text) RETURNS text AS $$
46
DECLARE
47
    string text;
48
BEGIN
49
  string :=
50
    convert_from(convert($1::bytea, 'UTF8', 'LATIN1'), 'UTF8');
51
  RETURN string;
52
EXCEPTION
53
  WHEN data_exception THEN
54
    RETURN $1;
55
END;
56
$$ LANGUAGE plpgsql;
57

    
58
-------------------
59
-- scrub country --
60
-------------------
61

    
62
CREATE TABLE vcountry AS
63
  SELECT DISTINCT country
64
    FROM vegbien_geoscrub
65
    WHERE country IS NOT NULL;
66
-- SELECT 439
67
-- Time: 862.413 ms
68
ALTER TABLE vcountry ADD COLUMN countryutf8 text;
69
ALTER TABLE vcountry ADD COLUMN countryid integer;
70

    
71
-- try to clean up character representations
72
UPDATE vcountry
73
    SET countryutf8 = u2L2u(country);
74
-- UPDATE 439
75
-- Time: 24.706 ms
76

    
77
-- try matching against direct country names
78
UPDATE vcountry vc
79
    SET countryid = gc.geonameid
80
    FROM countries gc
81
    WHERE lower(vc.countryutf8) = lower(gc.country);
82
-- UPDATE 219
83
-- Time: 12.384 ms
84

    
85
-- try matching against ISO 3166-1 alpha-3
86
UPDATE vcountry vc
87
    SET countryid = gc.geonameid
88
    FROM countries gc
89
    WHERE vc.countryutf8=gc.iso3
90
      AND vc.countryid IS NULL;
91
-- UPDATE 1
92
-- Time: 20.146 ms
93

    
94
-- try matching against ISO 3166-1 alpha-2
95
UPDATE vcountry vc
96
    SET countryid = gc.geonameid
97
    FROM countries gc
98
    WHERE vc.countryutf8 = iso
99
      AND vc.countryid IS NULL;
100
-- UPDATE 64
101
-- Time: 3.378 ms
102

    
103
/*
104
-- try matching against FIPS, but not if already matched (i.e.,
105
-- the 2-digit ISO match takes preference)
106
UPDATE vcountry vc
107
    SET countryid = gc.geonameid
108
    FROM countries gc
109
    WHERE vc.countryutf8 = fips
110
      AND vc.countryid IS NULL;
111
-- UPDATE 1
112
-- Time: 2.514 ms
113
*/
114

    
115
-- try matching against BIEN alt country names
116
UPDATE vcountry vc
117
    SET countryid = c.geonameid
118
    FROM alt_country a, countries c
119
    WHERE vc.countryutf8 = a.alternatename
120
      AND a.country = c.country
121
      AND vc.countryid IS NULL;
122
-- UPDATE 21
123
-- Time: 4.769 ms
124

    
125
-- try case-insensitive matching on geonames.org 'alternatenames'
126
-- note: this is a little sloppy because the set condition will return
127
-- more than one row if there are more than one matched alternate
128
-- country names, and which one gets used is non-deterministic; however,
129
-- we'll go back and remove those in the next update (specifically for
130
-- the cases where there are more than one *unique* matched countries)
131
UPDATE vcountry vc
132
    SET countryid = gc.geonameid
133
    FROM alternatenames a, countries gc
134
    WHERE lower(vc.countryutf8) = lower(a.alternatename)
135
      AND a.geonameid = gc.geonameid
136
      AND vc.countryid IS NULL;
137
-- UPDATE 76
138
-- Time: 22019.592 ms
139

    
140
-- go back and remove if altname lookup was ambiguous
141
UPDATE vcountry vc
142
    SET countryid = NULL
143
    WHERE countryutf8 IN (
144
        SELECT countryutf8
145
          FROM (
146
            SELECT DISTINCT vc.countryutf8, gc.iso3
147
              FROM vcountry vc,
148
                   alternatenames a,
149
                   countries gc
150
              WHERE lower(vc.countryutf8) = lower(a.alternatename)
151
                AND a.geonameid = gc.geonameid) lookup
152
          GROUP BY countryutf8
153
          HAVING COUNT(*) > 1);
154
-- UPDATE 2
155
-- Time: 2296.257 ms
156
-- Removes: Congo, CONGO
157

    
158
-- merge in GADM country names (our standard names)
159
ALTER TABLE vcountry ADD COLUMN countrystd text;
160
UPDATE vcountry vc
161
  SET countrystd = name_0
162
  FROM gadm_country_lookup gadm0
163
  WHERE vc.countryid=gadm0.countryid;
164
-- UPDATE 377
165
-- Time: 6.254 ms
166

    
167

    
168
-------------------------
169
-- scrub stateprovince --
170
-------------------------
171

    
172
CREATE TABLE vstate AS
173
  SELECT DISTINCT countryid, stateprovince
174
  FROM vegbien_geoscrub join vcountry using (country)
175
  WHERE countryid IS NOT NULL
176
    AND stateprovince IS NOT NULL;
177
-- SELECT 3312
178
-- Time: 4164.948 ms
179
ALTER TABLE vstate ADD COLUMN stateprovinceutf8 text;
180
ALTER TABLE vstate ADD COLUMN stateprovinceid integer;
181

    
182
-- try to clean up character representations
183
UPDATE vstate
184
    SET stateprovinceutf8 = u2L2u(stateprovince);
185
-- UPDATE 3312
186
-- Time: 92.702 ms
187

    
188
-- replace common mojibake with valid utf8 replacement character
189
-- note: second pass of u2L2u would do the same thing, but this is a
190
-- little more explicit about the intended effect
191
UPDATE vstate
192
    SET stateprovinceutf8 = replace(stateprovinceutf8, '�', chr(65533));
193

    
194
-- manually replace html character codes found in the names
195
-- todo: use some library to do this exhaustively rather than using
196
-- manually specified updates
197
UPDATE vstate
198
    SET stateprovinceutf8 = replace(stateprovinceutf8, ''', chr(39));
199
UPDATE vstate
200
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'á', chr(225));
201
UPDATE vstate
202
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'ã', chr(227));
203
UPDATE vstate
204
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'é', chr(233));
205
UPDATE vstate
206
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'é', chr(233));
207
UPDATE vstate
208
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'í', chr(237));
209
UPDATE vstate
210
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'ñ', chr(241));
211
UPDATE vstate
212
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'ó', chr(243));
213
UPDATE vstate
214
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'ô', chr(244));
215
UPDATE vstate
216
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'ü', chr(252));
217

    
218
-- try matching against direct stateprovince names
219
UPDATE vstate vs
220
    SET stateprovinceid = gn.geonameid
221
    FROM geonames gn,
222
         countries c
223
    WHERE lower(vs.stateprovinceutf8) = lower(gn.name)
224
      AND gn.countrycode=c.iso
225
      AND c.geonameid=vs.countryid
226
      AND gn.featurecode='ADM1';
227
-- UPDATE 238
228
-- Time: 2919.512 ms
229

    
230
-- try case-insensitive matching on geonames.org 'alternatenames'
231
-- note: this is a little sloppy because the set condition will return
232
-- more than one row if there are more than one matched alternate
233
-- stateprovince names, and which one gets used is non-deterministic
234
UPDATE vstate vs
235
    SET stateprovinceid = gn.geonameid
236
    FROM alternatenames a,
237
         geonames gn,
238
         countries c
239
    WHERE lower(vs.stateprovinceutf8) = lower(a.alternatename)
240
      AND a.geonameid = gn.geonameid
241
      AND gn.countrycode=c.iso
242
      AND c.geonameid=vs.countryid
243
      AND gn.featurecode='ADM1'
244
      AND vs.stateprovinceid IS NULL;
245
-- UPDATE 1281
246
-- Time: 5229.274 ms
247

    
248
-- try doing a 'like' query that ignores replacement characters (�)
249
UPDATE vstate vs
250
    SET stateprovinceid = gn.geonameid
251
    FROM alternatenames a,
252
         geonames gn,
253
         countries c
254
    WHERE a.alternatename ILIKE
255
          regexp_replace(stateprovinceutf8, chr(65533), '_')
256
      AND stateprovinceutf8 LIKE '%'||chr(65533)||'%'
257
      AND a.geonameid = gn.geonameid
258
      AND gn.countrycode=c.iso
259
      AND c.geonameid=vs.countryid
260
      AND gn.featurecode='ADM1'
261
      AND vs.stateprovinceid IS NULL;
262
-- UPDATE 112
263
-- Time: 6232.106 ms
264

    
265
-- try matching against alternatenames column in geonames table
266
UPDATE vstate vs
267
    SET stateprovinceid = gn.geonameid
268
    FROM geonames gn,
269
         countries c
270
    WHERE lower(vs.stateprovinceutf8) =
271
          ANY (string_to_array(lower(gn.alternatenames), ','))
272
      AND gn.countrycode=c.iso
273
      AND c.geonameid=vs.countryid
274
      AND gn.featurecode='ADM1'
275
      AND vs.stateprovinceid IS NULL;
276
-- UPDATE 220
277
-- Time: 5053.520 ms
278

    
279
-- try matching against BIEN alt stateprovince names
280
UPDATE vstate vs
281
    SET stateprovinceid = g.geonameid
282
    FROM alt_stateprovince asp,
283
         countries c,
284
         geonames g
285
    WHERE lower(vs.stateprovinceutf8) = lower(asp.alternatename)
286
      AND asp.country = c.country
287
      AND c.iso = g.countrycode
288
      AND asp.stateprovince = g.name
289
      AND g.featurecode='ADM1'
290
      AND vs.stateprovinceid IS NULL;
291
-- UPDATE 45
292
-- Time: 3452.274 ms
293

    
294
-- todo: the same stuff done for country to deal with Congo-type case
295

    
296
-- merge in GADM stateprovince names (our standard names)
297
ALTER TABLE vstate ADD COLUMN stateprovincestd text;
298
UPDATE vstate vs
299
  SET stateprovincestd = name_1
300
  FROM gadm_stateprovince_lookup gadm1
301
  WHERE vs.stateprovinceid=gadm1.stateprovinceid;
302
-- UPDATE 1896
303
-- Time: 23.946 ms
304

    
305
-- match any remaining unknowns directly to gadm
306
UPDATE vstate vs
307
  SET stateprovincestd = name_1
308
  FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm,
309
       vcountry vc
310
  WHERE vc.countryid = vs.countryid
311
    AND countrystd = name_0
312
    AND lower(stateprovinceutf8) = lower(name_1)
313
    AND stateprovincestd IS NULL;
314
-- UPDATE 55
315
-- Time: 4799.837 ms
316

    
317
-- try to recover geonameids for any of these so we can drill down to
318
-- the next level
319
UPDATE vstate vs
320
  SET stateprovinceid = gadm1.stateprovinceid
321
  FROM gadm_country_lookup gadm0,
322
       gadm_stateprovince_lookup gadm1
323
  WHERE vs.countryid = gadm0.countryid
324
    AND gadm0.name_0 = gadm1.name_0
325
    AND vs.stateprovincestd = gadm1.name_1
326
    AND vs.stateprovinceid IS NULL;
327
-- UPDATE 0
328
-- Time: 13.882 ms
329

    
330

    
331
/* TODO: is this still needed???
332
-- now reverse map
333
INSERT INTO gadm_stateprovince_lookup
334
select name_0, alternatename as name_1, g.geonameid
335
  from vstate vs,
336
       alt_stateprovince asp,
337
       geonames g,
338
       countries c,
339
       gadm_country_lookup gc
340
  where vs.stateprovinceid=g.geonameid
341
    AND asp.stateprovince=g.name
342
    AND vs.countryid=c.geonameid
343
    AND vs.countryid=gc.countryid
344
    AND asp.country=c.country
345
    AND vs.stateprovinceid not in (select stateprovinceid from gadm_stateprovince_lookup);
346
-- INSERT 0 4
347
-- Time: 20.228 ms
348
*/
349

    
350

    
351
------------------
352
-- scrub county --
353
------------------
354

    
355
CREATE TABLE vcounty AS
356
  SELECT DISTINCT countryid, stateprovinceid, county
357
  FROM vegbien_geoscrub
358
  JOIN vcountry USING (country)
359
  JOIN vstate USING (countryid, stateprovince)
360
  WHERE countryid IS NOT NULL
361
    AND stateprovinceid IS NOT NULL
362
    AND county IS NOT NULL;
363
-- SELECT 18715
364
-- Time: 3590.725 ms
365
ALTER TABLE vcounty ADD COLUMN countyutf8 text;
366
ALTER TABLE vcounty ADD COLUMN countyid integer;
367

    
368
-- try to clean up character representations
369
UPDATE vcounty
370
    SET countyutf8 = u2L2u(county);
371
-- UPDATE 18715
372
-- Time: 683.781 ms
373

    
374
-- replace common mojibake with valid utf8 replacement character
375
-- note: second pass of u2L2u would do the same thing, but this is a
376
-- little more explicit about the intended effect
377
UPDATE vcounty
378
    SET countyutf8 = replace(countyutf8, '�', chr(65533));
379
-- UPDATE 18715
380
-- Time: 153.395 ms
381

    
382
-- clean up some common municipality label abbreviations
383
-- todo: other fixes like this; check what brad did for bien2 on this front?
384
UPDATE vcounty
385
    SET countyutf8 = regexp_replace(countyutf8, ' Co[.]?$', ' County', 'i');
386

    
387
-- try matching against geonames (names and alternatenames)
388
UPDATE vcounty vc
389
  SET countyid = gn.geonameid
390
  FROM geonames gn,
391
       hierarchy h
392
  WHERE vc.stateprovinceid = h.parentid
393
    AND h.childid=gn.geonameid
394
    AND (lower(vc.countyutf8) = lower(gn.name)
395
        OR lower(vc.countyutf8) =
396
          ANY (string_to_array(lower(gn.alternatenames), ',')))
397
    AND gn.featurecode='ADM2';
398
-- UPDATE 6673
399
-- Time: 8369.737 ms
400

    
401
-- use alternatenames
402
UPDATE vcounty vc
403
    SET countyid = gn.geonameid
404
  FROM geonames gn,
405
       alternatenames a,
406
       hierarchy h
407
  WHERE vc.stateprovinceid = h.parentid
408
    AND h.childid = gn.geonameid
409
    AND a.geonameid = gn.geonameid
410
    AND lower(vc.countyutf8) = lower(a.alternatename)
411
    AND gn.featurecode='ADM2'
412
    AND vc.countyid IS NULL;
413
-- UPDATE 0
414
-- Time: 11255.926 ms
415

    
416
ALTER TABLE vcounty ADD COLUMN countystd text;
417

    
418
-- merge in GADM county names (our standard names)
419
-- for cases where we have geonameid
420
UPDATE vcounty vs
421
  SET countystd = name_2
422
  FROM gadm_county_lookup gadm1
423
  WHERE vs.countyid=gadm1.countyid
424
    AND countystd IS NULL;
425
-- UPDATE 6673
426
-- Time: 118.378 ms
427

    
428
-- match directly to gadm
429
UPDATE vcounty vco
430
  SET countystd = name_2
431
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
432
       vcountry vc,
433
       vstate vs
434
  WHERE vco.countryid = vc.countryid
435
    AND vco.stateprovinceid = vs.stateprovinceid
436
    AND countrystd = name_0
437
    AND stateprovincestd = name_1
438
    AND lower(countyutf8) = lower(name_2)
439
    AND countystd IS NULL;
440
-- UPDATE 5027
441
-- Time: 1410.377 ms
442

    
443
-----------------------------
444
-- put everything together --
445
-----------------------------
446

    
447
-- reconstitute the whole dang thang
448

    
449
-- new approach
450
CREATE TABLE geoscrub AS
451
SELECT decimallatitude, decimallongitude,
452
       country, stateprovince, county,
453
       countryid, stateprovinceid, countyid,
454
       countrystd, stateprovincestd, countystd
455
  FROM vegbien_geoscrub v
456
  LEFT JOIN vcountry USING (country)
457
  LEFT JOIN vstate USING (countryid, stateprovince)
458
  LEFT JOIN vcounty USING (countryid, stateprovinceid, county);
459
-- SELECT 1707970
460
-- Time: 26172.154 ms
461

    
462
-- try to squeeze out a few more direct stateprovince mappings to gadm2
463
UPDATE geoscrub
464
  SET stateprovincestd = name_1
465
  FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm
466
  WHERE country IS NOT NULL
467
    AND stateprovince IS NOT NULL
468
    AND countrystd = name_0
469
    AND stateprovincestd IS NULL
470
    AND stateprovince = name_1;
471
-- UPDATE 0
472
-- Time: 3630.973 ms
473

    
474
-- try to squeeze out a few more direct county mappings to gadm2
475
UPDATE geoscrub
476
  SET countystd = name_2
477
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm
478
  WHERE country IS NOT NULL
479
    AND stateprovince IS NOT NULL
480
    AND county IS NOT NULL
481
    AND countrystd = name_0
482
    AND stateprovincestd = name_1
483
    AND countystd IS NULL
484
    AND county = name_2;
485
-- UPDATE 69
486
-- Time: 3982.715 ms
(6-6/9)