Project

General

Profile

1
/*
2
  SQL statements carrying out country name, stateprovince name, and
3
  county name scrubbing on vegbien location data. Currently hard-coded
4
  to operate on a 'vegbien_geoscrub' table that contains the input for
5
  geoscrubbing, with the following columns:
6
     decimallatitude  (latitude, WGS84 decimal degrees)
7
     decimallongitude (longitude, WGS84 decimal degrees)
8
     country          (original asserted country name)
9
     stateprovince    (original asserted stateprovince name)
10
     county           (original asserted county name)
11
  (Note that the lat/lon columns are not needed for scrubbing, but are
12
  used later for geovalidation.)
13

    
14
 Misc issues to contemplate and possibly address:
15

    
16
  * Congo can be either "Democratic Republic of the Congo" (COD) or
17
    "Republic of the Congo" (COG), eh?
18
  * Vietnam: gadm2 seems to treat provinces as level2 units
19
  * Yugoslavia: no longer exists, not in geonames nor gadm2
20

    
21
  * Azerbaijan: Yevlakh City and Yevlakh Rayon both ADM1 in geonames,
22
    but just Yevlax in gadm2
23

    
24
  * England is not a geoname country, only the UK is
25

    
26
  * the general problem of historical names in cases where countries
27
    have either been split, combined, removed (?), or otherwise
28
    redefined
29

    
30
  * Cases with multiple alternativename matches???
31

    
32
 Notes:
33
 - the vegbien_geoscrub input used during development and testing this
34
   scrubbing procedure came from a select distinct on (lat, lon,
35
   country, state, province) from analytical_aggregate on vegbiendev
36

    
37
  Jim Regetz
38
  NCEAS
39
  Created Nov 2012
40
*/
41

    
42
BEGIN;
43

    
44
-------------------
45
-- scrub country --
46
-------------------
47

    
48
INSERT INTO vcountry
49
  SELECT DISTINCT country
50
    FROM vegbien_geoscrub
51
    WHERE country IS NOT NULL;
52
-- SELECT 439
53
-- Time: 862.413 ms
54

    
55
-- try to clean up character representations
56
UPDATE vcountry
57
    SET countryutf8 = u2L2u(country);
58
-- UPDATE 439
59
-- Time: 24.706 ms
60

    
61
-- try matching against direct country names
62
UPDATE vcountry vc
63
    SET countryid = gc.geonameid
64
    FROM countries gc
65
    WHERE lower(vc.countryutf8) = lower(gc.country);
66
-- UPDATE 219
67
-- Time: 12.384 ms
68

    
69
-- try matching against ISO 3166-1 alpha-3
70
UPDATE vcountry vc
71
    SET countryid = gc.geonameid
72
    FROM countries gc
73
    WHERE vc.countryutf8=gc.iso3
74
      AND vc.countryid IS NULL;
75
-- UPDATE 1
76
-- Time: 20.146 ms
77

    
78
-- try matching against ISO 3166-1 alpha-2
79
UPDATE vcountry vc
80
    SET countryid = gc.geonameid
81
    FROM countries gc
82
    WHERE vc.countryutf8 = iso
83
      AND vc.countryid IS NULL;
84
-- UPDATE 64
85
-- Time: 3.378 ms
86

    
87
/*
88
-- try matching against FIPS, but not if already matched (i.e.,
89
-- the 2-digit ISO match takes preference)
90
UPDATE vcountry vc
91
    SET countryid = gc.geonameid
92
    FROM countries gc
93
    WHERE vc.countryutf8 = fips
94
      AND vc.countryid IS NULL;
95
-- UPDATE 1
96
-- Time: 2.514 ms
97
*/
98

    
99
-- try matching against BIEN alt country names
100
UPDATE vcountry vc
101
    SET countryid = c.geonameid
102
    FROM alt_country a, countries c
103
    WHERE vc.countryutf8 = a.alternatename
104
      AND a.country = c.country
105
      AND vc.countryid IS NULL;
106
-- UPDATE 21
107
-- Time: 4.769 ms
108

    
109
-- try case-insensitive matching on geonames.org 'alternatenames'
110
-- note: this is a little sloppy because the set condition will return
111
-- more than one row if there are more than one matched alternate
112
-- country names, and which one gets used is non-deterministic; however,
113
-- we'll go back and remove those in the next update (specifically for
114
-- the cases where there are more than one *unique* matched countries)
115
UPDATE vcountry vc
116
    SET countryid = gc.geonameid
117
    FROM alternatenames a, countries gc
118
    WHERE lower(vc.countryutf8) = lower(a.alternatename)
119
      AND a.geonameid = gc.geonameid
120
      AND vc.countryid IS NULL;
121
-- UPDATE 76
122
-- Time: 22019.592 ms
123

    
124
-- go back and remove if altname lookup was ambiguous
125
UPDATE vcountry vc
126
    SET countryid = NULL
127
    WHERE countryutf8 IN (
128
        SELECT countryutf8
129
          FROM (
130
            SELECT DISTINCT vc.countryutf8, gc.iso3
131
              FROM vcountry vc,
132
                   alternatenames a,
133
                   countries gc
134
              WHERE lower(vc.countryutf8) = lower(a.alternatename)
135
                AND a.geonameid = gc.geonameid) lookup
136
          GROUP BY countryutf8
137
          HAVING COUNT(*) > 1);
138
-- UPDATE 2
139
-- Time: 2296.257 ms
140
-- Removes: Congo, CONGO
141

    
142
-- merge in GADM country names (our standard names)
143
UPDATE vcountry vc
144
  SET countrystd = name_0
145
  FROM gadm_country_lookup gadm0
146
  WHERE vc.countryid=gadm0.countryid;
147
-- UPDATE 377
148
-- Time: 6.254 ms
149

    
150

    
151
-------------------------
152
-- scrub stateprovince --
153
-------------------------
154

    
155
INSERT INTO vstate
156
  SELECT DISTINCT countryid, stateprovince
157
  FROM vegbien_geoscrub join vcountry using (country)
158
  WHERE countryid IS NOT NULL
159
    AND stateprovince IS NOT NULL;
160
-- SELECT 3312
161
-- Time: 4164.948 ms
162

    
163
-- try to clean up character representations
164
UPDATE vstate
165
    SET stateprovinceutf8 = u2L2u(stateprovince);
166
-- UPDATE 3312
167
-- Time: 92.702 ms
168

    
169
-- replace common mojibake with valid utf8 replacement character
170
-- note: second pass of u2L2u would do the same thing, but this is a
171
-- little more explicit about the intended effect
172
UPDATE vstate
173
    SET stateprovinceutf8 = replace(stateprovinceutf8, '�', chr(65533));
174

    
175
-- manually replace html character codes found in the names
176
-- todo: use some library to do this exhaustively rather than using
177
-- manually specified updates
178
UPDATE vstate
179
    SET stateprovinceutf8 = replace(stateprovinceutf8, ''', chr(39));
180
UPDATE vstate
181
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'á', chr(225));
182
UPDATE vstate
183
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'ã', chr(227));
184
UPDATE vstate
185
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'é', chr(233));
186
UPDATE vstate
187
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'é', chr(233));
188
UPDATE vstate
189
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'í', chr(237));
190
UPDATE vstate
191
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'ñ', chr(241));
192
UPDATE vstate
193
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'ó', chr(243));
194
UPDATE vstate
195
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'ô', chr(244));
196
UPDATE vstate
197
    SET stateprovinceutf8 = replace(stateprovinceutf8, 'ü', chr(252));
198

    
199
-- try matching against direct stateprovince names
200
UPDATE vstate vs
201
    SET stateprovinceid = gn.geonameid
202
    FROM geonames gn,
203
         countries c
204
    WHERE lower(vs.stateprovinceutf8) = lower(gn.name)
205
      AND gn.countrycode=c.iso
206
      AND c.geonameid=vs.countryid
207
      AND gn.featurecode='ADM1';
208
-- UPDATE 238
209
-- Time: 2919.512 ms
210

    
211
-- try case-insensitive matching on geonames.org 'alternatenames'
212
-- note: this is a little sloppy because the set condition will return
213
-- more than one row if there are more than one matched alternate
214
-- stateprovince names, and which one gets used is non-deterministic
215
UPDATE vstate vs
216
    SET stateprovinceid = gn.geonameid
217
    FROM alternatenames a,
218
         geonames gn,
219
         countries c
220
    WHERE lower(vs.stateprovinceutf8) = lower(a.alternatename)
221
      AND a.geonameid = gn.geonameid
222
      AND gn.countrycode=c.iso
223
      AND c.geonameid=vs.countryid
224
      AND gn.featurecode='ADM1'
225
      AND vs.stateprovinceid IS NULL;
226
-- UPDATE 1281
227
-- Time: 5229.274 ms
228

    
229
-- try doing a 'like' query that ignores replacement characters (�)
230
UPDATE vstate vs
231
    SET stateprovinceid = gn.geonameid
232
    FROM alternatenames a,
233
         geonames gn,
234
         countries c
235
    WHERE a.alternatename ILIKE
236
          regexp_replace(stateprovinceutf8, chr(65533), '_')
237
      AND stateprovinceutf8 LIKE '%'||chr(65533)||'%'
238
      AND a.geonameid = gn.geonameid
239
      AND gn.countrycode=c.iso
240
      AND c.geonameid=vs.countryid
241
      AND gn.featurecode='ADM1'
242
      AND vs.stateprovinceid IS NULL;
243
-- UPDATE 112
244
-- Time: 6232.106 ms
245

    
246
-- try matching against alternatenames column in geonames table
247
UPDATE vstate vs
248
    SET stateprovinceid = gn.geonameid
249
    FROM geonames gn,
250
         countries c
251
    WHERE lower(vs.stateprovinceutf8) =
252
          ANY (string_to_array(lower(gn.alternatenames), ','))
253
      AND gn.countrycode=c.iso
254
      AND c.geonameid=vs.countryid
255
      AND gn.featurecode='ADM1'
256
      AND vs.stateprovinceid IS NULL;
257
-- UPDATE 220
258
-- Time: 5053.520 ms
259

    
260
-- try matching against BIEN alt stateprovince names
261
UPDATE vstate vs
262
    SET stateprovinceid = g.geonameid
263
    FROM alt_stateprovince asp,
264
         countries c,
265
         geonames g
266
    WHERE lower(vs.stateprovinceutf8) = lower(asp.alternatename)
267
      AND asp.country = c.country
268
      AND c.iso = g.countrycode
269
      AND asp.stateprovince = g.name
270
      AND g.featurecode='ADM1'
271
      AND vs.stateprovinceid IS NULL;
272
-- UPDATE 45
273
-- Time: 3452.274 ms
274

    
275
-- todo: the same stuff done for country to deal with Congo-type case
276

    
277
-- merge in GADM stateprovince names (our standard names)
278
UPDATE vstate vs
279
  SET stateprovincestd = name_1
280
  FROM gadm_stateprovince_lookup gadm1
281
  WHERE vs.stateprovinceid=gadm1.stateprovinceid;
282
-- UPDATE 1896
283
-- Time: 23.946 ms
284

    
285
-- match any remaining unknowns directly to gadm
286
UPDATE vstate vs
287
  SET stateprovincestd = name_1
288
  FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm,
289
       vcountry vc
290
  WHERE vc.countryid = vs.countryid
291
    AND countrystd = name_0
292
    AND lower(stateprovinceutf8) = lower(name_1)
293
    AND stateprovincestd IS NULL;
294
-- UPDATE 55
295
-- Time: 4799.837 ms
296

    
297
-- try to recover geonameids for any of these so we can drill down to
298
-- the next level
299
UPDATE vstate vs
300
  SET stateprovinceid = gadm1.stateprovinceid
301
  FROM gadm_country_lookup gadm0,
302
       gadm_stateprovince_lookup gadm1
303
  WHERE vs.countryid = gadm0.countryid
304
    AND gadm0.name_0 = gadm1.name_0
305
    AND vs.stateprovincestd = gadm1.name_1
306
    AND vs.stateprovinceid IS NULL;
307
-- UPDATE 0
308
-- Time: 13.882 ms
309

    
310

    
311
/* TODO: is this still needed???
312
-- now reverse map
313
INSERT INTO gadm_stateprovince_lookup
314
select name_0, alternatename as name_1, g.geonameid
315
  from vstate vs,
316
       alt_stateprovince asp,
317
       geonames g,
318
       countries c,
319
       gadm_country_lookup gc
320
  where vs.stateprovinceid=g.geonameid
321
    AND asp.stateprovince=g.name
322
    AND vs.countryid=c.geonameid
323
    AND vs.countryid=gc.countryid
324
    AND asp.country=c.country
325
    AND vs.stateprovinceid not in (select stateprovinceid from gadm_stateprovince_lookup);
326
-- INSERT 0 4
327
-- Time: 20.228 ms
328
*/
329

    
330

    
331
------------------
332
-- scrub county --
333
------------------
334

    
335
INSERT INTO vcounty
336
  SELECT DISTINCT countryid, stateprovinceid, county
337
  FROM vegbien_geoscrub
338
  JOIN vcountry USING (country)
339
  JOIN vstate USING (countryid, stateprovince)
340
  WHERE countryid IS NOT NULL
341
    AND stateprovinceid IS NOT NULL
342
    AND county IS NOT NULL;
343
-- SELECT 18715
344
-- Time: 3590.725 ms
345

    
346
-- try to clean up character representations
347
UPDATE vcounty
348
    SET countyutf8 = u2L2u(county);
349
-- UPDATE 18715
350
-- Time: 683.781 ms
351

    
352
-- replace common mojibake with valid utf8 replacement character
353
-- note: second pass of u2L2u would do the same thing, but this is a
354
-- little more explicit about the intended effect
355
UPDATE vcounty
356
    SET countyutf8 = replace(countyutf8, '�', chr(65533));
357
-- UPDATE 18715
358
-- Time: 153.395 ms
359

    
360
-- clean up some common municipality label abbreviations
361
-- todo: other fixes like this; check what brad did for bien2 on this front?
362
UPDATE vcounty
363
    SET countyutf8 = regexp_replace(countyutf8, ' Co[.]?$', ' County', 'i');
364

    
365
-- try matching against geonames (names and alternatenames)
366
UPDATE vcounty vc
367
  SET countyid = gn.geonameid
368
  FROM geonames gn,
369
       hierarchy h
370
  WHERE vc.stateprovinceid = h.parentid
371
    AND h.childid=gn.geonameid
372
    AND (lower(vc.countyutf8) = lower(gn.name)
373
        OR lower(vc.countyutf8) =
374
          ANY (string_to_array(lower(gn.alternatenames), ',')))
375
    AND gn.featurecode='ADM2';
376
-- UPDATE 6673
377
-- Time: 8369.737 ms
378

    
379
-- use alternatenames
380
UPDATE vcounty vc
381
    SET countyid = gn.geonameid
382
  FROM geonames gn,
383
       alternatenames a,
384
       hierarchy h
385
  WHERE vc.stateprovinceid = h.parentid
386
    AND h.childid = gn.geonameid
387
    AND a.geonameid = gn.geonameid
388
    AND lower(vc.countyutf8) = lower(a.alternatename)
389
    AND gn.featurecode='ADM2'
390
    AND vc.countyid IS NULL;
391
-- UPDATE 0
392
-- Time: 11255.926 ms
393

    
394
-- merge in GADM county names (our standard names)
395
-- for cases where we have geonameid
396
UPDATE vcounty vs
397
  SET countystd = name_2
398
  FROM gadm_county_lookup gadm1
399
  WHERE vs.countyid=gadm1.countyid
400
    AND countystd IS NULL;
401
-- UPDATE 6673
402
-- Time: 118.378 ms
403

    
404
-- match directly to gadm
405
UPDATE vcounty vco
406
  SET countystd = name_2
407
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
408
       vcountry vc,
409
       vstate vs
410
  WHERE vco.countryid = vc.countryid
411
    AND vco.stateprovinceid = vs.stateprovinceid
412
    AND countrystd = name_0
413
    AND stateprovincestd = name_1
414
    AND lower(countyutf8) = lower(name_2)
415
    AND countystd IS NULL;
416
-- UPDATE 5027
417
-- Time: 1410.377 ms
418

    
419
COMMIT;
420

    
(6-6/13)