Project

General

Profile

« Previous | Next » 

Revision 10707

View differences:

derived/biengeo/geonames-assessment.sql
1
/*
2

  
3
  Misc notes on how well we did scrubbing names. And lots of misc SQL
4
  statements for examining various scrubbing outcomes, some of which may
5
  not be so relevant anymore as a consequences of other changes.
6

  
7
Provisional (and not necessarily fully updated) summary of geoname
8
scrubbing performance on vegbien_geoscrub
9
(= geoscrub_input from vegbien schema public.2012-11-04-07-34-10.r5984)
10
========================================================================
11
1707970 total unique location determinations
12
  32541 distinct on country, stateprovince, county (including all nulls)
13
========================================================================
14
* Country *
15
1485885 assertions
16
1479338 mapped to geonames
17
        (6547 not mapped to geonames)
18
1479338 mapped to gadm2
19
        (6547 not mapped to gadm2)
20
        (0 cases of failed mapping to gadm2 name_0)
21
1479338 mapped both to geonames and to gadm2
22
========================================================================
23
* Country with State/Province *
24
1408955 assertions
25
1388525 mapped to geonames
26
        (20430 not mapped to geonames)
27
        (5576 of these had unmapped country)
28
1389239 mapped to gadm2
29
        (19716 not mapped to gadm2)
30
        (5576 of these had unmapped country)
31
1387915 mapped both to geonames and to gadm2
32
========================================================================
33
* Country with State/Province with County/Parish*
34
 276809 assertions
35
 145767 mapped to geonames
36
 213795 mapped to gadm2
37
 140983 mapped both to geonames and to gadm2
38
========================================================================
39

  
40
========================================================================
41
Distinct country names:
42
439 total unique country names (non-null)
43
377 recognized unique country names (non-null)
44
 62 unrecognized unique country names (non-null)
45
========================================================================
46
Distinct country+stateprovince names:
47
4764 total unique country and stateprovince (stateprovince non-null)
48
4704 total unique country and stateprovince (both non-null)
49
2944 mapped to geonames
50
     (1760 not mapped to geonames)
51
2950 mapped to gadm2
52
     (1754 not mapped to gadm2)
53
2885 mapped both to geonames and to gadm2
54
     (1819 not mapped to both)
55
------------------------------------------------------------------------
56
* Distinct mappedcountry+stateprovince names *
57
3312 total unique
58
1895 mapped to geonames
59
     (1417 not mapped to geonames)
60
1901 mapped to gadm2
61
     (1411 not mapped to gadm2)
62
========================================================================
63
Distinct country+stateprovince+county names:
64
27440 total unique country and stateprovince and county (county non-null)
65
25764 total unique country and stateprovince and county (all non-null)
66
8507 mapped to geonames
67
     (17257 not mapped to geonames)
68
12610 mapped to gadm2
69
     (13154 not mapped to gadm2)
70
4917 mapped both to geonames and to gadm2
71
     (20847 not mapped to both)
72
------------------------------------------------------------------------
73
* Distinct mappedcountry+mappedstateprovince+county names *
74
18715 total unique
75
6673 mapped to geonames
76
     (12042 not mapped to geonames)
77
11310 mapped to gadm2
78
     (7405 not mapped to gadm2)
79
========================================================================
80

  
81

  
82
for comparison, here are bien2 stats based on boyle/donoghue scrubbing:
83
753302 total unique location determinations
84
 have asserted country
85
745321 have scrubbed country
86
 have asserted country and stateprovince
87
 have scrubbed country and stateprovince
88
 have asserted country and stateprovince and county
89
 have scrubbed country and stateprovince and county
90

  
91
  Jim Regetz
92
  NCEAS
93
  Created Nov 2012
94
*/
95

  
96
-- Generate human-readable view of all distinct cases where we haven't
97
-- obtained a match for the asserted country
98
CREATE TEMP VIEW unmatched_country AS
99
SELECT count(*), country FROM
100
  (SELECT
101
      CASE WHEN length(countryutf8)>4
102
           THEN initcap(countryutf8)
103
           ELSE countryutf8
104
       END AS country
105
    FROM scrubbed
106
    JOIN vcountry USING (country)
107
    WHERE scrubbed.country IS NOT NULL
108
      AND scrubbed.countryid IS NULL) summary
109
  GROUP BY country
110
  ORDER BY count DESC;
111
-- Generate human-readable view of all distinct cases where we have
112
-- obtained match for the asserted country, but not stateprovince
113
CREATE TEMP VIEW unmatched_stateprovince AS
114
SELECT count(*), country, stateprovince FROM
115
  (SELECT countries.country AS country,
116
      CASE WHEN length(stateprovinceutf8)>3
117
           THEN initcap(stateprovinceutf8)
118
           ELSE stateprovinceutf8
119
       END AS stateprovince
120
    FROM scrubbed
121
    LEFT JOIN vstate USING (countryid, stateprovince)
122
    LEFT JOIN countries ON vstate.countryid=geonameid
123
    WHERE scrubbed.stateprovince IS NOT NULL
124
      AND scrubbed.stateprovinceid IS NULL
125
      AND scrubbed.country IS NOT NULL) summary
126
  GROUP BY country, stateprovince
127
  ORDER BY count DESC;
128
/*
129
SELECT DISTINCT countries.country,
130
    CASE WHEN length(stateprovinceutf8)>3
131
         THEN initcap(stateprovinceutf8)
132
         ELSE stateprovinceutf8
133
     END AS stateprovince
134
  FROM scrubbed
135
  JOIN vstate USING (countryid, stateprovince)
136
  JOIN countries ON vstate.countryid=geonameid
137
  WHERE scrubbed.stateprovince IS NOT NULL
138
    AND scrubbed.stateprovinceid IS NULL
139
  ORDER BY country, stateprovince;
140
*/
141

  
142
-- Generate human-readable view of all distinct cases where we have
143
-- obtained match for the asserted country and stateprovince, but not
144
-- county
145
/*
146
CREATE TEMP VIEW unmatched_county AS
147
SELECT DISTINCT countries.country,
148
    CASE WHEN length(stateprovinceutf8)>3
149
         THEN initcap(stateprovinceutf8)
150
         ELSE stateprovinceutf8
151
     END AS stateprovince
152
  FROM scrubbed
153
  JOIN vstate USING (countryid, stateprovince)
154
  JOIN countries ON vstate.countryid=geonameid
155
  WHERE scrubbed.stateprovince IS NOT NULL
156
    AND scrubbed.stateprovinceid IS NULL
157
  ORDER BY country, stateprovince;
158
*/
159

  
160
/*
161
-- which states aren't in gadm2?
162
select distinct iso3, s.name state
163
      from scrubbed
164
      join countries c on countryid=c.geonameid
165
      join geonames s on stateprovinceid=s.geonameid
166
except select iso, name_1 state from gadm2;
167

  
168
select country, count(*)
169
  from scrubbed s
170
  left join gadm_stateprovince_lookup using (stateprovinceid)
171
  where name_0 is null
172
    and stateprovinceid is not null
173
  group by country
174
  order by count desc;
175

  
176

  
177
select name
178
  from gadm2
179
  join alternatenames on name_1=alternatename
180
  join geonames using (geonameid)
181
  join countries on countrycode=countries.iso
182
  where featurecode='ADM1'
183
    and name_1='North Solomons';
184
*/
185

  
186

  
187
/*
188
-- report distinct geonames-scrubbed country+stateprovince pairs that
189
-- we didn't match to any gadm polygons
190
SELECT DISTINCT c.country, g.name
191
 FROM countries c
192
 JOIN vstate v ON geonameid=countryid
193
 JOIN geonames g ON g.geonameid=v.stateprovinceid
194
 WHERE stateprovinceid NOT IN (
195
           SELECT stateprovinceid FROM gadm_stateprovince_lookup
196
       )
197
 ORDER BY c.country, g.name;
198

  
199
-- report failed stateprovince mappings from gadm2 to geonames
200
SELECT countries.country AS countryname,
201
       name AS stateprovincename,
202
       count(*)
203
  FROM scrubbed
204
  LEFT JOIN gadm_stateprovince_lookup USING (stateprovinceid)
205
  LEFT JOIN countries ON countryid=countries.geonameid
206
  LEFT JOIN geonames ON stateprovinceid=geonames.geonameid
207
  WHERE stateprovinceid IS NOT NULL
208
    AND name_1 IS NULL
209
  GROUP BY countryname, stateprovincename
210
  ORDER BY countryname, stateprovincename;
211
*/
212

  
213
/*
214
select iso, name_1, geonameid as stateid
215
  from (select distinct iso, name_1 from gadm2) gadm2
216
  left join (
217
      select distinct gadm2.iso,
218
             name_1,
219
             alternatenames.geonameid,
220
             geonames.name
221
        from (select distinct iso, name_1 from gadm2) gadm2
222
        join alternatenames on name_1=alternatename
223
        join geonames using (geonameid)
224
        join countries on countrycode=countries.iso
225
        where featurecode='ADM1'
226
          and gadm2.iso=countries.iso3) foo
227
    using (iso, name_1);
228
*/
229

  
230
-- how did we do?
231

  
232
-- country
233

  
234
-- percent of unique locations with asserted country names
235
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
236
  FROM scrubbed WHERE country IS NOT NULL;
237
-- 87.0%
238
-- percent of unique locations with geoname-scrubbed country names
239
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
240
  FROM scrubbed WHERE countryid IS NOT NULL;
241
-- 86.6%
242
-- percent of unique locations with gadm-matchedl country names
243
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
244
  FROM scrubbed WHERE countryid IN (SELECT countryid FROM gadm_country_lookup);
245
-- 86.6%
246

  
247
-- stateprovince
248

  
249
-- note: 212221 records have a stateprovince but no country! those are
250
--       considered missing below
251

  
252
-- percent of unique locations with asserted stateprovince names
253
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
254
  FROM scrubbed WHERE country IS NOT NULL AND stateprovince IS NOT NULL;
255
-- 82.5%
256
-- percent of unique locations with geoname-scrubbed stateprovince names
257
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
258
  FROM scrubbed WHERE stateprovinceid IS NOT NULL;
259
-- 80.2%
260
-- percent of unique locations with gadm-matchedl stateprovince names
261
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
262
  FROM scrubbed
263
  WHERE stateprovinceid IN (SELECT stateprovinceid FROM gadm_stateprovince_lookup);
264
-- 79.5%
265

  
266
-- stateprovince
267

  
268
-- percent of unique locations with asserted county names
269
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
270
  FROM scrubbed
271
  WHERE country IS NOT NULL AND stateprovince IS NOT NULL AND county IS NOT NULL;
272
-- 16.2%
273
-- percent of unique locations with geoname-scrubbed county names
274
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
275
  FROM scrubbed WHERE countyid IS NOT NULL;
276
-- 6.3%
277
-- percent of unique locations with gadm-matchedl county names
278
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
279
  FROM scrubbed
280
  WHERE countyid IN (SELECT countyid FROM gadm_county_lookup);
281
-- tbd
282

  
283
SELECT CASE WHEN countryid IS NOT NULL
284
            THEN 'matched'
285
            ELSE 'unmatched' END AS matches,
286
       COUNT(*)
287
  FROM vcountry
288
  GROUP BY matches;
289

  
290
SELECT CASE WHEN stateprovinceid IS NOT NULL
291
            THEN 'matched'
292
            ELSE 'unmatched' END AS matches,
293
       COUNT(*)
294
  FROM vstate
295
  GROUP BY matches;
296
--  matched   |  1770
297
--  unmatched |  1541
298

  
299
SELECT CASE WHEN countyid IS NOT NULL
300
            THEN 'matched'
301
            ELSE 'unmatched' END AS matches,
302
       COUNT(*)
303
  FROM vcounty
304
  GROUP BY matches;
305
--  matched   |  4295
306
--  unmatched | 14354
307

  
308
SELECT CASE WHEN country IS NULL
309
                 THEN 'no country provided'
310
            WHEN country IS NOT NULL AND countryid IS NULL
311
                 THEN 'unrecognized country'
312
            ELSE
313
                 'scrubbed country'
314
            END results,
315
       COUNT(*)
316
  FROM scrubbed
317
  GROUP BY results;
318

  
319
SELECT CASE WHEN country IS NULL
320
                 THEN 'no fully qualified stateprovince provided'
321
            WHEN country IS NOT NULL AND stateprovince IS NOT NULL AND stateprovinceid IS NULL
322
                 THEN 'unrecognized stateprovince'
323
            ELSE
324
                 'scrubbed stateprovince'
325
            END results,
326
       COUNT(*)
327
  FROM scrubbed
328
  GROUP BY results;
329

  
330
-- file output
331
SELECT DISTINCT countryid, country,
332
    stateprovinceid, stateprovince,
333
    countyid, county
334
  FROM scrubbed
335
  ORDER BY country, stateprovince, county;
336

  
337
/*
338
-- look at the whole iso2 vs fips thing
339
SELECT DISTINCT vc.country, i.country iso, f.country fips, stateprovince
340
  FROM vcountry vc
341
  LEFT JOIN countries i ON vc.country=i.iso
342
  LEFT JOIN countries f ON vc.country=f.fips
343
  LEFT JOIN scrubbed s ON s.country=vc.country
344
  WHERE vc.countryid IS NULL
345
    AND i.country IS NOT NULL
346
    AND f.country IS NOT NULL
347
    ORDER BY vc.country;
348
*/
derived/biengeo/geonames-to-gadm.sql
1
/*
2
  SQL statements to generate tables mapping geonames features at each
3
  relevant administative level (country, state/province, county/parish)
4
  to GADM2 features.
5

  
6
  Todo:
7
  * Be tolerant to presence/absence of administrative qualifiers in
8
    names? (Already doing this mildly for County/Co/Co.)
9
    - Municipality
10
    - Barrio
11
    - District
12

  
13
  * Get rid of rudundancy
14
  * Try matching against GADM2 varname_* columns?
15

  
16
  Jim Regetz
17
  NCEAS
18
  Created Nov 2012
19
*/
20

  
21
-----------------------
22
-- Level 0 (Country) --
23
-----------------------
24

  
25
-- map gadm2 level0 to geonames countries
26
CREATE TABLE gadm_country_lookup (
27
        countryid integer primary key references geonames (geonameid),
28
        name_0 text
29
);
30
INSERT INTO gadm_country_lookup (countryid)
31
  SELECT DISTINCT geonameid
32
    FROM countries
33
    WHERE geonameid IS NOT NULL;
34
-- INSERT 0 250
35
-- Time: 20.770 ms
36
UPDATE gadm_country_lookup lu
37
  SET name_0 = gadm.name_0
38
  FROM (SELECT DISTINCT iso, name_0 FROM gadm2) gadm,
39
       countries c
40
  WHERE lu.countryid = c.geonameid
41
    AND c.iso3 = gadm.iso;
42
-- UPDATE 248
43
-- Time: 3562.601 ms
44

  
45
------------------------------
46
-- Level 1 (State/Province) --
47
------------------------------
48

  
49
-- map gadm2 level1 to geonames ADM1
50
CREATE TABLE gadm_stateprovince_lookup (
51
        stateprovinceid integer primary key references geonames (geonameid),
52
        name_0 text,
53
        name_1 text
54
);
55
INSERT INTO gadm_stateprovince_lookup (stateprovinceid)
56
  SELECT DISTINCT geonameid
57
    FROM geonames
58
    WHERE featurecode='ADM1';
59
-- INSERT 0 3841
60
-- Time: 7085.635 ms
61

  
62

  
63
-- try matching against alternatenames table
64
UPDATE gadm_stateprovince_lookup gs
65
  SET name_0 = gadm.name_0,
66
      name_1 = gadm.name_1
67
  FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
68
       alternatenames a,
69
       geonames g,
70
       hierarchy h,
71
       gadm_country_lookup gc
72
  WHERE lower(gadm.name_1)=lower(alternatename)
73
    AND gs.stateprovinceid = a.geonameid
74
    AND a.geonameid = g.geonameid
75
    AND gs.stateprovinceid = h.childid
76
    AND h.parentid = gc.countryid
77
    AND gc.name_0 = gadm.name_0
78
    AND g.featurecode='ADM1'
79
    AND gs.name_1 IS NULL;
80
-- UPDATE 2145
81
-- Time: 3453.568 ms
82

  
83
-- try matching against geonames (names and alternatenames)
84
UPDATE gadm_stateprovince_lookup gs
85
  SET name_0 = gadm.name_0,
86
      name_1 = gadm.name_1
87
  FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
88
       geonames g,
89
       hierarchy h,
90
       gadm_country_lookup gc
91
  WHERE (lower(gadm.name_1)=lower(g.name)
92
         OR lower(gadm.name_1) =
93
            ANY (string_to_array(lower(g.alternatenames), ',')))
94
    AND gs.stateprovinceid = h.childid
95
    AND h.parentid = gc.countryid
96
    AND gc.name_0 = gadm.name_0
97
    AND gs.stateprovinceid = g.geonameid
98
    AND g.featurecode='ADM1'
99
    AND gs.name_1 IS NULL;
100
-- UPDATE 319
101
-- Time: 857.885 ms
102

  
103

  
104
-- now again but against our manual mapping
105
UPDATE gadm_stateprovince_lookup gs
106
  SET name_0 = gadm.name_0,
107
      name_1 = gadm.name_1
108
  FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm,
109
       alt_stateprovince asp,
110
       geonames g,
111
       countries c
112
  WHERE gadm.name_1=asp.alternatename
113
    AND gadm.iso=c.iso3
114
    AND asp.country=c.country
115
    AND gs.stateprovinceid = g.geonameid
116
    AND g.countrycode=c.iso
117
    AND g.name=asp.stateprovince
118
    AND gs.name_0 IS NULL;
119
-- UPDATE 29
120
-- Time: 777.376 ms
121

  
122
----------------------
123
-- Level 2 (County) --
124
----------------------
125

  
126
-- map gadm2 level2 to geonames ADM2
127
CREATE TABLE gadm_county_lookup (
128
        countyid integer primary key references geonames (geonameid),
129
        name_0 text,
130
        name_1 text,
131
        name_2 text
132
);
133
INSERT INTO gadm_county_lookup (countyid)
134
  SELECT DISTINCT geonameid
135
    FROM geonames
136
    WHERE featurecode='ADM2';
137
-- INSERT 0 32374
138
-- Time: 13177.539 ms
139

  
140
-- try matching against geonames (names and alternatenames)
141
UPDATE gadm_county_lookup gcl
142
  SET name_0 = gadm.name_0,
143
      name_1 = gadm.name_1,
144
      name_2 = gadm.name_2
145
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
146
       geonames g,
147
       hierarchy h,
148
       gadm_stateprovince_lookup gsl
149
  WHERE (lower(gadm.name_2)=lower(g.name)
150
         OR lower(gadm.name_2) =
151
            ANY (string_to_array(lower(g.alternatenames), ',')))
152
    AND gcl.countyid = g.geonameid
153
    AND gcl.countyid = h.childid
154
    AND h.parentid = gsl.stateprovinceid
155
    AND gsl.name_0 = gadm.name_0
156
    AND gsl.name_1 = gadm.name_1
157
    AND g.featurecode='ADM2';
158
-- UPDATE 12352
159
-- Time: 27390.357 ms
160

  
161
-- try matching against alternatenames table
162
UPDATE gadm_county_lookup gcl
163
  SET name_0 = gadm.name_0,
164
      name_1 = gadm.name_1,
165
      name_2 = gadm.name_2
166
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
167
       alternatenames a,
168
       hierarchy h,
169
       geonames g,
170
       gadm_stateprovince_lookup gsl
171
  WHERE lower(gadm.name_2)=lower(a.alternatename)
172
    AND gcl.countyid = a.geonameid
173
    AND a.geonameid = g.geonameid
174
    AND gcl.countyid = h.childid
175
    AND h.parentid = gsl.stateprovinceid
176
    AND gsl.name_0 = gadm.name_0
177
    AND gsl.name_1 = gadm.name_1
178
    AND g.featurecode='ADM2'
179
    AND gcl.name_2 IS NULL;
180
-- UPDATE 0
181
-- Time: 6340.441 ms
182

  
183
-- map geonames '/Foo/ County' to gadm2 '/Foo/'
184
-- todo: other mappings like this???
185
UPDATE gadm_county_lookup gcl
186
  SET name_0 = gadm.name_0,
187
      name_1 = gadm.name_1,
188
      name_2 = gadm.name_2
189
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
190
       geonames g,
191
       hierarchy h,
192
       gadm_stateprovince_lookup gsl
193
  WHERE lower(gadm.name_2||' County')=lower(g.name)
194
    AND gcl.countyid = g.geonameid
195
    AND gcl.countyid = h.childid
196
    AND h.parentid = gsl.stateprovinceid
197
    AND gsl.name_0 = gadm.name_0
198
    AND gsl.name_1 = gadm.name_1
199
    AND g.featurecode='ADM2'
200
    AND gcl.name_2 IS NULL;
201
-- UPDATE 3000
202
-- Time: 22248.393 ms
203

  
204

  
205
/*
206
-- another way to create gadm_stateprovince_lookup, though this won't
207
-- pick up multiple matches from geonames (good or bad??)
208
CREATE TABLE gadm_stateprovince_lookup AS
209
SELECT DISTINCT name_0, name_1
210
  FROM gadm2;
211
ALTER TABLE gadm_stateprovince_lookup ADD COLUMN stateprovinceid integer;
212
UPDATE gadm_stateprovince_lookup gs
213
  SET stateprovinceid = g.geonameid
214
  FROM gadm_country_lookup gc,
215
       alternatenames a,
216
       geonames g,
217
       countries c
218
  WHERE gs.name_0=gc.name_0
219
    AND gs.name_1=a.alternatename
220
    AND a.geonameid=g.geonameid
221
    AND g.countrycode=c.iso
222
    AND gc.countryid=c.geonameid
223
    AND g.featurecode='ADM1';
224
-- UPDATE 2137
225
-- Time: 600.345 ms
226
*/
227

  
228

  
229

  
230
/*
231
-- these geonameids match multiple gadm2 state/provinces, and that's bad
232
-- because we don't know which one to use for geovalidation
233
select stateprovinceid, name_0, array_agg(name_1) as name_1
234
  from gadm_stateprovince_lookup
235
  group by stateprovinceid, name_0
236
  having count(*)>1;
237

  
238
 stateprovinceid |  name_0  |                name_1
239
-----------------+----------+--------------------------------------
240
         3653890 | Ecuador  | {Orellana,Orellana}
241
          453751 | Bulgaria | {Razgrad,Ruse}
242
         1831095 | Cambodia | {"Phnom Penh",Kândal}
243
         1506272 | Russia   | {Altay,Gorno-Altay}
244
          128222 | Iran     | {Kermanshah,Kordestan}
245
         3457415 | Brazil   | {"Mato Grosso do Sul","Mato Grosso"}
246
          170652 | Syria    | {Damascus,"Rif Dimashq"}
247
(7 rows)
248

  
249
-- for now, manually clean up after these ambiguous cases
250
-- ... this countryid should only apply to Mato Grosso do Sul
251
DELETE FROM gadm_stateprovince_lookup
252
  WHERE name_0 = 'Brazil'
253
    AND name_1 = 'Mato Grosso'
254
    AND stateprovinceid = '3457415';
255
*/
256

  
257
/*
258
-- these gadm2 state/provinces match multiple geonameids, but i'm pretty sure
259
-- we don't really care in this direction
260
select iso, name_1, count(*)
261
  from (select distinct iso, name_1 from gadm2) gadm2
262
  left join (
263
      select distinct gadm2.iso,
264
             name_1,
265
             alternatenames.geonameid,
266
             geonames.name
267
        from (select distinct iso, name_1 from gadm2) gadm2
268
        join alternatenames on name_1=alternatename
269
        join geonames using (geonameid)
270
        join countries on countrycode=countries.iso
271
        where featurecode='ADM1'
272
          and gadm2.iso=countries.iso3) foo
273
    using (iso, name_1) group by iso, name_1 having count(*)>1;
274

  
275
 iso |   name_1    | count
276
-----+-------------+-------
277
 AZE | Yevlax      |     2
278
 BGR | Ruse        |     2
279
 BMU | Hamilton    |     2
280
 BRA | Mato Grosso |     2
281
 KAZ | Almaty      |     2
282
 KHM | Phnom Penh  |     2
283
 RUS | Altay       |     2
284
 RUS | Moskva      |     2
285
(8 rows)
286
*/
287

  
derived/biengeo/geovalidate.sql
1
/*
2
  SQL statements carrying out geovalidation on vegbien location data.
3
  Currently hard-coded to operate on the 'geoscrub' table that is
4
  assumed to have been generated by the geonames scrubbing process
5
  (see geonames.sql). This table should minimally contain the following
6
  columns:
7
     decimallatitude  (latitude, WGS84 decimal degrees)
8
     decimallongitude (longitude, WGS84 decimal degrees)
9
     country          (original asserted country name)
10
     stateprovince    (original asserted stateprovince name)
11
     county           (original asserted county name)
12
     countrystd       (GADM2-mapped country name)
13
     stateprovincestd (GADM2-mapped state/province name)
14
     countystd        (GADM2-mapped county name)
15
  (Note that the geonames.sql script also currently creates columns with
16
  geonames.org IDs for country, stateprovince, and county.)
17

  
18
  After execution of the statements below, the table will contain the
19
  following new columns (along with geom and geog columns used by
20
  postgis):
21
     latlonvalidity        (validity score for lat/lon coordinate)
22
     countryvalidity       (validity score for country)
23
     stateprovincevalidity (validity score for stateprovince)
24
     countyvalidity        (validity score for county)
25
     pointcountry          (GADM2 country containing the point)
26
     pointstateprovince    (GADM2 stateprovince containing the point)
27
     pointcounty           (GADM2 county containing the point)
28

  
29
  Workflow with actual times (as executed 15-Nov-2012)
30
     12s create point geoms
31
     16s create point geogs
32
     23s index geoms
33
     26s index geogs
34
     37s index countrystd
35
     25s index stateprovincestd
36
      5s vacuum analyze
37
      1s mark missing coords (UPDATE 0)
38
    133s mark valid coords (UPDATE 1702989)
39
      2s mark invalid coords (UPDATE 4981)
40
     17s vacuum analyze
41
     27s mark missing countries (UPDATE 222085)
42
     36s mark non-gadm countries (UPDATE 6547)
43
   1277s mark point in country (UPDATE 1400627)
44
   1491s mark point near country (UPDATE 24626)
45
    573s mark point near country accurate (UPDATE 1182)
46
      0s mark point near Antarctica (UPDATE 0)
47
      2s mark point not in country (UPDATE 54085)
48
      8s vacuum analyze
49
     15s mark missing stateprovinces (UPDATE 299015)
50
     12s mark non-gadm stateprovinces (UPDATE 19716)
51
    920s mark point in stateprovince (UPDATE 1241068)
52
    864s mark point near stateprovince (UPDATE 35685)
53
    424s mark point near stateprovince accurate (UPDATE 1051)
54
      7s mark point not in stateprovince (UPDATE 111282)
55
       s vacuum analyze
56
    128s mark missing counties (UPDATE 1431161)
57
     29s mark non-gadm counties (UPDATE 63014)
58
     86s mark point in county (UPDATE 174469)
59
     65s mark point near county (UPDATE 12477)
60
      ?s mark point near county accurate (UPDATE ?)
61
      3s mark point not in county (UPDATE 26849)
62
      ?s vacuum analyze
63
    890s look up GADM location using given lat/lon (UPDATE 1656709)
64

  
65
  todo:
66
  * make firm decision about whether we should assume here that *std
67
    columns contain valid GADM names (or are NULL), or whether we should
68
    check that here
69

  
70
  Jim Regetz
71
  NCEAS
72
  Created Nov 2012
73
*/
74

  
75
-- generate point geometries from coordinates
76
SELECT AddGeometryColumn('public', 'geoscrub', 'geom', 4326, 'POINT', 2 );
77
UPDATE geoscrub
78
   SET geom = ST_SetSRID(ST_Point(decimallongitude, decimallatitude), 4326);
79
-- ... create point geographies too ...
80
-- note that this excludes (leaves null) any occurrences with coordinates
81
-- that are not valid decimal degrees
82
ALTER TABLE geoscrub ADD COLUMN geog geography(POINT, 4326);
83
UPDATE geoscrub
84
  SET geog = geom::geography
85
  WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
86

  
87
-- create indexes
88
CREATE INDEX "geoscrub_geom_gist" ON geoscrub USING GIST (geom);
89
CREATE INDEX "geoscrub_geog_gist" ON geoscrub USING GIST (geog);
90
CREATE INDEX "geoscrub_countrystd_idx" ON geoscrub (countrystd);
91
CREATE INDEX "geoscrub_stateprovincestd_idx" ON geoscrub (stateprovincestd);
92
CREATE INDEX "geoscrub_countystd_idx" ON geoscrub (countystd);
93

  
94
-- may want to do this first, otherwise the query planner wants to do a
95
-- hash join instead of nested loop join, and at least when I use a limit
96
-- statement to time queries on either side of the threshold where it
97
-- switches, the hash join does waaaaay worse
98
SET enable_hashjoin = false;
99

  
100
--
101
-- validate lat/lon coordinates
102
--
103

  
104
VACUUM ANALYZE geoscrub;
105

  
106
ALTER TABLE geoscrub ADD COLUMN latlonvalidity int;
107

  
108
-- -1 if missing one or both coordinates
109
UPDATE geoscrub o
110
    SET latlonvalidity = -1
111
    WHERE (
112
        decimallatitude IS NULL
113
        OR decimallongitude IS NULL
114
    );
115

  
116
-- 1 if the coordinates falls within the global bounding box
117
UPDATE geoscrub o
118
    SET latlonvalidity = 1
119
    WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
120

  
121
-- 0 otherwise (have coordinates, but not valid)
122
UPDATE geoscrub o
123
    SET latlonvalidity = 0
124
    WHERE o.latlonvalidity IS NULL;
125

  
126
--
127
-- validate country
128
--
129

  
130
VACUUM ANALYZE geoscrub;
131

  
132
ALTER TABLE geoscrub ADD COLUMN countryvalidity int;
133

  
134
-- -1 if missing country name
135
UPDATE geoscrub o
136
    SET countryvalidity = -1
137
    WHERE o.country IS NULL;
138

  
139
-- 0 if a country name is asserted, but not recognized among GADM
140
-- country names
141
UPDATE geoscrub o
142
    SET countryvalidity = 0
143
    WHERE (o.countrystd IS NULL
144
        OR NOT EXISTS (
145
           SELECT 1
146
             FROM gadm2 c
147
             WHERE o.countrystd=c.name_0
148
         )
149
     ) AND o.countryvalidity IS NULL;
150

  
151
-- 3 if the point is in (or on the border of) the asserted country
152
UPDATE geoscrub AS o
153
    SET countryvalidity = 3
154
    WHERE EXISTS (
155
        SELECT 1
156
            FROM gadm2 c
157
            WHERE o.countrystd=c.name_0
158
              AND ST_Intersects(o.geom, c.simple_geom)
159
    ) AND o.countryvalidity IS NULL;
160

  
161
-- 2 for those that aren't 3's, but the point is in within 5km of
162
-- the asserted country
163
UPDATE geoscrub o
164
    SET countryvalidity = 2
165
    WHERE EXISTS (
166
        SELECT 1
167
            FROM gadm2 c
168
            WHERE o.countrystd=c.name_0
169
              AND ST_DWithin(o.geog, c.simple_geog, 5000)
170
    ) AND o.countryvalidity IS NULL;
171
-- now recheck the 2's to see if any are within the country based on the
172
-- original (unsimplified) geometry, and if so, upgrade to 3
173
UPDATE geoscrub AS o
174
    SET countryvalidity = 3
175
    WHERE EXISTS (
176
        SELECT 1
177
            FROM gadm2 c
178
            WHERE o.countrystd=c.name_0
179
              AND ST_Intersects(o.geom, c.geom)
180
    ) AND o.countryvalidity = 2;
181
-- also handle special case Antartica, which has invalid geography in
182
-- gadm2 and thus wasn't included in the within-5km check
183
UPDATE geoscrub AS o
184
    SET countryvalidity = 3
185
    WHERE EXISTS (
186
        SELECT 1
187
            FROM gadm2 c
188
            WHERE o.countrystd=c.name_0
189
              AND ST_Intersects(o.geom, c.geom)
190
    ) AND o.countryvalidity < 3
191
    AND o.countrystd = 'Antarctica';
192

  
193
-- 1 otherwise (have recognized country name, but point is not within
194
-- 5km of the asserted country)
195
UPDATE geoscrub o
196
    SET countryvalidity = 1
197
    WHERE o.countryvalidity IS NULL;
198

  
199

  
200
-----------------------------
201
-- validate state/province --
202
-----------------------------
203

  
204
VACUUM ANALYZE geoscrub;
205

  
206
ALTER TABLE geoscrub ADD COLUMN stateprovincevalidity int;
207

  
208
-- -1 if missing stateprovince name
209
UPDATE geoscrub o
210
    SET stateprovincevalidity = -1
211
    WHERE o.country IS NULL
212
       OR o.stateprovince IS NULL;
213

  
214
-- 0 if a stateprovince name is asserted, but not recognized among GADM
215
-- stateprovince names
216
UPDATE geoscrub o
217
    SET stateprovincevalidity = 0
218
    WHERE (o.countrystd IS NULL
219
        OR o.stateprovincestd IS NULL
220
        OR NOT EXISTS (
221
           SELECT 1
222
             FROM gadm2 c
223
             WHERE o.countrystd=c.name_0
224
               AND o.stateprovincestd=c.name_1
225
        )
226
    ) AND o.stateprovincevalidity IS NULL;
227

  
228
-- 3 if the point is in (or on the border of) the asserted stateprovince
229
UPDATE geoscrub AS o
230
    SET stateprovincevalidity = 3
231
    WHERE EXISTS (
232
        SELECT 1
233
            FROM gadm2 c
234
            WHERE o.countrystd=c.name_0
235
              AND o.stateprovincestd=c.name_1
236
              AND ST_Intersects(o.geom, c.simple_geom)
237
    ) AND o.countryvalidity = 3
238
    AND o.stateprovincevalidity IS NULL;
239

  
240
-- for those that aren't 3's, set to 2 if the point is in within 5km of
241
-- the asserted stateprovince
242
UPDATE geoscrub o
243
    SET stateprovincevalidity = 2
244
    WHERE EXISTS (
245
        SELECT 1
246
            FROM gadm2 c
247
            WHERE o.countrystd=c.name_0
248
              AND o.stateprovincestd=c.name_1
249
              AND ST_DWithin(o.geog, c.simple_geog, 5000)
250
    ) AND (o.countryvalidity = 2 OR o.countryvalidity = 3)
251
    AND o.stateprovincevalidity IS NULL;
252
-- now recheck the 2's to see if any are within the stateprovince based
253
-- on the original (unsimplified) geometry, and if so, upgrade to 3
254
UPDATE geoscrub AS o
255
    SET stateprovincevalidity = 3
256
    WHERE EXISTS (
257
        SELECT 1
258
            FROM gadm2 c
259
            WHERE o.countrystd=c.name_0
260
              AND o.stateprovincestd=c.name_1
261
              AND ST_Intersects(o.geom, c.geom)
262
    ) AND o.stateprovincevalidity = 2;
263

  
264
-- 1 otherwise (have recognized stateprovince name, but point is not within
265
-- 5km of the asserted stateprovince)
266
UPDATE geoscrub o
267
    SET stateprovincevalidity = 1
268
    WHERE o.stateprovincevalidity IS NULL;
269

  
270
-----------------------------
271
-- validate county/parish --
272
-----------------------------
273

  
274
ALTER TABLE geoscrub ADD COLUMN countyvalidity int;
275

  
276
-- -1 if missing county name
277
UPDATE geoscrub o
278
    SET countyvalidity = -1
279
    WHERE o.country IS NULL
280
       OR o.stateprovince IS NULL
281
       OR o.county IS NULL;
282

  
283
-- 0 if a county name is asserted, but not recognized among GADM
284
-- county names
285
UPDATE geoscrub o
286
    SET countyvalidity = 0
287
    WHERE (o.countrystd IS NULL
288
        OR o.stateprovincestd IS NULL
289
        OR o.countystd IS NULL
290
        OR NOT EXISTS (
291
           SELECT 1
292
             FROM gadm2 c
293
             WHERE o.countrystd=c.name_0
294
               AND o.stateprovincestd=c.name_1
295
               AND o.countystd=c.name_2
296
        )
297
    ) AND o.countyvalidity IS NULL;
298

  
299
-- 3 if the point is in (or on the border of) the asserted county
300
UPDATE geoscrub AS o
301
    SET countyvalidity = 3
302
    WHERE EXISTS (
303
        SELECT 1
304
            FROM gadm2 c
305
            WHERE o.countrystd=c.name_0
306
              AND o.stateprovincestd=c.name_1
307
              AND o.countystd=c.name_2
308
              AND ST_Intersects(o.geom, c.simple_geom)
309
    ) AND o.stateprovincevalidity = 3
310
    AND o.countyvalidity IS NULL;
311

  
312
-- for those that aren't 3's, set to 2 if the point is in within 5km of
313
-- the asserted county
314
UPDATE geoscrub o
315
    SET countyvalidity = 2
316
    WHERE EXISTS (
317
        SELECT 1
318
            FROM gadm2 c
319
            WHERE o.countrystd=c.name_0
320
              AND o.stateprovincestd=c.name_1
321
              AND o.countystd=c.name_2
322
              AND ST_DWithin(o.geog, c.simple_geog, 5000)
323
    ) AND (o.stateprovincevalidity = 2 OR o.stateprovincevalidity = 3)
324
    AND o.countyvalidity IS NULL;
325

  
326
-- 1 otherwise (have recognized county name, but point is not within
327
-- 5km of the asserted county)
328
UPDATE geoscrub o
329
    SET countyvalidity = 1
330
    WHERE o.countyvalidity IS NULL;
331

  
332
--
333
-- Look up GADM place names based purely on coordinates for all points
334
--
335

  
336
ALTER TABLE geoscrub ADD COLUMN pointcountry text;
337
ALTER TABLE geoscrub ADD COLUMN pointstateprovince text;
338
ALTER TABLE geoscrub ADD COLUMN pointcounty text;
339
-- note: only using simplified geometry now, for speed reasons
340
UPDATE geoscrub o
341
    SET pointcountry = c.name_0,
342
        pointstateprovince = c.name_1,
343
        pointcounty = c.name_2
344
    FROM gadm2 c
345
    WHERE ST_Intersects(o.geom, c.simple_geom)
346
      AND o.latlonvalidity = 1;
347

  
348
SET enable_hashjoin = true;
derived/biengeo/geonames.sh
1
# Script to geoscrub-enable a postgres database, developed in the
2
# context of BIEN3 geoscrubbing but with intentions of generality.
3
#
4
# Basic workflow:
5
#  1. Import table dumps obtained from geonames.org
6
#  2. Add additional custom mapping info, including:
7
#     a. Madagascar provinces used in GADM2 but not geonames
8
#     b. extra country name mappings/fixes specific to BIEN3 data
9
#     c. extra state/prov name mappings/fixes specific to BIEN3 data
10
#     d. custom state/prov name mappings binding GADM2 to geonames
11
#
12
# Notes:
13
#  * One of the INSERT statements below contains the string 'Hawai\`i'.
14
#    The backslash is there to escape the backtick for bash, so that
15
#    what gets inserted into the database is 'Hawai`i'. If manually
16
#    copy-and-pasting the statement into psql rather than running as a
17
#    script (i.e., not having bash interpret the command), the backslash
18
#    must not be included.
19
#
20
# Jim Regetz
21
# NCEAS
22
# Created Nov 2012
23

  
24
# !! this is specific to regetz's testing environment !!
25
DATADIR="$HOME/biengeo/geonames"
26
cd ${DATADIR}
27

  
28
#
29
# load geonames.org geonames
30
#
31

  
32
wget http://download.geonames.org/export/dump/allCountries.zip
33
unzip allCountries.zip
34

  
35
psql -c \
36
   'CREATE TABLE geonames (
37
        geonameid serial primary key,
38
        name text,
39
        asciiname text,
40
        alternatenames text,
41
        latitude numeric,
42
        longitude numeric,
43
        featureclass char(1),
44
        featurecode text,
45
        countrycode char(2),
46
        cc2 char(60),
47
        admin1code text,
48
        admin2code text,
49
        admin3code text,
50
        admin4code text,
51
        population bigint,
52
        elevation int,
53
        dem int,
54
        timezone text,
55
        modification date
56
    )' geoscrub
57

  
58
# import data
59
psql -c "COPY geonames FROM '${DATADIR}/allCountries.txt' DELIMITER E'\t' NULL ''" geoscrub
60
psql -c \ "SELECT setval('geonames_geonameid_seq', (SELECT max(geonameid) FROM geonames))" geoscrub
61
psql -c \ "CREATE INDEX geonames_countrycode_idx ON geonames (countrycode)" geoscrub
62

  
63
#
64
# load geonames.org alternate names
65
#
66

  
67
wget http://download.geonames.org/export/dump/alternateNames.zip
68
unzip alternateNames.zip
69

  
70
psql -c \
71
   'CREATE TABLE alternateNames (
72
        alternateNameId serial primary key,
73
        geonameid int references geonames (geonameid),
74
        isolanguage varchar(7),
75
        alternateName varchar(200),
76
        isPreferredName char(1),
77
        isShortName char(1),
78
        isColloquial char(1),
79
        isHistoric char(1)
80
    )' geoscrub
81

  
82
# import data
83
psql -c "COPY alternateNames FROM '${DATADIR}/alternateNames.txt' DELIMITER E'\t' NULL ''" geoscrub
84
psql -c \ "SELECT setval('alternatenames_alternatenameid_seq', (SELECT max(alternatenameid) FROM alternatenames))" geoscrub
85
# 4m45.368s
86

  
87
# index alternatenames
88
psql -c 'CREATE INDEX "alternatenames_alternatename_idx" ON alternateNames (alternateName)' geoscrub
89
# 10m42.827s
90

  
91
#
92
# load geonames.org country info
93
#
94

  
95
psql -c \
96
   'CREATE TABLE countries (
97
        iso char(2),
98
        iso3 char(3),
99
        isonumeric text,
100
        fips text,
101
        country text UNIQUE,
102
        capital text,
103
        area text,
104
        population text,
105
        continent text,
106
        tld text,
107
        currencyCode text,
108
        currencyName text,
109
        phone text,
110
        postalCodeFormat text,
111
        postalCodeRegex text,
112
        languages text,
113
        geonameid int,
114
        neighbours text,
115
        equivalentFipsCode text
116
    )' geoscrub
117

  
118
# import geonames country table
119
grep -ve '^#' $DATADIR/countryInfo.txt | \
120
  psql -c "COPY countries FROM STDIN DELIMITER E'\t' NULL ''" geoscrub
121

  
122
#
123
# load geonames.org hierarchy info
124
#
125

  
126
wget http://download.geonames.org/export/dump/hierarchy.zip
127
unzip hierarchy.zip
128

  
129
psql -c \
130
   'CREATE TABLE hierarchy (
131
        parentId int,-- references geonames (geonameid),
132
        childId int,-- references geonames (geonameid),
133
        type text
134
    )' geoscrub
135

  
136
# import geonames country table
137
psql -c "COPY hierarchy FROM '${DATADIR}/hierarchy.txt' DELIMITER E'\t' NULL ''" geoscrub
138

  
139
#
140
# insert additional custom name-scrub mappings
141
#
142

  
143
# augment geonames with some additional places we need, and update
144
# hierarchy table accordingly
145
psql -c \
146
"WITH newnames AS (
147
  INSERT INTO geonames
148
    (name, alternatenames, featurecode, countrycode )
149
    VALUES
150
    ('Antananarivo', NULL, 'ADM1', 'MG'),
151
    ('Antsiranana', NULL, 'ADM1', 'MG'),
152
    ('Fianarantsoa', NULL, 'ADM1', 'MG'),
153
    ('Mahajanga', NULL, 'ADM1', 'MG'),
154
    ('Toamasina', NULL, 'ADM1', 'MG'),
155
    ('Toliara', 'Toliary', 'ADM1', 'MG')
156
  RETURNING geonameid, countrycode
157
)
158
INSERT INTO hierarchy (parentid, childid)
159
  SELECT c.geonameid AS parentid,
160
         n.geonameid AS childid
161
    FROM newnames n,
162
         countries c
163
    WHERE n.countrycode = c.iso;
164
" geoscrub
165

  
166
# add some custom country alternate names that we need
167
psql -c "
168
CREATE TABLE alt_country (
169
    country text references countries (country),
170
    alternatename text
171
);
172
INSERT INTO alt_country
173
    (country, alternatename)
174
    VALUES
175
    ('Central African Republic', 'Central African Repu')
176
  , ('British Virgin Islands', 'Virgin Islands (U.K.)')
177
  , ('Democratic Republic of the Congo', 'Congo, The Democratic Republic of the')
178
  , ('Falkland Islands', 'Falkland Isl')
179
  , ('Falkland Islands', 'Falkland Islands (Malvinas)')
180
  , ('Faroe Islands', 'Faeroe Isl.')
181
  , ('French Guiana', 'Guyana Francesa')
182
  , ('Iran', 'Iran, Islamic Republic of')
183
  , ('Mexico', E'Mexico\rMexico')
184
  , ('Nicaragua', 'Ncaragua')
185
  , ('Norfolk Island', 'Norfolk Isl')
186
  , ('North Korea', 'Korea, Democratic People''s Republic of')
187
  , ('Solomon Islands', 'Solomon Isl')
188
  , ('South Georgia and the South Sandwich Islands', 'South Georgia and the South Sand')
189
  , ('South Korea', 'Korea (Republic of)')
190
  , ('South Korea', 'Korea, Republic of')
191
  , ('U.S. Virgin Islands', 'US Virgin Islands')
192
  , ('U.S. Virgin Islands', 'Virgin Islands (U.S.)')
193
  , ('United States', 'ESTADOS UNIDOS DE AMERICA')
194
  , ('United States', 'EUA')
195
  , ('Vietnam', 'VIETNAM [Socialist Republic of V')
196
;" geoscrub
197

  
198
# add some custom stateprovince alternate names that we need
199
psql -c "
200
CREATE TABLE alt_stateprovince (
201
    country text references countries (country),
202
    stateprovince text,
203
    alternatename text
204
);
205
INSERT INTO alt_stateprovince
206
    (country, stateprovince, alternatename)
207
    VALUES
208
    ('Canada', 'British Columbia', 'B.C.')
209
  , ('Canada', 'Newfoundland and Labrador', 'Newfoundland & Labrador')
210
  , ('Canada', 'Newfoundland and Labrador', 'New Foundland And Labrador')
211
  , ('Canada', 'Newfoundland and Labrador', 'Labrador & Newfoundland')
212
  , ('Canada', 'Northwest Territories', 'North West Territories')
213
  , ('Canada', 'Northwest Territories', 'Northwest Territorie')
214
  , ('Canada', 'Northwest Territories', 'NWT')
215
  , ('Canada', 'Québec', 'Quebéc')
216
  , ('Canada', 'Québec', 'Pq')
217
  , ('Canada', 'Québec', 'PQ')
218
  , ('United States', 'Hawaii', 'Hawai\`i')
219
  , ('United States', 'North Carolina', 'N. Carolina')
220
  , ('United States', 'North Dakota', 'N. Dakota')
221
  , ('United States', 'South Dakota', 'S. Dakota')
222
  , ('Kenya', 'Coast Province', 'Coast')
223
  , ('Mexico', 'Estado de Baja California', 'Baja California Norte')
224
  , ('Mexico', 'Estado de Baja California', 'Baja California (Norte)')
225
  , ('Mexico', 'Estado de Baja California', 'Baja California, Norte de')
226
  , ('Mexico', 'Estado de Baja California', 'Baja California (state)')
227
  , ('Mexico', 'Estado de Baja California Sur', 'Baja California, Sur de')
228
  , ('Mexico', 'Estado de Baja California Sur', 'Baja California Sur (state)')
229
  , ('Mexico', 'Estado de Baja California Sur', 'Baja Califronia')
230
  , ('Mexico', 'Estado de Baja California Sur', 'Baja Cal. Sur')
231
  , ('Ecuador', 'Provincia de Sucumbíos', 'Sucumbíos')
232
;
233
" geoscrub
234

  
235
# additions for mapping to gadm2 level 1 units
236
#  - Los Lagos -> Los Rios is imperfect but needed for gadm
237
#  - Orellana -> Napo is imperfect (wrong??) but needed for gadm
238
#  - Mapping both Concelho de Macau and Concelho das Ilhas to Ilhas
239
#  - Nassa seems to be a gadm misspelling? (Niassa)
240
# not addressed:
241
#  - Bahamas: unmapped gadm level1 entities may be obsolete?
242
#  - French Guiana has arrondissements as adm0 in gadm, low level?
243
#  - French Polynesia has no adm1 in gadm
244
#  - Madagascar: Sava is (was?) inside gadm's Antsiranana province
245
#  - Philippines: GN has 17 regions at level1, gadm has 82 provinces
246
#  - Sri Lanka: GN ADM2 matches gadm level1; bien data side with GN
247
#  - Venezuela: Dependencias Federales don't seem to be in GADM
248
#  - Vietnam: GN ADM1 matches gadm level2; bien data side with GN?
249
psql -c "
250
INSERT INTO alt_stateprovince
251
    (country, stateprovince, alternatename)
252
    VALUES
253
    ('Cambodia', 'Khétt Siĕm Réab', 'Siemréab')
254
  , ('Chile', 'Región del Biobío', 'Bío-Bío')
255
  , ('Chile', 'Región del Libertador General Bernardo O’Higgins', 'Libertador General Bernardo O''Higgins')
256
  , ('Chile', 'Región de Los Ríos', 'Los Lagos')
257
  , ('Chile', 'Providencia y Santa Catalina, Departamento de Archipiélago de San Andrés', 'San Andrés y Providencia')
258
  , ('Dominican Republic', 'Provincia de Baoruco', 'Bahoruco')
259
  , ('Ecuador', 'Provincia de Morona Santiago', 'Morona Santiago')
260
  , ('Ecuador', 'Provincia de Napo', 'Orellana')
261
  , ('Ecuador', 'Provincia de Zamora Chinchipe', 'Zamora Chinchipe')
262
  , ('Gabon', 'Province du Woleu-Ntem', 'Wouleu-Ntem')
263
  , ('Ghana', 'Brong-Ahafo Region', 'Brong Ahafo')
264
  , ('Guatemala', 'Departamento de Quetzaltenango', 'Quezaltenango')
265
  , ('Haiti', 'Département de l''Artibonite', 'L''Artibonite')
266
  , ('Liberia', 'Grand Bassa County', 'GrandBassa')
267
  , ('Liberia', 'Grand Gedeh County', 'GrandGedeh')
268
  , ('Macao', 'Concelho de Macau', 'Ilhas')
269
  , ('Mozambique', 'Niassa Province', 'Nassa')
270
  , ('Norway', 'Østfold fylke', 'Ãstfold')
271
  , ('Peru', 'Región de Huánuco', 'Huánuco')
272
  , ('Puerto Rico', 'Guánica Municipio', 'Guánica')
273
  , ('Puerto Rico', 'Loíza Municipio', 'Loíza')
274
  , ('Puerto Rico', 'Manatí Municipio', 'Manatí')
275
  , ('Puerto Rico', 'Peñuelas Municipio', 'Peñuelas')
276
  , ('Puerto Rico', 'Río Grande Municipio', 'Río Grande')
277
  , ('Puerto Rico', 'San Germán Municipio', 'San Germán')
278
  , ('Puerto Rico', 'San Sebastián Municipio', 'San Sebastián')
279
  , ('Russia', 'Karachayevo-Cherkesskaya Respublika', 'Karachay-Cherkess')
280
  , ('Syria', 'Muḩāfaz̧at al Lādhiqīyah', 'Lattakia')
281
  , ('Togo', 'Région Centrale', 'Centre')
282
;" geoscrub
derived/biengeo/README.txt
1
BIEN geovalidation notes
2
========================
3

  
4
[Also see comments embedded in specific scripts in this directory.]
5

  
6
The bash and SQL statements contained in the files as ordered below
7
should be applied to carry out geographic name scrubbing and
8
geovalidation on a given corpus of BIEN location records.
9

  
10
That said, given the tight deadline under which this was done in order
11
to produced a geovalidated BIEN3 corpus in advance of the Nov 2013
12
working group meeting, and the corresponding manner in which much of
13
this was actually executed piecemeal in an iterative and interactive
14
fashion within a bash shell and psql session, I can't guarantee that the
15
code in its current state could be run end-to-end without intervention.
16
It's close, but probably not bulletproof.
17

  
18
1. geovalidate.sh
19
   - creates postgis DB and loads GADM2 data
20
2. geonames.sh
21
   - loads geonames.org data and adds some custom mapping logic
22
3. geonames-to-gadm.sql
23
   - contains SQL statements that build linkages between geonames.org
24
     names and GADM2 names
25
4. load-geoscrub-input.sh
26
   - dumps geoscrub_input from vegbien and loads it into the geoscrub db
27
5. geonames.sql
28
   - contains SQL statements that scrub asserted names and (to the
29
     extent possible) map them to GADM2
30
6. geovalidate.sql
31
   - contains (postgis-extended) SQL statements that score the validity
32
     of GADM2-scrubbed names against given point coordinates
33

  
34
The resulting 'geoscrub' table is what contains the scrubbed (i.e.,
35
GADM2-matched) names and various geovalidation scores.
36

  
37
Notes/Caveats/Todos:
38
* Clearly the SQL statements used in this procedure suffer from a lot of
39
  redundancy, and it might be worth trying to refactor once we're happy
40
  with the particular approach taken.
41
* Need to pull out more known notes/caveats/todos and highlight them :)
derived/biengeo/geonames.sql
1
/*
2
  SQL statements carrying out country name, stateprovince name, and
3
  county name scrubbing on vegbien location data. Currently hard-coded
4
  to operate on a 'vegbien_geoscrub' table that contains the input for
5
  geoscrubbing, with the following columns:
6
     decimallatitude  (latitude, WGS84 decimal degrees)
7
     decimallongitude (longitude, WGS84 decimal degrees)
8
     country          (original asserted country name)
9
     stateprovince    (original asserted stateprovince name)
10
     county           (original asserted county name)
11
  (Note that the lat/lon columns are not needed for scrubbing, but are
12
  used later for geovalidation.)
13

  
14
 Misc issues to contemplate and possibly address:
15

  
16
  * Congo can be either "Democratic Republic of the Congo" (COD) or
17
    "Republic of the Congo" (COG), eh?
18
  * Vietnam: gadm2 seems to treat provinces as level2 units
19
  * Yugoslavia: no longer exists, not in geonames nor gadm2
20

  
21
  * Azerbaijan: Yevlakh City and Yevlakh Rayon both ADM1 in geonames,
22
    but just Yevlax in gadm2
23

  
24
  * England is not a geoname country, only the UK is
25

  
26
  * the general problem of historical names in cases where countries
27
    have either been split, combined, removed (?), or otherwise
28
    redefined
29

  
30
  * Cases with multiple alternativename matches???
31

  
32
 Notes:
33
 - the vegbien_geoscrub input used during development and testing this
34
   scrubbing procedure came from a select distinct on (lat, lon,
35
   country, state, province) from analytical_aggregate on vegbiendev
36

  
37
  Jim Regetz
38
  NCEAS
39
  Created Nov 2012
40
*/
41

  
42
-- define custom function to pass characters back through latin-1
43
-- encoding to revert errors; in principle this could introduce new
44
-- errors, but it seems to do more good than harm
45
CREATE OR REPLACE FUNCTION u2L2u(text) RETURNS text AS $$
46
DECLARE
47
    string text;
48
BEGIN
49
  string :=
50
    convert_from(convert($1::bytea, 'UTF8', 'LATIN1'), 'UTF8');
51
  RETURN string;
52
EXCEPTION
53
  WHEN data_exception THEN
54
    RETURN $1;
55
END;
56
$$ LANGUAGE plpgsql;
57

  
58
-------------------
59
-- scrub country --
60
-------------------
61

  
62
CREATE TABLE vcountry AS
63
  SELECT DISTINCT country
64
    FROM vegbien_geoscrub
65
    WHERE country IS NOT NULL;
66
-- SELECT 439
67
-- Time: 862.413 ms
68
ALTER TABLE vcountry ADD COLUMN countryutf8 text;
69
ALTER TABLE vcountry ADD COLUMN countryid integer;
70

  
71
-- try to clean up character representations
72
UPDATE vcountry
73
    SET countryutf8 = u2L2u(country);
74
-- UPDATE 439
75
-- Time: 24.706 ms
76

  
77
-- try matching against direct country names
78
UPDATE vcountry vc
79
    SET countryid = gc.geonameid
80
    FROM countries gc
81
    WHERE lower(vc.countryutf8) = lower(gc.country);
82
-- UPDATE 219
83
-- Time: 12.384 ms
84

  
85
-- try matching against ISO 3166-1 alpha-3
86
UPDATE vcountry vc
87
    SET countryid = gc.geonameid
88
    FROM countries gc
89
    WHERE vc.countryutf8=gc.iso3
90
      AND vc.countryid IS NULL;
91
-- UPDATE 1
92
-- Time: 20.146 ms
93

  
94
-- try matching against ISO 3166-1 alpha-2
95
UPDATE vcountry vc
96
    SET countryid = gc.geonameid
97
    FROM countries gc
98
    WHERE vc.countryutf8 = iso
99
      AND vc.countryid IS NULL;
100
-- UPDATE 64
101
-- Time: 3.378 ms
102

  
103
/*
104
-- try matching against FIPS, but not if already matched (i.e.,
105
-- the 2-digit ISO match takes preference)
106
UPDATE vcountry vc
107
    SET countryid = gc.geonameid
108
    FROM countries gc
109
    WHERE vc.countryutf8 = fips
110
      AND vc.countryid IS NULL;
111
-- UPDATE 1
112
-- Time: 2.514 ms
113
*/
114

  
115
-- try matching against BIEN alt country names
116
UPDATE vcountry vc
117
    SET countryid = c.geonameid
118
    FROM alt_country a, countries c
119
    WHERE vc.countryutf8 = a.alternatename
120
      AND a.country = c.country
121
      AND vc.countryid IS NULL;
122
-- UPDATE 21
123
-- Time: 4.769 ms
124

  
125
-- try case-insensitive matching on geonames.org 'alternatenames'
126
-- note: this is a little sloppy because the set condition will return
127
-- more than one row if there are more than one matched alternate
128
-- country names, and which one gets used is non-deterministic; however,
129
-- we'll go back and remove those in the next update (specifically for
130
-- the cases where there are more than one *unique* matched countries)
131
UPDATE vcountry vc
132
    SET countryid = gc.geonameid
133
    FROM alternatenames a, countries gc
134
    WHERE lower(vc.countryutf8) = lower(a.alternatename)
135
      AND a.geonameid = gc.geonameid
136
      AND vc.countryid IS NULL;
137
-- UPDATE 76
138
-- Time: 22019.592 ms
139

  
140
-- go back and remove if altname lookup was ambiguous
141
UPDATE vcountry vc
142
    SET countryid = NULL
143
    WHERE countryutf8 IN (
144
        SELECT countryutf8
145
          FROM (
146
            SELECT DISTINCT vc.countryutf8, gc.iso3
147
              FROM vcountry vc,
148
                   alternatenames a,
149
                   countries gc
150
              WHERE lower(vc.countryutf8) = lower(a.alternatename)
151
                AND a.geonameid = gc.geonameid) lookup
152
          GROUP BY countryutf8
153
          HAVING COUNT(*) > 1);
154
-- UPDATE 2
155
-- Time: 2296.257 ms
156
-- Removes: Congo, CONGO
157

  
158
-- merge in GADM country names (our standard names)
159
ALTER TABLE vcountry ADD COLUMN countrystd text;
160
UPDATE vcountry vc
161
  SET countrystd = name_0
162
  FROM gadm_country_lookup gadm0
163
  WHERE vc.countryid=gadm0.countryid;
164
-- UPDATE 377
165
-- Time: 6.254 ms
... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff