Project

General

Profile

1 10707 aaronmk
/*
2
  SQL statements carrying out geovalidation on vegbien location data.
3
  Currently hard-coded to operate on the 'geoscrub' table that is
4
  assumed to have been generated by the geonames scrubbing process
5
  (see geonames.sql). This table should minimally contain the following
6
  columns:
7
     decimallatitude  (latitude, WGS84 decimal degrees)
8
     decimallongitude (longitude, WGS84 decimal degrees)
9
     country          (original asserted country name)
10
     stateprovince    (original asserted stateprovince name)
11
     county           (original asserted county name)
12
     countrystd       (GADM2-mapped country name)
13
     stateprovincestd (GADM2-mapped state/province name)
14
     countystd        (GADM2-mapped county name)
15
  (Note that the geonames.sql script also currently creates columns with
16
  geonames.org IDs for country, stateprovince, and county.)
17
18
  After execution of the statements below, the table will contain the
19
  following new columns (along with geom and geog columns used by
20
  postgis):
21
     latlonvalidity        (validity score for lat/lon coordinate)
22
     countryvalidity       (validity score for country)
23
     stateprovincevalidity (validity score for stateprovince)
24
     countyvalidity        (validity score for county)
25
     pointcountry          (GADM2 country containing the point)
26
     pointstateprovince    (GADM2 stateprovince containing the point)
27
     pointcounty           (GADM2 county containing the point)
28
29
  Workflow with actual times (as executed 15-Nov-2012)
30
     12s create point geoms
31
     16s create point geogs
32
     23s index geoms
33
     26s index geogs
34
     37s index countrystd
35
     25s index stateprovincestd
36
      5s vacuum analyze
37
      1s mark missing coords (UPDATE 0)
38
    133s mark valid coords (UPDATE 1702989)
39
      2s mark invalid coords (UPDATE 4981)
40
     17s vacuum analyze
41
     27s mark missing countries (UPDATE 222085)
42
     36s mark non-gadm countries (UPDATE 6547)
43
   1277s mark point in country (UPDATE 1400627)
44
   1491s mark point near country (UPDATE 24626)
45
    573s mark point near country accurate (UPDATE 1182)
46
      0s mark point near Antarctica (UPDATE 0)
47
      2s mark point not in country (UPDATE 54085)
48
      8s vacuum analyze
49
     15s mark missing stateprovinces (UPDATE 299015)
50
     12s mark non-gadm stateprovinces (UPDATE 19716)
51
    920s mark point in stateprovince (UPDATE 1241068)
52
    864s mark point near stateprovince (UPDATE 35685)
53
    424s mark point near stateprovince accurate (UPDATE 1051)
54
      7s mark point not in stateprovince (UPDATE 111282)
55
       s vacuum analyze
56
    128s mark missing counties (UPDATE 1431161)
57
     29s mark non-gadm counties (UPDATE 63014)
58
     86s mark point in county (UPDATE 174469)
59
     65s mark point near county (UPDATE 12477)
60
      ?s mark point near county accurate (UPDATE ?)
61
      3s mark point not in county (UPDATE 26849)
62
      ?s vacuum analyze
63
    890s look up GADM location using given lat/lon (UPDATE 1656709)
64
65
  todo:
66
  * make firm decision about whether we should assume here that *std
67
    columns contain valid GADM names (or are NULL), or whether we should
68
    check that here
69
70
  Jim Regetz
71
  NCEAS
72
  Created Nov 2012
73
*/
74
75 11443 psarando
BEGIN;
76
77
TRUNCATE geoscrub;
78
79
DROP INDEX IF EXISTS "geoscrub_geom_gist";
80
DROP INDEX IF EXISTS "geoscrub_geog_gist";
81
DROP INDEX IF EXISTS "geoscrub_countrystd_idx";
82
DROP INDEX IF EXISTS "geoscrub_stateprovincestd_idx";
83
DROP INDEX IF EXISTS "geoscrub_countystd_idx";
84
85
-----------------------------
86
-- put everything together --
87
-----------------------------
88
89
-- reconstitute the whole dang thang
90
91
-- populate geoscrub from vegbien_geoscrub data
92
INSERT INTO geoscrub
93
SELECT decimallatitude, decimallongitude,
94
       country, stateprovince, county,
95
       countryid, stateprovinceid, countyid,
96
       countrystd, stateprovincestd, countystd
97
  FROM vegbien_geoscrub v
98
  LEFT JOIN vcountry USING (country)
99
  LEFT JOIN vstate USING (countryid, stateprovince)
100
  LEFT JOIN vcounty USING (countryid, stateprovinceid, county);
101
-- SELECT 1707970
102
-- Time: 26172.154 ms
103
104
-- try to squeeze out a few more direct stateprovince mappings to gadm2
105
UPDATE geoscrub
106
  SET stateprovincestd = name_1
107
  FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm
108
  WHERE country IS NOT NULL
109
    AND stateprovince IS NOT NULL
110
    AND countrystd = name_0
111
    AND stateprovincestd IS NULL
112
    AND stateprovince = name_1;
113
-- UPDATE 0
114
-- Time: 3630.973 ms
115
116
-- try to squeeze out a few more direct county mappings to gadm2
117
UPDATE geoscrub
118
  SET countystd = name_2
119
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm
120
  WHERE country IS NOT NULL
121
    AND stateprovince IS NOT NULL
122
    AND county IS NOT NULL
123
    AND countrystd = name_0
124
    AND stateprovincestd = name_1
125
    AND countystd IS NULL
126
    AND county = name_2;
127
-- UPDATE 69
128
-- Time: 3982.715 ms
129
130
COMMIT;
131
132 10707 aaronmk
-- generate point geometries from coordinates
133
UPDATE geoscrub
134
   SET geom = ST_SetSRID(ST_Point(decimallongitude, decimallatitude), 4326);
135
-- ... create point geographies too ...
136
-- note that this excludes (leaves null) any occurrences with coordinates
137
-- that are not valid decimal degrees
138
UPDATE geoscrub
139
  SET geog = geom::geography
140
  WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
141
142
-- create indexes
143
CREATE INDEX "geoscrub_geom_gist" ON geoscrub USING GIST (geom);
144
CREATE INDEX "geoscrub_geog_gist" ON geoscrub USING GIST (geog);
145
CREATE INDEX "geoscrub_countrystd_idx" ON geoscrub (countrystd);
146
CREATE INDEX "geoscrub_stateprovincestd_idx" ON geoscrub (stateprovincestd);
147
CREATE INDEX "geoscrub_countystd_idx" ON geoscrub (countystd);
148
149
-- may want to do this first, otherwise the query planner wants to do a
150
-- hash join instead of nested loop join, and at least when I use a limit
151
-- statement to time queries on either side of the threshold where it
152
-- switches, the hash join does waaaaay worse
153
SET enable_hashjoin = false;
154
155
--
156
-- validate lat/lon coordinates
157
--
158
159
VACUUM ANALYZE geoscrub;
160
161
-- -1 if missing one or both coordinates
162
UPDATE geoscrub o
163
    SET latlonvalidity = -1
164
    WHERE (
165
        decimallatitude IS NULL
166
        OR decimallongitude IS NULL
167
    );
168
169
-- 1 if the coordinates falls within the global bounding box
170
UPDATE geoscrub o
171
    SET latlonvalidity = 1
172
    WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
173
174
-- 0 otherwise (have coordinates, but not valid)
175
UPDATE geoscrub o
176
    SET latlonvalidity = 0
177
    WHERE o.latlonvalidity IS NULL;
178
179
--
180
-- validate country
181
--
182
183
VACUUM ANALYZE geoscrub;
184
185
-- -1 if missing country name
186
UPDATE geoscrub o
187
    SET countryvalidity = -1
188
    WHERE o.country IS NULL;
189
190
-- 0 if a country name is asserted, but not recognized among GADM
191
-- country names
192
UPDATE geoscrub o
193
    SET countryvalidity = 0
194
    WHERE (o.countrystd IS NULL
195
        OR NOT EXISTS (
196
           SELECT 1
197
             FROM gadm2 c
198
             WHERE o.countrystd=c.name_0
199
         )
200
     ) AND o.countryvalidity IS NULL;
201
202
-- 3 if the point is in (or on the border of) the asserted country
203
UPDATE geoscrub AS o
204
    SET countryvalidity = 3
205
    WHERE EXISTS (
206
        SELECT 1
207
            FROM gadm2 c
208
            WHERE o.countrystd=c.name_0
209
              AND ST_Intersects(o.geom, c.simple_geom)
210
    ) AND o.countryvalidity IS NULL;
211
212
-- 2 for those that aren't 3's, but the point is in within 5km of
213
-- the asserted country
214
UPDATE geoscrub o
215
    SET countryvalidity = 2
216
    WHERE EXISTS (
217
        SELECT 1
218
            FROM gadm2 c
219
            WHERE o.countrystd=c.name_0
220
              AND ST_DWithin(o.geog, c.simple_geog, 5000)
221
    ) AND o.countryvalidity IS NULL;
222
-- now recheck the 2's to see if any are within the country based on the
223
-- original (unsimplified) geometry, and if so, upgrade to 3
224
UPDATE geoscrub AS o
225
    SET countryvalidity = 3
226
    WHERE EXISTS (
227
        SELECT 1
228
            FROM gadm2 c
229
            WHERE o.countrystd=c.name_0
230
              AND ST_Intersects(o.geom, c.geom)
231
    ) AND o.countryvalidity = 2;
232
-- also handle special case Antartica, which has invalid geography in
233
-- gadm2 and thus wasn't included in the within-5km check
234
UPDATE geoscrub AS o
235
    SET countryvalidity = 3
236
    WHERE EXISTS (
237
        SELECT 1
238
            FROM gadm2 c
239
            WHERE o.countrystd=c.name_0
240
              AND ST_Intersects(o.geom, c.geom)
241
    ) AND o.countryvalidity < 3
242
    AND o.countrystd = 'Antarctica';
243
244
-- 1 otherwise (have recognized country name, but point is not within
245
-- 5km of the asserted country)
246
UPDATE geoscrub o
247
    SET countryvalidity = 1
248
    WHERE o.countryvalidity IS NULL;
249
250
251
-----------------------------
252
-- validate state/province --
253
-----------------------------
254
255
VACUUM ANALYZE geoscrub;
256
257
-- -1 if missing stateprovince name
258
UPDATE geoscrub o
259
    SET stateprovincevalidity = -1
260
    WHERE o.country IS NULL
261
       OR o.stateprovince IS NULL;
262
263
-- 0 if a stateprovince name is asserted, but not recognized among GADM
264
-- stateprovince names
265
UPDATE geoscrub o
266
    SET stateprovincevalidity = 0
267
    WHERE (o.countrystd IS NULL
268
        OR o.stateprovincestd IS NULL
269
        OR NOT EXISTS (
270
           SELECT 1
271
             FROM gadm2 c
272
             WHERE o.countrystd=c.name_0
273
               AND o.stateprovincestd=c.name_1
274
        )
275
    ) AND o.stateprovincevalidity IS NULL;
276
277
-- 3 if the point is in (or on the border of) the asserted stateprovince
278
UPDATE geoscrub AS o
279
    SET stateprovincevalidity = 3
280
    WHERE EXISTS (
281
        SELECT 1
282
            FROM gadm2 c
283
            WHERE o.countrystd=c.name_0
284
              AND o.stateprovincestd=c.name_1
285
              AND ST_Intersects(o.geom, c.simple_geom)
286
    ) AND o.countryvalidity = 3
287
    AND o.stateprovincevalidity IS NULL;
288
289
-- for those that aren't 3's, set to 2 if the point is in within 5km of
290
-- the asserted stateprovince
291
UPDATE geoscrub o
292
    SET stateprovincevalidity = 2
293
    WHERE EXISTS (
294
        SELECT 1
295
            FROM gadm2 c
296
            WHERE o.countrystd=c.name_0
297
              AND o.stateprovincestd=c.name_1
298
              AND ST_DWithin(o.geog, c.simple_geog, 5000)
299
    ) AND (o.countryvalidity = 2 OR o.countryvalidity = 3)
300
    AND o.stateprovincevalidity IS NULL;
301
-- now recheck the 2's to see if any are within the stateprovince based
302
-- on the original (unsimplified) geometry, and if so, upgrade to 3
303
UPDATE geoscrub AS o
304
    SET stateprovincevalidity = 3
305
    WHERE EXISTS (
306
        SELECT 1
307
            FROM gadm2 c
308
            WHERE o.countrystd=c.name_0
309
              AND o.stateprovincestd=c.name_1
310
              AND ST_Intersects(o.geom, c.geom)
311
    ) AND o.stateprovincevalidity = 2;
312
313
-- 1 otherwise (have recognized stateprovince name, but point is not within
314
-- 5km of the asserted stateprovince)
315
UPDATE geoscrub o
316
    SET stateprovincevalidity = 1
317
    WHERE o.stateprovincevalidity IS NULL;
318
319
-----------------------------
320
-- validate county/parish --
321
-----------------------------
322
323
-- -1 if missing county name
324
UPDATE geoscrub o
325
    SET countyvalidity = -1
326
    WHERE o.country IS NULL
327
       OR o.stateprovince IS NULL
328
       OR o.county IS NULL;
329
330
-- 0 if a county name is asserted, but not recognized among GADM
331
-- county names
332
UPDATE geoscrub o
333
    SET countyvalidity = 0
334
    WHERE (o.countrystd IS NULL
335
        OR o.stateprovincestd IS NULL
336
        OR o.countystd IS NULL
337
        OR NOT EXISTS (
338
           SELECT 1
339
             FROM gadm2 c
340
             WHERE o.countrystd=c.name_0
341
               AND o.stateprovincestd=c.name_1
342
               AND o.countystd=c.name_2
343
        )
344
    ) AND o.countyvalidity IS NULL;
345
346
-- 3 if the point is in (or on the border of) the asserted county
347
UPDATE geoscrub AS o
348
    SET countyvalidity = 3
349
    WHERE EXISTS (
350
        SELECT 1
351
            FROM gadm2 c
352
            WHERE o.countrystd=c.name_0
353
              AND o.stateprovincestd=c.name_1
354
              AND o.countystd=c.name_2
355
              AND ST_Intersects(o.geom, c.simple_geom)
356
    ) AND o.stateprovincevalidity = 3
357
    AND o.countyvalidity IS NULL;
358
359
-- for those that aren't 3's, set to 2 if the point is in within 5km of
360
-- the asserted county
361
UPDATE geoscrub o
362
    SET countyvalidity = 2
363
    WHERE EXISTS (
364
        SELECT 1
365
            FROM gadm2 c
366
            WHERE o.countrystd=c.name_0
367
              AND o.stateprovincestd=c.name_1
368
              AND o.countystd=c.name_2
369
              AND ST_DWithin(o.geog, c.simple_geog, 5000)
370
    ) AND (o.stateprovincevalidity = 2 OR o.stateprovincevalidity = 3)
371
    AND o.countyvalidity IS NULL;
372
373
-- 1 otherwise (have recognized county name, but point is not within
374
-- 5km of the asserted county)
375
UPDATE geoscrub o
376
    SET countyvalidity = 1
377
    WHERE o.countyvalidity IS NULL;
378
379
--
380
-- Look up GADM place names based purely on coordinates for all points
381
--
382
383
-- note: only using simplified geometry now, for speed reasons
384
UPDATE geoscrub o
385
    SET pointcountry = c.name_0,
386
        pointstateprovince = c.name_1,
387
        pointcounty = c.name_2
388
    FROM gadm2 c
389
    WHERE ST_Intersects(o.geom, c.simple_geom)
390
      AND o.latlonvalidity = 1;
391
392
SET enable_hashjoin = true;