1
|
/*
|
2
|
SQL statements carrying out geovalidation on vegbien location data.
|
3
|
Currently hard-coded to operate on the 'geoscrub' table that is
|
4
|
assumed to have been generated by the geonames scrubbing process
|
5
|
(see geonames.sql). This table should minimally contain the following
|
6
|
columns:
|
7
|
decimallatitude (latitude, WGS84 decimal degrees)
|
8
|
decimallongitude (longitude, WGS84 decimal degrees)
|
9
|
country (original asserted country name)
|
10
|
stateprovince (original asserted stateprovince name)
|
11
|
county (original asserted county name)
|
12
|
countrystd (GADM2-mapped country name)
|
13
|
stateprovincestd (GADM2-mapped state/province name)
|
14
|
countystd (GADM2-mapped county name)
|
15
|
(Note that the geonames.sql script also currently creates columns with
|
16
|
geonames.org IDs for country, stateprovince, and county.)
|
17
|
|
18
|
After execution of the statements below, the table will contain the
|
19
|
following new columns (along with geom and geog columns used by
|
20
|
postgis):
|
21
|
latlonvalidity (validity score for lat/lon coordinate)
|
22
|
countryvalidity (validity score for country)
|
23
|
stateprovincevalidity (validity score for stateprovince)
|
24
|
countyvalidity (validity score for county)
|
25
|
pointcountry (GADM2 country containing the point)
|
26
|
pointstateprovince (GADM2 stateprovince containing the point)
|
27
|
pointcounty (GADM2 county containing the point)
|
28
|
|
29
|
Workflow with actual times (as executed 15-Nov-2012)
|
30
|
12s create point geoms
|
31
|
16s create point geogs
|
32
|
23s index geoms
|
33
|
26s index geogs
|
34
|
37s index countrystd
|
35
|
25s index stateprovincestd
|
36
|
5s vacuum analyze
|
37
|
1s mark missing coords (UPDATE 0)
|
38
|
133s mark valid coords (UPDATE 1702989)
|
39
|
2s mark invalid coords (UPDATE 4981)
|
40
|
17s vacuum analyze
|
41
|
27s mark missing countries (UPDATE 222085)
|
42
|
36s mark non-gadm countries (UPDATE 6547)
|
43
|
1277s mark point in country (UPDATE 1400627)
|
44
|
1491s mark point near country (UPDATE 24626)
|
45
|
573s mark point near country accurate (UPDATE 1182)
|
46
|
0s mark point near Antarctica (UPDATE 0)
|
47
|
2s mark point not in country (UPDATE 54085)
|
48
|
8s vacuum analyze
|
49
|
15s mark missing stateprovinces (UPDATE 299015)
|
50
|
12s mark non-gadm stateprovinces (UPDATE 19716)
|
51
|
920s mark point in stateprovince (UPDATE 1241068)
|
52
|
864s mark point near stateprovince (UPDATE 35685)
|
53
|
424s mark point near stateprovince accurate (UPDATE 1051)
|
54
|
7s mark point not in stateprovince (UPDATE 111282)
|
55
|
s vacuum analyze
|
56
|
128s mark missing counties (UPDATE 1431161)
|
57
|
29s mark non-gadm counties (UPDATE 63014)
|
58
|
86s mark point in county (UPDATE 174469)
|
59
|
65s mark point near county (UPDATE 12477)
|
60
|
?s mark point near county accurate (UPDATE ?)
|
61
|
3s mark point not in county (UPDATE 26849)
|
62
|
?s vacuum analyze
|
63
|
890s look up GADM location using given lat/lon (UPDATE 1656709)
|
64
|
|
65
|
todo:
|
66
|
* make firm decision about whether we should assume here that *std
|
67
|
columns contain valid GADM names (or are NULL), or whether we should
|
68
|
check that here
|
69
|
|
70
|
Jim Regetz
|
71
|
NCEAS
|
72
|
Created Nov 2012
|
73
|
*/
|
74
|
|
75
|
BEGIN;
|
76
|
|
77
|
TRUNCATE geoscrub;
|
78
|
|
79
|
DROP INDEX IF EXISTS "geoscrub_geom_gist";
|
80
|
DROP INDEX IF EXISTS "geoscrub_geog_gist";
|
81
|
DROP INDEX IF EXISTS "geoscrub_countrystd_idx";
|
82
|
DROP INDEX IF EXISTS "geoscrub_stateprovincestd_idx";
|
83
|
DROP INDEX IF EXISTS "geoscrub_countystd_idx";
|
84
|
|
85
|
-----------------------------
|
86
|
-- put everything together --
|
87
|
-----------------------------
|
88
|
|
89
|
-- reconstitute the whole dang thang
|
90
|
|
91
|
-- populate geoscrub from vegbien_geoscrub data
|
92
|
INSERT INTO geoscrub
|
93
|
SELECT decimallatitude, decimallongitude,
|
94
|
country, stateprovince, county,
|
95
|
countryid, stateprovinceid, countyid,
|
96
|
countrystd, stateprovincestd, countystd
|
97
|
FROM vegbien_geoscrub v
|
98
|
LEFT JOIN vcountry USING (country)
|
99
|
LEFT JOIN vstate USING (countryid, stateprovince)
|
100
|
LEFT JOIN vcounty USING (countryid, stateprovinceid, county);
|
101
|
-- SELECT 1707970
|
102
|
-- Time: 26172.154 ms
|
103
|
|
104
|
-- try to squeeze out a few more direct stateprovince mappings to gadm2
|
105
|
UPDATE geoscrub
|
106
|
SET stateprovincestd = name_1
|
107
|
FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm
|
108
|
WHERE country IS NOT NULL
|
109
|
AND stateprovince IS NOT NULL
|
110
|
AND countrystd = name_0
|
111
|
AND stateprovincestd IS NULL
|
112
|
AND stateprovince = name_1;
|
113
|
-- UPDATE 0
|
114
|
-- Time: 3630.973 ms
|
115
|
|
116
|
-- try to squeeze out a few more direct county mappings to gadm2
|
117
|
UPDATE geoscrub
|
118
|
SET countystd = name_2
|
119
|
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm
|
120
|
WHERE country IS NOT NULL
|
121
|
AND stateprovince IS NOT NULL
|
122
|
AND county IS NOT NULL
|
123
|
AND countrystd = name_0
|
124
|
AND stateprovincestd = name_1
|
125
|
AND countystd IS NULL
|
126
|
AND county = name_2;
|
127
|
-- UPDATE 69
|
128
|
-- Time: 3982.715 ms
|
129
|
|
130
|
COMMIT;
|
131
|
|
132
|
-- generate point geometries from coordinates
|
133
|
UPDATE geoscrub
|
134
|
SET geom = ST_SetSRID(ST_Point(decimallongitude, decimallatitude), 4326);
|
135
|
-- ... create point geographies too ...
|
136
|
-- note that this excludes (leaves null) any occurrences with coordinates
|
137
|
-- that are not valid decimal degrees
|
138
|
UPDATE geoscrub
|
139
|
SET geog = geom::geography
|
140
|
WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
|
141
|
|
142
|
-- create indexes
|
143
|
CREATE INDEX "geoscrub_geom_gist" ON geoscrub USING GIST (geom);
|
144
|
CREATE INDEX "geoscrub_geog_gist" ON geoscrub USING GIST (geog);
|
145
|
CREATE INDEX "geoscrub_countrystd_idx" ON geoscrub (countrystd);
|
146
|
CREATE INDEX "geoscrub_stateprovincestd_idx" ON geoscrub (stateprovincestd);
|
147
|
CREATE INDEX "geoscrub_countystd_idx" ON geoscrub (countystd);
|
148
|
|
149
|
-- may want to do this first, otherwise the query planner wants to do a
|
150
|
-- hash join instead of nested loop join, and at least when I use a limit
|
151
|
-- statement to time queries on either side of the threshold where it
|
152
|
-- switches, the hash join does waaaaay worse
|
153
|
SET enable_hashjoin = false;
|
154
|
|
155
|
--
|
156
|
-- validate lat/lon coordinates
|
157
|
--
|
158
|
|
159
|
VACUUM ANALYZE geoscrub;
|
160
|
|
161
|
-- -1 if missing one or both coordinates
|
162
|
UPDATE geoscrub o
|
163
|
SET latlonvalidity = -1
|
164
|
WHERE (
|
165
|
decimallatitude IS NULL
|
166
|
OR decimallongitude IS NULL
|
167
|
);
|
168
|
|
169
|
-- 1 if the coordinates falls within the global bounding box
|
170
|
UPDATE geoscrub o
|
171
|
SET latlonvalidity = 1
|
172
|
WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
|
173
|
|
174
|
-- 0 otherwise (have coordinates, but not valid)
|
175
|
UPDATE geoscrub o
|
176
|
SET latlonvalidity = 0
|
177
|
WHERE o.latlonvalidity IS NULL;
|
178
|
|
179
|
--
|
180
|
-- validate country
|
181
|
--
|
182
|
|
183
|
VACUUM ANALYZE geoscrub;
|
184
|
|
185
|
-- -1 if missing country name
|
186
|
UPDATE geoscrub o
|
187
|
SET countryvalidity = -1
|
188
|
WHERE o.country IS NULL;
|
189
|
|
190
|
-- 0 if a country name is asserted, but not recognized among GADM
|
191
|
-- country names
|
192
|
UPDATE geoscrub o
|
193
|
SET countryvalidity = 0
|
194
|
WHERE (o.countrystd IS NULL
|
195
|
OR NOT EXISTS (
|
196
|
SELECT 1
|
197
|
FROM gadm2 c
|
198
|
WHERE o.countrystd=c.name_0
|
199
|
)
|
200
|
) AND o.countryvalidity IS NULL;
|
201
|
|
202
|
-- 3 if the point is in (or on the border of) the asserted country
|
203
|
UPDATE geoscrub AS o
|
204
|
SET countryvalidity = 3
|
205
|
WHERE EXISTS (
|
206
|
SELECT 1
|
207
|
FROM gadm2 c
|
208
|
WHERE o.countrystd=c.name_0
|
209
|
AND ST_Intersects(o.geom, c.simple_geom)
|
210
|
) AND o.countryvalidity IS NULL;
|
211
|
|
212
|
-- 2 for those that aren't 3's, but the point is in within 5km of
|
213
|
-- the asserted country
|
214
|
UPDATE geoscrub o
|
215
|
SET countryvalidity = 2
|
216
|
WHERE EXISTS (
|
217
|
SELECT 1
|
218
|
FROM gadm2 c
|
219
|
WHERE o.countrystd=c.name_0
|
220
|
AND ST_DWithin(o.geog, c.simple_geog, 5000)
|
221
|
) AND o.countryvalidity IS NULL;
|
222
|
-- now recheck the 2's to see if any are within the country based on the
|
223
|
-- original (unsimplified) geometry, and if so, upgrade to 3
|
224
|
UPDATE geoscrub AS o
|
225
|
SET countryvalidity = 3
|
226
|
WHERE EXISTS (
|
227
|
SELECT 1
|
228
|
FROM gadm2 c
|
229
|
WHERE o.countrystd=c.name_0
|
230
|
AND ST_Intersects(o.geom, c.geom)
|
231
|
) AND o.countryvalidity = 2;
|
232
|
-- also handle special case Antartica, which has invalid geography in
|
233
|
-- gadm2 and thus wasn't included in the within-5km check
|
234
|
UPDATE geoscrub AS o
|
235
|
SET countryvalidity = 3
|
236
|
WHERE EXISTS (
|
237
|
SELECT 1
|
238
|
FROM gadm2 c
|
239
|
WHERE o.countrystd=c.name_0
|
240
|
AND ST_Intersects(o.geom, c.geom)
|
241
|
) AND o.countryvalidity < 3
|
242
|
AND o.countrystd = 'Antarctica';
|
243
|
|
244
|
-- 1 otherwise (have recognized country name, but point is not within
|
245
|
-- 5km of the asserted country)
|
246
|
UPDATE geoscrub o
|
247
|
SET countryvalidity = 1
|
248
|
WHERE o.countryvalidity IS NULL;
|
249
|
|
250
|
|
251
|
-----------------------------
|
252
|
-- validate state/province --
|
253
|
-----------------------------
|
254
|
|
255
|
VACUUM ANALYZE geoscrub;
|
256
|
|
257
|
-- -1 if missing stateprovince name
|
258
|
UPDATE geoscrub o
|
259
|
SET stateprovincevalidity = -1
|
260
|
WHERE o.country IS NULL
|
261
|
OR o.stateprovince IS NULL;
|
262
|
|
263
|
-- 0 if a stateprovince name is asserted, but not recognized among GADM
|
264
|
-- stateprovince names
|
265
|
UPDATE geoscrub o
|
266
|
SET stateprovincevalidity = 0
|
267
|
WHERE (o.countrystd IS NULL
|
268
|
OR o.stateprovincestd IS NULL
|
269
|
OR NOT EXISTS (
|
270
|
SELECT 1
|
271
|
FROM gadm2 c
|
272
|
WHERE o.countrystd=c.name_0
|
273
|
AND o.stateprovincestd=c.name_1
|
274
|
)
|
275
|
) AND o.stateprovincevalidity IS NULL;
|
276
|
|
277
|
-- 3 if the point is in (or on the border of) the asserted stateprovince
|
278
|
UPDATE geoscrub AS o
|
279
|
SET stateprovincevalidity = 3
|
280
|
WHERE EXISTS (
|
281
|
SELECT 1
|
282
|
FROM gadm2 c
|
283
|
WHERE o.countrystd=c.name_0
|
284
|
AND o.stateprovincestd=c.name_1
|
285
|
AND ST_Intersects(o.geom, c.simple_geom)
|
286
|
) AND o.countryvalidity = 3
|
287
|
AND o.stateprovincevalidity IS NULL;
|
288
|
|
289
|
-- for those that aren't 3's, set to 2 if the point is in within 5km of
|
290
|
-- the asserted stateprovince
|
291
|
UPDATE geoscrub o
|
292
|
SET stateprovincevalidity = 2
|
293
|
WHERE EXISTS (
|
294
|
SELECT 1
|
295
|
FROM gadm2 c
|
296
|
WHERE o.countrystd=c.name_0
|
297
|
AND o.stateprovincestd=c.name_1
|
298
|
AND ST_DWithin(o.geog, c.simple_geog, 5000)
|
299
|
) AND (o.countryvalidity = 2 OR o.countryvalidity = 3)
|
300
|
AND o.stateprovincevalidity IS NULL;
|
301
|
-- now recheck the 2's to see if any are within the stateprovince based
|
302
|
-- on the original (unsimplified) geometry, and if so, upgrade to 3
|
303
|
UPDATE geoscrub AS o
|
304
|
SET stateprovincevalidity = 3
|
305
|
WHERE EXISTS (
|
306
|
SELECT 1
|
307
|
FROM gadm2 c
|
308
|
WHERE o.countrystd=c.name_0
|
309
|
AND o.stateprovincestd=c.name_1
|
310
|
AND ST_Intersects(o.geom, c.geom)
|
311
|
) AND o.stateprovincevalidity = 2;
|
312
|
|
313
|
-- 1 otherwise (have recognized stateprovince name, but point is not within
|
314
|
-- 5km of the asserted stateprovince)
|
315
|
UPDATE geoscrub o
|
316
|
SET stateprovincevalidity = 1
|
317
|
WHERE o.stateprovincevalidity IS NULL;
|
318
|
|
319
|
-----------------------------
|
320
|
-- validate county/parish --
|
321
|
-----------------------------
|
322
|
|
323
|
-- -1 if missing county name
|
324
|
UPDATE geoscrub o
|
325
|
SET countyvalidity = -1
|
326
|
WHERE o.country IS NULL
|
327
|
OR o.stateprovince IS NULL
|
328
|
OR o.county IS NULL;
|
329
|
|
330
|
-- 0 if a county name is asserted, but not recognized among GADM
|
331
|
-- county names
|
332
|
UPDATE geoscrub o
|
333
|
SET countyvalidity = 0
|
334
|
WHERE (o.countrystd IS NULL
|
335
|
OR o.stateprovincestd IS NULL
|
336
|
OR o.countystd IS NULL
|
337
|
OR NOT EXISTS (
|
338
|
SELECT 1
|
339
|
FROM gadm2 c
|
340
|
WHERE o.countrystd=c.name_0
|
341
|
AND o.stateprovincestd=c.name_1
|
342
|
AND o.countystd=c.name_2
|
343
|
)
|
344
|
) AND o.countyvalidity IS NULL;
|
345
|
|
346
|
-- 3 if the point is in (or on the border of) the asserted county
|
347
|
UPDATE geoscrub AS o
|
348
|
SET countyvalidity = 3
|
349
|
WHERE EXISTS (
|
350
|
SELECT 1
|
351
|
FROM gadm2 c
|
352
|
WHERE o.countrystd=c.name_0
|
353
|
AND o.stateprovincestd=c.name_1
|
354
|
AND o.countystd=c.name_2
|
355
|
AND ST_Intersects(o.geom, c.simple_geom)
|
356
|
) AND o.stateprovincevalidity = 3
|
357
|
AND o.countyvalidity IS NULL;
|
358
|
|
359
|
-- for those that aren't 3's, set to 2 if the point is in within 5km of
|
360
|
-- the asserted county
|
361
|
UPDATE geoscrub o
|
362
|
SET countyvalidity = 2
|
363
|
WHERE EXISTS (
|
364
|
SELECT 1
|
365
|
FROM gadm2 c
|
366
|
WHERE o.countrystd=c.name_0
|
367
|
AND o.stateprovincestd=c.name_1
|
368
|
AND o.countystd=c.name_2
|
369
|
AND ST_DWithin(o.geog, c.simple_geog, 5000)
|
370
|
) AND (o.stateprovincevalidity = 2 OR o.stateprovincevalidity = 3)
|
371
|
AND o.countyvalidity IS NULL;
|
372
|
|
373
|
-- 1 otherwise (have recognized county name, but point is not within
|
374
|
-- 5km of the asserted county)
|
375
|
UPDATE geoscrub o
|
376
|
SET countyvalidity = 1
|
377
|
WHERE o.countyvalidity IS NULL;
|
378
|
|
379
|
--
|
380
|
-- Look up GADM place names based purely on coordinates for all points
|
381
|
--
|
382
|
|
383
|
-- note: only using simplified geometry now, for speed reasons
|
384
|
UPDATE geoscrub o
|
385
|
SET pointcountry = c.name_0,
|
386
|
pointstateprovince = c.name_1,
|
387
|
pointcounty = c.name_2
|
388
|
FROM gadm2 c
|
389
|
WHERE ST_Intersects(o.geom, c.simple_geom)
|
390
|
AND o.latlonvalidity = 1;
|
391
|
|
392
|
SET enable_hashjoin = true;
|