1
|
/*
|
2
|
SQL statements carrying out geovalidation on vegbien location data.
|
3
|
Currently hard-coded to operate on the 'geoscrub' table that is
|
4
|
assumed to have been generated by the geonames scrubbing process
|
5
|
(see geonames.sql). This table should minimally contain the following
|
6
|
columns:
|
7
|
decimallatitude (latitude, WGS84 decimal degrees)
|
8
|
decimallongitude (longitude, WGS84 decimal degrees)
|
9
|
country (original asserted country name)
|
10
|
stateprovince (original asserted stateprovince name)
|
11
|
county (original asserted county name)
|
12
|
countrystd (GADM2-mapped country name)
|
13
|
stateprovincestd (GADM2-mapped state/province name)
|
14
|
countystd (GADM2-mapped county name)
|
15
|
(Note that the geonames.sql script also currently creates columns with
|
16
|
geonames.org IDs for country, stateprovince, and county.)
|
17
|
|
18
|
After execution of the statements below, the table will contain the
|
19
|
following new columns (along with geom and geog columns used by
|
20
|
postgis):
|
21
|
latlonvalidity (validity score for lat/lon coordinate)
|
22
|
countryvalidity (validity score for country)
|
23
|
stateprovincevalidity (validity score for stateprovince)
|
24
|
countyvalidity (validity score for county)
|
25
|
pointcountry (GADM2 country containing the point)
|
26
|
pointstateprovince (GADM2 stateprovince containing the point)
|
27
|
pointcounty (GADM2 county containing the point)
|
28
|
|
29
|
Workflow with actual times (as executed 15-Nov-2012)
|
30
|
12s create point geoms
|
31
|
16s create point geogs
|
32
|
23s index geoms
|
33
|
26s index geogs
|
34
|
37s index countrystd
|
35
|
25s index stateprovincestd
|
36
|
5s vacuum analyze
|
37
|
1s mark missing coords (UPDATE 0)
|
38
|
133s mark valid coords (UPDATE 1702989)
|
39
|
2s mark invalid coords (UPDATE 4981)
|
40
|
17s vacuum analyze
|
41
|
27s mark missing countries (UPDATE 222085)
|
42
|
36s mark non-gadm countries (UPDATE 6547)
|
43
|
1277s mark point in country (UPDATE 1400627)
|
44
|
1491s mark point near country (UPDATE 24626)
|
45
|
573s mark point near country accurate (UPDATE 1182)
|
46
|
0s mark point near Antarctica (UPDATE 0)
|
47
|
2s mark point not in country (UPDATE 54085)
|
48
|
8s vacuum analyze
|
49
|
15s mark missing stateprovinces (UPDATE 299015)
|
50
|
12s mark non-gadm stateprovinces (UPDATE 19716)
|
51
|
920s mark point in stateprovince (UPDATE 1241068)
|
52
|
864s mark point near stateprovince (UPDATE 35685)
|
53
|
424s mark point near stateprovince accurate (UPDATE 1051)
|
54
|
7s mark point not in stateprovince (UPDATE 111282)
|
55
|
s vacuum analyze
|
56
|
128s mark missing counties (UPDATE 1431161)
|
57
|
29s mark non-gadm counties (UPDATE 63014)
|
58
|
86s mark point in county (UPDATE 174469)
|
59
|
65s mark point near county (UPDATE 12477)
|
60
|
?s mark point near county accurate (UPDATE ?)
|
61
|
3s mark point not in county (UPDATE 26849)
|
62
|
?s vacuum analyze
|
63
|
890s look up GADM location using given lat/lon (UPDATE 1656709)
|
64
|
|
65
|
todo:
|
66
|
* make firm decision about whether we should assume here that *std
|
67
|
columns contain valid GADM names (or are NULL), or whether we should
|
68
|
check that here
|
69
|
|
70
|
Jim Regetz
|
71
|
NCEAS
|
72
|
Created Nov 2012
|
73
|
*/
|
74
|
|
75
|
-- generate point geometries from coordinates
|
76
|
SELECT AddGeometryColumn('public', 'geoscrub', 'geom', 4326, 'POINT', 2 );
|
77
|
UPDATE geoscrub
|
78
|
SET geom = ST_SetSRID(ST_Point(decimallongitude, decimallatitude), 4326);
|
79
|
-- ... create point geographies too ...
|
80
|
-- note that this excludes (leaves null) any occurrences with coordinates
|
81
|
-- that are not valid decimal degrees
|
82
|
ALTER TABLE geoscrub ADD COLUMN geog geography(POINT, 4326);
|
83
|
UPDATE geoscrub
|
84
|
SET geog = geom::geography
|
85
|
WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
|
86
|
|
87
|
-- create indexes
|
88
|
CREATE INDEX "geoscrub_geom_gist" ON geoscrub USING GIST (geom);
|
89
|
CREATE INDEX "geoscrub_geog_gist" ON geoscrub USING GIST (geog);
|
90
|
CREATE INDEX "geoscrub_countrystd_idx" ON geoscrub (countrystd);
|
91
|
CREATE INDEX "geoscrub_stateprovincestd_idx" ON geoscrub (stateprovincestd);
|
92
|
CREATE INDEX "geoscrub_countystd_idx" ON geoscrub (countystd);
|
93
|
|
94
|
-- may want to do this first, otherwise the query planner wants to do a
|
95
|
-- hash join instead of nested loop join, and at least when I use a limit
|
96
|
-- statement to time queries on either side of the threshold where it
|
97
|
-- switches, the hash join does waaaaay worse
|
98
|
SET enable_hashjoin = false;
|
99
|
|
100
|
--
|
101
|
-- validate lat/lon coordinates
|
102
|
--
|
103
|
|
104
|
VACUUM ANALYZE geoscrub;
|
105
|
|
106
|
ALTER TABLE geoscrub ADD COLUMN latlonvalidity int;
|
107
|
|
108
|
-- -1 if missing one or both coordinates
|
109
|
UPDATE geoscrub o
|
110
|
SET latlonvalidity = -1
|
111
|
WHERE (
|
112
|
decimallatitude IS NULL
|
113
|
OR decimallongitude IS NULL
|
114
|
);
|
115
|
|
116
|
-- 1 if the coordinates falls within the global bounding box
|
117
|
UPDATE geoscrub o
|
118
|
SET latlonvalidity = 1
|
119
|
WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
|
120
|
|
121
|
-- 0 otherwise (have coordinates, but not valid)
|
122
|
UPDATE geoscrub o
|
123
|
SET latlonvalidity = 0
|
124
|
WHERE o.latlonvalidity IS NULL;
|
125
|
|
126
|
--
|
127
|
-- validate country
|
128
|
--
|
129
|
|
130
|
VACUUM ANALYZE geoscrub;
|
131
|
|
132
|
ALTER TABLE geoscrub ADD COLUMN countryvalidity int;
|
133
|
|
134
|
-- -1 if missing country name
|
135
|
UPDATE geoscrub o
|
136
|
SET countryvalidity = -1
|
137
|
WHERE o.country IS NULL;
|
138
|
|
139
|
-- 0 if a country name is asserted, but not recognized among GADM
|
140
|
-- country names
|
141
|
UPDATE geoscrub o
|
142
|
SET countryvalidity = 0
|
143
|
WHERE (o.countrystd IS NULL
|
144
|
OR NOT EXISTS (
|
145
|
SELECT 1
|
146
|
FROM gadm2 c
|
147
|
WHERE o.countrystd=c.name_0
|
148
|
)
|
149
|
) AND o.countryvalidity IS NULL;
|
150
|
|
151
|
-- 3 if the point is in (or on the border of) the asserted country
|
152
|
UPDATE geoscrub AS o
|
153
|
SET countryvalidity = 3
|
154
|
WHERE EXISTS (
|
155
|
SELECT 1
|
156
|
FROM gadm2 c
|
157
|
WHERE o.countrystd=c.name_0
|
158
|
AND ST_Intersects(o.geom, c.simple_geom)
|
159
|
) AND o.countryvalidity IS NULL;
|
160
|
|
161
|
-- 2 for those that aren't 3's, but the point is in within 5km of
|
162
|
-- the asserted country
|
163
|
UPDATE geoscrub o
|
164
|
SET countryvalidity = 2
|
165
|
WHERE EXISTS (
|
166
|
SELECT 1
|
167
|
FROM gadm2 c
|
168
|
WHERE o.countrystd=c.name_0
|
169
|
AND ST_DWithin(o.geog, c.simple_geog, 5000)
|
170
|
) AND o.countryvalidity IS NULL;
|
171
|
-- now recheck the 2's to see if any are within the country based on the
|
172
|
-- original (unsimplified) geometry, and if so, upgrade to 3
|
173
|
UPDATE geoscrub AS o
|
174
|
SET countryvalidity = 3
|
175
|
WHERE EXISTS (
|
176
|
SELECT 1
|
177
|
FROM gadm2 c
|
178
|
WHERE o.countrystd=c.name_0
|
179
|
AND ST_Intersects(o.geom, c.geom)
|
180
|
) AND o.countryvalidity = 2;
|
181
|
-- also handle special case Antartica, which has invalid geography in
|
182
|
-- gadm2 and thus wasn't included in the within-5km check
|
183
|
UPDATE geoscrub AS o
|
184
|
SET countryvalidity = 3
|
185
|
WHERE EXISTS (
|
186
|
SELECT 1
|
187
|
FROM gadm2 c
|
188
|
WHERE o.countrystd=c.name_0
|
189
|
AND ST_Intersects(o.geom, c.geom)
|
190
|
) AND o.countryvalidity < 3
|
191
|
AND o.countrystd = 'Antarctica';
|
192
|
|
193
|
-- 1 otherwise (have recognized country name, but point is not within
|
194
|
-- 5km of the asserted country)
|
195
|
UPDATE geoscrub o
|
196
|
SET countryvalidity = 1
|
197
|
WHERE o.countryvalidity IS NULL;
|
198
|
|
199
|
|
200
|
-----------------------------
|
201
|
-- validate state/province --
|
202
|
-----------------------------
|
203
|
|
204
|
VACUUM ANALYZE geoscrub;
|
205
|
|
206
|
ALTER TABLE geoscrub ADD COLUMN stateprovincevalidity int;
|
207
|
|
208
|
-- -1 if missing stateprovince name
|
209
|
UPDATE geoscrub o
|
210
|
SET stateprovincevalidity = -1
|
211
|
WHERE o.country IS NULL
|
212
|
OR o.stateprovince IS NULL;
|
213
|
|
214
|
-- 0 if a stateprovince name is asserted, but not recognized among GADM
|
215
|
-- stateprovince names
|
216
|
UPDATE geoscrub o
|
217
|
SET stateprovincevalidity = 0
|
218
|
WHERE (o.countrystd IS NULL
|
219
|
OR o.stateprovincestd IS NULL
|
220
|
OR NOT EXISTS (
|
221
|
SELECT 1
|
222
|
FROM gadm2 c
|
223
|
WHERE o.countrystd=c.name_0
|
224
|
AND o.stateprovincestd=c.name_1
|
225
|
)
|
226
|
) AND o.stateprovincevalidity IS NULL;
|
227
|
|
228
|
-- 3 if the point is in (or on the border of) the asserted stateprovince
|
229
|
UPDATE geoscrub AS o
|
230
|
SET stateprovincevalidity = 3
|
231
|
WHERE EXISTS (
|
232
|
SELECT 1
|
233
|
FROM gadm2 c
|
234
|
WHERE o.countrystd=c.name_0
|
235
|
AND o.stateprovincestd=c.name_1
|
236
|
AND ST_Intersects(o.geom, c.simple_geom)
|
237
|
) AND o.countryvalidity = 3
|
238
|
AND o.stateprovincevalidity IS NULL;
|
239
|
|
240
|
-- for those that aren't 3's, set to 2 if the point is in within 5km of
|
241
|
-- the asserted stateprovince
|
242
|
UPDATE geoscrub o
|
243
|
SET stateprovincevalidity = 2
|
244
|
WHERE EXISTS (
|
245
|
SELECT 1
|
246
|
FROM gadm2 c
|
247
|
WHERE o.countrystd=c.name_0
|
248
|
AND o.stateprovincestd=c.name_1
|
249
|
AND ST_DWithin(o.geog, c.simple_geog, 5000)
|
250
|
) AND (o.countryvalidity = 2 OR o.countryvalidity = 3)
|
251
|
AND o.stateprovincevalidity IS NULL;
|
252
|
-- now recheck the 2's to see if any are within the stateprovince based
|
253
|
-- on the original (unsimplified) geometry, and if so, upgrade to 3
|
254
|
UPDATE geoscrub AS o
|
255
|
SET stateprovincevalidity = 3
|
256
|
WHERE EXISTS (
|
257
|
SELECT 1
|
258
|
FROM gadm2 c
|
259
|
WHERE o.countrystd=c.name_0
|
260
|
AND o.stateprovincestd=c.name_1
|
261
|
AND ST_Intersects(o.geom, c.geom)
|
262
|
) AND o.stateprovincevalidity = 2;
|
263
|
|
264
|
-- 1 otherwise (have recognized stateprovince name, but point is not within
|
265
|
-- 5km of the asserted stateprovince)
|
266
|
UPDATE geoscrub o
|
267
|
SET stateprovincevalidity = 1
|
268
|
WHERE o.stateprovincevalidity IS NULL;
|
269
|
|
270
|
-----------------------------
|
271
|
-- validate county/parish --
|
272
|
-----------------------------
|
273
|
|
274
|
ALTER TABLE geoscrub ADD COLUMN countyvalidity int;
|
275
|
|
276
|
-- -1 if missing county name
|
277
|
UPDATE geoscrub o
|
278
|
SET countyvalidity = -1
|
279
|
WHERE o.country IS NULL
|
280
|
OR o.stateprovince IS NULL
|
281
|
OR o.county IS NULL;
|
282
|
|
283
|
-- 0 if a county name is asserted, but not recognized among GADM
|
284
|
-- county names
|
285
|
UPDATE geoscrub o
|
286
|
SET countyvalidity = 0
|
287
|
WHERE (o.countrystd IS NULL
|
288
|
OR o.stateprovincestd IS NULL
|
289
|
OR o.countystd IS NULL
|
290
|
OR NOT EXISTS (
|
291
|
SELECT 1
|
292
|
FROM gadm2 c
|
293
|
WHERE o.countrystd=c.name_0
|
294
|
AND o.stateprovincestd=c.name_1
|
295
|
AND o.countystd=c.name_2
|
296
|
)
|
297
|
) AND o.countyvalidity IS NULL;
|
298
|
|
299
|
-- 3 if the point is in (or on the border of) the asserted county
|
300
|
UPDATE geoscrub AS o
|
301
|
SET countyvalidity = 3
|
302
|
WHERE EXISTS (
|
303
|
SELECT 1
|
304
|
FROM gadm2 c
|
305
|
WHERE o.countrystd=c.name_0
|
306
|
AND o.stateprovincestd=c.name_1
|
307
|
AND o.countystd=c.name_2
|
308
|
AND ST_Intersects(o.geom, c.simple_geom)
|
309
|
) AND o.stateprovincevalidity = 3
|
310
|
AND o.countyvalidity IS NULL;
|
311
|
|
312
|
-- for those that aren't 3's, set to 2 if the point is in within 5km of
|
313
|
-- the asserted county
|
314
|
UPDATE geoscrub o
|
315
|
SET countyvalidity = 2
|
316
|
WHERE EXISTS (
|
317
|
SELECT 1
|
318
|
FROM gadm2 c
|
319
|
WHERE o.countrystd=c.name_0
|
320
|
AND o.stateprovincestd=c.name_1
|
321
|
AND o.countystd=c.name_2
|
322
|
AND ST_DWithin(o.geog, c.simple_geog, 5000)
|
323
|
) AND (o.stateprovincevalidity = 2 OR o.stateprovincevalidity = 3)
|
324
|
AND o.countyvalidity IS NULL;
|
325
|
|
326
|
-- 1 otherwise (have recognized county name, but point is not within
|
327
|
-- 5km of the asserted county)
|
328
|
UPDATE geoscrub o
|
329
|
SET countyvalidity = 1
|
330
|
WHERE o.countyvalidity IS NULL;
|
331
|
|
332
|
--
|
333
|
-- Look up GADM place names based purely on coordinates for all points
|
334
|
--
|
335
|
|
336
|
ALTER TABLE geoscrub ADD COLUMN pointcountry text;
|
337
|
ALTER TABLE geoscrub ADD COLUMN pointstateprovince text;
|
338
|
ALTER TABLE geoscrub ADD COLUMN pointcounty text;
|
339
|
-- note: only using simplified geometry now, for speed reasons
|
340
|
UPDATE geoscrub o
|
341
|
SET pointcountry = c.name_0,
|
342
|
pointstateprovince = c.name_1,
|
343
|
pointcounty = c.name_2
|
344
|
FROM gadm2 c
|
345
|
WHERE ST_Intersects(o.geom, c.simple_geom)
|
346
|
AND o.latlonvalidity = 1;
|
347
|
|
348
|
SET enable_hashjoin = true;
|