1 |
10707
|
aaronmk
|
/*
|
2 |
|
|
SQL statements carrying out geovalidation on vegbien location data.
|
3 |
|
|
Currently hard-coded to operate on the 'geoscrub' table that is
|
4 |
|
|
assumed to have been generated by the geonames scrubbing process
|
5 |
|
|
(see geonames.sql). This table should minimally contain the following
|
6 |
|
|
columns:
|
7 |
|
|
decimallatitude (latitude, WGS84 decimal degrees)
|
8 |
|
|
decimallongitude (longitude, WGS84 decimal degrees)
|
9 |
|
|
country (original asserted country name)
|
10 |
|
|
stateprovince (original asserted stateprovince name)
|
11 |
|
|
county (original asserted county name)
|
12 |
|
|
countrystd (GADM2-mapped country name)
|
13 |
|
|
stateprovincestd (GADM2-mapped state/province name)
|
14 |
|
|
countystd (GADM2-mapped county name)
|
15 |
|
|
(Note that the geonames.sql script also currently creates columns with
|
16 |
|
|
geonames.org IDs for country, stateprovince, and county.)
|
17 |
|
|
|
18 |
|
|
After execution of the statements below, the table will contain the
|
19 |
|
|
following new columns (along with geom and geog columns used by
|
20 |
|
|
postgis):
|
21 |
|
|
latlonvalidity (validity score for lat/lon coordinate)
|
22 |
|
|
countryvalidity (validity score for country)
|
23 |
|
|
stateprovincevalidity (validity score for stateprovince)
|
24 |
|
|
countyvalidity (validity score for county)
|
25 |
|
|
pointcountry (GADM2 country containing the point)
|
26 |
|
|
pointstateprovince (GADM2 stateprovince containing the point)
|
27 |
|
|
pointcounty (GADM2 county containing the point)
|
28 |
|
|
|
29 |
|
|
Workflow with actual times (as executed 15-Nov-2012)
|
30 |
|
|
12s create point geoms
|
31 |
|
|
16s create point geogs
|
32 |
|
|
23s index geoms
|
33 |
|
|
26s index geogs
|
34 |
|
|
37s index countrystd
|
35 |
|
|
25s index stateprovincestd
|
36 |
|
|
5s vacuum analyze
|
37 |
|
|
1s mark missing coords (UPDATE 0)
|
38 |
|
|
133s mark valid coords (UPDATE 1702989)
|
39 |
|
|
2s mark invalid coords (UPDATE 4981)
|
40 |
|
|
17s vacuum analyze
|
41 |
|
|
27s mark missing countries (UPDATE 222085)
|
42 |
|
|
36s mark non-gadm countries (UPDATE 6547)
|
43 |
|
|
1277s mark point in country (UPDATE 1400627)
|
44 |
|
|
1491s mark point near country (UPDATE 24626)
|
45 |
|
|
573s mark point near country accurate (UPDATE 1182)
|
46 |
|
|
0s mark point near Antarctica (UPDATE 0)
|
47 |
|
|
2s mark point not in country (UPDATE 54085)
|
48 |
|
|
8s vacuum analyze
|
49 |
|
|
15s mark missing stateprovinces (UPDATE 299015)
|
50 |
|
|
12s mark non-gadm stateprovinces (UPDATE 19716)
|
51 |
|
|
920s mark point in stateprovince (UPDATE 1241068)
|
52 |
|
|
864s mark point near stateprovince (UPDATE 35685)
|
53 |
|
|
424s mark point near stateprovince accurate (UPDATE 1051)
|
54 |
|
|
7s mark point not in stateprovince (UPDATE 111282)
|
55 |
|
|
s vacuum analyze
|
56 |
|
|
128s mark missing counties (UPDATE 1431161)
|
57 |
|
|
29s mark non-gadm counties (UPDATE 63014)
|
58 |
|
|
86s mark point in county (UPDATE 174469)
|
59 |
|
|
65s mark point near county (UPDATE 12477)
|
60 |
|
|
?s mark point near county accurate (UPDATE ?)
|
61 |
|
|
3s mark point not in county (UPDATE 26849)
|
62 |
|
|
?s vacuum analyze
|
63 |
|
|
890s look up GADM location using given lat/lon (UPDATE 1656709)
|
64 |
|
|
|
65 |
|
|
todo:
|
66 |
|
|
* make firm decision about whether we should assume here that *std
|
67 |
|
|
columns contain valid GADM names (or are NULL), or whether we should
|
68 |
|
|
check that here
|
69 |
|
|
|
70 |
|
|
Jim Regetz
|
71 |
|
|
NCEAS
|
72 |
|
|
Created Nov 2012
|
73 |
|
|
*/
|
74 |
|
|
|
75 |
11443
|
psarando
|
BEGIN;
|
76 |
|
|
|
77 |
|
|
TRUNCATE geoscrub;
|
78 |
|
|
|
79 |
|
|
DROP INDEX IF EXISTS "geoscrub_geom_gist";
|
80 |
|
|
DROP INDEX IF EXISTS "geoscrub_geog_gist";
|
81 |
|
|
DROP INDEX IF EXISTS "geoscrub_countrystd_idx";
|
82 |
|
|
DROP INDEX IF EXISTS "geoscrub_stateprovincestd_idx";
|
83 |
|
|
DROP INDEX IF EXISTS "geoscrub_countystd_idx";
|
84 |
|
|
|
85 |
|
|
-----------------------------
|
86 |
|
|
-- put everything together --
|
87 |
|
|
-----------------------------
|
88 |
|
|
|
89 |
|
|
-- reconstitute the whole dang thang
|
90 |
|
|
|
91 |
|
|
-- populate geoscrub from vegbien_geoscrub data
|
92 |
|
|
INSERT INTO geoscrub
|
93 |
|
|
SELECT decimallatitude, decimallongitude,
|
94 |
|
|
country, stateprovince, county,
|
95 |
|
|
countryid, stateprovinceid, countyid,
|
96 |
|
|
countrystd, stateprovincestd, countystd
|
97 |
|
|
FROM vegbien_geoscrub v
|
98 |
|
|
LEFT JOIN vcountry USING (country)
|
99 |
|
|
LEFT JOIN vstate USING (countryid, stateprovince)
|
100 |
|
|
LEFT JOIN vcounty USING (countryid, stateprovinceid, county);
|
101 |
|
|
-- SELECT 1707970
|
102 |
|
|
-- Time: 26172.154 ms
|
103 |
|
|
|
104 |
|
|
-- try to squeeze out a few more direct stateprovince mappings to gadm2
|
105 |
|
|
UPDATE geoscrub
|
106 |
|
|
SET stateprovincestd = name_1
|
107 |
|
|
FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm
|
108 |
|
|
WHERE country IS NOT NULL
|
109 |
|
|
AND stateprovince IS NOT NULL
|
110 |
|
|
AND countrystd = name_0
|
111 |
|
|
AND stateprovincestd IS NULL
|
112 |
|
|
AND stateprovince = name_1;
|
113 |
|
|
-- UPDATE 0
|
114 |
|
|
-- Time: 3630.973 ms
|
115 |
|
|
|
116 |
|
|
-- try to squeeze out a few more direct county mappings to gadm2
|
117 |
|
|
UPDATE geoscrub
|
118 |
|
|
SET countystd = name_2
|
119 |
|
|
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm
|
120 |
|
|
WHERE country IS NOT NULL
|
121 |
|
|
AND stateprovince IS NOT NULL
|
122 |
|
|
AND county IS NOT NULL
|
123 |
|
|
AND countrystd = name_0
|
124 |
|
|
AND stateprovincestd = name_1
|
125 |
|
|
AND countystd IS NULL
|
126 |
|
|
AND county = name_2;
|
127 |
|
|
-- UPDATE 69
|
128 |
|
|
-- Time: 3982.715 ms
|
129 |
|
|
|
130 |
|
|
COMMIT;
|
131 |
|
|
|
132 |
10707
|
aaronmk
|
-- generate point geometries from coordinates
|
133 |
|
|
UPDATE geoscrub
|
134 |
|
|
SET geom = ST_SetSRID(ST_Point(decimallongitude, decimallatitude), 4326);
|
135 |
|
|
-- ... create point geographies too ...
|
136 |
|
|
-- note that this excludes (leaves null) any occurrences with coordinates
|
137 |
|
|
-- that are not valid decimal degrees
|
138 |
|
|
UPDATE geoscrub
|
139 |
|
|
SET geog = geom::geography
|
140 |
|
|
WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
|
141 |
|
|
|
142 |
|
|
-- create indexes
|
143 |
|
|
CREATE INDEX "geoscrub_geom_gist" ON geoscrub USING GIST (geom);
|
144 |
|
|
CREATE INDEX "geoscrub_geog_gist" ON geoscrub USING GIST (geog);
|
145 |
|
|
CREATE INDEX "geoscrub_countrystd_idx" ON geoscrub (countrystd);
|
146 |
|
|
CREATE INDEX "geoscrub_stateprovincestd_idx" ON geoscrub (stateprovincestd);
|
147 |
|
|
CREATE INDEX "geoscrub_countystd_idx" ON geoscrub (countystd);
|
148 |
|
|
|
149 |
|
|
-- may want to do this first, otherwise the query planner wants to do a
|
150 |
|
|
-- hash join instead of nested loop join, and at least when I use a limit
|
151 |
|
|
-- statement to time queries on either side of the threshold where it
|
152 |
|
|
-- switches, the hash join does waaaaay worse
|
153 |
|
|
SET enable_hashjoin = false;
|
154 |
|
|
|
155 |
|
|
--
|
156 |
|
|
-- validate lat/lon coordinates
|
157 |
|
|
--
|
158 |
|
|
|
159 |
|
|
VACUUM ANALYZE geoscrub;
|
160 |
|
|
|
161 |
|
|
-- -1 if missing one or both coordinates
|
162 |
|
|
UPDATE geoscrub o
|
163 |
|
|
SET latlonvalidity = -1
|
164 |
|
|
WHERE (
|
165 |
|
|
decimallatitude IS NULL
|
166 |
|
|
OR decimallongitude IS NULL
|
167 |
|
|
);
|
168 |
|
|
|
169 |
|
|
-- 1 if the coordinates falls within the global bounding box
|
170 |
|
|
UPDATE geoscrub o
|
171 |
|
|
SET latlonvalidity = 1
|
172 |
|
|
WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
|
173 |
|
|
|
174 |
|
|
-- 0 otherwise (have coordinates, but not valid)
|
175 |
|
|
UPDATE geoscrub o
|
176 |
|
|
SET latlonvalidity = 0
|
177 |
|
|
WHERE o.latlonvalidity IS NULL;
|
178 |
|
|
|
179 |
|
|
--
|
180 |
|
|
-- validate country
|
181 |
|
|
--
|
182 |
|
|
|
183 |
|
|
VACUUM ANALYZE geoscrub;
|
184 |
|
|
|
185 |
|
|
-- -1 if missing country name
|
186 |
|
|
UPDATE geoscrub o
|
187 |
|
|
SET countryvalidity = -1
|
188 |
|
|
WHERE o.country IS NULL;
|
189 |
|
|
|
190 |
|
|
-- 0 if a country name is asserted, but not recognized among GADM
|
191 |
|
|
-- country names
|
192 |
|
|
UPDATE geoscrub o
|
193 |
|
|
SET countryvalidity = 0
|
194 |
|
|
WHERE (o.countrystd IS NULL
|
195 |
|
|
OR NOT EXISTS (
|
196 |
|
|
SELECT 1
|
197 |
|
|
FROM gadm2 c
|
198 |
|
|
WHERE o.countrystd=c.name_0
|
199 |
|
|
)
|
200 |
|
|
) AND o.countryvalidity IS NULL;
|
201 |
|
|
|
202 |
|
|
-- 3 if the point is in (or on the border of) the asserted country
|
203 |
|
|
UPDATE geoscrub AS o
|
204 |
|
|
SET countryvalidity = 3
|
205 |
|
|
WHERE EXISTS (
|
206 |
|
|
SELECT 1
|
207 |
|
|
FROM gadm2 c
|
208 |
|
|
WHERE o.countrystd=c.name_0
|
209 |
|
|
AND ST_Intersects(o.geom, c.simple_geom)
|
210 |
|
|
) AND o.countryvalidity IS NULL;
|
211 |
|
|
|
212 |
|
|
-- 2 for those that aren't 3's, but the point is in within 5km of
|
213 |
|
|
-- the asserted country
|
214 |
|
|
UPDATE geoscrub o
|
215 |
|
|
SET countryvalidity = 2
|
216 |
|
|
WHERE EXISTS (
|
217 |
|
|
SELECT 1
|
218 |
|
|
FROM gadm2 c
|
219 |
|
|
WHERE o.countrystd=c.name_0
|
220 |
|
|
AND ST_DWithin(o.geog, c.simple_geog, 5000)
|
221 |
|
|
) AND o.countryvalidity IS NULL;
|
222 |
|
|
-- now recheck the 2's to see if any are within the country based on the
|
223 |
|
|
-- original (unsimplified) geometry, and if so, upgrade to 3
|
224 |
|
|
UPDATE geoscrub AS o
|
225 |
|
|
SET countryvalidity = 3
|
226 |
|
|
WHERE EXISTS (
|
227 |
|
|
SELECT 1
|
228 |
|
|
FROM gadm2 c
|
229 |
|
|
WHERE o.countrystd=c.name_0
|
230 |
|
|
AND ST_Intersects(o.geom, c.geom)
|
231 |
|
|
) AND o.countryvalidity = 2;
|
232 |
|
|
-- also handle special case Antartica, which has invalid geography in
|
233 |
|
|
-- gadm2 and thus wasn't included in the within-5km check
|
234 |
|
|
UPDATE geoscrub AS o
|
235 |
|
|
SET countryvalidity = 3
|
236 |
|
|
WHERE EXISTS (
|
237 |
|
|
SELECT 1
|
238 |
|
|
FROM gadm2 c
|
239 |
|
|
WHERE o.countrystd=c.name_0
|
240 |
|
|
AND ST_Intersects(o.geom, c.geom)
|
241 |
|
|
) AND o.countryvalidity < 3
|
242 |
|
|
AND o.countrystd = 'Antarctica';
|
243 |
|
|
|
244 |
|
|
-- 1 otherwise (have recognized country name, but point is not within
|
245 |
|
|
-- 5km of the asserted country)
|
246 |
|
|
UPDATE geoscrub o
|
247 |
|
|
SET countryvalidity = 1
|
248 |
|
|
WHERE o.countryvalidity IS NULL;
|
249 |
|
|
|
250 |
|
|
|
251 |
|
|
-----------------------------
|
252 |
|
|
-- validate state/province --
|
253 |
|
|
-----------------------------
|
254 |
|
|
|
255 |
|
|
VACUUM ANALYZE geoscrub;
|
256 |
|
|
|
257 |
|
|
-- -1 if missing stateprovince name
|
258 |
|
|
UPDATE geoscrub o
|
259 |
|
|
SET stateprovincevalidity = -1
|
260 |
|
|
WHERE o.country IS NULL
|
261 |
|
|
OR o.stateprovince IS NULL;
|
262 |
|
|
|
263 |
|
|
-- 0 if a stateprovince name is asserted, but not recognized among GADM
|
264 |
|
|
-- stateprovince names
|
265 |
|
|
UPDATE geoscrub o
|
266 |
|
|
SET stateprovincevalidity = 0
|
267 |
|
|
WHERE (o.countrystd IS NULL
|
268 |
|
|
OR o.stateprovincestd IS NULL
|
269 |
|
|
OR NOT EXISTS (
|
270 |
|
|
SELECT 1
|
271 |
|
|
FROM gadm2 c
|
272 |
|
|
WHERE o.countrystd=c.name_0
|
273 |
|
|
AND o.stateprovincestd=c.name_1
|
274 |
|
|
)
|
275 |
|
|
) AND o.stateprovincevalidity IS NULL;
|
276 |
|
|
|
277 |
|
|
-- 3 if the point is in (or on the border of) the asserted stateprovince
|
278 |
|
|
UPDATE geoscrub AS o
|
279 |
|
|
SET stateprovincevalidity = 3
|
280 |
|
|
WHERE EXISTS (
|
281 |
|
|
SELECT 1
|
282 |
|
|
FROM gadm2 c
|
283 |
|
|
WHERE o.countrystd=c.name_0
|
284 |
|
|
AND o.stateprovincestd=c.name_1
|
285 |
|
|
AND ST_Intersects(o.geom, c.simple_geom)
|
286 |
|
|
) AND o.countryvalidity = 3
|
287 |
|
|
AND o.stateprovincevalidity IS NULL;
|
288 |
|
|
|
289 |
|
|
-- for those that aren't 3's, set to 2 if the point is in within 5km of
|
290 |
|
|
-- the asserted stateprovince
|
291 |
|
|
UPDATE geoscrub o
|
292 |
|
|
SET stateprovincevalidity = 2
|
293 |
|
|
WHERE EXISTS (
|
294 |
|
|
SELECT 1
|
295 |
|
|
FROM gadm2 c
|
296 |
|
|
WHERE o.countrystd=c.name_0
|
297 |
|
|
AND o.stateprovincestd=c.name_1
|
298 |
|
|
AND ST_DWithin(o.geog, c.simple_geog, 5000)
|
299 |
|
|
) AND (o.countryvalidity = 2 OR o.countryvalidity = 3)
|
300 |
|
|
AND o.stateprovincevalidity IS NULL;
|
301 |
|
|
-- now recheck the 2's to see if any are within the stateprovince based
|
302 |
|
|
-- on the original (unsimplified) geometry, and if so, upgrade to 3
|
303 |
|
|
UPDATE geoscrub AS o
|
304 |
|
|
SET stateprovincevalidity = 3
|
305 |
|
|
WHERE EXISTS (
|
306 |
|
|
SELECT 1
|
307 |
|
|
FROM gadm2 c
|
308 |
|
|
WHERE o.countrystd=c.name_0
|
309 |
|
|
AND o.stateprovincestd=c.name_1
|
310 |
|
|
AND ST_Intersects(o.geom, c.geom)
|
311 |
|
|
) AND o.stateprovincevalidity = 2;
|
312 |
|
|
|
313 |
|
|
-- 1 otherwise (have recognized stateprovince name, but point is not within
|
314 |
|
|
-- 5km of the asserted stateprovince)
|
315 |
|
|
UPDATE geoscrub o
|
316 |
|
|
SET stateprovincevalidity = 1
|
317 |
|
|
WHERE o.stateprovincevalidity IS NULL;
|
318 |
|
|
|
319 |
|
|
-----------------------------
|
320 |
|
|
-- validate county/parish --
|
321 |
|
|
-----------------------------
|
322 |
|
|
|
323 |
|
|
-- -1 if missing county name
|
324 |
|
|
UPDATE geoscrub o
|
325 |
|
|
SET countyvalidity = -1
|
326 |
|
|
WHERE o.country IS NULL
|
327 |
|
|
OR o.stateprovince IS NULL
|
328 |
|
|
OR o.county IS NULL;
|
329 |
|
|
|
330 |
|
|
-- 0 if a county name is asserted, but not recognized among GADM
|
331 |
|
|
-- county names
|
332 |
|
|
UPDATE geoscrub o
|
333 |
|
|
SET countyvalidity = 0
|
334 |
|
|
WHERE (o.countrystd IS NULL
|
335 |
|
|
OR o.stateprovincestd IS NULL
|
336 |
|
|
OR o.countystd IS NULL
|
337 |
|
|
OR NOT EXISTS (
|
338 |
|
|
SELECT 1
|
339 |
|
|
FROM gadm2 c
|
340 |
|
|
WHERE o.countrystd=c.name_0
|
341 |
|
|
AND o.stateprovincestd=c.name_1
|
342 |
|
|
AND o.countystd=c.name_2
|
343 |
|
|
)
|
344 |
|
|
) AND o.countyvalidity IS NULL;
|
345 |
|
|
|
346 |
|
|
-- 3 if the point is in (or on the border of) the asserted county
|
347 |
|
|
UPDATE geoscrub AS o
|
348 |
|
|
SET countyvalidity = 3
|
349 |
|
|
WHERE EXISTS (
|
350 |
|
|
SELECT 1
|
351 |
|
|
FROM gadm2 c
|
352 |
|
|
WHERE o.countrystd=c.name_0
|
353 |
|
|
AND o.stateprovincestd=c.name_1
|
354 |
|
|
AND o.countystd=c.name_2
|
355 |
|
|
AND ST_Intersects(o.geom, c.simple_geom)
|
356 |
|
|
) AND o.stateprovincevalidity = 3
|
357 |
|
|
AND o.countyvalidity IS NULL;
|
358 |
|
|
|
359 |
|
|
-- for those that aren't 3's, set to 2 if the point is in within 5km of
|
360 |
|
|
-- the asserted county
|
361 |
|
|
UPDATE geoscrub o
|
362 |
|
|
SET countyvalidity = 2
|
363 |
|
|
WHERE EXISTS (
|
364 |
|
|
SELECT 1
|
365 |
|
|
FROM gadm2 c
|
366 |
|
|
WHERE o.countrystd=c.name_0
|
367 |
|
|
AND o.stateprovincestd=c.name_1
|
368 |
|
|
AND o.countystd=c.name_2
|
369 |
|
|
AND ST_DWithin(o.geog, c.simple_geog, 5000)
|
370 |
|
|
) AND (o.stateprovincevalidity = 2 OR o.stateprovincevalidity = 3)
|
371 |
|
|
AND o.countyvalidity IS NULL;
|
372 |
|
|
|
373 |
|
|
-- 1 otherwise (have recognized county name, but point is not within
|
374 |
|
|
-- 5km of the asserted county)
|
375 |
|
|
UPDATE geoscrub o
|
376 |
|
|
SET countyvalidity = 1
|
377 |
|
|
WHERE o.countyvalidity IS NULL;
|
378 |
|
|
|
379 |
|
|
--
|
380 |
|
|
-- Look up GADM place names based purely on coordinates for all points
|
381 |
|
|
--
|
382 |
|
|
|
383 |
|
|
-- note: only using simplified geometry now, for speed reasons
|
384 |
|
|
UPDATE geoscrub o
|
385 |
|
|
SET pointcountry = c.name_0,
|
386 |
|
|
pointstateprovince = c.name_1,
|
387 |
|
|
pointcounty = c.name_2
|
388 |
|
|
FROM gadm2 c
|
389 |
|
|
WHERE ST_Intersects(o.geom, c.simple_geom)
|
390 |
|
|
AND o.latlonvalidity = 1;
|
391 |
|
|
|
392 |
|
|
SET enable_hashjoin = true;
|