1 |
10707
|
aaronmk
|
/*
|
2 |
|
|
SQL statements carrying out geovalidation on vegbien location data.
|
3 |
|
|
Currently hard-coded to operate on the 'geoscrub' table that is
|
4 |
|
|
assumed to have been generated by the geonames scrubbing process
|
5 |
|
|
(see geonames.sql). This table should minimally contain the following
|
6 |
|
|
columns:
|
7 |
|
|
decimallatitude (latitude, WGS84 decimal degrees)
|
8 |
|
|
decimallongitude (longitude, WGS84 decimal degrees)
|
9 |
|
|
country (original asserted country name)
|
10 |
|
|
stateprovince (original asserted stateprovince name)
|
11 |
|
|
county (original asserted county name)
|
12 |
|
|
countrystd (GADM2-mapped country name)
|
13 |
|
|
stateprovincestd (GADM2-mapped state/province name)
|
14 |
|
|
countystd (GADM2-mapped county name)
|
15 |
|
|
(Note that the geonames.sql script also currently creates columns with
|
16 |
|
|
geonames.org IDs for country, stateprovince, and county.)
|
17 |
|
|
|
18 |
|
|
After execution of the statements below, the table will contain the
|
19 |
|
|
following new columns (along with geom and geog columns used by
|
20 |
|
|
postgis):
|
21 |
|
|
latlonvalidity (validity score for lat/lon coordinate)
|
22 |
|
|
countryvalidity (validity score for country)
|
23 |
|
|
stateprovincevalidity (validity score for stateprovince)
|
24 |
|
|
countyvalidity (validity score for county)
|
25 |
|
|
pointcountry (GADM2 country containing the point)
|
26 |
|
|
pointstateprovince (GADM2 stateprovince containing the point)
|
27 |
|
|
pointcounty (GADM2 county containing the point)
|
28 |
|
|
|
29 |
|
|
Workflow with actual times (as executed 15-Nov-2012)
|
30 |
|
|
12s create point geoms
|
31 |
|
|
16s create point geogs
|
32 |
|
|
23s index geoms
|
33 |
|
|
26s index geogs
|
34 |
|
|
37s index countrystd
|
35 |
|
|
25s index stateprovincestd
|
36 |
|
|
5s vacuum analyze
|
37 |
|
|
1s mark missing coords (UPDATE 0)
|
38 |
|
|
133s mark valid coords (UPDATE 1702989)
|
39 |
|
|
2s mark invalid coords (UPDATE 4981)
|
40 |
|
|
17s vacuum analyze
|
41 |
|
|
27s mark missing countries (UPDATE 222085)
|
42 |
|
|
36s mark non-gadm countries (UPDATE 6547)
|
43 |
|
|
1277s mark point in country (UPDATE 1400627)
|
44 |
|
|
1491s mark point near country (UPDATE 24626)
|
45 |
|
|
573s mark point near country accurate (UPDATE 1182)
|
46 |
|
|
0s mark point near Antarctica (UPDATE 0)
|
47 |
|
|
2s mark point not in country (UPDATE 54085)
|
48 |
|
|
8s vacuum analyze
|
49 |
|
|
15s mark missing stateprovinces (UPDATE 299015)
|
50 |
|
|
12s mark non-gadm stateprovinces (UPDATE 19716)
|
51 |
|
|
920s mark point in stateprovince (UPDATE 1241068)
|
52 |
|
|
864s mark point near stateprovince (UPDATE 35685)
|
53 |
|
|
424s mark point near stateprovince accurate (UPDATE 1051)
|
54 |
|
|
7s mark point not in stateprovince (UPDATE 111282)
|
55 |
|
|
s vacuum analyze
|
56 |
|
|
128s mark missing counties (UPDATE 1431161)
|
57 |
|
|
29s mark non-gadm counties (UPDATE 63014)
|
58 |
|
|
86s mark point in county (UPDATE 174469)
|
59 |
|
|
65s mark point near county (UPDATE 12477)
|
60 |
|
|
?s mark point near county accurate (UPDATE ?)
|
61 |
|
|
3s mark point not in county (UPDATE 26849)
|
62 |
|
|
?s vacuum analyze
|
63 |
|
|
890s look up GADM location using given lat/lon (UPDATE 1656709)
|
64 |
|
|
|
65 |
|
|
todo:
|
66 |
|
|
* make firm decision about whether we should assume here that *std
|
67 |
|
|
columns contain valid GADM names (or are NULL), or whether we should
|
68 |
|
|
check that here
|
69 |
|
|
|
70 |
|
|
Jim Regetz
|
71 |
|
|
NCEAS
|
72 |
|
|
Created Nov 2012
|
73 |
|
|
*/
|
74 |
|
|
|
75 |
|
|
-- generate point geometries from coordinates
|
76 |
|
|
SELECT AddGeometryColumn('public', 'geoscrub', 'geom', 4326, 'POINT', 2 );
|
77 |
|
|
UPDATE geoscrub
|
78 |
|
|
SET geom = ST_SetSRID(ST_Point(decimallongitude, decimallatitude), 4326);
|
79 |
|
|
-- ... create point geographies too ...
|
80 |
|
|
-- note that this excludes (leaves null) any occurrences with coordinates
|
81 |
|
|
-- that are not valid decimal degrees
|
82 |
|
|
ALTER TABLE geoscrub ADD COLUMN geog geography(POINT, 4326);
|
83 |
|
|
UPDATE geoscrub
|
84 |
|
|
SET geog = geom::geography
|
85 |
|
|
WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
|
86 |
|
|
|
87 |
|
|
-- create indexes
|
88 |
|
|
CREATE INDEX "geoscrub_geom_gist" ON geoscrub USING GIST (geom);
|
89 |
|
|
CREATE INDEX "geoscrub_geog_gist" ON geoscrub USING GIST (geog);
|
90 |
|
|
CREATE INDEX "geoscrub_countrystd_idx" ON geoscrub (countrystd);
|
91 |
|
|
CREATE INDEX "geoscrub_stateprovincestd_idx" ON geoscrub (stateprovincestd);
|
92 |
|
|
CREATE INDEX "geoscrub_countystd_idx" ON geoscrub (countystd);
|
93 |
|
|
|
94 |
|
|
-- may want to do this first, otherwise the query planner wants to do a
|
95 |
|
|
-- hash join instead of nested loop join, and at least when I use a limit
|
96 |
|
|
-- statement to time queries on either side of the threshold where it
|
97 |
|
|
-- switches, the hash join does waaaaay worse
|
98 |
|
|
SET enable_hashjoin = false;
|
99 |
|
|
|
100 |
|
|
--
|
101 |
|
|
-- validate lat/lon coordinates
|
102 |
|
|
--
|
103 |
|
|
|
104 |
|
|
VACUUM ANALYZE geoscrub;
|
105 |
|
|
|
106 |
|
|
ALTER TABLE geoscrub ADD COLUMN latlonvalidity int;
|
107 |
|
|
|
108 |
|
|
-- -1 if missing one or both coordinates
|
109 |
|
|
UPDATE geoscrub o
|
110 |
|
|
SET latlonvalidity = -1
|
111 |
|
|
WHERE (
|
112 |
|
|
decimallatitude IS NULL
|
113 |
|
|
OR decimallongitude IS NULL
|
114 |
|
|
);
|
115 |
|
|
|
116 |
|
|
-- 1 if the coordinates falls within the global bounding box
|
117 |
|
|
UPDATE geoscrub o
|
118 |
|
|
SET latlonvalidity = 1
|
119 |
|
|
WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
|
120 |
|
|
|
121 |
|
|
-- 0 otherwise (have coordinates, but not valid)
|
122 |
|
|
UPDATE geoscrub o
|
123 |
|
|
SET latlonvalidity = 0
|
124 |
|
|
WHERE o.latlonvalidity IS NULL;
|
125 |
|
|
|
126 |
|
|
--
|
127 |
|
|
-- validate country
|
128 |
|
|
--
|
129 |
|
|
|
130 |
|
|
VACUUM ANALYZE geoscrub;
|
131 |
|
|
|
132 |
|
|
ALTER TABLE geoscrub ADD COLUMN countryvalidity int;
|
133 |
|
|
|
134 |
|
|
-- -1 if missing country name
|
135 |
|
|
UPDATE geoscrub o
|
136 |
|
|
SET countryvalidity = -1
|
137 |
|
|
WHERE o.country IS NULL;
|
138 |
|
|
|
139 |
|
|
-- 0 if a country name is asserted, but not recognized among GADM
|
140 |
|
|
-- country names
|
141 |
|
|
UPDATE geoscrub o
|
142 |
|
|
SET countryvalidity = 0
|
143 |
|
|
WHERE (o.countrystd IS NULL
|
144 |
|
|
OR NOT EXISTS (
|
145 |
|
|
SELECT 1
|
146 |
|
|
FROM gadm2 c
|
147 |
|
|
WHERE o.countrystd=c.name_0
|
148 |
|
|
)
|
149 |
|
|
) AND o.countryvalidity IS NULL;
|
150 |
|
|
|
151 |
|
|
-- 3 if the point is in (or on the border of) the asserted country
|
152 |
|
|
UPDATE geoscrub AS o
|
153 |
|
|
SET countryvalidity = 3
|
154 |
|
|
WHERE EXISTS (
|
155 |
|
|
SELECT 1
|
156 |
|
|
FROM gadm2 c
|
157 |
|
|
WHERE o.countrystd=c.name_0
|
158 |
|
|
AND ST_Intersects(o.geom, c.simple_geom)
|
159 |
|
|
) AND o.countryvalidity IS NULL;
|
160 |
|
|
|
161 |
|
|
-- 2 for those that aren't 3's, but the point is in within 5km of
|
162 |
|
|
-- the asserted country
|
163 |
|
|
UPDATE geoscrub o
|
164 |
|
|
SET countryvalidity = 2
|
165 |
|
|
WHERE EXISTS (
|
166 |
|
|
SELECT 1
|
167 |
|
|
FROM gadm2 c
|
168 |
|
|
WHERE o.countrystd=c.name_0
|
169 |
|
|
AND ST_DWithin(o.geog, c.simple_geog, 5000)
|
170 |
|
|
) AND o.countryvalidity IS NULL;
|
171 |
|
|
-- now recheck the 2's to see if any are within the country based on the
|
172 |
|
|
-- original (unsimplified) geometry, and if so, upgrade to 3
|
173 |
|
|
UPDATE geoscrub AS o
|
174 |
|
|
SET countryvalidity = 3
|
175 |
|
|
WHERE EXISTS (
|
176 |
|
|
SELECT 1
|
177 |
|
|
FROM gadm2 c
|
178 |
|
|
WHERE o.countrystd=c.name_0
|
179 |
|
|
AND ST_Intersects(o.geom, c.geom)
|
180 |
|
|
) AND o.countryvalidity = 2;
|
181 |
|
|
-- also handle special case Antartica, which has invalid geography in
|
182 |
|
|
-- gadm2 and thus wasn't included in the within-5km check
|
183 |
|
|
UPDATE geoscrub AS o
|
184 |
|
|
SET countryvalidity = 3
|
185 |
|
|
WHERE EXISTS (
|
186 |
|
|
SELECT 1
|
187 |
|
|
FROM gadm2 c
|
188 |
|
|
WHERE o.countrystd=c.name_0
|
189 |
|
|
AND ST_Intersects(o.geom, c.geom)
|
190 |
|
|
) AND o.countryvalidity < 3
|
191 |
|
|
AND o.countrystd = 'Antarctica';
|
192 |
|
|
|
193 |
|
|
-- 1 otherwise (have recognized country name, but point is not within
|
194 |
|
|
-- 5km of the asserted country)
|
195 |
|
|
UPDATE geoscrub o
|
196 |
|
|
SET countryvalidity = 1
|
197 |
|
|
WHERE o.countryvalidity IS NULL;
|
198 |
|
|
|
199 |
|
|
|
200 |
|
|
-----------------------------
|
201 |
|
|
-- validate state/province --
|
202 |
|
|
-----------------------------
|
203 |
|
|
|
204 |
|
|
VACUUM ANALYZE geoscrub;
|
205 |
|
|
|
206 |
|
|
ALTER TABLE geoscrub ADD COLUMN stateprovincevalidity int;
|
207 |
|
|
|
208 |
|
|
-- -1 if missing stateprovince name
|
209 |
|
|
UPDATE geoscrub o
|
210 |
|
|
SET stateprovincevalidity = -1
|
211 |
|
|
WHERE o.country IS NULL
|
212 |
|
|
OR o.stateprovince IS NULL;
|
213 |
|
|
|
214 |
|
|
-- 0 if a stateprovince name is asserted, but not recognized among GADM
|
215 |
|
|
-- stateprovince names
|
216 |
|
|
UPDATE geoscrub o
|
217 |
|
|
SET stateprovincevalidity = 0
|
218 |
|
|
WHERE (o.countrystd IS NULL
|
219 |
|
|
OR o.stateprovincestd IS NULL
|
220 |
|
|
OR NOT EXISTS (
|
221 |
|
|
SELECT 1
|
222 |
|
|
FROM gadm2 c
|
223 |
|
|
WHERE o.countrystd=c.name_0
|
224 |
|
|
AND o.stateprovincestd=c.name_1
|
225 |
|
|
)
|
226 |
|
|
) AND o.stateprovincevalidity IS NULL;
|
227 |
|
|
|
228 |
|
|
-- 3 if the point is in (or on the border of) the asserted stateprovince
|
229 |
|
|
UPDATE geoscrub AS o
|
230 |
|
|
SET stateprovincevalidity = 3
|
231 |
|
|
WHERE EXISTS (
|
232 |
|
|
SELECT 1
|
233 |
|
|
FROM gadm2 c
|
234 |
|
|
WHERE o.countrystd=c.name_0
|
235 |
|
|
AND o.stateprovincestd=c.name_1
|
236 |
|
|
AND ST_Intersects(o.geom, c.simple_geom)
|
237 |
|
|
) AND o.countryvalidity = 3
|
238 |
|
|
AND o.stateprovincevalidity IS NULL;
|
239 |
|
|
|
240 |
|
|
-- for those that aren't 3's, set to 2 if the point is in within 5km of
|
241 |
|
|
-- the asserted stateprovince
|
242 |
|
|
UPDATE geoscrub o
|
243 |
|
|
SET stateprovincevalidity = 2
|
244 |
|
|
WHERE EXISTS (
|
245 |
|
|
SELECT 1
|
246 |
|
|
FROM gadm2 c
|
247 |
|
|
WHERE o.countrystd=c.name_0
|
248 |
|
|
AND o.stateprovincestd=c.name_1
|
249 |
|
|
AND ST_DWithin(o.geog, c.simple_geog, 5000)
|
250 |
|
|
) AND (o.countryvalidity = 2 OR o.countryvalidity = 3)
|
251 |
|
|
AND o.stateprovincevalidity IS NULL;
|
252 |
|
|
-- now recheck the 2's to see if any are within the stateprovince based
|
253 |
|
|
-- on the original (unsimplified) geometry, and if so, upgrade to 3
|
254 |
|
|
UPDATE geoscrub AS o
|
255 |
|
|
SET stateprovincevalidity = 3
|
256 |
|
|
WHERE EXISTS (
|
257 |
|
|
SELECT 1
|
258 |
|
|
FROM gadm2 c
|
259 |
|
|
WHERE o.countrystd=c.name_0
|
260 |
|
|
AND o.stateprovincestd=c.name_1
|
261 |
|
|
AND ST_Intersects(o.geom, c.geom)
|
262 |
|
|
) AND o.stateprovincevalidity = 2;
|
263 |
|
|
|
264 |
|
|
-- 1 otherwise (have recognized stateprovince name, but point is not within
|
265 |
|
|
-- 5km of the asserted stateprovince)
|
266 |
|
|
UPDATE geoscrub o
|
267 |
|
|
SET stateprovincevalidity = 1
|
268 |
|
|
WHERE o.stateprovincevalidity IS NULL;
|
269 |
|
|
|
270 |
|
|
-----------------------------
|
271 |
|
|
-- validate county/parish --
|
272 |
|
|
-----------------------------
|
273 |
|
|
|
274 |
|
|
ALTER TABLE geoscrub ADD COLUMN countyvalidity int;
|
275 |
|
|
|
276 |
|
|
-- -1 if missing county name
|
277 |
|
|
UPDATE geoscrub o
|
278 |
|
|
SET countyvalidity = -1
|
279 |
|
|
WHERE o.country IS NULL
|
280 |
|
|
OR o.stateprovince IS NULL
|
281 |
|
|
OR o.county IS NULL;
|
282 |
|
|
|
283 |
|
|
-- 0 if a county name is asserted, but not recognized among GADM
|
284 |
|
|
-- county names
|
285 |
|
|
UPDATE geoscrub o
|
286 |
|
|
SET countyvalidity = 0
|
287 |
|
|
WHERE (o.countrystd IS NULL
|
288 |
|
|
OR o.stateprovincestd IS NULL
|
289 |
|
|
OR o.countystd IS NULL
|
290 |
|
|
OR NOT EXISTS (
|
291 |
|
|
SELECT 1
|
292 |
|
|
FROM gadm2 c
|
293 |
|
|
WHERE o.countrystd=c.name_0
|
294 |
|
|
AND o.stateprovincestd=c.name_1
|
295 |
|
|
AND o.countystd=c.name_2
|
296 |
|
|
)
|
297 |
|
|
) AND o.countyvalidity IS NULL;
|
298 |
|
|
|
299 |
|
|
-- 3 if the point is in (or on the border of) the asserted county
|
300 |
|
|
UPDATE geoscrub AS o
|
301 |
|
|
SET countyvalidity = 3
|
302 |
|
|
WHERE EXISTS (
|
303 |
|
|
SELECT 1
|
304 |
|
|
FROM gadm2 c
|
305 |
|
|
WHERE o.countrystd=c.name_0
|
306 |
|
|
AND o.stateprovincestd=c.name_1
|
307 |
|
|
AND o.countystd=c.name_2
|
308 |
|
|
AND ST_Intersects(o.geom, c.simple_geom)
|
309 |
|
|
) AND o.stateprovincevalidity = 3
|
310 |
|
|
AND o.countyvalidity IS NULL;
|
311 |
|
|
|
312 |
|
|
-- for those that aren't 3's, set to 2 if the point is in within 5km of
|
313 |
|
|
-- the asserted county
|
314 |
|
|
UPDATE geoscrub o
|
315 |
|
|
SET countyvalidity = 2
|
316 |
|
|
WHERE EXISTS (
|
317 |
|
|
SELECT 1
|
318 |
|
|
FROM gadm2 c
|
319 |
|
|
WHERE o.countrystd=c.name_0
|
320 |
|
|
AND o.stateprovincestd=c.name_1
|
321 |
|
|
AND o.countystd=c.name_2
|
322 |
|
|
AND ST_DWithin(o.geog, c.simple_geog, 5000)
|
323 |
|
|
) AND (o.stateprovincevalidity = 2 OR o.stateprovincevalidity = 3)
|
324 |
|
|
AND o.countyvalidity IS NULL;
|
325 |
|
|
|
326 |
|
|
-- 1 otherwise (have recognized county name, but point is not within
|
327 |
|
|
-- 5km of the asserted county)
|
328 |
|
|
UPDATE geoscrub o
|
329 |
|
|
SET countyvalidity = 1
|
330 |
|
|
WHERE o.countyvalidity IS NULL;
|
331 |
|
|
|
332 |
|
|
--
|
333 |
|
|
-- Look up GADM place names based purely on coordinates for all points
|
334 |
|
|
--
|
335 |
|
|
|
336 |
|
|
ALTER TABLE geoscrub ADD COLUMN pointcountry text;
|
337 |
|
|
ALTER TABLE geoscrub ADD COLUMN pointstateprovince text;
|
338 |
|
|
ALTER TABLE geoscrub ADD COLUMN pointcounty text;
|
339 |
|
|
-- note: only using simplified geometry now, for speed reasons
|
340 |
|
|
UPDATE geoscrub o
|
341 |
|
|
SET pointcountry = c.name_0,
|
342 |
|
|
pointstateprovince = c.name_1,
|
343 |
|
|
pointcounty = c.name_2
|
344 |
|
|
FROM gadm2 c
|
345 |
|
|
WHERE ST_Intersects(o.geom, c.simple_geom)
|
346 |
|
|
AND o.latlonvalidity = 1;
|
347 |
|
|
|
348 |
|
|
SET enable_hashjoin = true;
|