Project

General

Profile

1 10707 aaronmk
/*
2
  SQL statements carrying out geovalidation on vegbien location data.
3
  Currently hard-coded to operate on the 'geoscrub' table that is
4
  assumed to have been generated by the geonames scrubbing process
5
  (see geonames.sql). This table should minimally contain the following
6
  columns:
7
     decimallatitude  (latitude, WGS84 decimal degrees)
8
     decimallongitude (longitude, WGS84 decimal degrees)
9
     country          (original asserted country name)
10
     stateprovince    (original asserted stateprovince name)
11
     county           (original asserted county name)
12
     countrystd       (GADM2-mapped country name)
13
     stateprovincestd (GADM2-mapped state/province name)
14
     countystd        (GADM2-mapped county name)
15
  (Note that the geonames.sql script also currently creates columns with
16
  geonames.org IDs for country, stateprovince, and county.)
17
18
  After execution of the statements below, the table will contain the
19
  following new columns (along with geom and geog columns used by
20
  postgis):
21
     latlonvalidity        (validity score for lat/lon coordinate)
22
     countryvalidity       (validity score for country)
23
     stateprovincevalidity (validity score for stateprovince)
24
     countyvalidity        (validity score for county)
25
     pointcountry          (GADM2 country containing the point)
26
     pointstateprovince    (GADM2 stateprovince containing the point)
27
     pointcounty           (GADM2 county containing the point)
28
29
  Workflow with actual times (as executed 15-Nov-2012)
30
     12s create point geoms
31
     16s create point geogs
32
     23s index geoms
33
     26s index geogs
34
     37s index countrystd
35
     25s index stateprovincestd
36
      5s vacuum analyze
37
      1s mark missing coords (UPDATE 0)
38
    133s mark valid coords (UPDATE 1702989)
39
      2s mark invalid coords (UPDATE 4981)
40
     17s vacuum analyze
41
     27s mark missing countries (UPDATE 222085)
42
     36s mark non-gadm countries (UPDATE 6547)
43
   1277s mark point in country (UPDATE 1400627)
44
   1491s mark point near country (UPDATE 24626)
45
    573s mark point near country accurate (UPDATE 1182)
46
      0s mark point near Antarctica (UPDATE 0)
47
      2s mark point not in country (UPDATE 54085)
48
      8s vacuum analyze
49
     15s mark missing stateprovinces (UPDATE 299015)
50
     12s mark non-gadm stateprovinces (UPDATE 19716)
51
    920s mark point in stateprovince (UPDATE 1241068)
52
    864s mark point near stateprovince (UPDATE 35685)
53
    424s mark point near stateprovince accurate (UPDATE 1051)
54
      7s mark point not in stateprovince (UPDATE 111282)
55
       s vacuum analyze
56
    128s mark missing counties (UPDATE 1431161)
57
     29s mark non-gadm counties (UPDATE 63014)
58
     86s mark point in county (UPDATE 174469)
59
     65s mark point near county (UPDATE 12477)
60
      ?s mark point near county accurate (UPDATE ?)
61
      3s mark point not in county (UPDATE 26849)
62
      ?s vacuum analyze
63
    890s look up GADM location using given lat/lon (UPDATE 1656709)
64
65
  todo:
66
  * make firm decision about whether we should assume here that *std
67
    columns contain valid GADM names (or are NULL), or whether we should
68
    check that here
69
70
  Jim Regetz
71
  NCEAS
72
  Created Nov 2012
73
*/
74
75
-- generate point geometries from coordinates
76
SELECT AddGeometryColumn('public', 'geoscrub', 'geom', 4326, 'POINT', 2 );
77
UPDATE geoscrub
78
   SET geom = ST_SetSRID(ST_Point(decimallongitude, decimallatitude), 4326);
79
-- ... create point geographies too ...
80
-- note that this excludes (leaves null) any occurrences with coordinates
81
-- that are not valid decimal degrees
82
ALTER TABLE geoscrub ADD COLUMN geog geography(POINT, 4326);
83
UPDATE geoscrub
84
  SET geog = geom::geography
85
  WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
86
87
-- create indexes
88
CREATE INDEX "geoscrub_geom_gist" ON geoscrub USING GIST (geom);
89
CREATE INDEX "geoscrub_geog_gist" ON geoscrub USING GIST (geog);
90
CREATE INDEX "geoscrub_countrystd_idx" ON geoscrub (countrystd);
91
CREATE INDEX "geoscrub_stateprovincestd_idx" ON geoscrub (stateprovincestd);
92
CREATE INDEX "geoscrub_countystd_idx" ON geoscrub (countystd);
93
94
-- may want to do this first, otherwise the query planner wants to do a
95
-- hash join instead of nested loop join, and at least when I use a limit
96
-- statement to time queries on either side of the threshold where it
97
-- switches, the hash join does waaaaay worse
98
SET enable_hashjoin = false;
99
100
--
101
-- validate lat/lon coordinates
102
--
103
104
VACUUM ANALYZE geoscrub;
105
106
ALTER TABLE geoscrub ADD COLUMN latlonvalidity int;
107
108
-- -1 if missing one or both coordinates
109
UPDATE geoscrub o
110
    SET latlonvalidity = -1
111
    WHERE (
112
        decimallatitude IS NULL
113
        OR decimallongitude IS NULL
114
    );
115
116
-- 1 if the coordinates falls within the global bounding box
117
UPDATE geoscrub o
118
    SET latlonvalidity = 1
119
    WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
120
121
-- 0 otherwise (have coordinates, but not valid)
122
UPDATE geoscrub o
123
    SET latlonvalidity = 0
124
    WHERE o.latlonvalidity IS NULL;
125
126
--
127
-- validate country
128
--
129
130
VACUUM ANALYZE geoscrub;
131
132
ALTER TABLE geoscrub ADD COLUMN countryvalidity int;
133
134
-- -1 if missing country name
135
UPDATE geoscrub o
136
    SET countryvalidity = -1
137
    WHERE o.country IS NULL;
138
139
-- 0 if a country name is asserted, but not recognized among GADM
140
-- country names
141
UPDATE geoscrub o
142
    SET countryvalidity = 0
143
    WHERE (o.countrystd IS NULL
144
        OR NOT EXISTS (
145
           SELECT 1
146
             FROM gadm2 c
147
             WHERE o.countrystd=c.name_0
148
         )
149
     ) AND o.countryvalidity IS NULL;
150
151
-- 3 if the point is in (or on the border of) the asserted country
152
UPDATE geoscrub AS o
153
    SET countryvalidity = 3
154
    WHERE EXISTS (
155
        SELECT 1
156
            FROM gadm2 c
157
            WHERE o.countrystd=c.name_0
158
              AND ST_Intersects(o.geom, c.simple_geom)
159
    ) AND o.countryvalidity IS NULL;
160
161
-- 2 for those that aren't 3's, but the point is in within 5km of
162
-- the asserted country
163
UPDATE geoscrub o
164
    SET countryvalidity = 2
165
    WHERE EXISTS (
166
        SELECT 1
167
            FROM gadm2 c
168
            WHERE o.countrystd=c.name_0
169
              AND ST_DWithin(o.geog, c.simple_geog, 5000)
170
    ) AND o.countryvalidity IS NULL;
171
-- now recheck the 2's to see if any are within the country based on the
172
-- original (unsimplified) geometry, and if so, upgrade to 3
173
UPDATE geoscrub AS o
174
    SET countryvalidity = 3
175
    WHERE EXISTS (
176
        SELECT 1
177
            FROM gadm2 c
178
            WHERE o.countrystd=c.name_0
179
              AND ST_Intersects(o.geom, c.geom)
180
    ) AND o.countryvalidity = 2;
181
-- also handle special case Antartica, which has invalid geography in
182
-- gadm2 and thus wasn't included in the within-5km check
183
UPDATE geoscrub AS o
184
    SET countryvalidity = 3
185
    WHERE EXISTS (
186
        SELECT 1
187
            FROM gadm2 c
188
            WHERE o.countrystd=c.name_0
189
              AND ST_Intersects(o.geom, c.geom)
190
    ) AND o.countryvalidity < 3
191
    AND o.countrystd = 'Antarctica';
192
193
-- 1 otherwise (have recognized country name, but point is not within
194
-- 5km of the asserted country)
195
UPDATE geoscrub o
196
    SET countryvalidity = 1
197
    WHERE o.countryvalidity IS NULL;
198
199
200
-----------------------------
201
-- validate state/province --
202
-----------------------------
203
204
VACUUM ANALYZE geoscrub;
205
206
ALTER TABLE geoscrub ADD COLUMN stateprovincevalidity int;
207
208
-- -1 if missing stateprovince name
209
UPDATE geoscrub o
210
    SET stateprovincevalidity = -1
211
    WHERE o.country IS NULL
212
       OR o.stateprovince IS NULL;
213
214
-- 0 if a stateprovince name is asserted, but not recognized among GADM
215
-- stateprovince names
216
UPDATE geoscrub o
217
    SET stateprovincevalidity = 0
218
    WHERE (o.countrystd IS NULL
219
        OR o.stateprovincestd IS NULL
220
        OR NOT EXISTS (
221
           SELECT 1
222
             FROM gadm2 c
223
             WHERE o.countrystd=c.name_0
224
               AND o.stateprovincestd=c.name_1
225
        )
226
    ) AND o.stateprovincevalidity IS NULL;
227
228
-- 3 if the point is in (or on the border of) the asserted stateprovince
229
UPDATE geoscrub AS o
230
    SET stateprovincevalidity = 3
231
    WHERE EXISTS (
232
        SELECT 1
233
            FROM gadm2 c
234
            WHERE o.countrystd=c.name_0
235
              AND o.stateprovincestd=c.name_1
236
              AND ST_Intersects(o.geom, c.simple_geom)
237
    ) AND o.countryvalidity = 3
238
    AND o.stateprovincevalidity IS NULL;
239
240
-- for those that aren't 3's, set to 2 if the point is in within 5km of
241
-- the asserted stateprovince
242
UPDATE geoscrub o
243
    SET stateprovincevalidity = 2
244
    WHERE EXISTS (
245
        SELECT 1
246
            FROM gadm2 c
247
            WHERE o.countrystd=c.name_0
248
              AND o.stateprovincestd=c.name_1
249
              AND ST_DWithin(o.geog, c.simple_geog, 5000)
250
    ) AND (o.countryvalidity = 2 OR o.countryvalidity = 3)
251
    AND o.stateprovincevalidity IS NULL;
252
-- now recheck the 2's to see if any are within the stateprovince based
253
-- on the original (unsimplified) geometry, and if so, upgrade to 3
254
UPDATE geoscrub AS o
255
    SET stateprovincevalidity = 3
256
    WHERE EXISTS (
257
        SELECT 1
258
            FROM gadm2 c
259
            WHERE o.countrystd=c.name_0
260
              AND o.stateprovincestd=c.name_1
261
              AND ST_Intersects(o.geom, c.geom)
262
    ) AND o.stateprovincevalidity = 2;
263
264
-- 1 otherwise (have recognized stateprovince name, but point is not within
265
-- 5km of the asserted stateprovince)
266
UPDATE geoscrub o
267
    SET stateprovincevalidity = 1
268
    WHERE o.stateprovincevalidity IS NULL;
269
270
-----------------------------
271
-- validate county/parish --
272
-----------------------------
273
274
ALTER TABLE geoscrub ADD COLUMN countyvalidity int;
275
276
-- -1 if missing county name
277
UPDATE geoscrub o
278
    SET countyvalidity = -1
279
    WHERE o.country IS NULL
280
       OR o.stateprovince IS NULL
281
       OR o.county IS NULL;
282
283
-- 0 if a county name is asserted, but not recognized among GADM
284
-- county names
285
UPDATE geoscrub o
286
    SET countyvalidity = 0
287
    WHERE (o.countrystd IS NULL
288
        OR o.stateprovincestd IS NULL
289
        OR o.countystd IS NULL
290
        OR NOT EXISTS (
291
           SELECT 1
292
             FROM gadm2 c
293
             WHERE o.countrystd=c.name_0
294
               AND o.stateprovincestd=c.name_1
295
               AND o.countystd=c.name_2
296
        )
297
    ) AND o.countyvalidity IS NULL;
298
299
-- 3 if the point is in (or on the border of) the asserted county
300
UPDATE geoscrub AS o
301
    SET countyvalidity = 3
302
    WHERE EXISTS (
303
        SELECT 1
304
            FROM gadm2 c
305
            WHERE o.countrystd=c.name_0
306
              AND o.stateprovincestd=c.name_1
307
              AND o.countystd=c.name_2
308
              AND ST_Intersects(o.geom, c.simple_geom)
309
    ) AND o.stateprovincevalidity = 3
310
    AND o.countyvalidity IS NULL;
311
312
-- for those that aren't 3's, set to 2 if the point is in within 5km of
313
-- the asserted county
314
UPDATE geoscrub o
315
    SET countyvalidity = 2
316
    WHERE EXISTS (
317
        SELECT 1
318
            FROM gadm2 c
319
            WHERE o.countrystd=c.name_0
320
              AND o.stateprovincestd=c.name_1
321
              AND o.countystd=c.name_2
322
              AND ST_DWithin(o.geog, c.simple_geog, 5000)
323
    ) AND (o.stateprovincevalidity = 2 OR o.stateprovincevalidity = 3)
324
    AND o.countyvalidity IS NULL;
325
326
-- 1 otherwise (have recognized county name, but point is not within
327
-- 5km of the asserted county)
328
UPDATE geoscrub o
329
    SET countyvalidity = 1
330
    WHERE o.countyvalidity IS NULL;
331
332
--
333
-- Look up GADM place names based purely on coordinates for all points
334
--
335
336
ALTER TABLE geoscrub ADD COLUMN pointcountry text;
337
ALTER TABLE geoscrub ADD COLUMN pointstateprovince text;
338
ALTER TABLE geoscrub ADD COLUMN pointcounty text;
339
-- note: only using simplified geometry now, for speed reasons
340
UPDATE geoscrub o
341
    SET pointcountry = c.name_0,
342
        pointstateprovince = c.name_1,
343
        pointcounty = c.name_2
344
    FROM gadm2 c
345
    WHERE ST_Intersects(o.geom, c.simple_geom)
346
      AND o.latlonvalidity = 1;
347
348
SET enable_hashjoin = true;