1 |
10707
|
aaronmk
|
/*
|
2 |
|
|
SQL statements carrying out country name, stateprovince name, and
|
3 |
|
|
county name scrubbing on vegbien location data. Currently hard-coded
|
4 |
|
|
to operate on a 'vegbien_geoscrub' table that contains the input for
|
5 |
|
|
geoscrubbing, with the following columns:
|
6 |
|
|
decimallatitude (latitude, WGS84 decimal degrees)
|
7 |
|
|
decimallongitude (longitude, WGS84 decimal degrees)
|
8 |
|
|
country (original asserted country name)
|
9 |
|
|
stateprovince (original asserted stateprovince name)
|
10 |
|
|
county (original asserted county name)
|
11 |
|
|
(Note that the lat/lon columns are not needed for scrubbing, but are
|
12 |
|
|
used later for geovalidation.)
|
13 |
|
|
|
14 |
|
|
Misc issues to contemplate and possibly address:
|
15 |
|
|
|
16 |
|
|
* Congo can be either "Democratic Republic of the Congo" (COD) or
|
17 |
|
|
"Republic of the Congo" (COG), eh?
|
18 |
|
|
* Vietnam: gadm2 seems to treat provinces as level2 units
|
19 |
|
|
* Yugoslavia: no longer exists, not in geonames nor gadm2
|
20 |
|
|
|
21 |
|
|
* Azerbaijan: Yevlakh City and Yevlakh Rayon both ADM1 in geonames,
|
22 |
|
|
but just Yevlax in gadm2
|
23 |
|
|
|
24 |
|
|
* England is not a geoname country, only the UK is
|
25 |
|
|
|
26 |
|
|
* the general problem of historical names in cases where countries
|
27 |
|
|
have either been split, combined, removed (?), or otherwise
|
28 |
|
|
redefined
|
29 |
|
|
|
30 |
|
|
* Cases with multiple alternativename matches???
|
31 |
|
|
|
32 |
|
|
Notes:
|
33 |
|
|
- the vegbien_geoscrub input used during development and testing this
|
34 |
|
|
scrubbing procedure came from a select distinct on (lat, lon,
|
35 |
|
|
country, state, province) from analytical_aggregate on vegbiendev
|
36 |
|
|
|
37 |
|
|
Jim Regetz
|
38 |
|
|
NCEAS
|
39 |
|
|
Created Nov 2012
|
40 |
|
|
*/
|
41 |
|
|
|
42 |
11443
|
psarando
|
BEGIN;
|
43 |
10707
|
aaronmk
|
|
44 |
|
|
-------------------
|
45 |
|
|
-- scrub country --
|
46 |
|
|
-------------------
|
47 |
|
|
|
48 |
11443
|
psarando
|
INSERT INTO vcountry
|
49 |
10707
|
aaronmk
|
SELECT DISTINCT country
|
50 |
|
|
FROM vegbien_geoscrub
|
51 |
|
|
WHERE country IS NOT NULL;
|
52 |
|
|
-- SELECT 439
|
53 |
|
|
-- Time: 862.413 ms
|
54 |
|
|
|
55 |
|
|
-- try to clean up character representations
|
56 |
|
|
UPDATE vcountry
|
57 |
|
|
SET countryutf8 = u2L2u(country);
|
58 |
|
|
-- UPDATE 439
|
59 |
|
|
-- Time: 24.706 ms
|
60 |
|
|
|
61 |
|
|
-- try matching against direct country names
|
62 |
|
|
UPDATE vcountry vc
|
63 |
|
|
SET countryid = gc.geonameid
|
64 |
|
|
FROM countries gc
|
65 |
|
|
WHERE lower(vc.countryutf8) = lower(gc.country);
|
66 |
|
|
-- UPDATE 219
|
67 |
|
|
-- Time: 12.384 ms
|
68 |
|
|
|
69 |
|
|
-- try matching against ISO 3166-1 alpha-3
|
70 |
|
|
UPDATE vcountry vc
|
71 |
|
|
SET countryid = gc.geonameid
|
72 |
|
|
FROM countries gc
|
73 |
|
|
WHERE vc.countryutf8=gc.iso3
|
74 |
|
|
AND vc.countryid IS NULL;
|
75 |
|
|
-- UPDATE 1
|
76 |
|
|
-- Time: 20.146 ms
|
77 |
|
|
|
78 |
|
|
-- try matching against ISO 3166-1 alpha-2
|
79 |
|
|
UPDATE vcountry vc
|
80 |
|
|
SET countryid = gc.geonameid
|
81 |
|
|
FROM countries gc
|
82 |
|
|
WHERE vc.countryutf8 = iso
|
83 |
|
|
AND vc.countryid IS NULL;
|
84 |
|
|
-- UPDATE 64
|
85 |
|
|
-- Time: 3.378 ms
|
86 |
|
|
|
87 |
|
|
/*
|
88 |
|
|
-- try matching against FIPS, but not if already matched (i.e.,
|
89 |
|
|
-- the 2-digit ISO match takes preference)
|
90 |
|
|
UPDATE vcountry vc
|
91 |
|
|
SET countryid = gc.geonameid
|
92 |
|
|
FROM countries gc
|
93 |
|
|
WHERE vc.countryutf8 = fips
|
94 |
|
|
AND vc.countryid IS NULL;
|
95 |
|
|
-- UPDATE 1
|
96 |
|
|
-- Time: 2.514 ms
|
97 |
|
|
*/
|
98 |
|
|
|
99 |
|
|
-- try matching against BIEN alt country names
|
100 |
|
|
UPDATE vcountry vc
|
101 |
|
|
SET countryid = c.geonameid
|
102 |
|
|
FROM alt_country a, countries c
|
103 |
|
|
WHERE vc.countryutf8 = a.alternatename
|
104 |
|
|
AND a.country = c.country
|
105 |
|
|
AND vc.countryid IS NULL;
|
106 |
|
|
-- UPDATE 21
|
107 |
|
|
-- Time: 4.769 ms
|
108 |
|
|
|
109 |
|
|
-- try case-insensitive matching on geonames.org 'alternatenames'
|
110 |
|
|
-- note: this is a little sloppy because the set condition will return
|
111 |
|
|
-- more than one row if there are more than one matched alternate
|
112 |
|
|
-- country names, and which one gets used is non-deterministic; however,
|
113 |
|
|
-- we'll go back and remove those in the next update (specifically for
|
114 |
|
|
-- the cases where there are more than one *unique* matched countries)
|
115 |
|
|
UPDATE vcountry vc
|
116 |
|
|
SET countryid = gc.geonameid
|
117 |
|
|
FROM alternatenames a, countries gc
|
118 |
|
|
WHERE lower(vc.countryutf8) = lower(a.alternatename)
|
119 |
|
|
AND a.geonameid = gc.geonameid
|
120 |
|
|
AND vc.countryid IS NULL;
|
121 |
|
|
-- UPDATE 76
|
122 |
|
|
-- Time: 22019.592 ms
|
123 |
|
|
|
124 |
|
|
-- go back and remove if altname lookup was ambiguous
|
125 |
|
|
UPDATE vcountry vc
|
126 |
|
|
SET countryid = NULL
|
127 |
|
|
WHERE countryutf8 IN (
|
128 |
|
|
SELECT countryutf8
|
129 |
|
|
FROM (
|
130 |
|
|
SELECT DISTINCT vc.countryutf8, gc.iso3
|
131 |
|
|
FROM vcountry vc,
|
132 |
|
|
alternatenames a,
|
133 |
|
|
countries gc
|
134 |
|
|
WHERE lower(vc.countryutf8) = lower(a.alternatename)
|
135 |
|
|
AND a.geonameid = gc.geonameid) lookup
|
136 |
|
|
GROUP BY countryutf8
|
137 |
|
|
HAVING COUNT(*) > 1);
|
138 |
|
|
-- UPDATE 2
|
139 |
|
|
-- Time: 2296.257 ms
|
140 |
|
|
-- Removes: Congo, CONGO
|
141 |
|
|
|
142 |
|
|
-- merge in GADM country names (our standard names)
|
143 |
|
|
UPDATE vcountry vc
|
144 |
|
|
SET countrystd = name_0
|
145 |
|
|
FROM gadm_country_lookup gadm0
|
146 |
|
|
WHERE vc.countryid=gadm0.countryid;
|
147 |
|
|
-- UPDATE 377
|
148 |
|
|
-- Time: 6.254 ms
|
149 |
|
|
|
150 |
|
|
|
151 |
|
|
-------------------------
|
152 |
|
|
-- scrub stateprovince --
|
153 |
|
|
-------------------------
|
154 |
|
|
|
155 |
11443
|
psarando
|
INSERT INTO vstate
|
156 |
10707
|
aaronmk
|
SELECT DISTINCT countryid, stateprovince
|
157 |
|
|
FROM vegbien_geoscrub join vcountry using (country)
|
158 |
|
|
WHERE countryid IS NOT NULL
|
159 |
|
|
AND stateprovince IS NOT NULL;
|
160 |
|
|
-- SELECT 3312
|
161 |
|
|
-- Time: 4164.948 ms
|
162 |
|
|
|
163 |
|
|
-- try to clean up character representations
|
164 |
|
|
UPDATE vstate
|
165 |
|
|
SET stateprovinceutf8 = u2L2u(stateprovince);
|
166 |
|
|
-- UPDATE 3312
|
167 |
|
|
-- Time: 92.702 ms
|
168 |
|
|
|
169 |
|
|
-- replace common mojibake with valid utf8 replacement character
|
170 |
|
|
-- note: second pass of u2L2u would do the same thing, but this is a
|
171 |
|
|
-- little more explicit about the intended effect
|
172 |
|
|
UPDATE vstate
|
173 |
|
|
SET stateprovinceutf8 = replace(stateprovinceutf8, '�', chr(65533));
|
174 |
|
|
|
175 |
|
|
-- manually replace html character codes found in the names
|
176 |
|
|
-- todo: use some library to do this exhaustively rather than using
|
177 |
|
|
-- manually specified updates
|
178 |
|
|
UPDATE vstate
|
179 |
|
|
SET stateprovinceutf8 = replace(stateprovinceutf8, ''', chr(39));
|
180 |
|
|
UPDATE vstate
|
181 |
|
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'á', chr(225));
|
182 |
|
|
UPDATE vstate
|
183 |
|
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ã', chr(227));
|
184 |
|
|
UPDATE vstate
|
185 |
|
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'é', chr(233));
|
186 |
|
|
UPDATE vstate
|
187 |
|
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'é', chr(233));
|
188 |
|
|
UPDATE vstate
|
189 |
|
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'í', chr(237));
|
190 |
|
|
UPDATE vstate
|
191 |
|
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ñ', chr(241));
|
192 |
|
|
UPDATE vstate
|
193 |
|
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ó', chr(243));
|
194 |
|
|
UPDATE vstate
|
195 |
|
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ô', chr(244));
|
196 |
|
|
UPDATE vstate
|
197 |
|
|
SET stateprovinceutf8 = replace(stateprovinceutf8, 'ü', chr(252));
|
198 |
|
|
|
199 |
|
|
-- try matching against direct stateprovince names
|
200 |
|
|
UPDATE vstate vs
|
201 |
|
|
SET stateprovinceid = gn.geonameid
|
202 |
|
|
FROM geonames gn,
|
203 |
|
|
countries c
|
204 |
|
|
WHERE lower(vs.stateprovinceutf8) = lower(gn.name)
|
205 |
|
|
AND gn.countrycode=c.iso
|
206 |
|
|
AND c.geonameid=vs.countryid
|
207 |
|
|
AND gn.featurecode='ADM1';
|
208 |
|
|
-- UPDATE 238
|
209 |
|
|
-- Time: 2919.512 ms
|
210 |
|
|
|
211 |
|
|
-- try case-insensitive matching on geonames.org 'alternatenames'
|
212 |
|
|
-- note: this is a little sloppy because the set condition will return
|
213 |
|
|
-- more than one row if there are more than one matched alternate
|
214 |
|
|
-- stateprovince names, and which one gets used is non-deterministic
|
215 |
|
|
UPDATE vstate vs
|
216 |
|
|
SET stateprovinceid = gn.geonameid
|
217 |
|
|
FROM alternatenames a,
|
218 |
|
|
geonames gn,
|
219 |
|
|
countries c
|
220 |
|
|
WHERE lower(vs.stateprovinceutf8) = lower(a.alternatename)
|
221 |
|
|
AND a.geonameid = gn.geonameid
|
222 |
|
|
AND gn.countrycode=c.iso
|
223 |
|
|
AND c.geonameid=vs.countryid
|
224 |
|
|
AND gn.featurecode='ADM1'
|
225 |
|
|
AND vs.stateprovinceid IS NULL;
|
226 |
|
|
-- UPDATE 1281
|
227 |
|
|
-- Time: 5229.274 ms
|
228 |
|
|
|
229 |
|
|
-- try doing a 'like' query that ignores replacement characters (�)
|
230 |
|
|
UPDATE vstate vs
|
231 |
|
|
SET stateprovinceid = gn.geonameid
|
232 |
|
|
FROM alternatenames a,
|
233 |
|
|
geonames gn,
|
234 |
|
|
countries c
|
235 |
|
|
WHERE a.alternatename ILIKE
|
236 |
|
|
regexp_replace(stateprovinceutf8, chr(65533), '_')
|
237 |
|
|
AND stateprovinceutf8 LIKE '%'||chr(65533)||'%'
|
238 |
|
|
AND a.geonameid = gn.geonameid
|
239 |
|
|
AND gn.countrycode=c.iso
|
240 |
|
|
AND c.geonameid=vs.countryid
|
241 |
|
|
AND gn.featurecode='ADM1'
|
242 |
|
|
AND vs.stateprovinceid IS NULL;
|
243 |
|
|
-- UPDATE 112
|
244 |
|
|
-- Time: 6232.106 ms
|
245 |
|
|
|
246 |
|
|
-- try matching against alternatenames column in geonames table
|
247 |
|
|
UPDATE vstate vs
|
248 |
|
|
SET stateprovinceid = gn.geonameid
|
249 |
|
|
FROM geonames gn,
|
250 |
|
|
countries c
|
251 |
|
|
WHERE lower(vs.stateprovinceutf8) =
|
252 |
|
|
ANY (string_to_array(lower(gn.alternatenames), ','))
|
253 |
|
|
AND gn.countrycode=c.iso
|
254 |
|
|
AND c.geonameid=vs.countryid
|
255 |
|
|
AND gn.featurecode='ADM1'
|
256 |
|
|
AND vs.stateprovinceid IS NULL;
|
257 |
|
|
-- UPDATE 220
|
258 |
|
|
-- Time: 5053.520 ms
|
259 |
|
|
|
260 |
|
|
-- try matching against BIEN alt stateprovince names
|
261 |
|
|
UPDATE vstate vs
|
262 |
|
|
SET stateprovinceid = g.geonameid
|
263 |
|
|
FROM alt_stateprovince asp,
|
264 |
|
|
countries c,
|
265 |
|
|
geonames g
|
266 |
|
|
WHERE lower(vs.stateprovinceutf8) = lower(asp.alternatename)
|
267 |
|
|
AND asp.country = c.country
|
268 |
|
|
AND c.iso = g.countrycode
|
269 |
|
|
AND asp.stateprovince = g.name
|
270 |
|
|
AND g.featurecode='ADM1'
|
271 |
|
|
AND vs.stateprovinceid IS NULL;
|
272 |
|
|
-- UPDATE 45
|
273 |
|
|
-- Time: 3452.274 ms
|
274 |
|
|
|
275 |
|
|
-- todo: the same stuff done for country to deal with Congo-type case
|
276 |
|
|
|
277 |
|
|
-- merge in GADM stateprovince names (our standard names)
|
278 |
|
|
UPDATE vstate vs
|
279 |
|
|
SET stateprovincestd = name_1
|
280 |
|
|
FROM gadm_stateprovince_lookup gadm1
|
281 |
|
|
WHERE vs.stateprovinceid=gadm1.stateprovinceid;
|
282 |
|
|
-- UPDATE 1896
|
283 |
|
|
-- Time: 23.946 ms
|
284 |
|
|
|
285 |
|
|
-- match any remaining unknowns directly to gadm
|
286 |
|
|
UPDATE vstate vs
|
287 |
|
|
SET stateprovincestd = name_1
|
288 |
|
|
FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm,
|
289 |
|
|
vcountry vc
|
290 |
|
|
WHERE vc.countryid = vs.countryid
|
291 |
|
|
AND countrystd = name_0
|
292 |
|
|
AND lower(stateprovinceutf8) = lower(name_1)
|
293 |
|
|
AND stateprovincestd IS NULL;
|
294 |
|
|
-- UPDATE 55
|
295 |
|
|
-- Time: 4799.837 ms
|
296 |
|
|
|
297 |
|
|
-- try to recover geonameids for any of these so we can drill down to
|
298 |
|
|
-- the next level
|
299 |
|
|
UPDATE vstate vs
|
300 |
|
|
SET stateprovinceid = gadm1.stateprovinceid
|
301 |
|
|
FROM gadm_country_lookup gadm0,
|
302 |
|
|
gadm_stateprovince_lookup gadm1
|
303 |
|
|
WHERE vs.countryid = gadm0.countryid
|
304 |
|
|
AND gadm0.name_0 = gadm1.name_0
|
305 |
|
|
AND vs.stateprovincestd = gadm1.name_1
|
306 |
|
|
AND vs.stateprovinceid IS NULL;
|
307 |
|
|
-- UPDATE 0
|
308 |
|
|
-- Time: 13.882 ms
|
309 |
|
|
|
310 |
|
|
|
311 |
|
|
/* TODO: is this still needed???
|
312 |
|
|
-- now reverse map
|
313 |
|
|
INSERT INTO gadm_stateprovince_lookup
|
314 |
|
|
select name_0, alternatename as name_1, g.geonameid
|
315 |
|
|
from vstate vs,
|
316 |
|
|
alt_stateprovince asp,
|
317 |
|
|
geonames g,
|
318 |
|
|
countries c,
|
319 |
|
|
gadm_country_lookup gc
|
320 |
|
|
where vs.stateprovinceid=g.geonameid
|
321 |
|
|
AND asp.stateprovince=g.name
|
322 |
|
|
AND vs.countryid=c.geonameid
|
323 |
|
|
AND vs.countryid=gc.countryid
|
324 |
|
|
AND asp.country=c.country
|
325 |
|
|
AND vs.stateprovinceid not in (select stateprovinceid from gadm_stateprovince_lookup);
|
326 |
|
|
-- INSERT 0 4
|
327 |
|
|
-- Time: 20.228 ms
|
328 |
|
|
*/
|
329 |
|
|
|
330 |
|
|
|
331 |
|
|
------------------
|
332 |
|
|
-- scrub county --
|
333 |
|
|
------------------
|
334 |
|
|
|
335 |
11443
|
psarando
|
INSERT INTO vcounty
|
336 |
10707
|
aaronmk
|
SELECT DISTINCT countryid, stateprovinceid, county
|
337 |
|
|
FROM vegbien_geoscrub
|
338 |
|
|
JOIN vcountry USING (country)
|
339 |
|
|
JOIN vstate USING (countryid, stateprovince)
|
340 |
|
|
WHERE countryid IS NOT NULL
|
341 |
|
|
AND stateprovinceid IS NOT NULL
|
342 |
|
|
AND county IS NOT NULL;
|
343 |
|
|
-- SELECT 18715
|
344 |
|
|
-- Time: 3590.725 ms
|
345 |
|
|
|
346 |
|
|
-- try to clean up character representations
|
347 |
|
|
UPDATE vcounty
|
348 |
|
|
SET countyutf8 = u2L2u(county);
|
349 |
|
|
-- UPDATE 18715
|
350 |
|
|
-- Time: 683.781 ms
|
351 |
|
|
|
352 |
|
|
-- replace common mojibake with valid utf8 replacement character
|
353 |
|
|
-- note: second pass of u2L2u would do the same thing, but this is a
|
354 |
|
|
-- little more explicit about the intended effect
|
355 |
|
|
UPDATE vcounty
|
356 |
|
|
SET countyutf8 = replace(countyutf8, '�', chr(65533));
|
357 |
|
|
-- UPDATE 18715
|
358 |
|
|
-- Time: 153.395 ms
|
359 |
|
|
|
360 |
|
|
-- clean up some common municipality label abbreviations
|
361 |
|
|
-- todo: other fixes like this; check what brad did for bien2 on this front?
|
362 |
|
|
UPDATE vcounty
|
363 |
|
|
SET countyutf8 = regexp_replace(countyutf8, ' Co[.]?$', ' County', 'i');
|
364 |
|
|
|
365 |
|
|
-- try matching against geonames (names and alternatenames)
|
366 |
|
|
UPDATE vcounty vc
|
367 |
|
|
SET countyid = gn.geonameid
|
368 |
|
|
FROM geonames gn,
|
369 |
|
|
hierarchy h
|
370 |
|
|
WHERE vc.stateprovinceid = h.parentid
|
371 |
|
|
AND h.childid=gn.geonameid
|
372 |
|
|
AND (lower(vc.countyutf8) = lower(gn.name)
|
373 |
|
|
OR lower(vc.countyutf8) =
|
374 |
|
|
ANY (string_to_array(lower(gn.alternatenames), ',')))
|
375 |
|
|
AND gn.featurecode='ADM2';
|
376 |
|
|
-- UPDATE 6673
|
377 |
|
|
-- Time: 8369.737 ms
|
378 |
|
|
|
379 |
|
|
-- use alternatenames
|
380 |
|
|
UPDATE vcounty vc
|
381 |
|
|
SET countyid = gn.geonameid
|
382 |
|
|
FROM geonames gn,
|
383 |
|
|
alternatenames a,
|
384 |
|
|
hierarchy h
|
385 |
|
|
WHERE vc.stateprovinceid = h.parentid
|
386 |
|
|
AND h.childid = gn.geonameid
|
387 |
|
|
AND a.geonameid = gn.geonameid
|
388 |
|
|
AND lower(vc.countyutf8) = lower(a.alternatename)
|
389 |
|
|
AND gn.featurecode='ADM2'
|
390 |
|
|
AND vc.countyid IS NULL;
|
391 |
|
|
-- UPDATE 0
|
392 |
|
|
-- Time: 11255.926 ms
|
393 |
|
|
|
394 |
|
|
-- merge in GADM county names (our standard names)
|
395 |
|
|
-- for cases where we have geonameid
|
396 |
|
|
UPDATE vcounty vs
|
397 |
|
|
SET countystd = name_2
|
398 |
|
|
FROM gadm_county_lookup gadm1
|
399 |
|
|
WHERE vs.countyid=gadm1.countyid
|
400 |
|
|
AND countystd IS NULL;
|
401 |
|
|
-- UPDATE 6673
|
402 |
|
|
-- Time: 118.378 ms
|
403 |
|
|
|
404 |
|
|
-- match directly to gadm
|
405 |
|
|
UPDATE vcounty vco
|
406 |
|
|
SET countystd = name_2
|
407 |
|
|
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm,
|
408 |
|
|
vcountry vc,
|
409 |
|
|
vstate vs
|
410 |
|
|
WHERE vco.countryid = vc.countryid
|
411 |
|
|
AND vco.stateprovinceid = vs.stateprovinceid
|
412 |
|
|
AND countrystd = name_0
|
413 |
|
|
AND stateprovincestd = name_1
|
414 |
|
|
AND lower(countyutf8) = lower(name_2)
|
415 |
|
|
AND countystd IS NULL;
|
416 |
|
|
-- UPDATE 5027
|
417 |
|
|
-- Time: 1410.377 ms
|
418 |
|
|
|
419 |
11443
|
psarando
|
COMMIT;
|