Revision 10707
Added by Aaron Marcuse-Kubitza over 11 years ago
derived/biengeo/geonames-assessment.sql | ||
---|---|---|
1 |
/* |
|
2 |
|
|
3 |
Misc notes on how well we did scrubbing names. And lots of misc SQL |
|
4 |
statements for examining various scrubbing outcomes, some of which may |
|
5 |
not be so relevant anymore as a consequences of other changes. |
|
6 |
|
|
7 |
Provisional (and not necessarily fully updated) summary of geoname |
|
8 |
scrubbing performance on vegbien_geoscrub |
|
9 |
(= geoscrub_input from vegbien schema public.2012-11-04-07-34-10.r5984) |
|
10 |
======================================================================== |
|
11 |
1707970 total unique location determinations |
|
12 |
32541 distinct on country, stateprovince, county (including all nulls) |
|
13 |
======================================================================== |
|
14 |
* Country * |
|
15 |
1485885 assertions |
|
16 |
1479338 mapped to geonames |
|
17 |
(6547 not mapped to geonames) |
|
18 |
1479338 mapped to gadm2 |
|
19 |
(6547 not mapped to gadm2) |
|
20 |
(0 cases of failed mapping to gadm2 name_0) |
|
21 |
1479338 mapped both to geonames and to gadm2 |
|
22 |
======================================================================== |
|
23 |
* Country with State/Province * |
|
24 |
1408955 assertions |
|
25 |
1388525 mapped to geonames |
|
26 |
(20430 not mapped to geonames) |
|
27 |
(5576 of these had unmapped country) |
|
28 |
1389239 mapped to gadm2 |
|
29 |
(19716 not mapped to gadm2) |
|
30 |
(5576 of these had unmapped country) |
|
31 |
1387915 mapped both to geonames and to gadm2 |
|
32 |
======================================================================== |
|
33 |
* Country with State/Province with County/Parish* |
|
34 |
276809 assertions |
|
35 |
145767 mapped to geonames |
|
36 |
213795 mapped to gadm2 |
|
37 |
140983 mapped both to geonames and to gadm2 |
|
38 |
======================================================================== |
|
39 |
|
|
40 |
======================================================================== |
|
41 |
Distinct country names: |
|
42 |
439 total unique country names (non-null) |
|
43 |
377 recognized unique country names (non-null) |
|
44 |
62 unrecognized unique country names (non-null) |
|
45 |
======================================================================== |
|
46 |
Distinct country+stateprovince names: |
|
47 |
4764 total unique country and stateprovince (stateprovince non-null) |
|
48 |
4704 total unique country and stateprovince (both non-null) |
|
49 |
2944 mapped to geonames |
|
50 |
(1760 not mapped to geonames) |
|
51 |
2950 mapped to gadm2 |
|
52 |
(1754 not mapped to gadm2) |
|
53 |
2885 mapped both to geonames and to gadm2 |
|
54 |
(1819 not mapped to both) |
|
55 |
------------------------------------------------------------------------ |
|
56 |
* Distinct mappedcountry+stateprovince names * |
|
57 |
3312 total unique |
|
58 |
1895 mapped to geonames |
|
59 |
(1417 not mapped to geonames) |
|
60 |
1901 mapped to gadm2 |
|
61 |
(1411 not mapped to gadm2) |
|
62 |
======================================================================== |
|
63 |
Distinct country+stateprovince+county names: |
|
64 |
27440 total unique country and stateprovince and county (county non-null) |
|
65 |
25764 total unique country and stateprovince and county (all non-null) |
|
66 |
8507 mapped to geonames |
|
67 |
(17257 not mapped to geonames) |
|
68 |
12610 mapped to gadm2 |
|
69 |
(13154 not mapped to gadm2) |
|
70 |
4917 mapped both to geonames and to gadm2 |
|
71 |
(20847 not mapped to both) |
|
72 |
------------------------------------------------------------------------ |
|
73 |
* Distinct mappedcountry+mappedstateprovince+county names * |
|
74 |
18715 total unique |
|
75 |
6673 mapped to geonames |
|
76 |
(12042 not mapped to geonames) |
|
77 |
11310 mapped to gadm2 |
|
78 |
(7405 not mapped to gadm2) |
|
79 |
======================================================================== |
|
80 |
|
|
81 |
|
|
82 |
for comparison, here are bien2 stats based on boyle/donoghue scrubbing: |
|
83 |
753302 total unique location determinations |
|
84 |
have asserted country |
|
85 |
745321 have scrubbed country |
|
86 |
have asserted country and stateprovince |
|
87 |
have scrubbed country and stateprovince |
|
88 |
have asserted country and stateprovince and county |
|
89 |
have scrubbed country and stateprovince and county |
|
90 |
|
|
91 |
Jim Regetz |
|
92 |
NCEAS |
|
93 |
Created Nov 2012 |
|
94 |
*/ |
|
95 |
|
|
96 |
-- Generate human-readable view of all distinct cases where we haven't |
|
97 |
-- obtained a match for the asserted country |
|
98 |
CREATE TEMP VIEW unmatched_country AS |
|
99 |
SELECT count(*), country FROM |
|
100 |
(SELECT |
|
101 |
CASE WHEN length(countryutf8)>4 |
|
102 |
THEN initcap(countryutf8) |
|
103 |
ELSE countryutf8 |
|
104 |
END AS country |
|
105 |
FROM scrubbed |
|
106 |
JOIN vcountry USING (country) |
|
107 |
WHERE scrubbed.country IS NOT NULL |
|
108 |
AND scrubbed.countryid IS NULL) summary |
|
109 |
GROUP BY country |
|
110 |
ORDER BY count DESC; |
|
111 |
-- Generate human-readable view of all distinct cases where we have |
|
112 |
-- obtained match for the asserted country, but not stateprovince |
|
113 |
CREATE TEMP VIEW unmatched_stateprovince AS |
|
114 |
SELECT count(*), country, stateprovince FROM |
|
115 |
(SELECT countries.country AS country, |
|
116 |
CASE WHEN length(stateprovinceutf8)>3 |
|
117 |
THEN initcap(stateprovinceutf8) |
|
118 |
ELSE stateprovinceutf8 |
|
119 |
END AS stateprovince |
|
120 |
FROM scrubbed |
|
121 |
LEFT JOIN vstate USING (countryid, stateprovince) |
|
122 |
LEFT JOIN countries ON vstate.countryid=geonameid |
|
123 |
WHERE scrubbed.stateprovince IS NOT NULL |
|
124 |
AND scrubbed.stateprovinceid IS NULL |
|
125 |
AND scrubbed.country IS NOT NULL) summary |
|
126 |
GROUP BY country, stateprovince |
|
127 |
ORDER BY count DESC; |
|
128 |
/* |
|
129 |
SELECT DISTINCT countries.country, |
|
130 |
CASE WHEN length(stateprovinceutf8)>3 |
|
131 |
THEN initcap(stateprovinceutf8) |
|
132 |
ELSE stateprovinceutf8 |
|
133 |
END AS stateprovince |
|
134 |
FROM scrubbed |
|
135 |
JOIN vstate USING (countryid, stateprovince) |
|
136 |
JOIN countries ON vstate.countryid=geonameid |
|
137 |
WHERE scrubbed.stateprovince IS NOT NULL |
|
138 |
AND scrubbed.stateprovinceid IS NULL |
|
139 |
ORDER BY country, stateprovince; |
|
140 |
*/ |
|
141 |
|
|
142 |
-- Generate human-readable view of all distinct cases where we have |
|
143 |
-- obtained match for the asserted country and stateprovince, but not |
|
144 |
-- county |
|
145 |
/* |
|
146 |
CREATE TEMP VIEW unmatched_county AS |
|
147 |
SELECT DISTINCT countries.country, |
|
148 |
CASE WHEN length(stateprovinceutf8)>3 |
|
149 |
THEN initcap(stateprovinceutf8) |
|
150 |
ELSE stateprovinceutf8 |
|
151 |
END AS stateprovince |
|
152 |
FROM scrubbed |
|
153 |
JOIN vstate USING (countryid, stateprovince) |
|
154 |
JOIN countries ON vstate.countryid=geonameid |
|
155 |
WHERE scrubbed.stateprovince IS NOT NULL |
|
156 |
AND scrubbed.stateprovinceid IS NULL |
|
157 |
ORDER BY country, stateprovince; |
|
158 |
*/ |
|
159 |
|
|
160 |
/* |
|
161 |
-- which states aren't in gadm2? |
|
162 |
select distinct iso3, s.name state |
|
163 |
from scrubbed |
|
164 |
join countries c on countryid=c.geonameid |
|
165 |
join geonames s on stateprovinceid=s.geonameid |
|
166 |
except select iso, name_1 state from gadm2; |
|
167 |
|
|
168 |
select country, count(*) |
|
169 |
from scrubbed s |
|
170 |
left join gadm_stateprovince_lookup using (stateprovinceid) |
|
171 |
where name_0 is null |
|
172 |
and stateprovinceid is not null |
|
173 |
group by country |
|
174 |
order by count desc; |
|
175 |
|
|
176 |
|
|
177 |
select name |
|
178 |
from gadm2 |
|
179 |
join alternatenames on name_1=alternatename |
|
180 |
join geonames using (geonameid) |
|
181 |
join countries on countrycode=countries.iso |
|
182 |
where featurecode='ADM1' |
|
183 |
and name_1='North Solomons'; |
|
184 |
*/ |
|
185 |
|
|
186 |
|
|
187 |
/* |
|
188 |
-- report distinct geonames-scrubbed country+stateprovince pairs that |
|
189 |
-- we didn't match to any gadm polygons |
|
190 |
SELECT DISTINCT c.country, g.name |
|
191 |
FROM countries c |
|
192 |
JOIN vstate v ON geonameid=countryid |
|
193 |
JOIN geonames g ON g.geonameid=v.stateprovinceid |
|
194 |
WHERE stateprovinceid NOT IN ( |
|
195 |
SELECT stateprovinceid FROM gadm_stateprovince_lookup |
|
196 |
) |
|
197 |
ORDER BY c.country, g.name; |
|
198 |
|
|
199 |
-- report failed stateprovince mappings from gadm2 to geonames |
|
200 |
SELECT countries.country AS countryname, |
|
201 |
name AS stateprovincename, |
|
202 |
count(*) |
|
203 |
FROM scrubbed |
|
204 |
LEFT JOIN gadm_stateprovince_lookup USING (stateprovinceid) |
|
205 |
LEFT JOIN countries ON countryid=countries.geonameid |
|
206 |
LEFT JOIN geonames ON stateprovinceid=geonames.geonameid |
|
207 |
WHERE stateprovinceid IS NOT NULL |
|
208 |
AND name_1 IS NULL |
|
209 |
GROUP BY countryname, stateprovincename |
|
210 |
ORDER BY countryname, stateprovincename; |
|
211 |
*/ |
|
212 |
|
|
213 |
/* |
|
214 |
select iso, name_1, geonameid as stateid |
|
215 |
from (select distinct iso, name_1 from gadm2) gadm2 |
|
216 |
left join ( |
|
217 |
select distinct gadm2.iso, |
|
218 |
name_1, |
|
219 |
alternatenames.geonameid, |
|
220 |
geonames.name |
|
221 |
from (select distinct iso, name_1 from gadm2) gadm2 |
|
222 |
join alternatenames on name_1=alternatename |
|
223 |
join geonames using (geonameid) |
|
224 |
join countries on countrycode=countries.iso |
|
225 |
where featurecode='ADM1' |
|
226 |
and gadm2.iso=countries.iso3) foo |
|
227 |
using (iso, name_1); |
|
228 |
*/ |
|
229 |
|
|
230 |
-- how did we do? |
|
231 |
|
|
232 |
-- country |
|
233 |
|
|
234 |
-- percent of unique locations with asserted country names |
|
235 |
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct |
|
236 |
FROM scrubbed WHERE country IS NOT NULL; |
|
237 |
-- 87.0% |
|
238 |
-- percent of unique locations with geoname-scrubbed country names |
|
239 |
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct |
|
240 |
FROM scrubbed WHERE countryid IS NOT NULL; |
|
241 |
-- 86.6% |
|
242 |
-- percent of unique locations with gadm-matchedl country names |
|
243 |
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct |
|
244 |
FROM scrubbed WHERE countryid IN (SELECT countryid FROM gadm_country_lookup); |
|
245 |
-- 86.6% |
|
246 |
|
|
247 |
-- stateprovince |
|
248 |
|
|
249 |
-- note: 212221 records have a stateprovince but no country! those are |
|
250 |
-- considered missing below |
|
251 |
|
|
252 |
-- percent of unique locations with asserted stateprovince names |
|
253 |
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct |
|
254 |
FROM scrubbed WHERE country IS NOT NULL AND stateprovince IS NOT NULL; |
|
255 |
-- 82.5% |
|
256 |
-- percent of unique locations with geoname-scrubbed stateprovince names |
|
257 |
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct |
|
258 |
FROM scrubbed WHERE stateprovinceid IS NOT NULL; |
|
259 |
-- 80.2% |
|
260 |
-- percent of unique locations with gadm-matchedl stateprovince names |
|
261 |
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct |
|
262 |
FROM scrubbed |
|
263 |
WHERE stateprovinceid IN (SELECT stateprovinceid FROM gadm_stateprovince_lookup); |
|
264 |
-- 79.5% |
|
265 |
|
|
266 |
-- stateprovince |
|
267 |
|
|
268 |
-- percent of unique locations with asserted county names |
|
269 |
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct |
|
270 |
FROM scrubbed |
|
271 |
WHERE country IS NOT NULL AND stateprovince IS NOT NULL AND county IS NOT NULL; |
|
272 |
-- 16.2% |
|
273 |
-- percent of unique locations with geoname-scrubbed county names |
|
274 |
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct |
|
275 |
FROM scrubbed WHERE countyid IS NOT NULL; |
|
276 |
-- 6.3% |
|
277 |
-- percent of unique locations with gadm-matchedl county names |
|
278 |
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct |
|
279 |
FROM scrubbed |
|
280 |
WHERE countyid IN (SELECT countyid FROM gadm_county_lookup); |
|
281 |
-- tbd |
|
282 |
|
|
283 |
SELECT CASE WHEN countryid IS NOT NULL |
|
284 |
THEN 'matched' |
|
285 |
ELSE 'unmatched' END AS matches, |
|
286 |
COUNT(*) |
|
287 |
FROM vcountry |
|
288 |
GROUP BY matches; |
|
289 |
|
|
290 |
SELECT CASE WHEN stateprovinceid IS NOT NULL |
|
291 |
THEN 'matched' |
|
292 |
ELSE 'unmatched' END AS matches, |
|
293 |
COUNT(*) |
|
294 |
FROM vstate |
|
295 |
GROUP BY matches; |
|
296 |
-- matched | 1770 |
|
297 |
-- unmatched | 1541 |
|
298 |
|
|
299 |
SELECT CASE WHEN countyid IS NOT NULL |
|
300 |
THEN 'matched' |
|
301 |
ELSE 'unmatched' END AS matches, |
|
302 |
COUNT(*) |
|
303 |
FROM vcounty |
|
304 |
GROUP BY matches; |
|
305 |
-- matched | 4295 |
|
306 |
-- unmatched | 14354 |
|
307 |
|
|
308 |
SELECT CASE WHEN country IS NULL |
|
309 |
THEN 'no country provided' |
|
310 |
WHEN country IS NOT NULL AND countryid IS NULL |
|
311 |
THEN 'unrecognized country' |
|
312 |
ELSE |
|
313 |
'scrubbed country' |
|
314 |
END results, |
|
315 |
COUNT(*) |
|
316 |
FROM scrubbed |
|
317 |
GROUP BY results; |
|
318 |
|
|
319 |
SELECT CASE WHEN country IS NULL |
|
320 |
THEN 'no fully qualified stateprovince provided' |
|
321 |
WHEN country IS NOT NULL AND stateprovince IS NOT NULL AND stateprovinceid IS NULL |
|
322 |
THEN 'unrecognized stateprovince' |
|
323 |
ELSE |
|
324 |
'scrubbed stateprovince' |
|
325 |
END results, |
|
326 |
COUNT(*) |
|
327 |
FROM scrubbed |
|
328 |
GROUP BY results; |
|
329 |
|
|
330 |
-- file output |
|
331 |
SELECT DISTINCT countryid, country, |
|
332 |
stateprovinceid, stateprovince, |
|
333 |
countyid, county |
|
334 |
FROM scrubbed |
|
335 |
ORDER BY country, stateprovince, county; |
|
336 |
|
|
337 |
/* |
|
338 |
-- look at the whole iso2 vs fips thing |
|
339 |
SELECT DISTINCT vc.country, i.country iso, f.country fips, stateprovince |
|
340 |
FROM vcountry vc |
|
341 |
LEFT JOIN countries i ON vc.country=i.iso |
|
342 |
LEFT JOIN countries f ON vc.country=f.fips |
|
343 |
LEFT JOIN scrubbed s ON s.country=vc.country |
|
344 |
WHERE vc.countryid IS NULL |
|
345 |
AND i.country IS NOT NULL |
|
346 |
AND f.country IS NOT NULL |
|
347 |
ORDER BY vc.country; |
|
348 |
*/ |
derived/biengeo/geonames-to-gadm.sql | ||
---|---|---|
1 |
/* |
|
2 |
SQL statements to generate tables mapping geonames features at each |
|
3 |
relevant administative level (country, state/province, county/parish) |
|
4 |
to GADM2 features. |
|
5 |
|
|
6 |
Todo: |
|
7 |
* Be tolerant to presence/absence of administrative qualifiers in |
|
8 |
names? (Already doing this mildly for County/Co/Co.) |
|
9 |
- Municipality |
|
10 |
- Barrio |
|
11 |
- District |
|
12 |
|
|
13 |
* Get rid of rudundancy |
|
14 |
* Try matching against GADM2 varname_* columns? |
|
15 |
|
|
16 |
Jim Regetz |
|
17 |
NCEAS |
|
18 |
Created Nov 2012 |
|
19 |
*/ |
|
20 |
|
|
21 |
----------------------- |
|
22 |
-- Level 0 (Country) -- |
|
23 |
----------------------- |
|
24 |
|
|
25 |
-- map gadm2 level0 to geonames countries |
|
26 |
CREATE TABLE gadm_country_lookup ( |
|
27 |
countryid integer primary key references geonames (geonameid), |
|
28 |
name_0 text |
|
29 |
); |
|
30 |
INSERT INTO gadm_country_lookup (countryid) |
|
31 |
SELECT DISTINCT geonameid |
|
32 |
FROM countries |
|
33 |
WHERE geonameid IS NOT NULL; |
|
34 |
-- INSERT 0 250 |
|
35 |
-- Time: 20.770 ms |
|
36 |
UPDATE gadm_country_lookup lu |
|
37 |
SET name_0 = gadm.name_0 |
|
38 |
FROM (SELECT DISTINCT iso, name_0 FROM gadm2) gadm, |
|
39 |
countries c |
|
40 |
WHERE lu.countryid = c.geonameid |
|
41 |
AND c.iso3 = gadm.iso; |
|
42 |
-- UPDATE 248 |
|
43 |
-- Time: 3562.601 ms |
|
44 |
|
|
45 |
------------------------------ |
|
46 |
-- Level 1 (State/Province) -- |
|
47 |
------------------------------ |
|
48 |
|
|
49 |
-- map gadm2 level1 to geonames ADM1 |
|
50 |
CREATE TABLE gadm_stateprovince_lookup ( |
|
51 |
stateprovinceid integer primary key references geonames (geonameid), |
|
52 |
name_0 text, |
|
53 |
name_1 text |
|
54 |
); |
|
55 |
INSERT INTO gadm_stateprovince_lookup (stateprovinceid) |
|
56 |
SELECT DISTINCT geonameid |
|
57 |
FROM geonames |
|
58 |
WHERE featurecode='ADM1'; |
|
59 |
-- INSERT 0 3841 |
|
60 |
-- Time: 7085.635 ms |
|
61 |
|
|
62 |
|
|
63 |
-- try matching against alternatenames table |
|
64 |
UPDATE gadm_stateprovince_lookup gs |
|
65 |
SET name_0 = gadm.name_0, |
|
66 |
name_1 = gadm.name_1 |
|
67 |
FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm, |
|
68 |
alternatenames a, |
|
69 |
geonames g, |
|
70 |
hierarchy h, |
|
71 |
gadm_country_lookup gc |
|
72 |
WHERE lower(gadm.name_1)=lower(alternatename) |
|
73 |
AND gs.stateprovinceid = a.geonameid |
|
74 |
AND a.geonameid = g.geonameid |
|
75 |
AND gs.stateprovinceid = h.childid |
|
76 |
AND h.parentid = gc.countryid |
|
77 |
AND gc.name_0 = gadm.name_0 |
|
78 |
AND g.featurecode='ADM1' |
|
79 |
AND gs.name_1 IS NULL; |
|
80 |
-- UPDATE 2145 |
|
81 |
-- Time: 3453.568 ms |
|
82 |
|
|
83 |
-- try matching against geonames (names and alternatenames) |
|
84 |
UPDATE gadm_stateprovince_lookup gs |
|
85 |
SET name_0 = gadm.name_0, |
|
86 |
name_1 = gadm.name_1 |
|
87 |
FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm, |
|
88 |
geonames g, |
|
89 |
hierarchy h, |
|
90 |
gadm_country_lookup gc |
|
91 |
WHERE (lower(gadm.name_1)=lower(g.name) |
|
92 |
OR lower(gadm.name_1) = |
|
93 |
ANY (string_to_array(lower(g.alternatenames), ','))) |
|
94 |
AND gs.stateprovinceid = h.childid |
|
95 |
AND h.parentid = gc.countryid |
|
96 |
AND gc.name_0 = gadm.name_0 |
|
97 |
AND gs.stateprovinceid = g.geonameid |
|
98 |
AND g.featurecode='ADM1' |
|
99 |
AND gs.name_1 IS NULL; |
|
100 |
-- UPDATE 319 |
|
101 |
-- Time: 857.885 ms |
|
102 |
|
|
103 |
|
|
104 |
-- now again but against our manual mapping |
|
105 |
UPDATE gadm_stateprovince_lookup gs |
|
106 |
SET name_0 = gadm.name_0, |
|
107 |
name_1 = gadm.name_1 |
|
108 |
FROM (SELECT DISTINCT iso, name_0, name_1 FROM gadm2) gadm, |
|
109 |
alt_stateprovince asp, |
|
110 |
geonames g, |
|
111 |
countries c |
|
112 |
WHERE gadm.name_1=asp.alternatename |
|
113 |
AND gadm.iso=c.iso3 |
|
114 |
AND asp.country=c.country |
|
115 |
AND gs.stateprovinceid = g.geonameid |
|
116 |
AND g.countrycode=c.iso |
|
117 |
AND g.name=asp.stateprovince |
|
118 |
AND gs.name_0 IS NULL; |
|
119 |
-- UPDATE 29 |
|
120 |
-- Time: 777.376 ms |
|
121 |
|
|
122 |
---------------------- |
|
123 |
-- Level 2 (County) -- |
|
124 |
---------------------- |
|
125 |
|
|
126 |
-- map gadm2 level2 to geonames ADM2 |
|
127 |
CREATE TABLE gadm_county_lookup ( |
|
128 |
countyid integer primary key references geonames (geonameid), |
|
129 |
name_0 text, |
|
130 |
name_1 text, |
|
131 |
name_2 text |
|
132 |
); |
|
133 |
INSERT INTO gadm_county_lookup (countyid) |
|
134 |
SELECT DISTINCT geonameid |
|
135 |
FROM geonames |
|
136 |
WHERE featurecode='ADM2'; |
|
137 |
-- INSERT 0 32374 |
|
138 |
-- Time: 13177.539 ms |
|
139 |
|
|
140 |
-- try matching against geonames (names and alternatenames) |
|
141 |
UPDATE gadm_county_lookup gcl |
|
142 |
SET name_0 = gadm.name_0, |
|
143 |
name_1 = gadm.name_1, |
|
144 |
name_2 = gadm.name_2 |
|
145 |
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm, |
|
146 |
geonames g, |
|
147 |
hierarchy h, |
|
148 |
gadm_stateprovince_lookup gsl |
|
149 |
WHERE (lower(gadm.name_2)=lower(g.name) |
|
150 |
OR lower(gadm.name_2) = |
|
151 |
ANY (string_to_array(lower(g.alternatenames), ','))) |
|
152 |
AND gcl.countyid = g.geonameid |
|
153 |
AND gcl.countyid = h.childid |
|
154 |
AND h.parentid = gsl.stateprovinceid |
|
155 |
AND gsl.name_0 = gadm.name_0 |
|
156 |
AND gsl.name_1 = gadm.name_1 |
|
157 |
AND g.featurecode='ADM2'; |
|
158 |
-- UPDATE 12352 |
|
159 |
-- Time: 27390.357 ms |
|
160 |
|
|
161 |
-- try matching against alternatenames table |
|
162 |
UPDATE gadm_county_lookup gcl |
|
163 |
SET name_0 = gadm.name_0, |
|
164 |
name_1 = gadm.name_1, |
|
165 |
name_2 = gadm.name_2 |
|
166 |
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm, |
|
167 |
alternatenames a, |
|
168 |
hierarchy h, |
|
169 |
geonames g, |
|
170 |
gadm_stateprovince_lookup gsl |
|
171 |
WHERE lower(gadm.name_2)=lower(a.alternatename) |
|
172 |
AND gcl.countyid = a.geonameid |
|
173 |
AND a.geonameid = g.geonameid |
|
174 |
AND gcl.countyid = h.childid |
|
175 |
AND h.parentid = gsl.stateprovinceid |
|
176 |
AND gsl.name_0 = gadm.name_0 |
|
177 |
AND gsl.name_1 = gadm.name_1 |
|
178 |
AND g.featurecode='ADM2' |
|
179 |
AND gcl.name_2 IS NULL; |
|
180 |
-- UPDATE 0 |
|
181 |
-- Time: 6340.441 ms |
|
182 |
|
|
183 |
-- map geonames '/Foo/ County' to gadm2 '/Foo/' |
|
184 |
-- todo: other mappings like this??? |
|
185 |
UPDATE gadm_county_lookup gcl |
|
186 |
SET name_0 = gadm.name_0, |
|
187 |
name_1 = gadm.name_1, |
|
188 |
name_2 = gadm.name_2 |
|
189 |
FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm, |
|
190 |
geonames g, |
|
191 |
hierarchy h, |
|
192 |
gadm_stateprovince_lookup gsl |
|
193 |
WHERE lower(gadm.name_2||' County')=lower(g.name) |
|
194 |
AND gcl.countyid = g.geonameid |
|
195 |
AND gcl.countyid = h.childid |
|
196 |
AND h.parentid = gsl.stateprovinceid |
|
197 |
AND gsl.name_0 = gadm.name_0 |
|
198 |
AND gsl.name_1 = gadm.name_1 |
|
199 |
AND g.featurecode='ADM2' |
|
200 |
AND gcl.name_2 IS NULL; |
|
201 |
-- UPDATE 3000 |
|
202 |
-- Time: 22248.393 ms |
|
203 |
|
|
204 |
|
|
205 |
/* |
|
206 |
-- another way to create gadm_stateprovince_lookup, though this won't |
|
207 |
-- pick up multiple matches from geonames (good or bad??) |
|
208 |
CREATE TABLE gadm_stateprovince_lookup AS |
|
209 |
SELECT DISTINCT name_0, name_1 |
|
210 |
FROM gadm2; |
|
211 |
ALTER TABLE gadm_stateprovince_lookup ADD COLUMN stateprovinceid integer; |
|
212 |
UPDATE gadm_stateprovince_lookup gs |
|
213 |
SET stateprovinceid = g.geonameid |
|
214 |
FROM gadm_country_lookup gc, |
|
215 |
alternatenames a, |
|
216 |
geonames g, |
|
217 |
countries c |
|
218 |
WHERE gs.name_0=gc.name_0 |
|
219 |
AND gs.name_1=a.alternatename |
|
220 |
AND a.geonameid=g.geonameid |
|
221 |
AND g.countrycode=c.iso |
|
222 |
AND gc.countryid=c.geonameid |
|
223 |
AND g.featurecode='ADM1'; |
|
224 |
-- UPDATE 2137 |
|
225 |
-- Time: 600.345 ms |
|
226 |
*/ |
|
227 |
|
|
228 |
|
|
229 |
|
|
230 |
/* |
|
231 |
-- these geonameids match multiple gadm2 state/provinces, and that's bad |
|
232 |
-- because we don't know which one to use for geovalidation |
|
233 |
select stateprovinceid, name_0, array_agg(name_1) as name_1 |
|
234 |
from gadm_stateprovince_lookup |
|
235 |
group by stateprovinceid, name_0 |
|
236 |
having count(*)>1; |
|
237 |
|
|
238 |
stateprovinceid | name_0 | name_1 |
|
239 |
-----------------+----------+-------------------------------------- |
|
240 |
3653890 | Ecuador | {Orellana,Orellana} |
|
241 |
453751 | Bulgaria | {Razgrad,Ruse} |
|
242 |
1831095 | Cambodia | {"Phnom Penh",Kândal} |
|
243 |
1506272 | Russia | {Altay,Gorno-Altay} |
|
244 |
128222 | Iran | {Kermanshah,Kordestan} |
|
245 |
3457415 | Brazil | {"Mato Grosso do Sul","Mato Grosso"} |
|
246 |
170652 | Syria | {Damascus,"Rif Dimashq"} |
|
247 |
(7 rows) |
|
248 |
|
|
249 |
-- for now, manually clean up after these ambiguous cases |
|
250 |
-- ... this countryid should only apply to Mato Grosso do Sul |
|
251 |
DELETE FROM gadm_stateprovince_lookup |
|
252 |
WHERE name_0 = 'Brazil' |
|
253 |
AND name_1 = 'Mato Grosso' |
|
254 |
AND stateprovinceid = '3457415'; |
|
255 |
*/ |
|
256 |
|
|
257 |
/* |
|
258 |
-- these gadm2 state/provinces match multiple geonameids, but i'm pretty sure |
|
259 |
-- we don't really care in this direction |
|
260 |
select iso, name_1, count(*) |
|
261 |
from (select distinct iso, name_1 from gadm2) gadm2 |
|
262 |
left join ( |
|
263 |
select distinct gadm2.iso, |
|
264 |
name_1, |
|
265 |
alternatenames.geonameid, |
|
266 |
geonames.name |
|
267 |
from (select distinct iso, name_1 from gadm2) gadm2 |
|
268 |
join alternatenames on name_1=alternatename |
|
269 |
join geonames using (geonameid) |
|
270 |
join countries on countrycode=countries.iso |
|
271 |
where featurecode='ADM1' |
|
272 |
and gadm2.iso=countries.iso3) foo |
|
273 |
using (iso, name_1) group by iso, name_1 having count(*)>1; |
|
274 |
|
|
275 |
iso | name_1 | count |
|
276 |
-----+-------------+------- |
|
277 |
AZE | Yevlax | 2 |
|
278 |
BGR | Ruse | 2 |
|
279 |
BMU | Hamilton | 2 |
|
280 |
BRA | Mato Grosso | 2 |
|
281 |
KAZ | Almaty | 2 |
|
282 |
KHM | Phnom Penh | 2 |
|
283 |
RUS | Altay | 2 |
|
284 |
RUS | Moskva | 2 |
|
285 |
(8 rows) |
|
286 |
*/ |
|
287 |
|
derived/biengeo/geovalidate.sql | ||
---|---|---|
1 |
/* |
|
2 |
SQL statements carrying out geovalidation on vegbien location data. |
|
3 |
Currently hard-coded to operate on the 'geoscrub' table that is |
|
4 |
assumed to have been generated by the geonames scrubbing process |
|
5 |
(see geonames.sql). This table should minimally contain the following |
|
6 |
columns: |
|
7 |
decimallatitude (latitude, WGS84 decimal degrees) |
|
8 |
decimallongitude (longitude, WGS84 decimal degrees) |
|
9 |
country (original asserted country name) |
|
10 |
stateprovince (original asserted stateprovince name) |
|
11 |
county (original asserted county name) |
|
12 |
countrystd (GADM2-mapped country name) |
|
13 |
stateprovincestd (GADM2-mapped state/province name) |
|
14 |
countystd (GADM2-mapped county name) |
|
15 |
(Note that the geonames.sql script also currently creates columns with |
|
16 |
geonames.org IDs for country, stateprovince, and county.) |
|
17 |
|
|
18 |
After execution of the statements below, the table will contain the |
|
19 |
following new columns (along with geom and geog columns used by |
|
20 |
postgis): |
|
21 |
latlonvalidity (validity score for lat/lon coordinate) |
|
22 |
countryvalidity (validity score for country) |
|
23 |
stateprovincevalidity (validity score for stateprovince) |
|
24 |
countyvalidity (validity score for county) |
|
25 |
pointcountry (GADM2 country containing the point) |
|
26 |
pointstateprovince (GADM2 stateprovince containing the point) |
|
27 |
pointcounty (GADM2 county containing the point) |
|
28 |
|
|
29 |
Workflow with actual times (as executed 15-Nov-2012) |
|
30 |
12s create point geoms |
|
31 |
16s create point geogs |
|
32 |
23s index geoms |
|
33 |
26s index geogs |
|
34 |
37s index countrystd |
|
35 |
25s index stateprovincestd |
|
36 |
5s vacuum analyze |
|
37 |
1s mark missing coords (UPDATE 0) |
|
38 |
133s mark valid coords (UPDATE 1702989) |
|
39 |
2s mark invalid coords (UPDATE 4981) |
|
40 |
17s vacuum analyze |
|
41 |
27s mark missing countries (UPDATE 222085) |
|
42 |
36s mark non-gadm countries (UPDATE 6547) |
|
43 |
1277s mark point in country (UPDATE 1400627) |
|
44 |
1491s mark point near country (UPDATE 24626) |
|
45 |
573s mark point near country accurate (UPDATE 1182) |
|
46 |
0s mark point near Antarctica (UPDATE 0) |
|
47 |
2s mark point not in country (UPDATE 54085) |
|
48 |
8s vacuum analyze |
|
49 |
15s mark missing stateprovinces (UPDATE 299015) |
|
50 |
12s mark non-gadm stateprovinces (UPDATE 19716) |
|
51 |
920s mark point in stateprovince (UPDATE 1241068) |
|
52 |
864s mark point near stateprovince (UPDATE 35685) |
|
53 |
424s mark point near stateprovince accurate (UPDATE 1051) |
|
54 |
7s mark point not in stateprovince (UPDATE 111282) |
|
55 |
s vacuum analyze |
|
56 |
128s mark missing counties (UPDATE 1431161) |
|
57 |
29s mark non-gadm counties (UPDATE 63014) |
|
58 |
86s mark point in county (UPDATE 174469) |
|
59 |
65s mark point near county (UPDATE 12477) |
|
60 |
?s mark point near county accurate (UPDATE ?) |
|
61 |
3s mark point not in county (UPDATE 26849) |
|
62 |
?s vacuum analyze |
|
63 |
890s look up GADM location using given lat/lon (UPDATE 1656709) |
|
64 |
|
|
65 |
todo: |
|
66 |
* make firm decision about whether we should assume here that *std |
|
67 |
columns contain valid GADM names (or are NULL), or whether we should |
|
68 |
check that here |
|
69 |
|
|
70 |
Jim Regetz |
|
71 |
NCEAS |
|
72 |
Created Nov 2012 |
|
73 |
*/ |
|
74 |
|
|
75 |
-- generate point geometries from coordinates |
|
76 |
SELECT AddGeometryColumn('public', 'geoscrub', 'geom', 4326, 'POINT', 2 ); |
|
77 |
UPDATE geoscrub |
|
78 |
SET geom = ST_SetSRID(ST_Point(decimallongitude, decimallatitude), 4326); |
|
79 |
-- ... create point geographies too ... |
|
80 |
-- note that this excludes (leaves null) any occurrences with coordinates |
|
81 |
-- that are not valid decimal degrees |
|
82 |
ALTER TABLE geoscrub ADD COLUMN geog geography(POINT, 4326); |
|
83 |
UPDATE geoscrub |
|
84 |
SET geog = geom::geography |
|
85 |
WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90)); |
|
86 |
|
|
87 |
-- create indexes |
|
88 |
CREATE INDEX "geoscrub_geom_gist" ON geoscrub USING GIST (geom); |
|
89 |
CREATE INDEX "geoscrub_geog_gist" ON geoscrub USING GIST (geog); |
|
90 |
CREATE INDEX "geoscrub_countrystd_idx" ON geoscrub (countrystd); |
|
91 |
CREATE INDEX "geoscrub_stateprovincestd_idx" ON geoscrub (stateprovincestd); |
|
92 |
CREATE INDEX "geoscrub_countystd_idx" ON geoscrub (countystd); |
|
93 |
|
|
94 |
-- may want to do this first, otherwise the query planner wants to do a |
|
95 |
-- hash join instead of nested loop join, and at least when I use a limit |
|
96 |
-- statement to time queries on either side of the threshold where it |
|
97 |
-- switches, the hash join does waaaaay worse |
|
98 |
SET enable_hashjoin = false; |
|
99 |
|
|
100 |
-- |
|
101 |
-- validate lat/lon coordinates |
|
102 |
-- |
|
103 |
|
|
104 |
VACUUM ANALYZE geoscrub; |
|
105 |
|
|
106 |
ALTER TABLE geoscrub ADD COLUMN latlonvalidity int; |
|
107 |
|
|
108 |
-- -1 if missing one or both coordinates |
|
109 |
UPDATE geoscrub o |
|
110 |
SET latlonvalidity = -1 |
|
111 |
WHERE ( |
|
112 |
decimallatitude IS NULL |
|
113 |
OR decimallongitude IS NULL |
|
114 |
); |
|
115 |
|
|
116 |
-- 1 if the coordinates falls within the global bounding box |
|
117 |
UPDATE geoscrub o |
|
118 |
SET latlonvalidity = 1 |
|
119 |
WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90)); |
|
120 |
|
|
121 |
-- 0 otherwise (have coordinates, but not valid) |
|
122 |
UPDATE geoscrub o |
|
123 |
SET latlonvalidity = 0 |
|
124 |
WHERE o.latlonvalidity IS NULL; |
|
125 |
|
|
126 |
-- |
|
127 |
-- validate country |
|
128 |
-- |
|
129 |
|
|
130 |
VACUUM ANALYZE geoscrub; |
|
131 |
|
|
132 |
ALTER TABLE geoscrub ADD COLUMN countryvalidity int; |
|
133 |
|
|
134 |
-- -1 if missing country name |
|
135 |
UPDATE geoscrub o |
|
136 |
SET countryvalidity = -1 |
|
137 |
WHERE o.country IS NULL; |
|
138 |
|
|
139 |
-- 0 if a country name is asserted, but not recognized among GADM |
|
140 |
-- country names |
|
141 |
UPDATE geoscrub o |
|
142 |
SET countryvalidity = 0 |
|
143 |
WHERE (o.countrystd IS NULL |
|
144 |
OR NOT EXISTS ( |
|
145 |
SELECT 1 |
|
146 |
FROM gadm2 c |
|
147 |
WHERE o.countrystd=c.name_0 |
|
148 |
) |
|
149 |
) AND o.countryvalidity IS NULL; |
|
150 |
|
|
151 |
-- 3 if the point is in (or on the border of) the asserted country |
|
152 |
UPDATE geoscrub AS o |
|
153 |
SET countryvalidity = 3 |
|
154 |
WHERE EXISTS ( |
|
155 |
SELECT 1 |
|
156 |
FROM gadm2 c |
|
157 |
WHERE o.countrystd=c.name_0 |
|
158 |
AND ST_Intersects(o.geom, c.simple_geom) |
|
159 |
) AND o.countryvalidity IS NULL; |
|
160 |
|
|
161 |
-- 2 for those that aren't 3's, but the point is in within 5km of |
|
162 |
-- the asserted country |
|
163 |
UPDATE geoscrub o |
|
164 |
SET countryvalidity = 2 |
|
165 |
WHERE EXISTS ( |
|
166 |
SELECT 1 |
|
167 |
FROM gadm2 c |
|
168 |
WHERE o.countrystd=c.name_0 |
|
169 |
AND ST_DWithin(o.geog, c.simple_geog, 5000) |
|
170 |
) AND o.countryvalidity IS NULL; |
|
171 |
-- now recheck the 2's to see if any are within the country based on the |
|
172 |
-- original (unsimplified) geometry, and if so, upgrade to 3 |
|
173 |
UPDATE geoscrub AS o |
|
174 |
SET countryvalidity = 3 |
|
175 |
WHERE EXISTS ( |
|
176 |
SELECT 1 |
|
177 |
FROM gadm2 c |
|
178 |
WHERE o.countrystd=c.name_0 |
|
179 |
AND ST_Intersects(o.geom, c.geom) |
|
180 |
) AND o.countryvalidity = 2; |
|
181 |
-- also handle special case Antartica, which has invalid geography in |
|
182 |
-- gadm2 and thus wasn't included in the within-5km check |
|
183 |
UPDATE geoscrub AS o |
|
184 |
SET countryvalidity = 3 |
|
185 |
WHERE EXISTS ( |
|
186 |
SELECT 1 |
|
187 |
FROM gadm2 c |
|
188 |
WHERE o.countrystd=c.name_0 |
|
189 |
AND ST_Intersects(o.geom, c.geom) |
|
190 |
) AND o.countryvalidity < 3 |
|
191 |
AND o.countrystd = 'Antarctica'; |
|
192 |
|
|
193 |
-- 1 otherwise (have recognized country name, but point is not within |
|
194 |
-- 5km of the asserted country) |
|
195 |
UPDATE geoscrub o |
|
196 |
SET countryvalidity = 1 |
|
197 |
WHERE o.countryvalidity IS NULL; |
|
198 |
|
|
199 |
|
|
200 |
----------------------------- |
|
201 |
-- validate state/province -- |
|
202 |
----------------------------- |
|
203 |
|
|
204 |
VACUUM ANALYZE geoscrub; |
|
205 |
|
|
206 |
ALTER TABLE geoscrub ADD COLUMN stateprovincevalidity int; |
|
207 |
|
|
208 |
-- -1 if missing stateprovince name |
|
209 |
UPDATE geoscrub o |
|
210 |
SET stateprovincevalidity = -1 |
|
211 |
WHERE o.country IS NULL |
|
212 |
OR o.stateprovince IS NULL; |
|
213 |
|
|
214 |
-- 0 if a stateprovince name is asserted, but not recognized among GADM |
|
215 |
-- stateprovince names |
|
216 |
UPDATE geoscrub o |
|
217 |
SET stateprovincevalidity = 0 |
|
218 |
WHERE (o.countrystd IS NULL |
|
219 |
OR o.stateprovincestd IS NULL |
|
220 |
OR NOT EXISTS ( |
|
221 |
SELECT 1 |
|
222 |
FROM gadm2 c |
|
223 |
WHERE o.countrystd=c.name_0 |
|
224 |
AND o.stateprovincestd=c.name_1 |
|
225 |
) |
|
226 |
) AND o.stateprovincevalidity IS NULL; |
|
227 |
|
|
228 |
-- 3 if the point is in (or on the border of) the asserted stateprovince |
|
229 |
UPDATE geoscrub AS o |
|
230 |
SET stateprovincevalidity = 3 |
|
231 |
WHERE EXISTS ( |
|
232 |
SELECT 1 |
|
233 |
FROM gadm2 c |
|
234 |
WHERE o.countrystd=c.name_0 |
|
235 |
AND o.stateprovincestd=c.name_1 |
|
236 |
AND ST_Intersects(o.geom, c.simple_geom) |
|
237 |
) AND o.countryvalidity = 3 |
|
238 |
AND o.stateprovincevalidity IS NULL; |
|
239 |
|
|
240 |
-- for those that aren't 3's, set to 2 if the point is in within 5km of |
|
241 |
-- the asserted stateprovince |
|
242 |
UPDATE geoscrub o |
|
243 |
SET stateprovincevalidity = 2 |
|
244 |
WHERE EXISTS ( |
|
245 |
SELECT 1 |
|
246 |
FROM gadm2 c |
|
247 |
WHERE o.countrystd=c.name_0 |
|
248 |
AND o.stateprovincestd=c.name_1 |
|
249 |
AND ST_DWithin(o.geog, c.simple_geog, 5000) |
|
250 |
) AND (o.countryvalidity = 2 OR o.countryvalidity = 3) |
|
251 |
AND o.stateprovincevalidity IS NULL; |
|
252 |
-- now recheck the 2's to see if any are within the stateprovince based |
|
253 |
-- on the original (unsimplified) geometry, and if so, upgrade to 3 |
|
254 |
UPDATE geoscrub AS o |
|
255 |
SET stateprovincevalidity = 3 |
|
256 |
WHERE EXISTS ( |
|
257 |
SELECT 1 |
|
258 |
FROM gadm2 c |
|
259 |
WHERE o.countrystd=c.name_0 |
|
260 |
AND o.stateprovincestd=c.name_1 |
|
261 |
AND ST_Intersects(o.geom, c.geom) |
|
262 |
) AND o.stateprovincevalidity = 2; |
|
263 |
|
|
264 |
-- 1 otherwise (have recognized stateprovince name, but point is not within |
|
265 |
-- 5km of the asserted stateprovince) |
|
266 |
UPDATE geoscrub o |
|
267 |
SET stateprovincevalidity = 1 |
|
268 |
WHERE o.stateprovincevalidity IS NULL; |
|
269 |
|
|
270 |
----------------------------- |
|
271 |
-- validate county/parish -- |
|
272 |
----------------------------- |
|
273 |
|
|
274 |
ALTER TABLE geoscrub ADD COLUMN countyvalidity int; |
|
275 |
|
|
276 |
-- -1 if missing county name |
|
277 |
UPDATE geoscrub o |
|
278 |
SET countyvalidity = -1 |
|
279 |
WHERE o.country IS NULL |
|
280 |
OR o.stateprovince IS NULL |
|
281 |
OR o.county IS NULL; |
|
282 |
|
|
283 |
-- 0 if a county name is asserted, but not recognized among GADM |
|
284 |
-- county names |
|
285 |
UPDATE geoscrub o |
|
286 |
SET countyvalidity = 0 |
|
287 |
WHERE (o.countrystd IS NULL |
|
288 |
OR o.stateprovincestd IS NULL |
|
289 |
OR o.countystd IS NULL |
|
290 |
OR NOT EXISTS ( |
|
291 |
SELECT 1 |
|
292 |
FROM gadm2 c |
|
293 |
WHERE o.countrystd=c.name_0 |
|
294 |
AND o.stateprovincestd=c.name_1 |
|
295 |
AND o.countystd=c.name_2 |
|
296 |
) |
|
297 |
) AND o.countyvalidity IS NULL; |
|
298 |
|
|
299 |
-- 3 if the point is in (or on the border of) the asserted county |
|
300 |
UPDATE geoscrub AS o |
|
301 |
SET countyvalidity = 3 |
|
302 |
WHERE EXISTS ( |
|
303 |
SELECT 1 |
|
304 |
FROM gadm2 c |
|
305 |
WHERE o.countrystd=c.name_0 |
|
306 |
AND o.stateprovincestd=c.name_1 |
|
307 |
AND o.countystd=c.name_2 |
|
308 |
AND ST_Intersects(o.geom, c.simple_geom) |
|
309 |
) AND o.stateprovincevalidity = 3 |
|
310 |
AND o.countyvalidity IS NULL; |
|
311 |
|
|
312 |
-- for those that aren't 3's, set to 2 if the point is in within 5km of |
|
313 |
-- the asserted county |
|
314 |
UPDATE geoscrub o |
|
315 |
SET countyvalidity = 2 |
|
316 |
WHERE EXISTS ( |
|
317 |
SELECT 1 |
|
318 |
FROM gadm2 c |
|
319 |
WHERE o.countrystd=c.name_0 |
|
320 |
AND o.stateprovincestd=c.name_1 |
|
321 |
AND o.countystd=c.name_2 |
|
322 |
AND ST_DWithin(o.geog, c.simple_geog, 5000) |
|
323 |
) AND (o.stateprovincevalidity = 2 OR o.stateprovincevalidity = 3) |
|
324 |
AND o.countyvalidity IS NULL; |
|
325 |
|
|
326 |
-- 1 otherwise (have recognized county name, but point is not within |
|
327 |
-- 5km of the asserted county) |
|
328 |
UPDATE geoscrub o |
|
329 |
SET countyvalidity = 1 |
|
330 |
WHERE o.countyvalidity IS NULL; |
|
331 |
|
|
332 |
-- |
|
333 |
-- Look up GADM place names based purely on coordinates for all points |
|
334 |
-- |
|
335 |
|
|
336 |
ALTER TABLE geoscrub ADD COLUMN pointcountry text; |
|
337 |
ALTER TABLE geoscrub ADD COLUMN pointstateprovince text; |
|
338 |
ALTER TABLE geoscrub ADD COLUMN pointcounty text; |
|
339 |
-- note: only using simplified geometry now, for speed reasons |
|
340 |
UPDATE geoscrub o |
|
341 |
SET pointcountry = c.name_0, |
|
342 |
pointstateprovince = c.name_1, |
|
343 |
pointcounty = c.name_2 |
|
344 |
FROM gadm2 c |
|
345 |
WHERE ST_Intersects(o.geom, c.simple_geom) |
|
346 |
AND o.latlonvalidity = 1; |
|
347 |
|
|
348 |
SET enable_hashjoin = true; |
derived/biengeo/geonames.sh | ||
---|---|---|
1 |
# Script to geoscrub-enable a postgres database, developed in the |
|
2 |
# context of BIEN3 geoscrubbing but with intentions of generality. |
|
3 |
# |
|
4 |
# Basic workflow: |
|
5 |
# 1. Import table dumps obtained from geonames.org |
|
6 |
# 2. Add additional custom mapping info, including: |
|
7 |
# a. Madagascar provinces used in GADM2 but not geonames |
|
8 |
# b. extra country name mappings/fixes specific to BIEN3 data |
|
9 |
# c. extra state/prov name mappings/fixes specific to BIEN3 data |
|
10 |
# d. custom state/prov name mappings binding GADM2 to geonames |
|
11 |
# |
|
12 |
# Notes: |
|
13 |
# * One of the INSERT statements below contains the string 'Hawai\`i'. |
|
14 |
# The backslash is there to escape the backtick for bash, so that |
|
15 |
# what gets inserted into the database is 'Hawai`i'. If manually |
|
16 |
# copy-and-pasting the statement into psql rather than running as a |
|
17 |
# script (i.e., not having bash interpret the command), the backslash |
|
18 |
# must not be included. |
|
19 |
# |
|
20 |
# Jim Regetz |
|
21 |
# NCEAS |
|
22 |
# Created Nov 2012 |
|
23 |
|
|
24 |
# !! this is specific to regetz's testing environment !! |
|
25 |
DATADIR="$HOME/biengeo/geonames" |
|
26 |
cd ${DATADIR} |
|
27 |
|
|
28 |
# |
|
29 |
# load geonames.org geonames |
|
30 |
# |
|
31 |
|
|
32 |
wget http://download.geonames.org/export/dump/allCountries.zip |
|
33 |
unzip allCountries.zip |
|
34 |
|
|
35 |
psql -c \ |
|
36 |
'CREATE TABLE geonames ( |
|
37 |
geonameid serial primary key, |
|
38 |
name text, |
|
39 |
asciiname text, |
|
40 |
alternatenames text, |
|
41 |
latitude numeric, |
|
42 |
longitude numeric, |
|
43 |
featureclass char(1), |
|
44 |
featurecode text, |
|
45 |
countrycode char(2), |
|
46 |
cc2 char(60), |
|
47 |
admin1code text, |
|
48 |
admin2code text, |
|
49 |
admin3code text, |
|
50 |
admin4code text, |
|
51 |
population bigint, |
|
52 |
elevation int, |
|
53 |
dem int, |
|
54 |
timezone text, |
|
55 |
modification date |
|
56 |
)' geoscrub |
|
57 |
|
|
58 |
# import data |
|
59 |
psql -c "COPY geonames FROM '${DATADIR}/allCountries.txt' DELIMITER E'\t' NULL ''" geoscrub |
|
60 |
psql -c \ "SELECT setval('geonames_geonameid_seq', (SELECT max(geonameid) FROM geonames))" geoscrub |
|
61 |
psql -c \ "CREATE INDEX geonames_countrycode_idx ON geonames (countrycode)" geoscrub |
|
62 |
|
|
63 |
# |
|
64 |
# load geonames.org alternate names |
|
65 |
# |
|
66 |
|
|
67 |
wget http://download.geonames.org/export/dump/alternateNames.zip |
|
68 |
unzip alternateNames.zip |
|
69 |
|
|
70 |
psql -c \ |
|
71 |
'CREATE TABLE alternateNames ( |
|
72 |
alternateNameId serial primary key, |
|
73 |
geonameid int references geonames (geonameid), |
|
74 |
isolanguage varchar(7), |
|
75 |
alternateName varchar(200), |
|
76 |
isPreferredName char(1), |
|
77 |
isShortName char(1), |
|
78 |
isColloquial char(1), |
|
79 |
isHistoric char(1) |
|
80 |
)' geoscrub |
|
81 |
|
|
82 |
# import data |
|
83 |
psql -c "COPY alternateNames FROM '${DATADIR}/alternateNames.txt' DELIMITER E'\t' NULL ''" geoscrub |
|
84 |
psql -c \ "SELECT setval('alternatenames_alternatenameid_seq', (SELECT max(alternatenameid) FROM alternatenames))" geoscrub |
|
85 |
# 4m45.368s |
|
86 |
|
|
87 |
# index alternatenames |
|
88 |
psql -c 'CREATE INDEX "alternatenames_alternatename_idx" ON alternateNames (alternateName)' geoscrub |
|
89 |
# 10m42.827s |
|
90 |
|
|
91 |
# |
|
92 |
# load geonames.org country info |
|
93 |
# |
|
94 |
|
|
95 |
psql -c \ |
|
96 |
'CREATE TABLE countries ( |
|
97 |
iso char(2), |
|
98 |
iso3 char(3), |
|
99 |
isonumeric text, |
|
100 |
fips text, |
|
101 |
country text UNIQUE, |
|
102 |
capital text, |
|
103 |
area text, |
|
104 |
population text, |
|
105 |
continent text, |
|
106 |
tld text, |
|
107 |
currencyCode text, |
|
108 |
currencyName text, |
|
109 |
phone text, |
|
110 |
postalCodeFormat text, |
|
111 |
postalCodeRegex text, |
|
112 |
languages text, |
|
113 |
geonameid int, |
|
114 |
neighbours text, |
|
115 |
equivalentFipsCode text |
|
116 |
)' geoscrub |
|
117 |
|
|
118 |
# import geonames country table |
|
119 |
grep -ve '^#' $DATADIR/countryInfo.txt | \ |
|
120 |
psql -c "COPY countries FROM STDIN DELIMITER E'\t' NULL ''" geoscrub |
|
121 |
|
|
122 |
# |
|
123 |
# load geonames.org hierarchy info |
|
124 |
# |
|
125 |
|
|
126 |
wget http://download.geonames.org/export/dump/hierarchy.zip |
|
127 |
unzip hierarchy.zip |
|
128 |
|
|
129 |
psql -c \ |
|
130 |
'CREATE TABLE hierarchy ( |
|
131 |
parentId int,-- references geonames (geonameid), |
|
132 |
childId int,-- references geonames (geonameid), |
|
133 |
type text |
|
134 |
)' geoscrub |
|
135 |
|
|
136 |
# import geonames country table |
|
137 |
psql -c "COPY hierarchy FROM '${DATADIR}/hierarchy.txt' DELIMITER E'\t' NULL ''" geoscrub |
|
138 |
|
|
139 |
# |
|
140 |
# insert additional custom name-scrub mappings |
|
141 |
# |
|
142 |
|
|
143 |
# augment geonames with some additional places we need, and update |
|
144 |
# hierarchy table accordingly |
|
145 |
psql -c \ |
|
146 |
"WITH newnames AS ( |
|
147 |
INSERT INTO geonames |
|
148 |
(name, alternatenames, featurecode, countrycode ) |
|
149 |
VALUES |
|
150 |
('Antananarivo', NULL, 'ADM1', 'MG'), |
|
151 |
('Antsiranana', NULL, 'ADM1', 'MG'), |
|
152 |
('Fianarantsoa', NULL, 'ADM1', 'MG'), |
|
153 |
('Mahajanga', NULL, 'ADM1', 'MG'), |
|
154 |
('Toamasina', NULL, 'ADM1', 'MG'), |
|
155 |
('Toliara', 'Toliary', 'ADM1', 'MG') |
|
156 |
RETURNING geonameid, countrycode |
|
157 |
) |
|
158 |
INSERT INTO hierarchy (parentid, childid) |
|
159 |
SELECT c.geonameid AS parentid, |
|
160 |
n.geonameid AS childid |
|
161 |
FROM newnames n, |
|
162 |
countries c |
|
163 |
WHERE n.countrycode = c.iso; |
|
164 |
" geoscrub |
|
165 |
|
|
166 |
# add some custom country alternate names that we need |
|
167 |
psql -c " |
|
168 |
CREATE TABLE alt_country ( |
|
169 |
country text references countries (country), |
|
170 |
alternatename text |
|
171 |
); |
|
172 |
INSERT INTO alt_country |
|
173 |
(country, alternatename) |
|
174 |
VALUES |
|
175 |
('Central African Republic', 'Central African Repu') |
|
176 |
, ('British Virgin Islands', 'Virgin Islands (U.K.)') |
|
177 |
, ('Democratic Republic of the Congo', 'Congo, The Democratic Republic of the') |
|
178 |
, ('Falkland Islands', 'Falkland Isl') |
|
179 |
, ('Falkland Islands', 'Falkland Islands (Malvinas)') |
|
180 |
, ('Faroe Islands', 'Faeroe Isl.') |
|
181 |
, ('French Guiana', 'Guyana Francesa') |
|
182 |
, ('Iran', 'Iran, Islamic Republic of') |
|
183 |
, ('Mexico', E'Mexico\rMexico') |
|
184 |
, ('Nicaragua', 'Ncaragua') |
|
185 |
, ('Norfolk Island', 'Norfolk Isl') |
|
186 |
, ('North Korea', 'Korea, Democratic People''s Republic of') |
|
187 |
, ('Solomon Islands', 'Solomon Isl') |
|
188 |
, ('South Georgia and the South Sandwich Islands', 'South Georgia and the South Sand') |
|
189 |
, ('South Korea', 'Korea (Republic of)') |
|
190 |
, ('South Korea', 'Korea, Republic of') |
|
191 |
, ('U.S. Virgin Islands', 'US Virgin Islands') |
|
192 |
, ('U.S. Virgin Islands', 'Virgin Islands (U.S.)') |
|
193 |
, ('United States', 'ESTADOS UNIDOS DE AMERICA') |
|
194 |
, ('United States', 'EUA') |
|
195 |
, ('Vietnam', 'VIETNAM [Socialist Republic of V') |
|
196 |
;" geoscrub |
|
197 |
|
|
198 |
# add some custom stateprovince alternate names that we need |
|
199 |
psql -c " |
|
200 |
CREATE TABLE alt_stateprovince ( |
|
201 |
country text references countries (country), |
|
202 |
stateprovince text, |
|
203 |
alternatename text |
|
204 |
); |
|
205 |
INSERT INTO alt_stateprovince |
|
206 |
(country, stateprovince, alternatename) |
|
207 |
VALUES |
|
208 |
('Canada', 'British Columbia', 'B.C.') |
|
209 |
, ('Canada', 'Newfoundland and Labrador', 'Newfoundland & Labrador') |
|
210 |
, ('Canada', 'Newfoundland and Labrador', 'New Foundland And Labrador') |
|
211 |
, ('Canada', 'Newfoundland and Labrador', 'Labrador & Newfoundland') |
|
212 |
, ('Canada', 'Northwest Territories', 'North West Territories') |
|
213 |
, ('Canada', 'Northwest Territories', 'Northwest Territorie') |
|
214 |
, ('Canada', 'Northwest Territories', 'NWT') |
|
215 |
, ('Canada', 'Québec', 'Quebéc') |
|
216 |
, ('Canada', 'Québec', 'Pq') |
|
217 |
, ('Canada', 'Québec', 'PQ') |
|
218 |
, ('United States', 'Hawaii', 'Hawai\`i') |
|
219 |
, ('United States', 'North Carolina', 'N. Carolina') |
|
220 |
, ('United States', 'North Dakota', 'N. Dakota') |
|
221 |
, ('United States', 'South Dakota', 'S. Dakota') |
|
222 |
, ('Kenya', 'Coast Province', 'Coast') |
|
223 |
, ('Mexico', 'Estado de Baja California', 'Baja California Norte') |
|
224 |
, ('Mexico', 'Estado de Baja California', 'Baja California (Norte)') |
|
225 |
, ('Mexico', 'Estado de Baja California', 'Baja California, Norte de') |
|
226 |
, ('Mexico', 'Estado de Baja California', 'Baja California (state)') |
|
227 |
, ('Mexico', 'Estado de Baja California Sur', 'Baja California, Sur de') |
|
228 |
, ('Mexico', 'Estado de Baja California Sur', 'Baja California Sur (state)') |
|
229 |
, ('Mexico', 'Estado de Baja California Sur', 'Baja Califronia') |
|
230 |
, ('Mexico', 'Estado de Baja California Sur', 'Baja Cal. Sur') |
|
231 |
, ('Ecuador', 'Provincia de Sucumbíos', 'Sucumbíos') |
|
232 |
; |
|
233 |
" geoscrub |
|
234 |
|
|
235 |
# additions for mapping to gadm2 level 1 units |
|
236 |
# - Los Lagos -> Los Rios is imperfect but needed for gadm |
|
237 |
# - Orellana -> Napo is imperfect (wrong??) but needed for gadm |
|
238 |
# - Mapping both Concelho de Macau and Concelho das Ilhas to Ilhas |
|
239 |
# - Nassa seems to be a gadm misspelling? (Niassa) |
|
240 |
# not addressed: |
|
241 |
# - Bahamas: unmapped gadm level1 entities may be obsolete? |
|
242 |
# - French Guiana has arrondissements as adm0 in gadm, low level? |
|
243 |
# - French Polynesia has no adm1 in gadm |
|
244 |
# - Madagascar: Sava is (was?) inside gadm's Antsiranana province |
|
245 |
# - Philippines: GN has 17 regions at level1, gadm has 82 provinces |
|
246 |
# - Sri Lanka: GN ADM2 matches gadm level1; bien data side with GN |
|
247 |
# - Venezuela: Dependencias Federales don't seem to be in GADM |
|
248 |
# - Vietnam: GN ADM1 matches gadm level2; bien data side with GN? |
|
249 |
psql -c " |
|
250 |
INSERT INTO alt_stateprovince |
|
251 |
(country, stateprovince, alternatename) |
|
252 |
VALUES |
|
253 |
('Cambodia', 'Khétt Siĕm Réab', 'Siemréab') |
|
254 |
, ('Chile', 'Región del Biobío', 'Bío-Bío') |
|
255 |
, ('Chile', 'Región del Libertador General Bernardo O’Higgins', 'Libertador General Bernardo O''Higgins') |
|
256 |
, ('Chile', 'Región de Los Ríos', 'Los Lagos') |
|
257 |
, ('Chile', 'Providencia y Santa Catalina, Departamento de Archipiélago de San Andrés', 'San Andrés y Providencia') |
|
258 |
, ('Dominican Republic', 'Provincia de Baoruco', 'Bahoruco') |
|
259 |
, ('Ecuador', 'Provincia de Morona Santiago', 'Morona Santiago') |
|
260 |
, ('Ecuador', 'Provincia de Napo', 'Orellana') |
|
261 |
, ('Ecuador', 'Provincia de Zamora Chinchipe', 'Zamora Chinchipe') |
|
262 |
, ('Gabon', 'Province du Woleu-Ntem', 'Wouleu-Ntem') |
|
263 |
, ('Ghana', 'Brong-Ahafo Region', 'Brong Ahafo') |
|
264 |
, ('Guatemala', 'Departamento de Quetzaltenango', 'Quezaltenango') |
|
265 |
, ('Haiti', 'Département de l''Artibonite', 'L''Artibonite') |
|
266 |
, ('Liberia', 'Grand Bassa County', 'GrandBassa') |
|
267 |
, ('Liberia', 'Grand Gedeh County', 'GrandGedeh') |
|
268 |
, ('Macao', 'Concelho de Macau', 'Ilhas') |
|
269 |
, ('Mozambique', 'Niassa Province', 'Nassa') |
|
270 |
, ('Norway', 'Østfold fylke', 'Ãstfold') |
|
271 |
, ('Peru', 'Región de Huánuco', 'Huánuco') |
|
272 |
, ('Puerto Rico', 'Guánica Municipio', 'Guánica') |
|
273 |
, ('Puerto Rico', 'Loíza Municipio', 'Loíza') |
|
274 |
, ('Puerto Rico', 'Manatí Municipio', 'Manatí') |
|
275 |
, ('Puerto Rico', 'Peñuelas Municipio', 'Peñuelas') |
|
276 |
, ('Puerto Rico', 'Río Grande Municipio', 'Río Grande') |
|
277 |
, ('Puerto Rico', 'San Germán Municipio', 'San Germán') |
|
278 |
, ('Puerto Rico', 'San Sebastián Municipio', 'San Sebastián') |
|
279 |
, ('Russia', 'Karachayevo-Cherkesskaya Respublika', 'Karachay-Cherkess') |
|
280 |
, ('Syria', 'Muḩāfaz̧at al Lādhiqīyah', 'Lattakia') |
|
281 |
, ('Togo', 'Région Centrale', 'Centre') |
|
282 |
;" geoscrub |
derived/biengeo/README.txt | ||
---|---|---|
1 |
BIEN geovalidation notes |
|
2 |
======================== |
|
3 |
|
|
4 |
[Also see comments embedded in specific scripts in this directory.] |
|
5 |
|
|
6 |
The bash and SQL statements contained in the files as ordered below |
|
7 |
should be applied to carry out geographic name scrubbing and |
|
8 |
geovalidation on a given corpus of BIEN location records. |
|
9 |
|
|
10 |
That said, given the tight deadline under which this was done in order |
|
11 |
to produced a geovalidated BIEN3 corpus in advance of the Nov 2013 |
|
12 |
working group meeting, and the corresponding manner in which much of |
|
13 |
this was actually executed piecemeal in an iterative and interactive |
|
14 |
fashion within a bash shell and psql session, I can't guarantee that the |
|
15 |
code in its current state could be run end-to-end without intervention. |
|
16 |
It's close, but probably not bulletproof. |
|
17 |
|
|
18 |
1. geovalidate.sh |
|
19 |
- creates postgis DB and loads GADM2 data |
|
20 |
2. geonames.sh |
|
21 |
- loads geonames.org data and adds some custom mapping logic |
|
22 |
3. geonames-to-gadm.sql |
|
23 |
- contains SQL statements that build linkages between geonames.org |
|
24 |
names and GADM2 names |
|
25 |
4. load-geoscrub-input.sh |
|
26 |
- dumps geoscrub_input from vegbien and loads it into the geoscrub db |
|
27 |
5. geonames.sql |
|
28 |
- contains SQL statements that scrub asserted names and (to the |
|
29 |
extent possible) map them to GADM2 |
|
30 |
6. geovalidate.sql |
|
31 |
- contains (postgis-extended) SQL statements that score the validity |
|
32 |
of GADM2-scrubbed names against given point coordinates |
|
33 |
|
|
34 |
The resulting 'geoscrub' table is what contains the scrubbed (i.e., |
|
35 |
GADM2-matched) names and various geovalidation scores. |
|
36 |
|
|
37 |
Notes/Caveats/Todos: |
|
38 |
* Clearly the SQL statements used in this procedure suffer from a lot of |
|
39 |
redundancy, and it might be worth trying to refactor once we're happy |
|
40 |
with the particular approach taken. |
|
41 |
* Need to pull out more known notes/caveats/todos and highlight them :) |
derived/biengeo/geonames.sql | ||
---|---|---|
1 |
/* |
|
2 |
SQL statements carrying out country name, stateprovince name, and |
|
3 |
county name scrubbing on vegbien location data. Currently hard-coded |
|
4 |
to operate on a 'vegbien_geoscrub' table that contains the input for |
|
5 |
geoscrubbing, with the following columns: |
|
6 |
decimallatitude (latitude, WGS84 decimal degrees) |
|
7 |
decimallongitude (longitude, WGS84 decimal degrees) |
|
8 |
country (original asserted country name) |
|
9 |
stateprovince (original asserted stateprovince name) |
|
10 |
county (original asserted county name) |
|
11 |
(Note that the lat/lon columns are not needed for scrubbing, but are |
|
12 |
used later for geovalidation.) |
|
13 |
|
|
14 |
Misc issues to contemplate and possibly address: |
|
15 |
|
|
16 |
* Congo can be either "Democratic Republic of the Congo" (COD) or |
|
17 |
"Republic of the Congo" (COG), eh? |
|
18 |
* Vietnam: gadm2 seems to treat provinces as level2 units |
|
19 |
* Yugoslavia: no longer exists, not in geonames nor gadm2 |
|
20 |
|
|
21 |
* Azerbaijan: Yevlakh City and Yevlakh Rayon both ADM1 in geonames, |
|
22 |
but just Yevlax in gadm2 |
|
23 |
|
|
24 |
* England is not a geoname country, only the UK is |
|
25 |
|
|
26 |
* the general problem of historical names in cases where countries |
|
27 |
have either been split, combined, removed (?), or otherwise |
|
28 |
redefined |
|
29 |
|
|
30 |
* Cases with multiple alternativename matches??? |
|
31 |
|
|
32 |
Notes: |
|
33 |
- the vegbien_geoscrub input used during development and testing this |
|
34 |
scrubbing procedure came from a select distinct on (lat, lon, |
|
35 |
country, state, province) from analytical_aggregate on vegbiendev |
|
36 |
|
|
37 |
Jim Regetz |
|
38 |
NCEAS |
|
39 |
Created Nov 2012 |
|
40 |
*/ |
|
41 |
|
|
42 |
-- define custom function to pass characters back through latin-1 |
|
43 |
-- encoding to revert errors; in principle this could introduce new |
|
44 |
-- errors, but it seems to do more good than harm |
|
45 |
CREATE OR REPLACE FUNCTION u2L2u(text) RETURNS text AS $$ |
|
46 |
DECLARE |
|
47 |
string text; |
|
48 |
BEGIN |
|
49 |
string := |
|
50 |
convert_from(convert($1::bytea, 'UTF8', 'LATIN1'), 'UTF8'); |
|
51 |
RETURN string; |
|
52 |
EXCEPTION |
|
53 |
WHEN data_exception THEN |
|
54 |
RETURN $1; |
|
55 |
END; |
|
56 |
$$ LANGUAGE plpgsql; |
|
57 |
|
|
58 |
------------------- |
|
59 |
-- scrub country -- |
|
60 |
------------------- |
|
61 |
|
|
62 |
CREATE TABLE vcountry AS |
|
63 |
SELECT DISTINCT country |
|
64 |
FROM vegbien_geoscrub |
|
65 |
WHERE country IS NOT NULL; |
|
66 |
-- SELECT 439 |
|
67 |
-- Time: 862.413 ms |
|
68 |
ALTER TABLE vcountry ADD COLUMN countryutf8 text; |
|
69 |
ALTER TABLE vcountry ADD COLUMN countryid integer; |
|
70 |
|
|
71 |
-- try to clean up character representations |
|
72 |
UPDATE vcountry |
|
73 |
SET countryutf8 = u2L2u(country); |
|
74 |
-- UPDATE 439 |
|
75 |
-- Time: 24.706 ms |
|
76 |
|
|
77 |
-- try matching against direct country names |
|
78 |
UPDATE vcountry vc |
|
79 |
SET countryid = gc.geonameid |
|
80 |
FROM countries gc |
|
81 |
WHERE lower(vc.countryutf8) = lower(gc.country); |
|
82 |
-- UPDATE 219 |
|
83 |
-- Time: 12.384 ms |
|
84 |
|
|
85 |
-- try matching against ISO 3166-1 alpha-3 |
|
86 |
UPDATE vcountry vc |
|
87 |
SET countryid = gc.geonameid |
|
88 |
FROM countries gc |
|
89 |
WHERE vc.countryutf8=gc.iso3 |
|
90 |
AND vc.countryid IS NULL; |
|
91 |
-- UPDATE 1 |
|
92 |
-- Time: 20.146 ms |
|
93 |
|
|
94 |
-- try matching against ISO 3166-1 alpha-2 |
|
95 |
UPDATE vcountry vc |
|
96 |
SET countryid = gc.geonameid |
|
97 |
FROM countries gc |
|
98 |
WHERE vc.countryutf8 = iso |
|
99 |
AND vc.countryid IS NULL; |
|
100 |
-- UPDATE 64 |
|
101 |
-- Time: 3.378 ms |
|
102 |
|
|
103 |
/* |
|
104 |
-- try matching against FIPS, but not if already matched (i.e., |
|
105 |
-- the 2-digit ISO match takes preference) |
|
106 |
UPDATE vcountry vc |
|
107 |
SET countryid = gc.geonameid |
|
108 |
FROM countries gc |
|
109 |
WHERE vc.countryutf8 = fips |
|
110 |
AND vc.countryid IS NULL; |
|
111 |
-- UPDATE 1 |
|
112 |
-- Time: 2.514 ms |
|
113 |
*/ |
|
114 |
|
|
115 |
-- try matching against BIEN alt country names |
|
116 |
UPDATE vcountry vc |
|
117 |
SET countryid = c.geonameid |
|
118 |
FROM alt_country a, countries c |
|
119 |
WHERE vc.countryutf8 = a.alternatename |
|
120 |
AND a.country = c.country |
|
121 |
AND vc.countryid IS NULL; |
|
122 |
-- UPDATE 21 |
|
123 |
-- Time: 4.769 ms |
|
124 |
|
|
125 |
-- try case-insensitive matching on geonames.org 'alternatenames' |
|
126 |
-- note: this is a little sloppy because the set condition will return |
|
127 |
-- more than one row if there are more than one matched alternate |
|
128 |
-- country names, and which one gets used is non-deterministic; however, |
|
129 |
-- we'll go back and remove those in the next update (specifically for |
|
130 |
-- the cases where there are more than one *unique* matched countries) |
|
131 |
UPDATE vcountry vc |
|
132 |
SET countryid = gc.geonameid |
|
133 |
FROM alternatenames a, countries gc |
|
134 |
WHERE lower(vc.countryutf8) = lower(a.alternatename) |
|
135 |
AND a.geonameid = gc.geonameid |
|
136 |
AND vc.countryid IS NULL; |
|
137 |
-- UPDATE 76 |
|
138 |
-- Time: 22019.592 ms |
|
139 |
|
|
140 |
-- go back and remove if altname lookup was ambiguous |
|
141 |
UPDATE vcountry vc |
|
142 |
SET countryid = NULL |
|
143 |
WHERE countryutf8 IN ( |
|
144 |
SELECT countryutf8 |
|
145 |
FROM ( |
|
146 |
SELECT DISTINCT vc.countryutf8, gc.iso3 |
|
147 |
FROM vcountry vc, |
|
148 |
alternatenames a, |
|
149 |
countries gc |
|
150 |
WHERE lower(vc.countryutf8) = lower(a.alternatename) |
|
151 |
AND a.geonameid = gc.geonameid) lookup |
|
152 |
GROUP BY countryutf8 |
|
153 |
HAVING COUNT(*) > 1); |
|
154 |
-- UPDATE 2 |
|
155 |
-- Time: 2296.257 ms |
|
156 |
-- Removes: Congo, CONGO |
|
157 |
|
|
158 |
-- merge in GADM country names (our standard names) |
|
159 |
ALTER TABLE vcountry ADD COLUMN countrystd text; |
|
160 |
UPDATE vcountry vc |
|
161 |
SET countrystd = name_0 |
|
162 |
FROM gadm_country_lookup gadm0 |
|
163 |
WHERE vc.countryid=gadm0.countryid; |
|
164 |
-- UPDATE 377 |
|
165 |
-- Time: 6.254 ms |
Also available in: Unified diff
added derived/biengeo/ from https://projects.nceas.ucsb.edu/nceas/projects/biengeo/repository/