1 |
10707
|
aaronmk
|
/*
|
2 |
|
|
|
3 |
|
|
Misc notes on how well we did scrubbing names. And lots of misc SQL
|
4 |
|
|
statements for examining various scrubbing outcomes, some of which may
|
5 |
|
|
not be so relevant anymore as a consequences of other changes.
|
6 |
|
|
|
7 |
|
|
Provisional (and not necessarily fully updated) summary of geoname
|
8 |
|
|
scrubbing performance on vegbien_geoscrub
|
9 |
|
|
(= geoscrub_input from vegbien schema public.2012-11-04-07-34-10.r5984)
|
10 |
|
|
========================================================================
|
11 |
|
|
1707970 total unique location determinations
|
12 |
|
|
32541 distinct on country, stateprovince, county (including all nulls)
|
13 |
|
|
========================================================================
|
14 |
|
|
* Country *
|
15 |
|
|
1485885 assertions
|
16 |
|
|
1479338 mapped to geonames
|
17 |
|
|
(6547 not mapped to geonames)
|
18 |
|
|
1479338 mapped to gadm2
|
19 |
|
|
(6547 not mapped to gadm2)
|
20 |
|
|
(0 cases of failed mapping to gadm2 name_0)
|
21 |
|
|
1479338 mapped both to geonames and to gadm2
|
22 |
|
|
========================================================================
|
23 |
|
|
* Country with State/Province *
|
24 |
|
|
1408955 assertions
|
25 |
|
|
1388525 mapped to geonames
|
26 |
|
|
(20430 not mapped to geonames)
|
27 |
|
|
(5576 of these had unmapped country)
|
28 |
|
|
1389239 mapped to gadm2
|
29 |
|
|
(19716 not mapped to gadm2)
|
30 |
|
|
(5576 of these had unmapped country)
|
31 |
|
|
1387915 mapped both to geonames and to gadm2
|
32 |
|
|
========================================================================
|
33 |
|
|
* Country with State/Province with County/Parish*
|
34 |
|
|
276809 assertions
|
35 |
|
|
145767 mapped to geonames
|
36 |
|
|
213795 mapped to gadm2
|
37 |
|
|
140983 mapped both to geonames and to gadm2
|
38 |
|
|
========================================================================
|
39 |
|
|
|
40 |
|
|
========================================================================
|
41 |
|
|
Distinct country names:
|
42 |
|
|
439 total unique country names (non-null)
|
43 |
|
|
377 recognized unique country names (non-null)
|
44 |
|
|
62 unrecognized unique country names (non-null)
|
45 |
|
|
========================================================================
|
46 |
|
|
Distinct country+stateprovince names:
|
47 |
|
|
4764 total unique country and stateprovince (stateprovince non-null)
|
48 |
|
|
4704 total unique country and stateprovince (both non-null)
|
49 |
|
|
2944 mapped to geonames
|
50 |
|
|
(1760 not mapped to geonames)
|
51 |
|
|
2950 mapped to gadm2
|
52 |
|
|
(1754 not mapped to gadm2)
|
53 |
|
|
2885 mapped both to geonames and to gadm2
|
54 |
|
|
(1819 not mapped to both)
|
55 |
|
|
------------------------------------------------------------------------
|
56 |
|
|
* Distinct mappedcountry+stateprovince names *
|
57 |
|
|
3312 total unique
|
58 |
|
|
1895 mapped to geonames
|
59 |
|
|
(1417 not mapped to geonames)
|
60 |
|
|
1901 mapped to gadm2
|
61 |
|
|
(1411 not mapped to gadm2)
|
62 |
|
|
========================================================================
|
63 |
|
|
Distinct country+stateprovince+county names:
|
64 |
|
|
27440 total unique country and stateprovince and county (county non-null)
|
65 |
|
|
25764 total unique country and stateprovince and county (all non-null)
|
66 |
|
|
8507 mapped to geonames
|
67 |
|
|
(17257 not mapped to geonames)
|
68 |
|
|
12610 mapped to gadm2
|
69 |
|
|
(13154 not mapped to gadm2)
|
70 |
|
|
4917 mapped both to geonames and to gadm2
|
71 |
|
|
(20847 not mapped to both)
|
72 |
|
|
------------------------------------------------------------------------
|
73 |
|
|
* Distinct mappedcountry+mappedstateprovince+county names *
|
74 |
|
|
18715 total unique
|
75 |
|
|
6673 mapped to geonames
|
76 |
|
|
(12042 not mapped to geonames)
|
77 |
|
|
11310 mapped to gadm2
|
78 |
|
|
(7405 not mapped to gadm2)
|
79 |
|
|
========================================================================
|
80 |
|
|
|
81 |
|
|
|
82 |
|
|
for comparison, here are bien2 stats based on boyle/donoghue scrubbing:
|
83 |
|
|
753302 total unique location determinations
|
84 |
|
|
have asserted country
|
85 |
|
|
745321 have scrubbed country
|
86 |
|
|
have asserted country and stateprovince
|
87 |
|
|
have scrubbed country and stateprovince
|
88 |
|
|
have asserted country and stateprovince and county
|
89 |
|
|
have scrubbed country and stateprovince and county
|
90 |
|
|
|
91 |
|
|
Jim Regetz
|
92 |
|
|
NCEAS
|
93 |
|
|
Created Nov 2012
|
94 |
|
|
*/
|
95 |
|
|
|
96 |
|
|
-- Generate human-readable view of all distinct cases where we haven't
|
97 |
|
|
-- obtained a match for the asserted country
|
98 |
|
|
CREATE TEMP VIEW unmatched_country AS
|
99 |
|
|
SELECT count(*), country FROM
|
100 |
|
|
(SELECT
|
101 |
|
|
CASE WHEN length(countryutf8)>4
|
102 |
|
|
THEN initcap(countryutf8)
|
103 |
|
|
ELSE countryutf8
|
104 |
|
|
END AS country
|
105 |
|
|
FROM scrubbed
|
106 |
|
|
JOIN vcountry USING (country)
|
107 |
|
|
WHERE scrubbed.country IS NOT NULL
|
108 |
|
|
AND scrubbed.countryid IS NULL) summary
|
109 |
|
|
GROUP BY country
|
110 |
|
|
ORDER BY count DESC;
|
111 |
|
|
-- Generate human-readable view of all distinct cases where we have
|
112 |
|
|
-- obtained match for the asserted country, but not stateprovince
|
113 |
|
|
CREATE TEMP VIEW unmatched_stateprovince AS
|
114 |
|
|
SELECT count(*), country, stateprovince FROM
|
115 |
|
|
(SELECT countries.country AS country,
|
116 |
|
|
CASE WHEN length(stateprovinceutf8)>3
|
117 |
|
|
THEN initcap(stateprovinceutf8)
|
118 |
|
|
ELSE stateprovinceutf8
|
119 |
|
|
END AS stateprovince
|
120 |
|
|
FROM scrubbed
|
121 |
|
|
LEFT JOIN vstate USING (countryid, stateprovince)
|
122 |
|
|
LEFT JOIN countries ON vstate.countryid=geonameid
|
123 |
|
|
WHERE scrubbed.stateprovince IS NOT NULL
|
124 |
|
|
AND scrubbed.stateprovinceid IS NULL
|
125 |
|
|
AND scrubbed.country IS NOT NULL) summary
|
126 |
|
|
GROUP BY country, stateprovince
|
127 |
|
|
ORDER BY count DESC;
|
128 |
|
|
/*
|
129 |
|
|
SELECT DISTINCT countries.country,
|
130 |
|
|
CASE WHEN length(stateprovinceutf8)>3
|
131 |
|
|
THEN initcap(stateprovinceutf8)
|
132 |
|
|
ELSE stateprovinceutf8
|
133 |
|
|
END AS stateprovince
|
134 |
|
|
FROM scrubbed
|
135 |
|
|
JOIN vstate USING (countryid, stateprovince)
|
136 |
|
|
JOIN countries ON vstate.countryid=geonameid
|
137 |
|
|
WHERE scrubbed.stateprovince IS NOT NULL
|
138 |
|
|
AND scrubbed.stateprovinceid IS NULL
|
139 |
|
|
ORDER BY country, stateprovince;
|
140 |
|
|
*/
|
141 |
|
|
|
142 |
|
|
-- Generate human-readable view of all distinct cases where we have
|
143 |
|
|
-- obtained match for the asserted country and stateprovince, but not
|
144 |
|
|
-- county
|
145 |
|
|
/*
|
146 |
|
|
CREATE TEMP VIEW unmatched_county AS
|
147 |
|
|
SELECT DISTINCT countries.country,
|
148 |
|
|
CASE WHEN length(stateprovinceutf8)>3
|
149 |
|
|
THEN initcap(stateprovinceutf8)
|
150 |
|
|
ELSE stateprovinceutf8
|
151 |
|
|
END AS stateprovince
|
152 |
|
|
FROM scrubbed
|
153 |
|
|
JOIN vstate USING (countryid, stateprovince)
|
154 |
|
|
JOIN countries ON vstate.countryid=geonameid
|
155 |
|
|
WHERE scrubbed.stateprovince IS NOT NULL
|
156 |
|
|
AND scrubbed.stateprovinceid IS NULL
|
157 |
|
|
ORDER BY country, stateprovince;
|
158 |
|
|
*/
|
159 |
|
|
|
160 |
|
|
/*
|
161 |
|
|
-- which states aren't in gadm2?
|
162 |
|
|
select distinct iso3, s.name state
|
163 |
|
|
from scrubbed
|
164 |
|
|
join countries c on countryid=c.geonameid
|
165 |
|
|
join geonames s on stateprovinceid=s.geonameid
|
166 |
|
|
except select iso, name_1 state from gadm2;
|
167 |
|
|
|
168 |
|
|
select country, count(*)
|
169 |
|
|
from scrubbed s
|
170 |
|
|
left join gadm_stateprovince_lookup using (stateprovinceid)
|
171 |
|
|
where name_0 is null
|
172 |
|
|
and stateprovinceid is not null
|
173 |
|
|
group by country
|
174 |
|
|
order by count desc;
|
175 |
|
|
|
176 |
|
|
|
177 |
|
|
select name
|
178 |
|
|
from gadm2
|
179 |
|
|
join alternatenames on name_1=alternatename
|
180 |
|
|
join geonames using (geonameid)
|
181 |
|
|
join countries on countrycode=countries.iso
|
182 |
|
|
where featurecode='ADM1'
|
183 |
|
|
and name_1='North Solomons';
|
184 |
|
|
*/
|
185 |
|
|
|
186 |
|
|
|
187 |
|
|
/*
|
188 |
|
|
-- report distinct geonames-scrubbed country+stateprovince pairs that
|
189 |
|
|
-- we didn't match to any gadm polygons
|
190 |
|
|
SELECT DISTINCT c.country, g.name
|
191 |
|
|
FROM countries c
|
192 |
|
|
JOIN vstate v ON geonameid=countryid
|
193 |
|
|
JOIN geonames g ON g.geonameid=v.stateprovinceid
|
194 |
|
|
WHERE stateprovinceid NOT IN (
|
195 |
|
|
SELECT stateprovinceid FROM gadm_stateprovince_lookup
|
196 |
|
|
)
|
197 |
|
|
ORDER BY c.country, g.name;
|
198 |
|
|
|
199 |
|
|
-- report failed stateprovince mappings from gadm2 to geonames
|
200 |
|
|
SELECT countries.country AS countryname,
|
201 |
|
|
name AS stateprovincename,
|
202 |
|
|
count(*)
|
203 |
|
|
FROM scrubbed
|
204 |
|
|
LEFT JOIN gadm_stateprovince_lookup USING (stateprovinceid)
|
205 |
|
|
LEFT JOIN countries ON countryid=countries.geonameid
|
206 |
|
|
LEFT JOIN geonames ON stateprovinceid=geonames.geonameid
|
207 |
|
|
WHERE stateprovinceid IS NOT NULL
|
208 |
|
|
AND name_1 IS NULL
|
209 |
|
|
GROUP BY countryname, stateprovincename
|
210 |
|
|
ORDER BY countryname, stateprovincename;
|
211 |
|
|
*/
|
212 |
|
|
|
213 |
|
|
/*
|
214 |
|
|
select iso, name_1, geonameid as stateid
|
215 |
|
|
from (select distinct iso, name_1 from gadm2) gadm2
|
216 |
|
|
left join (
|
217 |
|
|
select distinct gadm2.iso,
|
218 |
|
|
name_1,
|
219 |
|
|
alternatenames.geonameid,
|
220 |
|
|
geonames.name
|
221 |
|
|
from (select distinct iso, name_1 from gadm2) gadm2
|
222 |
|
|
join alternatenames on name_1=alternatename
|
223 |
|
|
join geonames using (geonameid)
|
224 |
|
|
join countries on countrycode=countries.iso
|
225 |
|
|
where featurecode='ADM1'
|
226 |
|
|
and gadm2.iso=countries.iso3) foo
|
227 |
|
|
using (iso, name_1);
|
228 |
|
|
*/
|
229 |
|
|
|
230 |
|
|
-- how did we do?
|
231 |
|
|
|
232 |
|
|
-- country
|
233 |
|
|
|
234 |
|
|
-- percent of unique locations with asserted country names
|
235 |
|
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
236 |
|
|
FROM scrubbed WHERE country IS NOT NULL;
|
237 |
|
|
-- 87.0%
|
238 |
|
|
-- percent of unique locations with geoname-scrubbed country names
|
239 |
|
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
240 |
|
|
FROM scrubbed WHERE countryid IS NOT NULL;
|
241 |
|
|
-- 86.6%
|
242 |
|
|
-- percent of unique locations with gadm-matchedl country names
|
243 |
|
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
244 |
|
|
FROM scrubbed WHERE countryid IN (SELECT countryid FROM gadm_country_lookup);
|
245 |
|
|
-- 86.6%
|
246 |
|
|
|
247 |
|
|
-- stateprovince
|
248 |
|
|
|
249 |
|
|
-- note: 212221 records have a stateprovince but no country! those are
|
250 |
|
|
-- considered missing below
|
251 |
|
|
|
252 |
|
|
-- percent of unique locations with asserted stateprovince names
|
253 |
|
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
254 |
|
|
FROM scrubbed WHERE country IS NOT NULL AND stateprovince IS NOT NULL;
|
255 |
|
|
-- 82.5%
|
256 |
|
|
-- percent of unique locations with geoname-scrubbed stateprovince names
|
257 |
|
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
258 |
|
|
FROM scrubbed WHERE stateprovinceid IS NOT NULL;
|
259 |
|
|
-- 80.2%
|
260 |
|
|
-- percent of unique locations with gadm-matchedl stateprovince names
|
261 |
|
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
262 |
|
|
FROM scrubbed
|
263 |
|
|
WHERE stateprovinceid IN (SELECT stateprovinceid FROM gadm_stateprovince_lookup);
|
264 |
|
|
-- 79.5%
|
265 |
|
|
|
266 |
|
|
-- stateprovince
|
267 |
|
|
|
268 |
|
|
-- percent of unique locations with asserted county names
|
269 |
|
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
270 |
|
|
FROM scrubbed
|
271 |
|
|
WHERE country IS NOT NULL AND stateprovince IS NOT NULL AND county IS NOT NULL;
|
272 |
|
|
-- 16.2%
|
273 |
|
|
-- percent of unique locations with geoname-scrubbed county names
|
274 |
|
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
275 |
|
|
FROM scrubbed WHERE countyid IS NOT NULL;
|
276 |
|
|
-- 6.3%
|
277 |
|
|
-- percent of unique locations with gadm-matchedl county names
|
278 |
|
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
279 |
|
|
FROM scrubbed
|
280 |
|
|
WHERE countyid IN (SELECT countyid FROM gadm_county_lookup);
|
281 |
|
|
-- tbd
|
282 |
|
|
|
283 |
|
|
SELECT CASE WHEN countryid IS NOT NULL
|
284 |
|
|
THEN 'matched'
|
285 |
|
|
ELSE 'unmatched' END AS matches,
|
286 |
|
|
COUNT(*)
|
287 |
|
|
FROM vcountry
|
288 |
|
|
GROUP BY matches;
|
289 |
|
|
|
290 |
|
|
SELECT CASE WHEN stateprovinceid IS NOT NULL
|
291 |
|
|
THEN 'matched'
|
292 |
|
|
ELSE 'unmatched' END AS matches,
|
293 |
|
|
COUNT(*)
|
294 |
|
|
FROM vstate
|
295 |
|
|
GROUP BY matches;
|
296 |
|
|
-- matched | 1770
|
297 |
|
|
-- unmatched | 1541
|
298 |
|
|
|
299 |
|
|
SELECT CASE WHEN countyid IS NOT NULL
|
300 |
|
|
THEN 'matched'
|
301 |
|
|
ELSE 'unmatched' END AS matches,
|
302 |
|
|
COUNT(*)
|
303 |
|
|
FROM vcounty
|
304 |
|
|
GROUP BY matches;
|
305 |
|
|
-- matched | 4295
|
306 |
|
|
-- unmatched | 14354
|
307 |
|
|
|
308 |
|
|
SELECT CASE WHEN country IS NULL
|
309 |
|
|
THEN 'no country provided'
|
310 |
|
|
WHEN country IS NOT NULL AND countryid IS NULL
|
311 |
|
|
THEN 'unrecognized country'
|
312 |
|
|
ELSE
|
313 |
|
|
'scrubbed country'
|
314 |
|
|
END results,
|
315 |
|
|
COUNT(*)
|
316 |
|
|
FROM scrubbed
|
317 |
|
|
GROUP BY results;
|
318 |
|
|
|
319 |
|
|
SELECT CASE WHEN country IS NULL
|
320 |
|
|
THEN 'no fully qualified stateprovince provided'
|
321 |
|
|
WHEN country IS NOT NULL AND stateprovince IS NOT NULL AND stateprovinceid IS NULL
|
322 |
|
|
THEN 'unrecognized stateprovince'
|
323 |
|
|
ELSE
|
324 |
|
|
'scrubbed stateprovince'
|
325 |
|
|
END results,
|
326 |
|
|
COUNT(*)
|
327 |
|
|
FROM scrubbed
|
328 |
|
|
GROUP BY results;
|
329 |
|
|
|
330 |
|
|
-- file output
|
331 |
|
|
SELECT DISTINCT countryid, country,
|
332 |
|
|
stateprovinceid, stateprovince,
|
333 |
|
|
countyid, county
|
334 |
|
|
FROM scrubbed
|
335 |
|
|
ORDER BY country, stateprovince, county;
|
336 |
|
|
|
337 |
|
|
/*
|
338 |
|
|
-- look at the whole iso2 vs fips thing
|
339 |
|
|
SELECT DISTINCT vc.country, i.country iso, f.country fips, stateprovince
|
340 |
|
|
FROM vcountry vc
|
341 |
|
|
LEFT JOIN countries i ON vc.country=i.iso
|
342 |
|
|
LEFT JOIN countries f ON vc.country=f.fips
|
343 |
|
|
LEFT JOIN scrubbed s ON s.country=vc.country
|
344 |
|
|
WHERE vc.countryid IS NULL
|
345 |
|
|
AND i.country IS NOT NULL
|
346 |
|
|
AND f.country IS NOT NULL
|
347 |
|
|
ORDER BY vc.country;
|
348 |
|
|
*/
|