1
|
/*
|
2
|
|
3
|
Misc notes on how well we did scrubbing names. And lots of misc SQL
|
4
|
statements for examining various scrubbing outcomes, some of which may
|
5
|
not be so relevant anymore as a consequences of other changes.
|
6
|
|
7
|
Provisional (and not necessarily fully updated) summary of geoname
|
8
|
scrubbing performance on vegbien_geoscrub
|
9
|
(= geoscrub_input from vegbien schema public.2012-11-04-07-34-10.r5984)
|
10
|
========================================================================
|
11
|
1707970 total unique location determinations
|
12
|
32541 distinct on country, stateprovince, county (including all nulls)
|
13
|
========================================================================
|
14
|
* Country *
|
15
|
1485885 assertions
|
16
|
1479338 mapped to geonames
|
17
|
(6547 not mapped to geonames)
|
18
|
1479338 mapped to gadm2
|
19
|
(6547 not mapped to gadm2)
|
20
|
(0 cases of failed mapping to gadm2 name_0)
|
21
|
1479338 mapped both to geonames and to gadm2
|
22
|
========================================================================
|
23
|
* Country with State/Province *
|
24
|
1408955 assertions
|
25
|
1388525 mapped to geonames
|
26
|
(20430 not mapped to geonames)
|
27
|
(5576 of these had unmapped country)
|
28
|
1389239 mapped to gadm2
|
29
|
(19716 not mapped to gadm2)
|
30
|
(5576 of these had unmapped country)
|
31
|
1387915 mapped both to geonames and to gadm2
|
32
|
========================================================================
|
33
|
* Country with State/Province with County/Parish*
|
34
|
276809 assertions
|
35
|
145767 mapped to geonames
|
36
|
213795 mapped to gadm2
|
37
|
140983 mapped both to geonames and to gadm2
|
38
|
========================================================================
|
39
|
|
40
|
========================================================================
|
41
|
Distinct country names:
|
42
|
439 total unique country names (non-null)
|
43
|
377 recognized unique country names (non-null)
|
44
|
62 unrecognized unique country names (non-null)
|
45
|
========================================================================
|
46
|
Distinct country+stateprovince names:
|
47
|
4764 total unique country and stateprovince (stateprovince non-null)
|
48
|
4704 total unique country and stateprovince (both non-null)
|
49
|
2944 mapped to geonames
|
50
|
(1760 not mapped to geonames)
|
51
|
2950 mapped to gadm2
|
52
|
(1754 not mapped to gadm2)
|
53
|
2885 mapped both to geonames and to gadm2
|
54
|
(1819 not mapped to both)
|
55
|
------------------------------------------------------------------------
|
56
|
* Distinct mappedcountry+stateprovince names *
|
57
|
3312 total unique
|
58
|
1895 mapped to geonames
|
59
|
(1417 not mapped to geonames)
|
60
|
1901 mapped to gadm2
|
61
|
(1411 not mapped to gadm2)
|
62
|
========================================================================
|
63
|
Distinct country+stateprovince+county names:
|
64
|
27440 total unique country and stateprovince and county (county non-null)
|
65
|
25764 total unique country and stateprovince and county (all non-null)
|
66
|
8507 mapped to geonames
|
67
|
(17257 not mapped to geonames)
|
68
|
12610 mapped to gadm2
|
69
|
(13154 not mapped to gadm2)
|
70
|
4917 mapped both to geonames and to gadm2
|
71
|
(20847 not mapped to both)
|
72
|
------------------------------------------------------------------------
|
73
|
* Distinct mappedcountry+mappedstateprovince+county names *
|
74
|
18715 total unique
|
75
|
6673 mapped to geonames
|
76
|
(12042 not mapped to geonames)
|
77
|
11310 mapped to gadm2
|
78
|
(7405 not mapped to gadm2)
|
79
|
========================================================================
|
80
|
|
81
|
|
82
|
for comparison, here are bien2 stats based on boyle/donoghue scrubbing:
|
83
|
753302 total unique location determinations
|
84
|
have asserted country
|
85
|
745321 have scrubbed country
|
86
|
have asserted country and stateprovince
|
87
|
have scrubbed country and stateprovince
|
88
|
have asserted country and stateprovince and county
|
89
|
have scrubbed country and stateprovince and county
|
90
|
|
91
|
Jim Regetz
|
92
|
NCEAS
|
93
|
Created Nov 2012
|
94
|
*/
|
95
|
|
96
|
-- Generate human-readable view of all distinct cases where we haven't
|
97
|
-- obtained a match for the asserted country
|
98
|
CREATE TEMP VIEW unmatched_country AS
|
99
|
SELECT count(*), country FROM
|
100
|
(SELECT
|
101
|
CASE WHEN length(countryutf8)>4
|
102
|
THEN initcap(countryutf8)
|
103
|
ELSE countryutf8
|
104
|
END AS country
|
105
|
FROM scrubbed
|
106
|
JOIN vcountry USING (country)
|
107
|
WHERE scrubbed.country IS NOT NULL
|
108
|
AND scrubbed.countryid IS NULL) summary
|
109
|
GROUP BY country
|
110
|
ORDER BY count DESC;
|
111
|
-- Generate human-readable view of all distinct cases where we have
|
112
|
-- obtained match for the asserted country, but not stateprovince
|
113
|
CREATE TEMP VIEW unmatched_stateprovince AS
|
114
|
SELECT count(*), country, stateprovince FROM
|
115
|
(SELECT countries.country AS country,
|
116
|
CASE WHEN length(stateprovinceutf8)>3
|
117
|
THEN initcap(stateprovinceutf8)
|
118
|
ELSE stateprovinceutf8
|
119
|
END AS stateprovince
|
120
|
FROM scrubbed
|
121
|
LEFT JOIN vstate USING (countryid, stateprovince)
|
122
|
LEFT JOIN countries ON vstate.countryid=geonameid
|
123
|
WHERE scrubbed.stateprovince IS NOT NULL
|
124
|
AND scrubbed.stateprovinceid IS NULL
|
125
|
AND scrubbed.country IS NOT NULL) summary
|
126
|
GROUP BY country, stateprovince
|
127
|
ORDER BY count DESC;
|
128
|
/*
|
129
|
SELECT DISTINCT countries.country,
|
130
|
CASE WHEN length(stateprovinceutf8)>3
|
131
|
THEN initcap(stateprovinceutf8)
|
132
|
ELSE stateprovinceutf8
|
133
|
END AS stateprovince
|
134
|
FROM scrubbed
|
135
|
JOIN vstate USING (countryid, stateprovince)
|
136
|
JOIN countries ON vstate.countryid=geonameid
|
137
|
WHERE scrubbed.stateprovince IS NOT NULL
|
138
|
AND scrubbed.stateprovinceid IS NULL
|
139
|
ORDER BY country, stateprovince;
|
140
|
*/
|
141
|
|
142
|
-- Generate human-readable view of all distinct cases where we have
|
143
|
-- obtained match for the asserted country and stateprovince, but not
|
144
|
-- county
|
145
|
/*
|
146
|
CREATE TEMP VIEW unmatched_county AS
|
147
|
SELECT DISTINCT countries.country,
|
148
|
CASE WHEN length(stateprovinceutf8)>3
|
149
|
THEN initcap(stateprovinceutf8)
|
150
|
ELSE stateprovinceutf8
|
151
|
END AS stateprovince
|
152
|
FROM scrubbed
|
153
|
JOIN vstate USING (countryid, stateprovince)
|
154
|
JOIN countries ON vstate.countryid=geonameid
|
155
|
WHERE scrubbed.stateprovince IS NOT NULL
|
156
|
AND scrubbed.stateprovinceid IS NULL
|
157
|
ORDER BY country, stateprovince;
|
158
|
*/
|
159
|
|
160
|
/*
|
161
|
-- which states aren't in gadm2?
|
162
|
select distinct iso3, s.name state
|
163
|
from scrubbed
|
164
|
join countries c on countryid=c.geonameid
|
165
|
join geonames s on stateprovinceid=s.geonameid
|
166
|
except select iso, name_1 state from gadm2;
|
167
|
|
168
|
select country, count(*)
|
169
|
from scrubbed s
|
170
|
left join gadm_stateprovince_lookup using (stateprovinceid)
|
171
|
where name_0 is null
|
172
|
and stateprovinceid is not null
|
173
|
group by country
|
174
|
order by count desc;
|
175
|
|
176
|
|
177
|
select name
|
178
|
from gadm2
|
179
|
join alternatenames on name_1=alternatename
|
180
|
join geonames using (geonameid)
|
181
|
join countries on countrycode=countries.iso
|
182
|
where featurecode='ADM1'
|
183
|
and name_1='North Solomons';
|
184
|
*/
|
185
|
|
186
|
|
187
|
/*
|
188
|
-- report distinct geonames-scrubbed country+stateprovince pairs that
|
189
|
-- we didn't match to any gadm polygons
|
190
|
SELECT DISTINCT c.country, g.name
|
191
|
FROM countries c
|
192
|
JOIN vstate v ON geonameid=countryid
|
193
|
JOIN geonames g ON g.geonameid=v.stateprovinceid
|
194
|
WHERE stateprovinceid NOT IN (
|
195
|
SELECT stateprovinceid FROM gadm_stateprovince_lookup
|
196
|
)
|
197
|
ORDER BY c.country, g.name;
|
198
|
|
199
|
-- report failed stateprovince mappings from gadm2 to geonames
|
200
|
SELECT countries.country AS countryname,
|
201
|
name AS stateprovincename,
|
202
|
count(*)
|
203
|
FROM scrubbed
|
204
|
LEFT JOIN gadm_stateprovince_lookup USING (stateprovinceid)
|
205
|
LEFT JOIN countries ON countryid=countries.geonameid
|
206
|
LEFT JOIN geonames ON stateprovinceid=geonames.geonameid
|
207
|
WHERE stateprovinceid IS NOT NULL
|
208
|
AND name_1 IS NULL
|
209
|
GROUP BY countryname, stateprovincename
|
210
|
ORDER BY countryname, stateprovincename;
|
211
|
*/
|
212
|
|
213
|
/*
|
214
|
select iso, name_1, geonameid as stateid
|
215
|
from (select distinct iso, name_1 from gadm2) gadm2
|
216
|
left join (
|
217
|
select distinct gadm2.iso,
|
218
|
name_1,
|
219
|
alternatenames.geonameid,
|
220
|
geonames.name
|
221
|
from (select distinct iso, name_1 from gadm2) gadm2
|
222
|
join alternatenames on name_1=alternatename
|
223
|
join geonames using (geonameid)
|
224
|
join countries on countrycode=countries.iso
|
225
|
where featurecode='ADM1'
|
226
|
and gadm2.iso=countries.iso3) foo
|
227
|
using (iso, name_1);
|
228
|
*/
|
229
|
|
230
|
-- how did we do?
|
231
|
|
232
|
-- country
|
233
|
|
234
|
-- percent of unique locations with asserted country names
|
235
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
236
|
FROM scrubbed WHERE country IS NOT NULL;
|
237
|
-- 87.0%
|
238
|
-- percent of unique locations with geoname-scrubbed country names
|
239
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
240
|
FROM scrubbed WHERE countryid IS NOT NULL;
|
241
|
-- 86.6%
|
242
|
-- percent of unique locations with gadm-matchedl country names
|
243
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
244
|
FROM scrubbed WHERE countryid IN (SELECT countryid FROM gadm_country_lookup);
|
245
|
-- 86.6%
|
246
|
|
247
|
-- stateprovince
|
248
|
|
249
|
-- note: 212221 records have a stateprovince but no country! those are
|
250
|
-- considered missing below
|
251
|
|
252
|
-- percent of unique locations with asserted stateprovince names
|
253
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
254
|
FROM scrubbed WHERE country IS NOT NULL AND stateprovince IS NOT NULL;
|
255
|
-- 82.5%
|
256
|
-- percent of unique locations with geoname-scrubbed stateprovince names
|
257
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
258
|
FROM scrubbed WHERE stateprovinceid IS NOT NULL;
|
259
|
-- 80.2%
|
260
|
-- percent of unique locations with gadm-matchedl stateprovince names
|
261
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
262
|
FROM scrubbed
|
263
|
WHERE stateprovinceid IN (SELECT stateprovinceid FROM gadm_stateprovince_lookup);
|
264
|
-- 79.5%
|
265
|
|
266
|
-- stateprovince
|
267
|
|
268
|
-- percent of unique locations with asserted county names
|
269
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
270
|
FROM scrubbed
|
271
|
WHERE country IS NOT NULL AND stateprovince IS NOT NULL AND county IS NOT NULL;
|
272
|
-- 16.2%
|
273
|
-- percent of unique locations with geoname-scrubbed county names
|
274
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
275
|
FROM scrubbed WHERE countyid IS NOT NULL;
|
276
|
-- 6.3%
|
277
|
-- percent of unique locations with gadm-matchedl county names
|
278
|
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
|
279
|
FROM scrubbed
|
280
|
WHERE countyid IN (SELECT countyid FROM gadm_county_lookup);
|
281
|
-- tbd
|
282
|
|
283
|
SELECT CASE WHEN countryid IS NOT NULL
|
284
|
THEN 'matched'
|
285
|
ELSE 'unmatched' END AS matches,
|
286
|
COUNT(*)
|
287
|
FROM vcountry
|
288
|
GROUP BY matches;
|
289
|
|
290
|
SELECT CASE WHEN stateprovinceid IS NOT NULL
|
291
|
THEN 'matched'
|
292
|
ELSE 'unmatched' END AS matches,
|
293
|
COUNT(*)
|
294
|
FROM vstate
|
295
|
GROUP BY matches;
|
296
|
-- matched | 1770
|
297
|
-- unmatched | 1541
|
298
|
|
299
|
SELECT CASE WHEN countyid IS NOT NULL
|
300
|
THEN 'matched'
|
301
|
ELSE 'unmatched' END AS matches,
|
302
|
COUNT(*)
|
303
|
FROM vcounty
|
304
|
GROUP BY matches;
|
305
|
-- matched | 4295
|
306
|
-- unmatched | 14354
|
307
|
|
308
|
SELECT CASE WHEN country IS NULL
|
309
|
THEN 'no country provided'
|
310
|
WHEN country IS NOT NULL AND countryid IS NULL
|
311
|
THEN 'unrecognized country'
|
312
|
ELSE
|
313
|
'scrubbed country'
|
314
|
END results,
|
315
|
COUNT(*)
|
316
|
FROM scrubbed
|
317
|
GROUP BY results;
|
318
|
|
319
|
SELECT CASE WHEN country IS NULL
|
320
|
THEN 'no fully qualified stateprovince provided'
|
321
|
WHEN country IS NOT NULL AND stateprovince IS NOT NULL AND stateprovinceid IS NULL
|
322
|
THEN 'unrecognized stateprovince'
|
323
|
ELSE
|
324
|
'scrubbed stateprovince'
|
325
|
END results,
|
326
|
COUNT(*)
|
327
|
FROM scrubbed
|
328
|
GROUP BY results;
|
329
|
|
330
|
-- file output
|
331
|
SELECT DISTINCT countryid, country,
|
332
|
stateprovinceid, stateprovince,
|
333
|
countyid, county
|
334
|
FROM scrubbed
|
335
|
ORDER BY country, stateprovince, county;
|
336
|
|
337
|
/*
|
338
|
-- look at the whole iso2 vs fips thing
|
339
|
SELECT DISTINCT vc.country, i.country iso, f.country fips, stateprovince
|
340
|
FROM vcountry vc
|
341
|
LEFT JOIN countries i ON vc.country=i.iso
|
342
|
LEFT JOIN countries f ON vc.country=f.fips
|
343
|
LEFT JOIN scrubbed s ON s.country=vc.country
|
344
|
WHERE vc.countryid IS NULL
|
345
|
AND i.country IS NOT NULL
|
346
|
AND f.country IS NOT NULL
|
347
|
ORDER BY vc.country;
|
348
|
*/
|