Project

General

Profile

1 10707 aaronmk
/*
2
3
  Misc notes on how well we did scrubbing names. And lots of misc SQL
4
  statements for examining various scrubbing outcomes, some of which may
5
  not be so relevant anymore as a consequences of other changes.
6
7
Provisional (and not necessarily fully updated) summary of geoname
8
scrubbing performance on vegbien_geoscrub
9
(= geoscrub_input from vegbien schema public.2012-11-04-07-34-10.r5984)
10
========================================================================
11
1707970 total unique location determinations
12
  32541 distinct on country, stateprovince, county (including all nulls)
13
========================================================================
14
* Country *
15
1485885 assertions
16
1479338 mapped to geonames
17
        (6547 not mapped to geonames)
18
1479338 mapped to gadm2
19
        (6547 not mapped to gadm2)
20
        (0 cases of failed mapping to gadm2 name_0)
21
1479338 mapped both to geonames and to gadm2
22
========================================================================
23
* Country with State/Province *
24
1408955 assertions
25
1388525 mapped to geonames
26
        (20430 not mapped to geonames)
27
        (5576 of these had unmapped country)
28
1389239 mapped to gadm2
29
        (19716 not mapped to gadm2)
30
        (5576 of these had unmapped country)
31
1387915 mapped both to geonames and to gadm2
32
========================================================================
33
* Country with State/Province with County/Parish*
34
 276809 assertions
35
 145767 mapped to geonames
36
 213795 mapped to gadm2
37
 140983 mapped both to geonames and to gadm2
38
========================================================================
39
40
========================================================================
41
Distinct country names:
42
439 total unique country names (non-null)
43
377 recognized unique country names (non-null)
44
 62 unrecognized unique country names (non-null)
45
========================================================================
46
Distinct country+stateprovince names:
47
4764 total unique country and stateprovince (stateprovince non-null)
48
4704 total unique country and stateprovince (both non-null)
49
2944 mapped to geonames
50
     (1760 not mapped to geonames)
51
2950 mapped to gadm2
52
     (1754 not mapped to gadm2)
53
2885 mapped both to geonames and to gadm2
54
     (1819 not mapped to both)
55
------------------------------------------------------------------------
56
* Distinct mappedcountry+stateprovince names *
57
3312 total unique
58
1895 mapped to geonames
59
     (1417 not mapped to geonames)
60
1901 mapped to gadm2
61
     (1411 not mapped to gadm2)
62
========================================================================
63
Distinct country+stateprovince+county names:
64
27440 total unique country and stateprovince and county (county non-null)
65
25764 total unique country and stateprovince and county (all non-null)
66
8507 mapped to geonames
67
     (17257 not mapped to geonames)
68
12610 mapped to gadm2
69
     (13154 not mapped to gadm2)
70
4917 mapped both to geonames and to gadm2
71
     (20847 not mapped to both)
72
------------------------------------------------------------------------
73
* Distinct mappedcountry+mappedstateprovince+county names *
74
18715 total unique
75
6673 mapped to geonames
76
     (12042 not mapped to geonames)
77
11310 mapped to gadm2
78
     (7405 not mapped to gadm2)
79
========================================================================
80
81
82
for comparison, here are bien2 stats based on boyle/donoghue scrubbing:
83
753302 total unique location determinations
84
 have asserted country
85
745321 have scrubbed country
86
 have asserted country and stateprovince
87
 have scrubbed country and stateprovince
88
 have asserted country and stateprovince and county
89
 have scrubbed country and stateprovince and county
90
91
  Jim Regetz
92
  NCEAS
93
  Created Nov 2012
94
*/
95
96
-- Generate human-readable view of all distinct cases where we haven't
97
-- obtained a match for the asserted country
98
CREATE TEMP VIEW unmatched_country AS
99
SELECT count(*), country FROM
100
  (SELECT
101
      CASE WHEN length(countryutf8)>4
102
           THEN initcap(countryutf8)
103
           ELSE countryutf8
104
       END AS country
105
    FROM scrubbed
106
    JOIN vcountry USING (country)
107
    WHERE scrubbed.country IS NOT NULL
108
      AND scrubbed.countryid IS NULL) summary
109
  GROUP BY country
110
  ORDER BY count DESC;
111
-- Generate human-readable view of all distinct cases where we have
112
-- obtained match for the asserted country, but not stateprovince
113
CREATE TEMP VIEW unmatched_stateprovince AS
114
SELECT count(*), country, stateprovince FROM
115
  (SELECT countries.country AS country,
116
      CASE WHEN length(stateprovinceutf8)>3
117
           THEN initcap(stateprovinceutf8)
118
           ELSE stateprovinceutf8
119
       END AS stateprovince
120
    FROM scrubbed
121
    LEFT JOIN vstate USING (countryid, stateprovince)
122
    LEFT JOIN countries ON vstate.countryid=geonameid
123
    WHERE scrubbed.stateprovince IS NOT NULL
124
      AND scrubbed.stateprovinceid IS NULL
125
      AND scrubbed.country IS NOT NULL) summary
126
  GROUP BY country, stateprovince
127
  ORDER BY count DESC;
128
/*
129
SELECT DISTINCT countries.country,
130
    CASE WHEN length(stateprovinceutf8)>3
131
         THEN initcap(stateprovinceutf8)
132
         ELSE stateprovinceutf8
133
     END AS stateprovince
134
  FROM scrubbed
135
  JOIN vstate USING (countryid, stateprovince)
136
  JOIN countries ON vstate.countryid=geonameid
137
  WHERE scrubbed.stateprovince IS NOT NULL
138
    AND scrubbed.stateprovinceid IS NULL
139
  ORDER BY country, stateprovince;
140
*/
141
142
-- Generate human-readable view of all distinct cases where we have
143
-- obtained match for the asserted country and stateprovince, but not
144
-- county
145
/*
146
CREATE TEMP VIEW unmatched_county AS
147
SELECT DISTINCT countries.country,
148
    CASE WHEN length(stateprovinceutf8)>3
149
         THEN initcap(stateprovinceutf8)
150
         ELSE stateprovinceutf8
151
     END AS stateprovince
152
  FROM scrubbed
153
  JOIN vstate USING (countryid, stateprovince)
154
  JOIN countries ON vstate.countryid=geonameid
155
  WHERE scrubbed.stateprovince IS NOT NULL
156
    AND scrubbed.stateprovinceid IS NULL
157
  ORDER BY country, stateprovince;
158
*/
159
160
/*
161
-- which states aren't in gadm2?
162
select distinct iso3, s.name state
163
      from scrubbed
164
      join countries c on countryid=c.geonameid
165
      join geonames s on stateprovinceid=s.geonameid
166
except select iso, name_1 state from gadm2;
167
168
select country, count(*)
169
  from scrubbed s
170
  left join gadm_stateprovince_lookup using (stateprovinceid)
171
  where name_0 is null
172
    and stateprovinceid is not null
173
  group by country
174
  order by count desc;
175
176
177
select name
178
  from gadm2
179
  join alternatenames on name_1=alternatename
180
  join geonames using (geonameid)
181
  join countries on countrycode=countries.iso
182
  where featurecode='ADM1'
183
    and name_1='North Solomons';
184
*/
185
186
187
/*
188
-- report distinct geonames-scrubbed country+stateprovince pairs that
189
-- we didn't match to any gadm polygons
190
SELECT DISTINCT c.country, g.name
191
 FROM countries c
192
 JOIN vstate v ON geonameid=countryid
193
 JOIN geonames g ON g.geonameid=v.stateprovinceid
194
 WHERE stateprovinceid NOT IN (
195
           SELECT stateprovinceid FROM gadm_stateprovince_lookup
196
       )
197
 ORDER BY c.country, g.name;
198
199
-- report failed stateprovince mappings from gadm2 to geonames
200
SELECT countries.country AS countryname,
201
       name AS stateprovincename,
202
       count(*)
203
  FROM scrubbed
204
  LEFT JOIN gadm_stateprovince_lookup USING (stateprovinceid)
205
  LEFT JOIN countries ON countryid=countries.geonameid
206
  LEFT JOIN geonames ON stateprovinceid=geonames.geonameid
207
  WHERE stateprovinceid IS NOT NULL
208
    AND name_1 IS NULL
209
  GROUP BY countryname, stateprovincename
210
  ORDER BY countryname, stateprovincename;
211
*/
212
213
/*
214
select iso, name_1, geonameid as stateid
215
  from (select distinct iso, name_1 from gadm2) gadm2
216
  left join (
217
      select distinct gadm2.iso,
218
             name_1,
219
             alternatenames.geonameid,
220
             geonames.name
221
        from (select distinct iso, name_1 from gadm2) gadm2
222
        join alternatenames on name_1=alternatename
223
        join geonames using (geonameid)
224
        join countries on countrycode=countries.iso
225
        where featurecode='ADM1'
226
          and gadm2.iso=countries.iso3) foo
227
    using (iso, name_1);
228
*/
229
230
-- how did we do?
231
232
-- country
233
234
-- percent of unique locations with asserted country names
235
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
236
  FROM scrubbed WHERE country IS NOT NULL;
237
-- 87.0%
238
-- percent of unique locations with geoname-scrubbed country names
239
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
240
  FROM scrubbed WHERE countryid IS NOT NULL;
241
-- 86.6%
242
-- percent of unique locations with gadm-matchedl country names
243
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
244
  FROM scrubbed WHERE countryid IN (SELECT countryid FROM gadm_country_lookup);
245
-- 86.6%
246
247
-- stateprovince
248
249
-- note: 212221 records have a stateprovince but no country! those are
250
--       considered missing below
251
252
-- percent of unique locations with asserted stateprovince names
253
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
254
  FROM scrubbed WHERE country IS NOT NULL AND stateprovince IS NOT NULL;
255
-- 82.5%
256
-- percent of unique locations with geoname-scrubbed stateprovince names
257
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
258
  FROM scrubbed WHERE stateprovinceid IS NOT NULL;
259
-- 80.2%
260
-- percent of unique locations with gadm-matchedl stateprovince names
261
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
262
  FROM scrubbed
263
  WHERE stateprovinceid IN (SELECT stateprovinceid FROM gadm_stateprovince_lookup);
264
-- 79.5%
265
266
-- stateprovince
267
268
-- percent of unique locations with asserted county names
269
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
270
  FROM scrubbed
271
  WHERE country IS NOT NULL AND stateprovince IS NOT NULL AND county IS NOT NULL;
272
-- 16.2%
273
-- percent of unique locations with geoname-scrubbed county names
274
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
275
  FROM scrubbed WHERE countyid IS NOT NULL;
276
-- 6.3%
277
-- percent of unique locations with gadm-matchedl county names
278
SELECT ROUND(100*COUNT(*)/(SELECT COUNT(*)::numeric FROM scrubbed), 1) pct
279
  FROM scrubbed
280
  WHERE countyid IN (SELECT countyid FROM gadm_county_lookup);
281
-- tbd
282
283
SELECT CASE WHEN countryid IS NOT NULL
284
            THEN 'matched'
285
            ELSE 'unmatched' END AS matches,
286
       COUNT(*)
287
  FROM vcountry
288
  GROUP BY matches;
289
290
SELECT CASE WHEN stateprovinceid IS NOT NULL
291
            THEN 'matched'
292
            ELSE 'unmatched' END AS matches,
293
       COUNT(*)
294
  FROM vstate
295
  GROUP BY matches;
296
--  matched   |  1770
297
--  unmatched |  1541
298
299
SELECT CASE WHEN countyid IS NOT NULL
300
            THEN 'matched'
301
            ELSE 'unmatched' END AS matches,
302
       COUNT(*)
303
  FROM vcounty
304
  GROUP BY matches;
305
--  matched   |  4295
306
--  unmatched | 14354
307
308
SELECT CASE WHEN country IS NULL
309
                 THEN 'no country provided'
310
            WHEN country IS NOT NULL AND countryid IS NULL
311
                 THEN 'unrecognized country'
312
            ELSE
313
                 'scrubbed country'
314
            END results,
315
       COUNT(*)
316
  FROM scrubbed
317
  GROUP BY results;
318
319
SELECT CASE WHEN country IS NULL
320
                 THEN 'no fully qualified stateprovince provided'
321
            WHEN country IS NOT NULL AND stateprovince IS NOT NULL AND stateprovinceid IS NULL
322
                 THEN 'unrecognized stateprovince'
323
            ELSE
324
                 'scrubbed stateprovince'
325
            END results,
326
       COUNT(*)
327
  FROM scrubbed
328
  GROUP BY results;
329
330
-- file output
331
SELECT DISTINCT countryid, country,
332
    stateprovinceid, stateprovince,
333
    countyid, county
334
  FROM scrubbed
335
  ORDER BY country, stateprovince, county;
336
337
/*
338
-- look at the whole iso2 vs fips thing
339
SELECT DISTINCT vc.country, i.country iso, f.country fips, stateprovince
340
  FROM vcountry vc
341
  LEFT JOIN countries i ON vc.country=i.iso
342
  LEFT JOIN countries f ON vc.country=f.fips
343
  LEFT JOIN scrubbed s ON s.country=vc.country
344
  WHERE vc.countryid IS NULL
345
    AND i.country IS NOT NULL
346
    AND f.country IS NOT NULL
347
    ORDER BY vc.country;
348
*/