Project

General

Profile

« Previous | Next » 

Revision 11443

Initial checkin of geoscrub install SQL files.

Added install.*.sql files that will do initial table creation for all
required tables.
Added a truncate.vegbien_geoscrub.sql script that will clear tables related to
data downloaded in load-geoscrub-input.sh.
Moved all SQL statements that modify the final geoscrub table into the
geovalidate.sql script.
Modified the geonames.sql and geovalidate.sql scripts to assume all
tables have already been created by the install scripts.

View differences:

derived/biengeo/install.geonames.sql
1
BEGIN;
2

  
3
-- define custom function to pass characters back through latin-1
4
-- encoding to revert errors; in principle this could introduce new
5
-- errors, but it seems to do more good than harm
6
CREATE OR REPLACE FUNCTION u2L2u(text) RETURNS text AS $$
7
DECLARE
8
    string text;
9
BEGIN
10
  string :=
11
    convert_from(convert($1::bytea, 'UTF8', 'LATIN1'), 'UTF8');
12
  RETURN string;
13
EXCEPTION
14
  WHEN data_exception THEN
15
    RETURN $1;
16
END;
17
$$ LANGUAGE plpgsql;
18

  
19
CREATE TABLE geonames (
20
    geonameid serial primary key,
21
    name text,
22
    asciiname text,
23
    alternatenames text,
24
    latitude numeric,
25
    longitude numeric,
26
    featureclass char(1),
27
    featurecode text,
28
    countrycode char(2),
29
    cc2 char(60),
30
    admin1code text,
31
    admin2code text,
32
    admin3code text,
33
    admin4code text,
34
    population bigint,
35
    elevation int,
36
    dem int,
37
    timezone text,
38
    modification date
39
);
40

  
41
--
42
-- load geonames.org alternate names
43
--
44

  
45
CREATE TABLE alternateNames (
46
    alternateNameId serial primary key,
47
    geonameid int references geonames (geonameid),
48
    isolanguage varchar(7),
49
    alternateName varchar(200),
50
    isPreferredName char(1),
51
    isShortName char(1),
52
    isColloquial char(1),
53
    isHistoric char(1)
54
);
55

  
56
--
57
-- load geonames.org country info
58
--
59

  
60
CREATE TABLE countries (
61
    iso char(2),
62
    iso3 char(3),
63
    isonumeric text,
64
    fips text,
65
    country text UNIQUE,
66
    capital text,
67
    area text,
68
    population text,
69
    continent text,
70
    tld text,
71
    currencyCode text,
72
    currencyName text,
73
    phone text,
74
    postalCodeFormat text,
75
    postalCodeRegex text,
76
    languages text,
77
    geonameid int,
78
    neighbours text,
79
    equivalentFipsCode text
80
);
81

  
82
--
83
-- load geonames.org hierarchy info
84
--
85

  
86
CREATE TABLE hierarchy (
87
    parentId int,-- references geonames (geonameid),
88
    childId int,-- references geonames (geonameid),
89
    type text
90
);
91

  
92
--
93
-- tables for additional custom name-scrub mappings
94
--
95

  
96
-- table for custom country alternate names that we need
97
CREATE TABLE alt_country (
98
    country text references countries (country),
99
    alternatename text
100
);
101

  
102
-- table for custom stateprovince alternate names that we need
103
CREATE TABLE alt_stateprovince (
104
    country text references countries (country),
105
    stateprovince text,
106
    alternatename text
107
);
108

  
109
COMMIT;
110

  
derived/biengeo/geovalidate.sql
72 72
  Created Nov 2012
73 73
*/
74 74

  
75
BEGIN;
76

  
77
TRUNCATE geoscrub;
78

  
79
DROP INDEX IF EXISTS "geoscrub_geom_gist";
80
DROP INDEX IF EXISTS "geoscrub_geog_gist";
81
DROP INDEX IF EXISTS "geoscrub_countrystd_idx";
82
DROP INDEX IF EXISTS "geoscrub_stateprovincestd_idx";
83
DROP INDEX IF EXISTS "geoscrub_countystd_idx";
84

  
85
-----------------------------
86
-- put everything together --
87
-----------------------------
88

  
89
-- reconstitute the whole dang thang
90

  
91
-- populate geoscrub from vegbien_geoscrub data
92
INSERT INTO geoscrub
93
SELECT decimallatitude, decimallongitude,
94
       country, stateprovince, county,
95
       countryid, stateprovinceid, countyid,
96
       countrystd, stateprovincestd, countystd
97
  FROM vegbien_geoscrub v
98
  LEFT JOIN vcountry USING (country)
99
  LEFT JOIN vstate USING (countryid, stateprovince)
100
  LEFT JOIN vcounty USING (countryid, stateprovinceid, county);
101
-- SELECT 1707970
102
-- Time: 26172.154 ms
103

  
104
-- try to squeeze out a few more direct stateprovince mappings to gadm2
105
UPDATE geoscrub
106
  SET stateprovincestd = name_1
107
  FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm
108
  WHERE country IS NOT NULL
109
    AND stateprovince IS NOT NULL
110
    AND countrystd = name_0
111
    AND stateprovincestd IS NULL
112
    AND stateprovince = name_1;
113
-- UPDATE 0
114
-- Time: 3630.973 ms
115

  
116
-- try to squeeze out a few more direct county mappings to gadm2
117
UPDATE geoscrub
118
  SET countystd = name_2
119
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm
120
  WHERE country IS NOT NULL
121
    AND stateprovince IS NOT NULL
122
    AND county IS NOT NULL
123
    AND countrystd = name_0
124
    AND stateprovincestd = name_1
125
    AND countystd IS NULL
126
    AND county = name_2;
127
-- UPDATE 69
128
-- Time: 3982.715 ms
129

  
130
COMMIT;
131

  
75 132
-- generate point geometries from coordinates
76
SELECT AddGeometryColumn('public', 'geoscrub', 'geom', 4326, 'POINT', 2 );
77 133
UPDATE geoscrub
78 134
   SET geom = ST_SetSRID(ST_Point(decimallongitude, decimallatitude), 4326);
79 135
-- ... create point geographies too ...
80 136
-- note that this excludes (leaves null) any occurrences with coordinates
81 137
-- that are not valid decimal degrees
82
ALTER TABLE geoscrub ADD COLUMN geog geography(POINT, 4326);
83 138
UPDATE geoscrub
84 139
  SET geog = geom::geography
85 140
  WHERE geom && ST_MakeBox2D(ST_Point(-180, -90), ST_Point(180, 90));
......
103 158

  
104 159
VACUUM ANALYZE geoscrub;
105 160

  
106
ALTER TABLE geoscrub ADD COLUMN latlonvalidity int;
107

  
108 161
-- -1 if missing one or both coordinates
109 162
UPDATE geoscrub o
110 163
    SET latlonvalidity = -1
......
129 182

  
130 183
VACUUM ANALYZE geoscrub;
131 184

  
132
ALTER TABLE geoscrub ADD COLUMN countryvalidity int;
133

  
134 185
-- -1 if missing country name
135 186
UPDATE geoscrub o
136 187
    SET countryvalidity = -1
......
203 254

  
204 255
VACUUM ANALYZE geoscrub;
205 256

  
206
ALTER TABLE geoscrub ADD COLUMN stateprovincevalidity int;
207

  
208 257
-- -1 if missing stateprovince name
209 258
UPDATE geoscrub o
210 259
    SET stateprovincevalidity = -1
......
271 320
-- validate county/parish --
272 321
-----------------------------
273 322

  
274
ALTER TABLE geoscrub ADD COLUMN countyvalidity int;
275

  
276 323
-- -1 if missing county name
277 324
UPDATE geoscrub o
278 325
    SET countyvalidity = -1
......
333 380
-- Look up GADM place names based purely on coordinates for all points
334 381
--
335 382

  
336
ALTER TABLE geoscrub ADD COLUMN pointcountry text;
337
ALTER TABLE geoscrub ADD COLUMN pointstateprovince text;
338
ALTER TABLE geoscrub ADD COLUMN pointcounty text;
339 383
-- note: only using simplified geometry now, for speed reasons
340 384
UPDATE geoscrub o
341 385
    SET pointcountry = c.name_0,
derived/biengeo/install.geonames-to-gadm.sql
1
BEGIN;
2

  
3
-----------------------
4
-- Level 0 (Country) --
5
-----------------------
6

  
7
-- maps gadm2 level0 to geonames countries
8
CREATE TABLE gadm_country_lookup (
9
        countryid integer primary key references geonames (geonameid),
10
        name_0 text
11
);
12

  
13
------------------------------
14
-- Level 1 (State/Province) --
15
------------------------------
16

  
17
-- maps gadm2 level1 to geonames ADM1
18
CREATE TABLE gadm_stateprovince_lookup (
19
        stateprovinceid integer primary key references geonames (geonameid),
20
        name_0 text,
21
        name_1 text
22
);
23

  
24
/*
25
-- another way to create gadm_stateprovince_lookup, though this won't
26
-- pick up multiple matches from geonames (good or bad??)
27
CREATE TABLE gadm_stateprovince_lookup AS
28
SELECT DISTINCT name_0, name_1
29
  FROM gadm2;
30
ALTER TABLE gadm_stateprovince_lookup ADD COLUMN stateprovinceid integer;
31
UPDATE gadm_stateprovince_lookup gs
32
  SET stateprovinceid = g.geonameid
33
  FROM gadm_country_lookup gc,
34
       alternatenames a,
35
       geonames g,
36
       countries c
37
  WHERE gs.name_0=gc.name_0
38
    AND gs.name_1=a.alternatename
39
    AND a.geonameid=g.geonameid
40
    AND g.countrycode=c.iso
41
    AND gc.countryid=c.geonameid
42
    AND g.featurecode='ADM1';
43
-- UPDATE 2137
44
-- Time: 600.345 ms
45
*/
46

  
47
----------------------
48
-- Level 2 (County) --
49
----------------------
50

  
51
-- maps gadm2 level2 to geonames ADM2
52
CREATE TABLE gadm_county_lookup (
53
        countyid integer primary key references geonames (geonameid),
54
        name_0 text,
55
        name_1 text,
56
        name_2 text
57
);
58

  
59
COMMIT;
60

  
derived/biengeo/truncate.vegbien_geoscrub.sql
1
BEGIN;
2

  
3
TRUNCATE vegbien_geoscrub;
4
TRUNCATE vcountry;
5
TRUNCATE vstate;
6
TRUNCATE vcounty;
7

  
8
COMMIT;
9

  
derived/biengeo/geonames.sql
39 39
  Created Nov 2012
40 40
*/
41 41

  
42
-- define custom function to pass characters back through latin-1
43
-- encoding to revert errors; in principle this could introduce new
44
-- errors, but it seems to do more good than harm
45
CREATE OR REPLACE FUNCTION u2L2u(text) RETURNS text AS $$
46
DECLARE
47
    string text;
48
BEGIN
49
  string :=
50
    convert_from(convert($1::bytea, 'UTF8', 'LATIN1'), 'UTF8');
51
  RETURN string;
52
EXCEPTION
53
  WHEN data_exception THEN
54
    RETURN $1;
55
END;
56
$$ LANGUAGE plpgsql;
42
BEGIN;
57 43

  
58 44
-------------------
59 45
-- scrub country --
60 46
-------------------
61 47

  
62
CREATE TABLE vcountry AS
48
INSERT INTO vcountry
63 49
  SELECT DISTINCT country
64 50
    FROM vegbien_geoscrub
65 51
    WHERE country IS NOT NULL;
66 52
-- SELECT 439
67 53
-- Time: 862.413 ms
68
ALTER TABLE vcountry ADD COLUMN countryutf8 text;
69
ALTER TABLE vcountry ADD COLUMN countryid integer;
70 54

  
71 55
-- try to clean up character representations
72 56
UPDATE vcountry
......
156 140
-- Removes: Congo, CONGO
157 141

  
158 142
-- merge in GADM country names (our standard names)
159
ALTER TABLE vcountry ADD COLUMN countrystd text;
160 143
UPDATE vcountry vc
161 144
  SET countrystd = name_0
162 145
  FROM gadm_country_lookup gadm0
......
169 152
-- scrub stateprovince --
170 153
-------------------------
171 154

  
172
CREATE TABLE vstate AS
155
INSERT INTO vstate
173 156
  SELECT DISTINCT countryid, stateprovince
174 157
  FROM vegbien_geoscrub join vcountry using (country)
175 158
  WHERE countryid IS NOT NULL
176 159
    AND stateprovince IS NOT NULL;
177 160
-- SELECT 3312
178 161
-- Time: 4164.948 ms
179
ALTER TABLE vstate ADD COLUMN stateprovinceutf8 text;
180
ALTER TABLE vstate ADD COLUMN stateprovinceid integer;
181 162

  
182 163
-- try to clean up character representations
183 164
UPDATE vstate
......
294 275
-- todo: the same stuff done for country to deal with Congo-type case
295 276

  
296 277
-- merge in GADM stateprovince names (our standard names)
297
ALTER TABLE vstate ADD COLUMN stateprovincestd text;
298 278
UPDATE vstate vs
299 279
  SET stateprovincestd = name_1
300 280
  FROM gadm_stateprovince_lookup gadm1
......
352 332
-- scrub county --
353 333
------------------
354 334

  
355
CREATE TABLE vcounty AS
335
INSERT INTO vcounty
356 336
  SELECT DISTINCT countryid, stateprovinceid, county
357 337
  FROM vegbien_geoscrub
358 338
  JOIN vcountry USING (country)
......
362 342
    AND county IS NOT NULL;
363 343
-- SELECT 18715
364 344
-- Time: 3590.725 ms
365
ALTER TABLE vcounty ADD COLUMN countyutf8 text;
366
ALTER TABLE vcounty ADD COLUMN countyid integer;
367 345

  
368 346
-- try to clean up character representations
369 347
UPDATE vcounty
......
413 391
-- UPDATE 0
414 392
-- Time: 11255.926 ms
415 393

  
416
ALTER TABLE vcounty ADD COLUMN countystd text;
417

  
418 394
-- merge in GADM county names (our standard names)
419 395
-- for cases where we have geonameid
420 396
UPDATE vcounty vs
......
440 416
-- UPDATE 5027
441 417
-- Time: 1410.377 ms
442 418

  
443
-----------------------------
444
-- put everything together --
445
-----------------------------
419
COMMIT;
446 420

  
447
-- reconstitute the whole dang thang
448

  
449
-- new approach
450
CREATE TABLE geoscrub AS
451
SELECT decimallatitude, decimallongitude,
452
       country, stateprovince, county,
453
       countryid, stateprovinceid, countyid,
454
       countrystd, stateprovincestd, countystd
455
  FROM vegbien_geoscrub v
456
  LEFT JOIN vcountry USING (country)
457
  LEFT JOIN vstate USING (countryid, stateprovince)
458
  LEFT JOIN vcounty USING (countryid, stateprovinceid, county);
459
-- SELECT 1707970
460
-- Time: 26172.154 ms
461

  
462
-- try to squeeze out a few more direct stateprovince mappings to gadm2
463
UPDATE geoscrub
464
  SET stateprovincestd = name_1
465
  FROM (SELECT DISTINCT name_0, name_1 FROM gadm2) gadm
466
  WHERE country IS NOT NULL
467
    AND stateprovince IS NOT NULL
468
    AND countrystd = name_0
469
    AND stateprovincestd IS NULL
470
    AND stateprovince = name_1;
471
-- UPDATE 0
472
-- Time: 3630.973 ms
473

  
474
-- try to squeeze out a few more direct county mappings to gadm2
475
UPDATE geoscrub
476
  SET countystd = name_2
477
  FROM (SELECT DISTINCT name_0, name_1, name_2 FROM gadm2) gadm
478
  WHERE country IS NOT NULL
479
    AND stateprovince IS NOT NULL
480
    AND county IS NOT NULL
481
    AND countrystd = name_0
482
    AND stateprovincestd = name_1
483
    AND countystd IS NULL
484
    AND county = name_2;
485
-- UPDATE 69
486
-- Time: 3982.715 ms
derived/biengeo/load-geoscrub-input.sh
18 18

  
19 19
# Note, to force data to download from DATA_URL, ensure the DATAFILE is deleted
20 20
# before running this script.
21

  
22
SCRIPT_DIR=$(dirname $0)
21 23
DATA_URL="http://fs.vegpath.org/exports/geoscrub_input.no_header.cols=country,stateProvince,county,decimalLatitude,decimalLongitude.csv"
22
DATADIR="$(dirname $0)/input"
24
DATADIR="${SCRIPT_DIR}/input"
23 25
DATAFILE="${DATADIR}/geoscrub-corpus.csv"
24 26

  
25 27
if [[ ! -d "$DATADIR" ]]; then
......
42 44
    fi
43 45
fi
44 46

  
45
# generate table
46
psql -c \
47
   'CREATE TABLE vegbien_geoscrub (
48
        country text,
49
        stateProvince text,
50
        county text,
51
        decimalLatitude double precision,
52
        decimalLongitude double precision
53
    )' geoscrub
47
echo "Loading vegbien data from ${DATAFILE}"
54 48

  
49
# clear previous data
50
psql -e -d geoscrub --set ON_ERROR_STOP=1 < "${SCRIPT_DIR}/truncate.vegbien_geoscrub.sql"
51
if [[ $? != 0 ]]; then
52
    echo "Could not clear data from vegbien_geoscrub tables."
53
    exit 1
54
fi
55

  
55 56
# load
56 57
psql -c "COPY vegbien_geoscrub FROM '${DATAFILE}' WITH CSV" geoscrub
57 58

  
derived/biengeo/install.geoscrub.sql
1
/*
2
  SQL statements to create tables for country name, stateprovince name, and
3
  county name scrubbing on vegbien location data. Currently hard-coded
4
  to operate on a 'vegbien_geoscrub' table that contains the input for
5
  geoscrubbing, with the following columns:
6
     decimallatitude  (latitude, WGS84 decimal degrees)
7
     decimallongitude (longitude, WGS84 decimal degrees)
8
     country          (original asserted country name)
9
     stateprovince    (original asserted stateprovince name)
10
     county           (original asserted county name)
11
  (Note that the lat/lon columns are not needed for scrubbing, but are
12
  used later for geovalidation.)
13

  
14
  Jim Regetz
15
  NCEAS
16
  Created Nov 2012
17
*/
18

  
19
BEGIN;
20

  
21
CREATE TABLE geoscrub (
22
    decimallatitude double precision,
23
    decimallongitude double precision,
24
    country text,
25
    stateprovince text,
26
    county text,
27
    countryid integer,
28
    stateprovinceid integer,
29
    countyid integer,
30
    countrystd text,
31
    stateprovincestd text,
32
    countystd text,
33
    latlonvalidity int,
34
    countryvalidity int,
35
    stateprovincevalidity int,
36
    countyvalidity int,
37
    pointcountry text,
38
    pointstateprovince text,
39
    pointcounty text
40
);
41
SELECT AddGeometryColumn('public', 'geoscrub', 'geom', 4326, 'POINT', 2 );
42
ALTER TABLE geoscrub ADD COLUMN geog geography(POINT, 4326);
43

  
44
CREATE TABLE vegbien_geoscrub (
45
    country text,
46
    stateprovince text,
47
    county text,
48
    decimalLatitude double precision,
49
    decimalLongitude double precision
50
);
51

  
52
CREATE TABLE vcountry (
53
    country text,
54
    countryutf8 text,
55
    countryid integer,
56
    countrystd text
57
);
58

  
59
CREATE TABLE vstate (
60
    countryid integer,
61
    stateProvince text,
62
    stateprovinceutf8 text,
63
    stateprovinceid integer,
64
    stateprovincestd text
65
);
66

  
67
CREATE TABLE vcounty (
68
    countryid integer,
69
    stateprovinceid integer,
70
    county text,
71
    countyutf8 text,
72
    countyid integer,
73
    countystd text
74
);
75

  
76
COMMIT;
77

  

Also available in: Unified diff