Project

General

Profile

1 10707 aaronmk
# Script to geoscrub-enable a postgres database, developed in the
2
# context of BIEN3 geoscrubbing but with intentions of generality.
3
#
4
# Basic workflow:
5
#  1. Import table dumps obtained from geonames.org
6
#  2. Add additional custom mapping info, including:
7
#     a. Madagascar provinces used in GADM2 but not geonames
8
#     b. extra country name mappings/fixes specific to BIEN3 data
9
#     c. extra state/prov name mappings/fixes specific to BIEN3 data
10
#     d. custom state/prov name mappings binding GADM2 to geonames
11
#
12
# Notes:
13
#  * One of the INSERT statements below contains the string 'Hawai\`i'.
14
#    The backslash is there to escape the backtick for bash, so that
15
#    what gets inserted into the database is 'Hawai`i'. If manually
16
#    copy-and-pasting the statement into psql rather than running as a
17
#    script (i.e., not having bash interpret the command), the backslash
18
#    must not be included.
19
#
20
# Jim Regetz
21
# NCEAS
22
# Created Nov 2012
23
24
# !! this is specific to regetz's testing environment !!
25
DATADIR="$HOME/biengeo/geonames"
26
cd ${DATADIR}
27
28
#
29
# load geonames.org geonames
30
#
31
32
wget http://download.geonames.org/export/dump/allCountries.zip
33
unzip allCountries.zip
34
35
psql -c \
36
   'CREATE TABLE geonames (
37
        geonameid serial primary key,
38
        name text,
39
        asciiname text,
40
        alternatenames text,
41
        latitude numeric,
42
        longitude numeric,
43
        featureclass char(1),
44
        featurecode text,
45
        countrycode char(2),
46
        cc2 char(60),
47
        admin1code text,
48
        admin2code text,
49
        admin3code text,
50
        admin4code text,
51
        population bigint,
52
        elevation int,
53
        dem int,
54
        timezone text,
55
        modification date
56
    )' geoscrub
57
58
# import data
59
psql -c "COPY geonames FROM '${DATADIR}/allCountries.txt' DELIMITER E'\t' NULL ''" geoscrub
60
psql -c \ "SELECT setval('geonames_geonameid_seq', (SELECT max(geonameid) FROM geonames))" geoscrub
61
psql -c \ "CREATE INDEX geonames_countrycode_idx ON geonames (countrycode)" geoscrub
62 11346 psarando
psql -c "CREATE INDEX geonames_featurecode_idx ON geonames (featurecode)" geoscrub
63
psql -c "CREATE INDEX geonames_name_idx ON geonames (name)" geoscrub
64 10707 aaronmk
65
#
66
# load geonames.org alternate names
67
#
68
69
wget http://download.geonames.org/export/dump/alternateNames.zip
70
unzip alternateNames.zip
71
72
psql -c \
73
   'CREATE TABLE alternateNames (
74
        alternateNameId serial primary key,
75
        geonameid int references geonames (geonameid),
76
        isolanguage varchar(7),
77
        alternateName varchar(200),
78
        isPreferredName char(1),
79
        isShortName char(1),
80
        isColloquial char(1),
81
        isHistoric char(1)
82
    )' geoscrub
83
84
# import data
85
psql -c "COPY alternateNames FROM '${DATADIR}/alternateNames.txt' DELIMITER E'\t' NULL ''" geoscrub
86
psql -c \ "SELECT setval('alternatenames_alternatenameid_seq', (SELECT max(alternatenameid) FROM alternatenames))" geoscrub
87
# 4m45.368s
88
89
# index alternatenames
90
psql -c 'CREATE INDEX "alternatenames_alternatename_idx" ON alternateNames (alternateName)' geoscrub
91
# 10m42.827s
92 11346 psarando
psql -c 'CREATE INDEX "alternatenames_geonameid_idx" ON alternateNames (geonameid)' geoscrub
93 10707 aaronmk
94
#
95
# load geonames.org country info
96
#
97
98
psql -c \
99
   'CREATE TABLE countries (
100
        iso char(2),
101
        iso3 char(3),
102
        isonumeric text,
103
        fips text,
104
        country text UNIQUE,
105
        capital text,
106
        area text,
107
        population text,
108
        continent text,
109
        tld text,
110
        currencyCode text,
111
        currencyName text,
112
        phone text,
113
        postalCodeFormat text,
114
        postalCodeRegex text,
115
        languages text,
116
        geonameid int,
117
        neighbours text,
118
        equivalentFipsCode text
119
    )' geoscrub
120
121
# import geonames country table
122
grep -ve '^#' $DATADIR/countryInfo.txt | \
123
  psql -c "COPY countries FROM STDIN DELIMITER E'\t' NULL ''" geoscrub
124
125
#
126
# load geonames.org hierarchy info
127
#
128
129
wget http://download.geonames.org/export/dump/hierarchy.zip
130
unzip hierarchy.zip
131
132
psql -c \
133
   'CREATE TABLE hierarchy (
134
        parentId int,-- references geonames (geonameid),
135
        childId int,-- references geonames (geonameid),
136
        type text
137
    )' geoscrub
138
139
# import geonames country table
140
psql -c "COPY hierarchy FROM '${DATADIR}/hierarchy.txt' DELIMITER E'\t' NULL ''" geoscrub
141 11346 psarando
psql -c "CREATE INDEX hierarchy_parentid_idx ON hierarchy (parentId)" geoscrub
142
psql -c "CREATE INDEX hierarchy_childid_idx ON hierarchy (childId)" geoscrub
143 10707 aaronmk
144
#
145
# insert additional custom name-scrub mappings
146
#
147
148
# augment geonames with some additional places we need, and update
149
# hierarchy table accordingly
150
psql -c \
151
"WITH newnames AS (
152
  INSERT INTO geonames
153
    (name, alternatenames, featurecode, countrycode )
154
    VALUES
155
    ('Antananarivo', NULL, 'ADM1', 'MG'),
156
    ('Antsiranana', NULL, 'ADM1', 'MG'),
157
    ('Fianarantsoa', NULL, 'ADM1', 'MG'),
158
    ('Mahajanga', NULL, 'ADM1', 'MG'),
159
    ('Toamasina', NULL, 'ADM1', 'MG'),
160
    ('Toliara', 'Toliary', 'ADM1', 'MG')
161
  RETURNING geonameid, countrycode
162
)
163
INSERT INTO hierarchy (parentid, childid)
164
  SELECT c.geonameid AS parentid,
165
         n.geonameid AS childid
166
    FROM newnames n,
167
         countries c
168
    WHERE n.countrycode = c.iso;
169
" geoscrub
170
171
# add some custom country alternate names that we need
172
psql -c "
173
CREATE TABLE alt_country (
174
    country text references countries (country),
175
    alternatename text
176
);
177
INSERT INTO alt_country
178
    (country, alternatename)
179
    VALUES
180
    ('Central African Republic', 'Central African Repu')
181
  , ('British Virgin Islands', 'Virgin Islands (U.K.)')
182
  , ('Democratic Republic of the Congo', 'Congo, The Democratic Republic of the')
183
  , ('Falkland Islands', 'Falkland Isl')
184
  , ('Falkland Islands', 'Falkland Islands (Malvinas)')
185
  , ('Faroe Islands', 'Faeroe Isl.')
186
  , ('French Guiana', 'Guyana Francesa')
187
  , ('Iran', 'Iran, Islamic Republic of')
188
  , ('Mexico', E'Mexico\rMexico')
189
  , ('Nicaragua', 'Ncaragua')
190
  , ('Norfolk Island', 'Norfolk Isl')
191
  , ('North Korea', 'Korea, Democratic People''s Republic of')
192
  , ('Solomon Islands', 'Solomon Isl')
193
  , ('South Georgia and the South Sandwich Islands', 'South Georgia and the South Sand')
194
  , ('South Korea', 'Korea (Republic of)')
195
  , ('South Korea', 'Korea, Republic of')
196
  , ('U.S. Virgin Islands', 'US Virgin Islands')
197
  , ('U.S. Virgin Islands', 'Virgin Islands (U.S.)')
198
  , ('United States', 'ESTADOS UNIDOS DE AMERICA')
199
  , ('United States', 'EUA')
200
  , ('Vietnam', 'VIETNAM [Socialist Republic of V')
201
;" geoscrub
202
203
# add some custom stateprovince alternate names that we need
204
psql -c "
205
CREATE TABLE alt_stateprovince (
206
    country text references countries (country),
207
    stateprovince text,
208
    alternatename text
209
);
210
INSERT INTO alt_stateprovince
211
    (country, stateprovince, alternatename)
212
    VALUES
213
    ('Canada', 'British Columbia', 'B.C.')
214
  , ('Canada', 'Newfoundland and Labrador', 'Newfoundland & Labrador')
215
  , ('Canada', 'Newfoundland and Labrador', 'New Foundland And Labrador')
216
  , ('Canada', 'Newfoundland and Labrador', 'Labrador & Newfoundland')
217
  , ('Canada', 'Northwest Territories', 'North West Territories')
218
  , ('Canada', 'Northwest Territories', 'Northwest Territorie')
219
  , ('Canada', 'Northwest Territories', 'NWT')
220
  , ('Canada', 'Québec', 'Quebéc')
221
  , ('Canada', 'Québec', 'Pq')
222
  , ('Canada', 'Québec', 'PQ')
223
  , ('United States', 'Hawaii', 'Hawai\`i')
224
  , ('United States', 'North Carolina', 'N. Carolina')
225
  , ('United States', 'North Dakota', 'N. Dakota')
226
  , ('United States', 'South Dakota', 'S. Dakota')
227
  , ('Kenya', 'Coast Province', 'Coast')
228
  , ('Mexico', 'Estado de Baja California', 'Baja California Norte')
229
  , ('Mexico', 'Estado de Baja California', 'Baja California (Norte)')
230
  , ('Mexico', 'Estado de Baja California', 'Baja California, Norte de')
231
  , ('Mexico', 'Estado de Baja California', 'Baja California (state)')
232
  , ('Mexico', 'Estado de Baja California Sur', 'Baja California, Sur de')
233
  , ('Mexico', 'Estado de Baja California Sur', 'Baja California Sur (state)')
234
  , ('Mexico', 'Estado de Baja California Sur', 'Baja Califronia')
235
  , ('Mexico', 'Estado de Baja California Sur', 'Baja Cal. Sur')
236
  , ('Ecuador', 'Provincia de Sucumbíos', 'Sucumbíos')
237
;
238
" geoscrub
239
240
# additions for mapping to gadm2 level 1 units
241
#  - Los Lagos -> Los Rios is imperfect but needed for gadm
242
#  - Orellana -> Napo is imperfect (wrong??) but needed for gadm
243
#  - Mapping both Concelho de Macau and Concelho das Ilhas to Ilhas
244
#  - Nassa seems to be a gadm misspelling? (Niassa)
245
# not addressed:
246
#  - Bahamas: unmapped gadm level1 entities may be obsolete?
247
#  - French Guiana has arrondissements as adm0 in gadm, low level?
248
#  - French Polynesia has no adm1 in gadm
249
#  - Madagascar: Sava is (was?) inside gadm's Antsiranana province
250
#  - Philippines: GN has 17 regions at level1, gadm has 82 provinces
251
#  - Sri Lanka: GN ADM2 matches gadm level1; bien data side with GN
252
#  - Venezuela: Dependencias Federales don't seem to be in GADM
253
#  - Vietnam: GN ADM1 matches gadm level2; bien data side with GN?
254
psql -c "
255
INSERT INTO alt_stateprovince
256
    (country, stateprovince, alternatename)
257
    VALUES
258
    ('Cambodia', 'Khétt Siĕm Réab', 'Siemréab')
259
  , ('Chile', 'Región del Biobío', 'Bío-Bío')
260
  , ('Chile', 'Región del Libertador General Bernardo O’Higgins', 'Libertador General Bernardo O''Higgins')
261
  , ('Chile', 'Región de Los Ríos', 'Los Lagos')
262
  , ('Chile', 'Providencia y Santa Catalina, Departamento de Archipiélago de San Andrés', 'San Andrés y Providencia')
263
  , ('Dominican Republic', 'Provincia de Baoruco', 'Bahoruco')
264
  , ('Ecuador', 'Provincia de Morona Santiago', 'Morona Santiago')
265
  , ('Ecuador', 'Provincia de Napo', 'Orellana')
266
  , ('Ecuador', 'Provincia de Zamora Chinchipe', 'Zamora Chinchipe')
267
  , ('Gabon', 'Province du Woleu-Ntem', 'Wouleu-Ntem')
268
  , ('Ghana', 'Brong-Ahafo Region', 'Brong Ahafo')
269
  , ('Guatemala', 'Departamento de Quetzaltenango', 'Quezaltenango')
270
  , ('Haiti', 'Département de l''Artibonite', 'L''Artibonite')
271
  , ('Liberia', 'Grand Bassa County', 'GrandBassa')
272
  , ('Liberia', 'Grand Gedeh County', 'GrandGedeh')
273
  , ('Macao', 'Concelho de Macau', 'Ilhas')
274
  , ('Mozambique', 'Niassa Province', 'Nassa')
275
  , ('Norway', 'Østfold fylke', 'Ãstfold')
276
  , ('Peru', 'Región de Huánuco', 'Huánuco')
277
  , ('Puerto Rico', 'Guánica Municipio', 'Guánica')
278
  , ('Puerto Rico', 'Loíza Municipio', 'Loíza')
279
  , ('Puerto Rico', 'Manatí Municipio', 'Manatí')
280
  , ('Puerto Rico', 'Peñuelas Municipio', 'Peñuelas')
281
  , ('Puerto Rico', 'Río Grande Municipio', 'Río Grande')
282
  , ('Puerto Rico', 'San Germán Municipio', 'San Germán')
283
  , ('Puerto Rico', 'San Sebastián Municipio', 'San Sebastián')
284
  , ('Russia', 'Karachayevo-Cherkesskaya Respublika', 'Karachay-Cherkess')
285
  , ('Syria', 'Muḩāfaz̧at al Lādhiqīyah', 'Lattakia')
286
  , ('Togo', 'Région Centrale', 'Centre')
287
;" geoscrub