Project

General

Profile

1
# Script to geoscrub-enable a postgres database, developed in the
2
# context of BIEN3 geoscrubbing but with intentions of generality.
3
#
4
# Basic workflow:
5
#  1. Import table dumps obtained from geonames.org
6
#  2. Add additional custom mapping info, including:
7
#     a. Madagascar provinces used in GADM2 but not geonames
8
#     b. extra country name mappings/fixes specific to BIEN3 data
9
#     c. extra state/prov name mappings/fixes specific to BIEN3 data
10
#     d. custom state/prov name mappings binding GADM2 to geonames
11
#
12
# Notes:
13
#  * One of the INSERT statements below contains the string 'Hawai\`i'.
14
#    The backslash is there to escape the backtick for bash, so that
15
#    what gets inserted into the database is 'Hawai`i'. If manually
16
#    copy-and-pasting the statement into psql rather than running as a
17
#    script (i.e., not having bash interpret the command), the backslash
18
#    must not be included.
19
#
20
# Jim Regetz
21
# NCEAS
22
# Created Nov 2012
23

    
24
# !! this is specific to regetz's testing environment !!
25
DATADIR="$HOME/biengeo/geonames"
26
cd ${DATADIR}
27

    
28
#
29
# load geonames.org geonames
30
#
31

    
32
wget http://download.geonames.org/export/dump/allCountries.zip
33
unzip allCountries.zip
34

    
35
psql -c \
36
   'CREATE TABLE geonames (
37
        geonameid serial primary key,
38
        name text,
39
        asciiname text,
40
        alternatenames text,
41
        latitude numeric,
42
        longitude numeric,
43
        featureclass char(1),
44
        featurecode text,
45
        countrycode char(2),
46
        cc2 char(60),
47
        admin1code text,
48
        admin2code text,
49
        admin3code text,
50
        admin4code text,
51
        population bigint,
52
        elevation int,
53
        dem int,
54
        timezone text,
55
        modification date
56
    )' geoscrub
57

    
58
# import data
59
psql -c "COPY geonames FROM '${DATADIR}/allCountries.txt' DELIMITER E'\t' NULL ''" geoscrub
60
psql -c \ "SELECT setval('geonames_geonameid_seq', (SELECT max(geonameid) FROM geonames))" geoscrub
61
psql -c \ "CREATE INDEX geonames_countrycode_idx ON geonames (countrycode)" geoscrub
62

    
63
#
64
# load geonames.org alternate names
65
#
66

    
67
wget http://download.geonames.org/export/dump/alternateNames.zip
68
unzip alternateNames.zip
69

    
70
psql -c \
71
   'CREATE TABLE alternateNames (
72
        alternateNameId serial primary key,
73
        geonameid int references geonames (geonameid),
74
        isolanguage varchar(7),
75
        alternateName varchar(200),
76
        isPreferredName char(1),
77
        isShortName char(1),
78
        isColloquial char(1),
79
        isHistoric char(1)
80
    )' geoscrub
81

    
82
# import data
83
psql -c "COPY alternateNames FROM '${DATADIR}/alternateNames.txt' DELIMITER E'\t' NULL ''" geoscrub
84
psql -c \ "SELECT setval('alternatenames_alternatenameid_seq', (SELECT max(alternatenameid) FROM alternatenames))" geoscrub
85
# 4m45.368s
86

    
87
# index alternatenames
88
psql -c 'CREATE INDEX "alternatenames_alternatename_idx" ON alternateNames (alternateName)' geoscrub
89
# 10m42.827s
90

    
91
#
92
# load geonames.org country info
93
#
94

    
95
psql -c \
96
   'CREATE TABLE countries (
97
        iso char(2),
98
        iso3 char(3),
99
        isonumeric text,
100
        fips text,
101
        country text UNIQUE,
102
        capital text,
103
        area text,
104
        population text,
105
        continent text,
106
        tld text,
107
        currencyCode text,
108
        currencyName text,
109
        phone text,
110
        postalCodeFormat text,
111
        postalCodeRegex text,
112
        languages text,
113
        geonameid int,
114
        neighbours text,
115
        equivalentFipsCode text
116
    )' geoscrub
117

    
118
# import geonames country table
119
grep -ve '^#' $DATADIR/countryInfo.txt | \
120
  psql -c "COPY countries FROM STDIN DELIMITER E'\t' NULL ''" geoscrub
121

    
122
#
123
# load geonames.org hierarchy info
124
#
125

    
126
wget http://download.geonames.org/export/dump/hierarchy.zip
127
unzip hierarchy.zip
128

    
129
psql -c \
130
   'CREATE TABLE hierarchy (
131
        parentId int,-- references geonames (geonameid),
132
        childId int,-- references geonames (geonameid),
133
        type text
134
    )' geoscrub
135

    
136
# import geonames country table
137
psql -c "COPY hierarchy FROM '${DATADIR}/hierarchy.txt' DELIMITER E'\t' NULL ''" geoscrub
138

    
139
#
140
# insert additional custom name-scrub mappings
141
#
142

    
143
# augment geonames with some additional places we need, and update
144
# hierarchy table accordingly
145
psql -c \
146
"WITH newnames AS (
147
  INSERT INTO geonames
148
    (name, alternatenames, featurecode, countrycode )
149
    VALUES
150
    ('Antananarivo', NULL, 'ADM1', 'MG'),
151
    ('Antsiranana', NULL, 'ADM1', 'MG'),
152
    ('Fianarantsoa', NULL, 'ADM1', 'MG'),
153
    ('Mahajanga', NULL, 'ADM1', 'MG'),
154
    ('Toamasina', NULL, 'ADM1', 'MG'),
155
    ('Toliara', 'Toliary', 'ADM1', 'MG')
156
  RETURNING geonameid, countrycode
157
)
158
INSERT INTO hierarchy (parentid, childid)
159
  SELECT c.geonameid AS parentid,
160
         n.geonameid AS childid
161
    FROM newnames n,
162
         countries c
163
    WHERE n.countrycode = c.iso;
164
" geoscrub
165

    
166
# add some custom country alternate names that we need
167
psql -c "
168
CREATE TABLE alt_country (
169
    country text references countries (country),
170
    alternatename text
171
);
172
INSERT INTO alt_country
173
    (country, alternatename)
174
    VALUES
175
    ('Central African Republic', 'Central African Repu')
176
  , ('British Virgin Islands', 'Virgin Islands (U.K.)')
177
  , ('Democratic Republic of the Congo', 'Congo, The Democratic Republic of the')
178
  , ('Falkland Islands', 'Falkland Isl')
179
  , ('Falkland Islands', 'Falkland Islands (Malvinas)')
180
  , ('Faroe Islands', 'Faeroe Isl.')
181
  , ('French Guiana', 'Guyana Francesa')
182
  , ('Iran', 'Iran, Islamic Republic of')
183
  , ('Mexico', E'Mexico\rMexico')
184
  , ('Nicaragua', 'Ncaragua')
185
  , ('Norfolk Island', 'Norfolk Isl')
186
  , ('North Korea', 'Korea, Democratic People''s Republic of')
187
  , ('Solomon Islands', 'Solomon Isl')
188
  , ('South Georgia and the South Sandwich Islands', 'South Georgia and the South Sand')
189
  , ('South Korea', 'Korea (Republic of)')
190
  , ('South Korea', 'Korea, Republic of')
191
  , ('U.S. Virgin Islands', 'US Virgin Islands')
192
  , ('U.S. Virgin Islands', 'Virgin Islands (U.S.)')
193
  , ('United States', 'ESTADOS UNIDOS DE AMERICA')
194
  , ('United States', 'EUA')
195
  , ('Vietnam', 'VIETNAM [Socialist Republic of V')
196
;" geoscrub
197

    
198
# add some custom stateprovince alternate names that we need
199
psql -c "
200
CREATE TABLE alt_stateprovince (
201
    country text references countries (country),
202
    stateprovince text,
203
    alternatename text
204
);
205
INSERT INTO alt_stateprovince
206
    (country, stateprovince, alternatename)
207
    VALUES
208
    ('Canada', 'British Columbia', 'B.C.')
209
  , ('Canada', 'Newfoundland and Labrador', 'Newfoundland & Labrador')
210
  , ('Canada', 'Newfoundland and Labrador', 'New Foundland And Labrador')
211
  , ('Canada', 'Newfoundland and Labrador', 'Labrador & Newfoundland')
212
  , ('Canada', 'Northwest Territories', 'North West Territories')
213
  , ('Canada', 'Northwest Territories', 'Northwest Territorie')
214
  , ('Canada', 'Northwest Territories', 'NWT')
215
  , ('Canada', 'Québec', 'Quebéc')
216
  , ('Canada', 'Québec', 'Pq')
217
  , ('Canada', 'Québec', 'PQ')
218
  , ('United States', 'Hawaii', 'Hawai\`i')
219
  , ('United States', 'North Carolina', 'N. Carolina')
220
  , ('United States', 'North Dakota', 'N. Dakota')
221
  , ('United States', 'South Dakota', 'S. Dakota')
222
  , ('Kenya', 'Coast Province', 'Coast')
223
  , ('Mexico', 'Estado de Baja California', 'Baja California Norte')
224
  , ('Mexico', 'Estado de Baja California', 'Baja California (Norte)')
225
  , ('Mexico', 'Estado de Baja California', 'Baja California, Norte de')
226
  , ('Mexico', 'Estado de Baja California', 'Baja California (state)')
227
  , ('Mexico', 'Estado de Baja California Sur', 'Baja California, Sur de')
228
  , ('Mexico', 'Estado de Baja California Sur', 'Baja California Sur (state)')
229
  , ('Mexico', 'Estado de Baja California Sur', 'Baja Califronia')
230
  , ('Mexico', 'Estado de Baja California Sur', 'Baja Cal. Sur')
231
  , ('Ecuador', 'Provincia de Sucumbíos', 'Sucumbíos')
232
;
233
" geoscrub
234

    
235
# additions for mapping to gadm2 level 1 units
236
#  - Los Lagos -> Los Rios is imperfect but needed for gadm
237
#  - Orellana -> Napo is imperfect (wrong??) but needed for gadm
238
#  - Mapping both Concelho de Macau and Concelho das Ilhas to Ilhas
239
#  - Nassa seems to be a gadm misspelling? (Niassa)
240
# not addressed:
241
#  - Bahamas: unmapped gadm level1 entities may be obsolete?
242
#  - French Guiana has arrondissements as adm0 in gadm, low level?
243
#  - French Polynesia has no adm1 in gadm
244
#  - Madagascar: Sava is (was?) inside gadm's Antsiranana province
245
#  - Philippines: GN has 17 regions at level1, gadm has 82 provinces
246
#  - Sri Lanka: GN ADM2 matches gadm level1; bien data side with GN
247
#  - Venezuela: Dependencias Federales don't seem to be in GADM
248
#  - Vietnam: GN ADM1 matches gadm level2; bien data side with GN?
249
psql -c "
250
INSERT INTO alt_stateprovince
251
    (country, stateprovince, alternatename)
252
    VALUES
253
    ('Cambodia', 'Khétt Siĕm Réab', 'Siemréab')
254
  , ('Chile', 'Región del Biobío', 'Bío-Bío')
255
  , ('Chile', 'Región del Libertador General Bernardo O’Higgins', 'Libertador General Bernardo O''Higgins')
256
  , ('Chile', 'Región de Los Ríos', 'Los Lagos')
257
  , ('Chile', 'Providencia y Santa Catalina, Departamento de Archipiélago de San Andrés', 'San Andrés y Providencia')
258
  , ('Dominican Republic', 'Provincia de Baoruco', 'Bahoruco')
259
  , ('Ecuador', 'Provincia de Morona Santiago', 'Morona Santiago')
260
  , ('Ecuador', 'Provincia de Napo', 'Orellana')
261
  , ('Ecuador', 'Provincia de Zamora Chinchipe', 'Zamora Chinchipe')
262
  , ('Gabon', 'Province du Woleu-Ntem', 'Wouleu-Ntem')
263
  , ('Ghana', 'Brong-Ahafo Region', 'Brong Ahafo')
264
  , ('Guatemala', 'Departamento de Quetzaltenango', 'Quezaltenango')
265
  , ('Haiti', 'Département de l''Artibonite', 'L''Artibonite')
266
  , ('Liberia', 'Grand Bassa County', 'GrandBassa')
267
  , ('Liberia', 'Grand Gedeh County', 'GrandGedeh')
268
  , ('Macao', 'Concelho de Macau', 'Ilhas')
269
  , ('Mozambique', 'Niassa Province', 'Nassa')
270
  , ('Norway', 'Østfold fylke', 'Ãstfold')
271
  , ('Peru', 'Región de Huánuco', 'Huánuco')
272
  , ('Puerto Rico', 'Guánica Municipio', 'Guánica')
273
  , ('Puerto Rico', 'Loíza Municipio', 'Loíza')
274
  , ('Puerto Rico', 'Manatí Municipio', 'Manatí')
275
  , ('Puerto Rico', 'Peñuelas Municipio', 'Peñuelas')
276
  , ('Puerto Rico', 'Río Grande Municipio', 'Río Grande')
277
  , ('Puerto Rico', 'San Germán Municipio', 'San Germán')
278
  , ('Puerto Rico', 'San Sebastián Municipio', 'San Sebastián')
279
  , ('Russia', 'Karachayevo-Cherkesskaya Respublika', 'Karachay-Cherkess')
280
  , ('Syria', 'Muḩāfaz̧at al Lādhiqīyah', 'Lattakia')
281
  , ('Togo', 'Région Centrale', 'Centre')
282
;" geoscrub
(5-5/9)