1 |
10707
|
aaronmk
|
# Script to geoscrub-enable a postgres database, developed in the
|
2 |
|
|
# context of BIEN3 geoscrubbing but with intentions of generality.
|
3 |
|
|
#
|
4 |
|
|
# Basic workflow:
|
5 |
|
|
# 1. Import table dumps obtained from geonames.org
|
6 |
|
|
# 2. Add additional custom mapping info, including:
|
7 |
|
|
# a. Madagascar provinces used in GADM2 but not geonames
|
8 |
|
|
# b. extra country name mappings/fixes specific to BIEN3 data
|
9 |
|
|
# c. extra state/prov name mappings/fixes specific to BIEN3 data
|
10 |
|
|
# d. custom state/prov name mappings binding GADM2 to geonames
|
11 |
|
|
#
|
12 |
|
|
# Notes:
|
13 |
|
|
# * One of the INSERT statements below contains the string 'Hawai\`i'.
|
14 |
|
|
# The backslash is there to escape the backtick for bash, so that
|
15 |
|
|
# what gets inserted into the database is 'Hawai`i'. If manually
|
16 |
|
|
# copy-and-pasting the statement into psql rather than running as a
|
17 |
|
|
# script (i.e., not having bash interpret the command), the backslash
|
18 |
|
|
# must not be included.
|
19 |
|
|
#
|
20 |
|
|
# Jim Regetz
|
21 |
|
|
# NCEAS
|
22 |
|
|
# Created Nov 2012
|
23 |
|
|
|
24 |
|
|
# !! this is specific to regetz's testing environment !!
|
25 |
|
|
DATADIR="$HOME/biengeo/geonames"
|
26 |
|
|
cd ${DATADIR}
|
27 |
|
|
|
28 |
|
|
#
|
29 |
|
|
# load geonames.org geonames
|
30 |
|
|
#
|
31 |
|
|
|
32 |
|
|
wget http://download.geonames.org/export/dump/allCountries.zip
|
33 |
|
|
unzip allCountries.zip
|
34 |
|
|
|
35 |
|
|
psql -c \
|
36 |
|
|
'CREATE TABLE geonames (
|
37 |
|
|
geonameid serial primary key,
|
38 |
|
|
name text,
|
39 |
|
|
asciiname text,
|
40 |
|
|
alternatenames text,
|
41 |
|
|
latitude numeric,
|
42 |
|
|
longitude numeric,
|
43 |
|
|
featureclass char(1),
|
44 |
|
|
featurecode text,
|
45 |
|
|
countrycode char(2),
|
46 |
|
|
cc2 char(60),
|
47 |
|
|
admin1code text,
|
48 |
|
|
admin2code text,
|
49 |
|
|
admin3code text,
|
50 |
|
|
admin4code text,
|
51 |
|
|
population bigint,
|
52 |
|
|
elevation int,
|
53 |
|
|
dem int,
|
54 |
|
|
timezone text,
|
55 |
|
|
modification date
|
56 |
|
|
)' geoscrub
|
57 |
|
|
|
58 |
|
|
# import data
|
59 |
|
|
psql -c "COPY geonames FROM '${DATADIR}/allCountries.txt' DELIMITER E'\t' NULL ''" geoscrub
|
60 |
|
|
psql -c \ "SELECT setval('geonames_geonameid_seq', (SELECT max(geonameid) FROM geonames))" geoscrub
|
61 |
|
|
psql -c \ "CREATE INDEX geonames_countrycode_idx ON geonames (countrycode)" geoscrub
|
62 |
|
|
|
63 |
|
|
#
|
64 |
|
|
# load geonames.org alternate names
|
65 |
|
|
#
|
66 |
|
|
|
67 |
|
|
wget http://download.geonames.org/export/dump/alternateNames.zip
|
68 |
|
|
unzip alternateNames.zip
|
69 |
|
|
|
70 |
|
|
psql -c \
|
71 |
|
|
'CREATE TABLE alternateNames (
|
72 |
|
|
alternateNameId serial primary key,
|
73 |
|
|
geonameid int references geonames (geonameid),
|
74 |
|
|
isolanguage varchar(7),
|
75 |
|
|
alternateName varchar(200),
|
76 |
|
|
isPreferredName char(1),
|
77 |
|
|
isShortName char(1),
|
78 |
|
|
isColloquial char(1),
|
79 |
|
|
isHistoric char(1)
|
80 |
|
|
)' geoscrub
|
81 |
|
|
|
82 |
|
|
# import data
|
83 |
|
|
psql -c "COPY alternateNames FROM '${DATADIR}/alternateNames.txt' DELIMITER E'\t' NULL ''" geoscrub
|
84 |
|
|
psql -c \ "SELECT setval('alternatenames_alternatenameid_seq', (SELECT max(alternatenameid) FROM alternatenames))" geoscrub
|
85 |
|
|
# 4m45.368s
|
86 |
|
|
|
87 |
|
|
# index alternatenames
|
88 |
|
|
psql -c 'CREATE INDEX "alternatenames_alternatename_idx" ON alternateNames (alternateName)' geoscrub
|
89 |
|
|
# 10m42.827s
|
90 |
|
|
|
91 |
|
|
#
|
92 |
|
|
# load geonames.org country info
|
93 |
|
|
#
|
94 |
|
|
|
95 |
|
|
psql -c \
|
96 |
|
|
'CREATE TABLE countries (
|
97 |
|
|
iso char(2),
|
98 |
|
|
iso3 char(3),
|
99 |
|
|
isonumeric text,
|
100 |
|
|
fips text,
|
101 |
|
|
country text UNIQUE,
|
102 |
|
|
capital text,
|
103 |
|
|
area text,
|
104 |
|
|
population text,
|
105 |
|
|
continent text,
|
106 |
|
|
tld text,
|
107 |
|
|
currencyCode text,
|
108 |
|
|
currencyName text,
|
109 |
|
|
phone text,
|
110 |
|
|
postalCodeFormat text,
|
111 |
|
|
postalCodeRegex text,
|
112 |
|
|
languages text,
|
113 |
|
|
geonameid int,
|
114 |
|
|
neighbours text,
|
115 |
|
|
equivalentFipsCode text
|
116 |
|
|
)' geoscrub
|
117 |
|
|
|
118 |
|
|
# import geonames country table
|
119 |
|
|
grep -ve '^#' $DATADIR/countryInfo.txt | \
|
120 |
|
|
psql -c "COPY countries FROM STDIN DELIMITER E'\t' NULL ''" geoscrub
|
121 |
|
|
|
122 |
|
|
#
|
123 |
|
|
# load geonames.org hierarchy info
|
124 |
|
|
#
|
125 |
|
|
|
126 |
|
|
wget http://download.geonames.org/export/dump/hierarchy.zip
|
127 |
|
|
unzip hierarchy.zip
|
128 |
|
|
|
129 |
|
|
psql -c \
|
130 |
|
|
'CREATE TABLE hierarchy (
|
131 |
|
|
parentId int,-- references geonames (geonameid),
|
132 |
|
|
childId int,-- references geonames (geonameid),
|
133 |
|
|
type text
|
134 |
|
|
)' geoscrub
|
135 |
|
|
|
136 |
|
|
# import geonames country table
|
137 |
|
|
psql -c "COPY hierarchy FROM '${DATADIR}/hierarchy.txt' DELIMITER E'\t' NULL ''" geoscrub
|
138 |
|
|
|
139 |
|
|
#
|
140 |
|
|
# insert additional custom name-scrub mappings
|
141 |
|
|
#
|
142 |
|
|
|
143 |
|
|
# augment geonames with some additional places we need, and update
|
144 |
|
|
# hierarchy table accordingly
|
145 |
|
|
psql -c \
|
146 |
|
|
"WITH newnames AS (
|
147 |
|
|
INSERT INTO geonames
|
148 |
|
|
(name, alternatenames, featurecode, countrycode )
|
149 |
|
|
VALUES
|
150 |
|
|
('Antananarivo', NULL, 'ADM1', 'MG'),
|
151 |
|
|
('Antsiranana', NULL, 'ADM1', 'MG'),
|
152 |
|
|
('Fianarantsoa', NULL, 'ADM1', 'MG'),
|
153 |
|
|
('Mahajanga', NULL, 'ADM1', 'MG'),
|
154 |
|
|
('Toamasina', NULL, 'ADM1', 'MG'),
|
155 |
|
|
('Toliara', 'Toliary', 'ADM1', 'MG')
|
156 |
|
|
RETURNING geonameid, countrycode
|
157 |
|
|
)
|
158 |
|
|
INSERT INTO hierarchy (parentid, childid)
|
159 |
|
|
SELECT c.geonameid AS parentid,
|
160 |
|
|
n.geonameid AS childid
|
161 |
|
|
FROM newnames n,
|
162 |
|
|
countries c
|
163 |
|
|
WHERE n.countrycode = c.iso;
|
164 |
|
|
" geoscrub
|
165 |
|
|
|
166 |
|
|
# add some custom country alternate names that we need
|
167 |
|
|
psql -c "
|
168 |
|
|
CREATE TABLE alt_country (
|
169 |
|
|
country text references countries (country),
|
170 |
|
|
alternatename text
|
171 |
|
|
);
|
172 |
|
|
INSERT INTO alt_country
|
173 |
|
|
(country, alternatename)
|
174 |
|
|
VALUES
|
175 |
|
|
('Central African Republic', 'Central African Repu')
|
176 |
|
|
, ('British Virgin Islands', 'Virgin Islands (U.K.)')
|
177 |
|
|
, ('Democratic Republic of the Congo', 'Congo, The Democratic Republic of the')
|
178 |
|
|
, ('Falkland Islands', 'Falkland Isl')
|
179 |
|
|
, ('Falkland Islands', 'Falkland Islands (Malvinas)')
|
180 |
|
|
, ('Faroe Islands', 'Faeroe Isl.')
|
181 |
|
|
, ('French Guiana', 'Guyana Francesa')
|
182 |
|
|
, ('Iran', 'Iran, Islamic Republic of')
|
183 |
|
|
, ('Mexico', E'Mexico\rMexico')
|
184 |
|
|
, ('Nicaragua', 'Ncaragua')
|
185 |
|
|
, ('Norfolk Island', 'Norfolk Isl')
|
186 |
|
|
, ('North Korea', 'Korea, Democratic People''s Republic of')
|
187 |
|
|
, ('Solomon Islands', 'Solomon Isl')
|
188 |
|
|
, ('South Georgia and the South Sandwich Islands', 'South Georgia and the South Sand')
|
189 |
|
|
, ('South Korea', 'Korea (Republic of)')
|
190 |
|
|
, ('South Korea', 'Korea, Republic of')
|
191 |
|
|
, ('U.S. Virgin Islands', 'US Virgin Islands')
|
192 |
|
|
, ('U.S. Virgin Islands', 'Virgin Islands (U.S.)')
|
193 |
|
|
, ('United States', 'ESTADOS UNIDOS DE AMERICA')
|
194 |
|
|
, ('United States', 'EUA')
|
195 |
|
|
, ('Vietnam', 'VIETNAM [Socialist Republic of V')
|
196 |
|
|
;" geoscrub
|
197 |
|
|
|
198 |
|
|
# add some custom stateprovince alternate names that we need
|
199 |
|
|
psql -c "
|
200 |
|
|
CREATE TABLE alt_stateprovince (
|
201 |
|
|
country text references countries (country),
|
202 |
|
|
stateprovince text,
|
203 |
|
|
alternatename text
|
204 |
|
|
);
|
205 |
|
|
INSERT INTO alt_stateprovince
|
206 |
|
|
(country, stateprovince, alternatename)
|
207 |
|
|
VALUES
|
208 |
|
|
('Canada', 'British Columbia', 'B.C.')
|
209 |
|
|
, ('Canada', 'Newfoundland and Labrador', 'Newfoundland & Labrador')
|
210 |
|
|
, ('Canada', 'Newfoundland and Labrador', 'New Foundland And Labrador')
|
211 |
|
|
, ('Canada', 'Newfoundland and Labrador', 'Labrador & Newfoundland')
|
212 |
|
|
, ('Canada', 'Northwest Territories', 'North West Territories')
|
213 |
|
|
, ('Canada', 'Northwest Territories', 'Northwest Territorie')
|
214 |
|
|
, ('Canada', 'Northwest Territories', 'NWT')
|
215 |
|
|
, ('Canada', 'Québec', 'Quebéc')
|
216 |
|
|
, ('Canada', 'Québec', 'Pq')
|
217 |
|
|
, ('Canada', 'Québec', 'PQ')
|
218 |
|
|
, ('United States', 'Hawaii', 'Hawai\`i')
|
219 |
|
|
, ('United States', 'North Carolina', 'N. Carolina')
|
220 |
|
|
, ('United States', 'North Dakota', 'N. Dakota')
|
221 |
|
|
, ('United States', 'South Dakota', 'S. Dakota')
|
222 |
|
|
, ('Kenya', 'Coast Province', 'Coast')
|
223 |
|
|
, ('Mexico', 'Estado de Baja California', 'Baja California Norte')
|
224 |
|
|
, ('Mexico', 'Estado de Baja California', 'Baja California (Norte)')
|
225 |
|
|
, ('Mexico', 'Estado de Baja California', 'Baja California, Norte de')
|
226 |
|
|
, ('Mexico', 'Estado de Baja California', 'Baja California (state)')
|
227 |
|
|
, ('Mexico', 'Estado de Baja California Sur', 'Baja California, Sur de')
|
228 |
|
|
, ('Mexico', 'Estado de Baja California Sur', 'Baja California Sur (state)')
|
229 |
|
|
, ('Mexico', 'Estado de Baja California Sur', 'Baja Califronia')
|
230 |
|
|
, ('Mexico', 'Estado de Baja California Sur', 'Baja Cal. Sur')
|
231 |
|
|
, ('Ecuador', 'Provincia de Sucumbíos', 'Sucumbíos')
|
232 |
|
|
;
|
233 |
|
|
" geoscrub
|
234 |
|
|
|
235 |
|
|
# additions for mapping to gadm2 level 1 units
|
236 |
|
|
# - Los Lagos -> Los Rios is imperfect but needed for gadm
|
237 |
|
|
# - Orellana -> Napo is imperfect (wrong??) but needed for gadm
|
238 |
|
|
# - Mapping both Concelho de Macau and Concelho das Ilhas to Ilhas
|
239 |
|
|
# - Nassa seems to be a gadm misspelling? (Niassa)
|
240 |
|
|
# not addressed:
|
241 |
|
|
# - Bahamas: unmapped gadm level1 entities may be obsolete?
|
242 |
|
|
# - French Guiana has arrondissements as adm0 in gadm, low level?
|
243 |
|
|
# - French Polynesia has no adm1 in gadm
|
244 |
|
|
# - Madagascar: Sava is (was?) inside gadm's Antsiranana province
|
245 |
|
|
# - Philippines: GN has 17 regions at level1, gadm has 82 provinces
|
246 |
|
|
# - Sri Lanka: GN ADM2 matches gadm level1; bien data side with GN
|
247 |
|
|
# - Venezuela: Dependencias Federales don't seem to be in GADM
|
248 |
|
|
# - Vietnam: GN ADM1 matches gadm level2; bien data side with GN?
|
249 |
|
|
psql -c "
|
250 |
|
|
INSERT INTO alt_stateprovince
|
251 |
|
|
(country, stateprovince, alternatename)
|
252 |
|
|
VALUES
|
253 |
|
|
('Cambodia', 'Khétt Siĕm Réab', 'Siemréab')
|
254 |
|
|
, ('Chile', 'Región del Biobío', 'Bío-Bío')
|
255 |
|
|
, ('Chile', 'Región del Libertador General Bernardo O’Higgins', 'Libertador General Bernardo O''Higgins')
|
256 |
|
|
, ('Chile', 'Región de Los Ríos', 'Los Lagos')
|
257 |
|
|
, ('Chile', 'Providencia y Santa Catalina, Departamento de Archipiélago de San Andrés', 'San Andrés y Providencia')
|
258 |
|
|
, ('Dominican Republic', 'Provincia de Baoruco', 'Bahoruco')
|
259 |
|
|
, ('Ecuador', 'Provincia de Morona Santiago', 'Morona Santiago')
|
260 |
|
|
, ('Ecuador', 'Provincia de Napo', 'Orellana')
|
261 |
|
|
, ('Ecuador', 'Provincia de Zamora Chinchipe', 'Zamora Chinchipe')
|
262 |
|
|
, ('Gabon', 'Province du Woleu-Ntem', 'Wouleu-Ntem')
|
263 |
|
|
, ('Ghana', 'Brong-Ahafo Region', 'Brong Ahafo')
|
264 |
|
|
, ('Guatemala', 'Departamento de Quetzaltenango', 'Quezaltenango')
|
265 |
|
|
, ('Haiti', 'Département de l''Artibonite', 'L''Artibonite')
|
266 |
|
|
, ('Liberia', 'Grand Bassa County', 'GrandBassa')
|
267 |
|
|
, ('Liberia', 'Grand Gedeh County', 'GrandGedeh')
|
268 |
|
|
, ('Macao', 'Concelho de Macau', 'Ilhas')
|
269 |
|
|
, ('Mozambique', 'Niassa Province', 'Nassa')
|
270 |
|
|
, ('Norway', 'Østfold fylke', 'Ãstfold')
|
271 |
|
|
, ('Peru', 'Región de Huánuco', 'Huánuco')
|
272 |
|
|
, ('Puerto Rico', 'Guánica Municipio', 'Guánica')
|
273 |
|
|
, ('Puerto Rico', 'Loíza Municipio', 'Loíza')
|
274 |
|
|
, ('Puerto Rico', 'Manatí Municipio', 'Manatí')
|
275 |
|
|
, ('Puerto Rico', 'Peñuelas Municipio', 'Peñuelas')
|
276 |
|
|
, ('Puerto Rico', 'Río Grande Municipio', 'Río Grande')
|
277 |
|
|
, ('Puerto Rico', 'San Germán Municipio', 'San Germán')
|
278 |
|
|
, ('Puerto Rico', 'San Sebastián Municipio', 'San Sebastián')
|
279 |
|
|
, ('Russia', 'Karachayevo-Cherkesskaya Respublika', 'Karachay-Cherkess')
|
280 |
|
|
, ('Syria', 'Muḩāfaz̧at al Lādhiqīyah', 'Lattakia')
|
281 |
|
|
, ('Togo', 'Région Centrale', 'Centre')
|
282 |
|
|
;" geoscrub
|