Revision 11495
Added by Paul Sarando about 11 years ago
derived/biengeo/geonames.sh | ||
---|---|---|
1 |
# Script to geoscrub-enable a postgres database, developed in the |
|
2 |
# context of BIEN3 geoscrubbing but with intentions of generality. |
|
3 |
# |
|
4 |
# Basic workflow: |
|
5 |
# 1. Import table dumps obtained from geonames.org |
|
6 |
# 2. Add additional custom mapping info, including: |
|
7 |
# a. Madagascar provinces used in GADM2 but not geonames |
|
8 |
# b. extra country name mappings/fixes specific to BIEN3 data |
|
9 |
# c. extra state/prov name mappings/fixes specific to BIEN3 data |
|
10 |
# d. custom state/prov name mappings binding GADM2 to geonames |
|
11 |
# |
|
12 |
# Notes: |
|
13 |
# * One of the INSERT statements below contains the string 'Hawai\`i'. |
|
14 |
# The backslash is there to escape the backtick for bash, so that |
|
15 |
# what gets inserted into the database is 'Hawai`i'. If manually |
|
16 |
# copy-and-pasting the statement into psql rather than running as a |
|
17 |
# script (i.e., not having bash interpret the command), the backslash |
|
18 |
# must not be included. |
|
19 |
# |
|
20 |
# Jim Regetz |
|
21 |
# NCEAS |
|
22 |
# Created Nov 2012 |
|
23 |
|
|
24 |
# !! this is specific to regetz's testing environment !! |
|
25 |
DATADIR="$HOME/biengeo/geonames" |
|
26 |
cd ${DATADIR} |
|
27 |
|
|
28 |
# |
|
29 |
# load geonames.org geonames |
|
30 |
# |
|
31 |
|
|
32 |
wget http://download.geonames.org/export/dump/allCountries.zip |
|
33 |
unzip allCountries.zip |
|
34 |
|
|
35 |
psql -c \ |
|
36 |
'CREATE TABLE geonames ( |
|
37 |
geonameid serial primary key, |
|
38 |
name text, |
|
39 |
asciiname text, |
|
40 |
alternatenames text, |
|
41 |
latitude numeric, |
|
42 |
longitude numeric, |
|
43 |
featureclass char(1), |
|
44 |
featurecode text, |
|
45 |
countrycode char(2), |
|
46 |
cc2 char(60), |
|
47 |
admin1code text, |
|
48 |
admin2code text, |
|
49 |
admin3code text, |
|
50 |
admin4code text, |
|
51 |
population bigint, |
|
52 |
elevation int, |
|
53 |
dem int, |
|
54 |
timezone text, |
|
55 |
modification date |
|
56 |
)' geoscrub |
|
57 |
|
|
58 |
# import data |
|
59 |
psql -c "COPY geonames FROM '${DATADIR}/allCountries.txt' DELIMITER E'\t' NULL ''" geoscrub |
|
60 |
psql -c \ "SELECT setval('geonames_geonameid_seq', (SELECT max(geonameid) FROM geonames))" geoscrub |
|
61 |
psql -c \ "CREATE INDEX geonames_countrycode_idx ON geonames (countrycode)" geoscrub |
|
62 |
psql -c "CREATE INDEX geonames_featurecode_idx ON geonames (featurecode)" geoscrub |
|
63 |
psql -c "CREATE INDEX geonames_name_idx ON geonames (name)" geoscrub |
|
64 |
|
|
65 |
# |
|
66 |
# load geonames.org alternate names |
|
67 |
# |
|
68 |
|
|
69 |
wget http://download.geonames.org/export/dump/alternateNames.zip |
|
70 |
unzip alternateNames.zip |
|
71 |
|
|
72 |
psql -c \ |
|
73 |
'CREATE TABLE alternateNames ( |
|
74 |
alternateNameId serial primary key, |
|
75 |
geonameid int references geonames (geonameid), |
|
76 |
isolanguage varchar(7), |
|
77 |
alternateName varchar(200), |
|
78 |
isPreferredName char(1), |
|
79 |
isShortName char(1), |
|
80 |
isColloquial char(1), |
|
81 |
isHistoric char(1) |
|
82 |
)' geoscrub |
|
83 |
|
|
84 |
# import data |
|
85 |
psql -c "COPY alternateNames FROM '${DATADIR}/alternateNames.txt' DELIMITER E'\t' NULL ''" geoscrub |
|
86 |
psql -c \ "SELECT setval('alternatenames_alternatenameid_seq', (SELECT max(alternatenameid) FROM alternatenames))" geoscrub |
|
87 |
# 4m45.368s |
|
88 |
|
|
89 |
# index alternatenames |
|
90 |
psql -c 'CREATE INDEX "alternatenames_alternatename_idx" ON alternateNames (alternateName)' geoscrub |
|
91 |
# 10m42.827s |
|
92 |
psql -c 'CREATE INDEX "alternatenames_geonameid_idx" ON alternateNames (geonameid)' geoscrub |
|
93 |
|
|
94 |
# |
|
95 |
# load geonames.org country info |
|
96 |
# |
|
97 |
|
|
98 |
psql -c \ |
|
99 |
'CREATE TABLE countries ( |
|
100 |
iso char(2), |
|
101 |
iso3 char(3), |
|
102 |
isonumeric text, |
|
103 |
fips text, |
|
104 |
country text UNIQUE, |
|
105 |
capital text, |
|
106 |
area text, |
|
107 |
population text, |
|
108 |
continent text, |
|
109 |
tld text, |
|
110 |
currencyCode text, |
|
111 |
currencyName text, |
|
112 |
phone text, |
|
113 |
postalCodeFormat text, |
|
114 |
postalCodeRegex text, |
|
115 |
languages text, |
|
116 |
geonameid int, |
|
117 |
neighbours text, |
|
118 |
equivalentFipsCode text |
|
119 |
)' geoscrub |
|
120 |
|
|
121 |
# import geonames country table |
|
122 |
grep -ve '^#' $DATADIR/countryInfo.txt | \ |
|
123 |
psql -c "COPY countries FROM STDIN DELIMITER E'\t' NULL ''" geoscrub |
|
124 |
|
|
125 |
# |
|
126 |
# load geonames.org hierarchy info |
|
127 |
# |
|
128 |
|
|
129 |
wget http://download.geonames.org/export/dump/hierarchy.zip |
|
130 |
unzip hierarchy.zip |
|
131 |
|
|
132 |
psql -c \ |
|
133 |
'CREATE TABLE hierarchy ( |
|
134 |
parentId int,-- references geonames (geonameid), |
|
135 |
childId int,-- references geonames (geonameid), |
|
136 |
type text |
|
137 |
)' geoscrub |
|
138 |
|
|
139 |
# import geonames country table |
|
140 |
psql -c "COPY hierarchy FROM '${DATADIR}/hierarchy.txt' DELIMITER E'\t' NULL ''" geoscrub |
|
141 |
psql -c "CREATE INDEX hierarchy_parentid_idx ON hierarchy (parentId)" geoscrub |
|
142 |
psql -c "CREATE INDEX hierarchy_childid_idx ON hierarchy (childId)" geoscrub |
|
143 |
|
|
144 |
# |
|
145 |
# insert additional custom name-scrub mappings |
|
146 |
# |
|
147 |
|
|
148 |
# augment geonames with some additional places we need, and update |
|
149 |
# hierarchy table accordingly |
|
150 |
psql -c \ |
|
151 |
"WITH newnames AS ( |
|
152 |
INSERT INTO geonames |
|
153 |
(name, alternatenames, featurecode, countrycode ) |
|
154 |
VALUES |
|
155 |
('Antananarivo', NULL, 'ADM1', 'MG'), |
|
156 |
('Antsiranana', NULL, 'ADM1', 'MG'), |
|
157 |
('Fianarantsoa', NULL, 'ADM1', 'MG'), |
|
158 |
('Mahajanga', NULL, 'ADM1', 'MG'), |
|
159 |
('Toamasina', NULL, 'ADM1', 'MG'), |
|
160 |
('Toliara', 'Toliary', 'ADM1', 'MG') |
|
161 |
RETURNING geonameid, countrycode |
|
162 |
) |
|
163 |
INSERT INTO hierarchy (parentid, childid) |
|
164 |
SELECT c.geonameid AS parentid, |
|
165 |
n.geonameid AS childid |
|
166 |
FROM newnames n, |
|
167 |
countries c |
|
168 |
WHERE n.countrycode = c.iso; |
|
169 |
" geoscrub |
|
170 |
|
|
171 |
# add some custom country alternate names that we need |
|
172 |
psql -c " |
|
173 |
CREATE TABLE alt_country ( |
|
174 |
country text references countries (country), |
|
175 |
alternatename text |
|
176 |
); |
|
177 |
INSERT INTO alt_country |
|
178 |
(country, alternatename) |
|
179 |
VALUES |
|
180 |
('Central African Republic', 'Central African Repu') |
|
181 |
, ('British Virgin Islands', 'Virgin Islands (U.K.)') |
|
182 |
, ('Democratic Republic of the Congo', 'Congo, The Democratic Republic of the') |
|
183 |
, ('Falkland Islands', 'Falkland Isl') |
|
184 |
, ('Falkland Islands', 'Falkland Islands (Malvinas)') |
|
185 |
, ('Faroe Islands', 'Faeroe Isl.') |
|
186 |
, ('French Guiana', 'Guyana Francesa') |
|
187 |
, ('Iran', 'Iran, Islamic Republic of') |
|
188 |
, ('Mexico', E'Mexico\rMexico') |
|
189 |
, ('Nicaragua', 'Ncaragua') |
|
190 |
, ('Norfolk Island', 'Norfolk Isl') |
|
191 |
, ('North Korea', 'Korea, Democratic People''s Republic of') |
|
192 |
, ('Solomon Islands', 'Solomon Isl') |
|
193 |
, ('South Georgia and the South Sandwich Islands', 'South Georgia and the South Sand') |
|
194 |
, ('South Korea', 'Korea (Republic of)') |
|
195 |
, ('South Korea', 'Korea, Republic of') |
|
196 |
, ('U.S. Virgin Islands', 'US Virgin Islands') |
|
197 |
, ('U.S. Virgin Islands', 'Virgin Islands (U.S.)') |
|
198 |
, ('United States', 'ESTADOS UNIDOS DE AMERICA') |
|
199 |
, ('United States', 'EUA') |
|
200 |
, ('Vietnam', 'VIETNAM [Socialist Republic of V') |
|
201 |
;" geoscrub |
|
202 |
|
|
203 |
# add some custom stateprovince alternate names that we need |
|
204 |
psql -c " |
|
205 |
CREATE TABLE alt_stateprovince ( |
|
206 |
country text references countries (country), |
|
207 |
stateprovince text, |
|
208 |
alternatename text |
|
209 |
); |
|
210 |
INSERT INTO alt_stateprovince |
|
211 |
(country, stateprovince, alternatename) |
|
212 |
VALUES |
|
213 |
('Canada', 'British Columbia', 'B.C.') |
|
214 |
, ('Canada', 'Newfoundland and Labrador', 'Newfoundland & Labrador') |
|
215 |
, ('Canada', 'Newfoundland and Labrador', 'New Foundland And Labrador') |
|
216 |
, ('Canada', 'Newfoundland and Labrador', 'Labrador & Newfoundland') |
|
217 |
, ('Canada', 'Northwest Territories', 'North West Territories') |
|
218 |
, ('Canada', 'Northwest Territories', 'Northwest Territorie') |
|
219 |
, ('Canada', 'Northwest Territories', 'NWT') |
|
220 |
, ('Canada', 'Québec', 'Quebéc') |
|
221 |
, ('Canada', 'Québec', 'Pq') |
|
222 |
, ('Canada', 'Québec', 'PQ') |
|
223 |
, ('United States', 'Hawaii', 'Hawai\`i') |
|
224 |
, ('United States', 'North Carolina', 'N. Carolina') |
|
225 |
, ('United States', 'North Dakota', 'N. Dakota') |
|
226 |
, ('United States', 'South Dakota', 'S. Dakota') |
|
227 |
, ('Kenya', 'Coast Province', 'Coast') |
|
228 |
, ('Mexico', 'Estado de Baja California', 'Baja California Norte') |
|
229 |
, ('Mexico', 'Estado de Baja California', 'Baja California (Norte)') |
|
230 |
, ('Mexico', 'Estado de Baja California', 'Baja California, Norte de') |
|
231 |
, ('Mexico', 'Estado de Baja California', 'Baja California (state)') |
|
232 |
, ('Mexico', 'Estado de Baja California Sur', 'Baja California, Sur de') |
|
233 |
, ('Mexico', 'Estado de Baja California Sur', 'Baja California Sur (state)') |
|
234 |
, ('Mexico', 'Estado de Baja California Sur', 'Baja Califronia') |
|
235 |
, ('Mexico', 'Estado de Baja California Sur', 'Baja Cal. Sur') |
|
236 |
, ('Ecuador', 'Provincia de Sucumbíos', 'Sucumbíos') |
|
237 |
; |
|
238 |
" geoscrub |
|
239 |
|
|
240 |
# additions for mapping to gadm2 level 1 units |
|
241 |
# - Los Lagos -> Los Rios is imperfect but needed for gadm |
|
242 |
# - Orellana -> Napo is imperfect (wrong??) but needed for gadm |
|
243 |
# - Mapping both Concelho de Macau and Concelho das Ilhas to Ilhas |
|
244 |
# - Nassa seems to be a gadm misspelling? (Niassa) |
|
245 |
# not addressed: |
|
246 |
# - Bahamas: unmapped gadm level1 entities may be obsolete? |
|
247 |
# - French Guiana has arrondissements as adm0 in gadm, low level? |
|
248 |
# - French Polynesia has no adm1 in gadm |
|
249 |
# - Madagascar: Sava is (was?) inside gadm's Antsiranana province |
|
250 |
# - Philippines: GN has 17 regions at level1, gadm has 82 provinces |
|
251 |
# - Sri Lanka: GN ADM2 matches gadm level1; bien data side with GN |
|
252 |
# - Venezuela: Dependencias Federales don't seem to be in GADM |
|
253 |
# - Vietnam: GN ADM1 matches gadm level2; bien data side with GN? |
|
254 |
psql -c " |
|
255 |
INSERT INTO alt_stateprovince |
|
256 |
(country, stateprovince, alternatename) |
|
257 |
VALUES |
|
258 |
('Cambodia', 'Khétt Siĕm Réab', 'Siemréab') |
|
259 |
, ('Chile', 'Región del Biobío', 'Bío-Bío') |
|
260 |
, ('Chile', 'Región del Libertador General Bernardo O’Higgins', 'Libertador General Bernardo O''Higgins') |
|
261 |
, ('Chile', 'Región de Los Ríos', 'Los Lagos') |
|
262 |
, ('Chile', 'Providencia y Santa Catalina, Departamento de Archipiélago de San Andrés', 'San Andrés y Providencia') |
|
263 |
, ('Dominican Republic', 'Provincia de Baoruco', 'Bahoruco') |
|
264 |
, ('Ecuador', 'Provincia de Morona Santiago', 'Morona Santiago') |
|
265 |
, ('Ecuador', 'Provincia de Napo', 'Orellana') |
|
266 |
, ('Ecuador', 'Provincia de Zamora Chinchipe', 'Zamora Chinchipe') |
|
267 |
, ('Gabon', 'Province du Woleu-Ntem', 'Wouleu-Ntem') |
|
268 |
, ('Ghana', 'Brong-Ahafo Region', 'Brong Ahafo') |
|
269 |
, ('Guatemala', 'Departamento de Quetzaltenango', 'Quezaltenango') |
|
270 |
, ('Haiti', 'Département de l''Artibonite', 'L''Artibonite') |
|
271 |
, ('Liberia', 'Grand Bassa County', 'GrandBassa') |
|
272 |
, ('Liberia', 'Grand Gedeh County', 'GrandGedeh') |
|
273 |
, ('Macao', 'Concelho de Macau', 'Ilhas') |
|
274 |
, ('Mozambique', 'Niassa Province', 'Nassa') |
|
275 |
, ('Norway', 'Østfold fylke', 'Ãstfold') |
|
276 |
, ('Peru', 'Región de Huánuco', 'Huánuco') |
|
277 |
, ('Puerto Rico', 'Guánica Municipio', 'Guánica') |
|
278 |
, ('Puerto Rico', 'Loíza Municipio', 'Loíza') |
|
279 |
, ('Puerto Rico', 'Manatí Municipio', 'Manatí') |
|
280 |
, ('Puerto Rico', 'Peñuelas Municipio', 'Peñuelas') |
|
281 |
, ('Puerto Rico', 'Río Grande Municipio', 'Río Grande') |
|
282 |
, ('Puerto Rico', 'San Germán Municipio', 'San Germán') |
|
283 |
, ('Puerto Rico', 'San Sebastián Municipio', 'San Sebastián') |
|
284 |
, ('Russia', 'Karachayevo-Cherkesskaya Respublika', 'Karachay-Cherkess') |
|
285 |
, ('Syria', 'Muḩāfaz̧at al Lādhiqīyah', 'Lattakia') |
|
286 |
, ('Togo', 'Région Centrale', 'Centre') |
|
287 |
;" geoscrub |
derived/biengeo/truncate.geonames.sql | ||
---|---|---|
1 |
BEGIN; |
|
2 |
|
|
3 |
TRUNCATE geonames; |
|
4 |
TRUNCATE alternateNames; |
|
5 |
TRUNCATE countries; |
|
6 |
TRUNCATE hierarchy; |
|
7 |
|
|
8 |
DROP INDEX IF EXISTS geonames_countrycode_idx; |
|
9 |
DROP INDEX IF EXISTS geonames_featurecode_idx; |
|
10 |
DROP INDEX IF EXISTS geonames_name_idx; |
|
11 |
DROP INDEX IF EXISTS "alternatenames_alternatename_idx"; |
|
12 |
DROP INDEX IF EXISTS "alternatenames_geonameid_idx"; |
|
13 |
DROP INDEX IF EXISTS hierarchy_parentid_idx; |
|
14 |
DROP INDEX IF EXISTS hierarchy_childid_idx; |
|
15 |
|
|
16 |
COMMIT; |
|
17 |
|
derived/biengeo/update.alternateNames.sql | ||
---|---|---|
1 |
BEGIN; |
|
2 |
|
|
3 |
SELECT setval('alternatenames_alternatenameid_seq', (SELECT max(alternatenameid) FROM alternatenames)); |
|
4 |
-- 4m45.368s |
|
5 |
|
|
6 |
-- index alternatenames |
|
7 |
CREATE INDEX "alternatenames_alternatename_idx" ON alternateNames (alternateName); |
|
8 |
-- 10m42.827s |
|
9 |
CREATE INDEX "alternatenames_geonameid_idx" ON alternateNames (geonameid); |
|
10 |
|
|
11 |
COMMIT; |
|
12 |
|
derived/biengeo/update.alt_country.sql | ||
---|---|---|
1 |
BEGIN; |
|
2 |
|
|
3 |
-- |
|
4 |
-- insert additional custom name-scrub mappings |
|
5 |
-- |
|
6 |
|
|
7 |
-- add some custom country alternate names that we need |
|
8 |
INSERT INTO alt_country |
|
9 |
(country, alternatename) |
|
10 |
VALUES |
|
11 |
('Central African Republic', 'Central African Repu') |
|
12 |
, ('British Virgin Islands', 'Virgin Islands (U.K.)') |
|
13 |
, ('Democratic Republic of the Congo', 'Congo, The Democratic Republic of the') |
|
14 |
, ('Falkland Islands', 'Falkland Isl') |
|
15 |
, ('Falkland Islands', 'Falkland Islands (Malvinas)') |
|
16 |
, ('Faroe Islands', 'Faeroe Isl.') |
|
17 |
, ('French Guiana', 'Guyana Francesa') |
|
18 |
, ('Iran', 'Iran, Islamic Republic of') |
|
19 |
, ('Mexico', E'Mexico\rMexico') |
|
20 |
, ('Nicaragua', 'Ncaragua') |
|
21 |
, ('Norfolk Island', 'Norfolk Isl') |
|
22 |
, ('North Korea', 'Korea, Democratic People''s Republic of') |
|
23 |
, ('Solomon Islands', 'Solomon Isl') |
|
24 |
, ('South Georgia and the South Sandwich Islands', 'South Georgia and the South Sand') |
|
25 |
, ('South Korea', 'Korea (Republic of)') |
|
26 |
, ('South Korea', 'Korea, Republic of') |
|
27 |
, ('U.S. Virgin Islands', 'US Virgin Islands') |
|
28 |
, ('U.S. Virgin Islands', 'Virgin Islands (U.S.)') |
|
29 |
, ('United States', 'ESTADOS UNIDOS DE AMERICA') |
|
30 |
, ('United States', 'EUA') |
|
31 |
, ('Vietnam', 'VIETNAM [Socialist Republic of V') |
|
32 |
; |
|
33 |
|
|
34 |
COMMIT; |
|
35 |
|
derived/biengeo/update.hierarchy.sql | ||
---|---|---|
1 |
BEGIN; |
|
2 |
|
|
3 |
CREATE INDEX hierarchy_parentid_idx ON hierarchy (parentId); |
|
4 |
CREATE INDEX hierarchy_childid_idx ON hierarchy (childId); |
|
5 |
|
|
6 |
COMMIT; |
|
7 |
|
derived/biengeo/update_geonames_data.sh | ||
---|---|---|
1 |
#!/bin/bash |
|
2 |
|
|
3 |
# Script to geoscrub-enable a postgres database, developed in the |
|
4 |
# context of BIEN3 geoscrubbing but with intentions of generality. |
|
5 |
# |
|
6 |
# Basic workflow: |
|
7 |
# 1. Import table dumps obtained from geonames.org |
|
8 |
# 2. Add additional custom mapping info, including: |
|
9 |
# a. Madagascar provinces used in GADM2 but not geonames |
|
10 |
# b. extra country name mappings/fixes specific to BIEN3 data |
|
11 |
# c. extra state/prov name mappings/fixes specific to BIEN3 data |
|
12 |
# d. custom state/prov name mappings binding GADM2 to geonames |
|
13 |
# |
|
14 |
# Jim Regetz |
|
15 |
# NCEAS |
|
16 |
# Created Nov 2012 |
|
17 |
# |
|
18 |
# refactored and reorganized by |
|
19 |
# Paul Sarando |
|
20 |
# iPlant Collaborative |
|
21 |
# Updated Oct 2013 |
|
22 |
|
|
23 |
DB_NAME="geoscrub" |
|
24 |
DB_USER="bien" |
|
25 |
DB_HOST="localhost" |
|
26 |
SCRIPT_DIR="$(dirname $0)" |
|
27 |
GEONAMES_DUMP_URL="http://download.geonames.org/export/dump" |
|
28 |
ALL_COUNTRIES_ZIP=allCountries.zip |
|
29 |
ALL_COUNTRIES_TXT=allCountries.txt |
|
30 |
ALTERNATE_NAMES_ZIP=alternateNames.zip |
|
31 |
ALTERNATE_NAMES_TXT=alternateNames.txt |
|
32 |
COUNTRY_INFO_TXT=countryInfo.txt |
|
33 |
HIERARCHY_ZIP=hierarchy.zip |
|
34 |
HIERARCHY_TXT=hierarchy.txt |
|
35 |
|
|
36 |
function run_sql_cmd { |
|
37 |
local SQL_CMD=$1 |
|
38 |
|
|
39 |
psql -e -U "$DB_USER" -h "$DB_HOST" -d "$DB_NAME" -c "$SQL_CMD" |
|
40 |
if [[ $? != 0 ]]; then |
|
41 |
echo "Error while executing SQL command." |
|
42 |
exit 1 |
|
43 |
fi |
|
44 |
} |
|
45 |
|
|
46 |
function run_sql_script { |
|
47 |
local SCRIPT=$1 |
|
48 |
|
|
49 |
psql --set ON_ERROR_STOP=1 -U "$DB_USER" -h "$DB_HOST" -d "$DB_NAME" < "$SCRIPT" |
|
50 |
if [[ $? != 0 ]]; then |
|
51 |
echo "Error while executing SQL script ${SCRIPT}" |
|
52 |
exit 1 |
|
53 |
fi |
|
54 |
} |
|
55 |
|
|
56 |
echo "Updating geonames.org tables..." |
|
57 |
|
|
58 |
# Check for data downloaded into geonames under the current directory. |
|
59 |
DATADIR="$(dirname $0)/geonames" |
|
60 |
if [[ ! -d "${DATADIR}" ]]; then |
|
61 |
echo "making directory ${DATADIR}" |
|
62 |
mkdir -p "${DATADIR}" |
|
63 |
|
|
64 |
if [[ $? != 0 ]]; then |
|
65 |
echo "Could not create directory ${DATADIR}" |
|
66 |
exit 1 |
|
67 |
fi |
|
68 |
fi |
|
69 |
|
|
70 |
run_sql_script "${SCRIPT_DIR}/truncate.geonames.sql" |
|
71 |
|
|
72 |
# |
|
73 |
# load geonames.org geonames |
|
74 |
# |
|
75 |
|
|
76 |
if [[ ! -r "${DATADIR}/${ALL_COUNTRIES_TXT}" ]]; then |
|
77 |
pushd "${DATADIR}" |
|
78 |
wget "${GEONAMES_DUMP_URL}/${ALL_COUNTRIES_ZIP}" |
|
79 |
|
|
80 |
if [[ $? != 0 ]]; then |
|
81 |
echo "Could not download ${ALL_COUNTRIES_ZIP}" |
|
82 |
exit 1 |
|
83 |
fi |
|
84 |
unzip "$ALL_COUNTRIES_ZIP" |
|
85 |
rm -f "$ALL_COUNTRIES_ZIP" |
|
86 |
popd |
|
87 |
fi |
|
88 |
|
|
89 |
# import data |
|
90 |
run_sql_cmd "\COPY geonames FROM '${DATADIR}/${ALL_COUNTRIES_TXT}' DELIMITER E'\t' NULL ''" |
|
91 |
|
|
92 |
# |
|
93 |
# load geonames.org alternate names |
|
94 |
# |
|
95 |
|
|
96 |
if [[ ! -r "${DATADIR}/${ALTERNATE_NAMES_TXT}" ]]; then |
|
97 |
pushd "${DATADIR}" |
|
98 |
wget "${GEONAMES_DUMP_URL}/${ALTERNATE_NAMES_ZIP}" |
|
99 |
|
|
100 |
if [[ $? != 0 ]]; then |
|
101 |
echo "Could not download ${ALTERNATE_NAMES_ZIP}" |
|
102 |
exit 1 |
|
103 |
fi |
|
104 |
unzip "$ALTERNATE_NAMES_ZIP" |
|
105 |
rm -f "$ALTERNATE_NAMES_ZIP" |
|
106 |
popd |
|
107 |
fi |
|
108 |
|
|
109 |
# import data |
|
110 |
run_sql_cmd "\COPY alternateNames FROM '${DATADIR}/${ALTERNATE_NAMES_TXT}' DELIMITER E'\t' NULL ''" |
|
111 |
run_sql_script "${SCRIPT_DIR}/update.alternateNames.sql" |
|
112 |
|
|
113 |
# |
|
114 |
# load geonames.org country info |
|
115 |
# |
|
116 |
|
|
117 |
if [[ ! -r "${DATADIR}/${COUNTRY_INFO_TXT}" ]]; then |
|
118 |
pushd "${DATADIR}" |
|
119 |
wget "${GEONAMES_DUMP_URL}/${COUNTRY_INFO_TXT}" |
|
120 |
|
|
121 |
if [[ $? != 0 ]]; then |
|
122 |
echo "Could not download ${COUNTRY_INFO_TXT}" |
|
123 |
exit 1 |
|
124 |
fi |
|
125 |
popd |
|
126 |
fi |
|
127 |
|
|
128 |
# import geonames country table |
|
129 |
grep -ve '^#' "${DATADIR}/${COUNTRY_INFO_TXT}" | \ |
|
130 |
psql -U "$DB_USER" -h "$DB_HOST" -d "$DB_NAME" -c \ |
|
131 |
"\COPY countries FROM STDIN DELIMITER E'\t' NULL ''" |
|
132 |
if [[ $? != 0 ]]; then |
|
133 |
echo "Could not copy countries table data." |
|
134 |
exit 1 |
|
135 |
fi |
|
136 |
run_sql_script "${SCRIPT_DIR}/update.alt_country.sql" |
|
137 |
run_sql_script "${SCRIPT_DIR}/update.alt_stateprovince.sql" |
|
138 |
|
|
139 |
# |
|
140 |
# load geonames.org hierarchy info |
|
141 |
# |
|
142 |
|
|
143 |
if [[ ! -r "${DATADIR}/${HIERARCHY_TXT}" ]]; then |
|
144 |
pushd "${DATADIR}" |
|
145 |
wget "${GEONAMES_DUMP_URL}/${HIERARCHY_ZIP}" |
|
146 |
|
|
147 |
if [[ $? != 0 ]]; then |
|
148 |
echo "Could not download ${HIERARCHY_ZIP}" |
|
149 |
exit 1 |
|
150 |
fi |
|
151 |
unzip "$HIERARCHY_ZIP" |
|
152 |
rm -f "$HIERARCHY_ZIP" |
|
153 |
popd |
|
154 |
fi |
|
155 |
|
|
156 |
# import geonames country table |
|
157 |
run_sql_cmd "\COPY hierarchy FROM '${DATADIR}/${HIERARCHY_TXT}' DELIMITER E'\t' NULL ''" |
|
158 |
run_sql_script "${SCRIPT_DIR}/update.hierarchy.sql" |
|
159 |
|
|
160 |
# augment geonames with some additional places we need, and update |
|
161 |
# hierarchy table accordingly |
|
162 |
run_sql_script "${SCRIPT_DIR}/update.geonames.sql" |
|
163 |
|
|
164 |
echo "Update geonames.org tables successfully completed." |
|
165 |
|
|
0 | 166 |
derived/biengeo/update.alt_stateprovince.sql | ||
---|---|---|
1 |
BEGIN; |
|
2 |
|
|
3 |
-- |
|
4 |
-- insert additional custom name-scrub mappings |
|
5 |
-- |
|
6 |
|
|
7 |
-- add some custom stateprovince alternate names that we need |
|
8 |
INSERT INTO alt_stateprovince |
|
9 |
(country, stateprovince, alternatename) |
|
10 |
VALUES |
|
11 |
('Canada', 'British Columbia', 'B.C.') |
|
12 |
, ('Canada', 'Newfoundland and Labrador', 'Newfoundland & Labrador') |
|
13 |
, ('Canada', 'Newfoundland and Labrador', 'New Foundland And Labrador') |
|
14 |
, ('Canada', 'Newfoundland and Labrador', 'Labrador & Newfoundland') |
|
15 |
, ('Canada', 'Northwest Territories', 'North West Territories') |
|
16 |
, ('Canada', 'Northwest Territories', 'Northwest Territorie') |
|
17 |
, ('Canada', 'Northwest Territories', 'NWT') |
|
18 |
, ('Canada', 'Québec', 'Quebéc') |
|
19 |
, ('Canada', 'Québec', 'Pq') |
|
20 |
, ('Canada', 'Québec', 'PQ') |
|
21 |
, ('United States', 'Hawaii', 'Hawai`i') |
|
22 |
, ('United States', 'North Carolina', 'N. Carolina') |
|
23 |
, ('United States', 'North Dakota', 'N. Dakota') |
|
24 |
, ('United States', 'South Dakota', 'S. Dakota') |
|
25 |
, ('Kenya', 'Coast Province', 'Coast') |
|
26 |
, ('Mexico', 'Estado de Baja California', 'Baja California Norte') |
|
27 |
, ('Mexico', 'Estado de Baja California', 'Baja California (Norte)') |
|
28 |
, ('Mexico', 'Estado de Baja California', 'Baja California, Norte de') |
|
29 |
, ('Mexico', 'Estado de Baja California', 'Baja California (state)') |
|
30 |
, ('Mexico', 'Estado de Baja California Sur', 'Baja California, Sur de') |
|
31 |
, ('Mexico', 'Estado de Baja California Sur', 'Baja California Sur (state)') |
|
32 |
, ('Mexico', 'Estado de Baja California Sur', 'Baja Califronia') |
|
33 |
, ('Mexico', 'Estado de Baja California Sur', 'Baja Cal. Sur') |
|
34 |
, ('Ecuador', 'Provincia de Sucumbíos', 'Sucumbíos') |
|
35 |
; |
|
36 |
|
|
37 |
-- additions for mapping to gadm2 level 1 units |
|
38 |
-- - Los Lagos -> Los Rios is imperfect but needed for gadm |
|
39 |
-- - Orellana -> Napo is imperfect (wrong??) but needed for gadm |
|
40 |
-- - Mapping both Concelho de Macau and Concelho das Ilhas to Ilhas |
|
41 |
-- - Nassa seems to be a gadm misspelling? (Niassa) |
|
42 |
-- not addressed: |
|
43 |
-- - Bahamas: unmapped gadm level1 entities may be obsolete? |
|
44 |
-- - French Guiana has arrondissements as adm0 in gadm, low level? |
|
45 |
-- - French Polynesia has no adm1 in gadm |
|
46 |
-- - Madagascar: Sava is (was?) inside gadm's Antsiranana province |
|
47 |
-- - Philippines: GN has 17 regions at level1, gadm has 82 provinces |
|
48 |
-- - Sri Lanka: GN ADM2 matches gadm level1; bien data side with GN |
|
49 |
-- - Venezuela: Dependencias Federales don't seem to be in GADM |
|
50 |
-- - Vietnam: GN ADM1 matches gadm level2; bien data side with GN? |
|
51 |
INSERT INTO alt_stateprovince |
|
52 |
(country, stateprovince, alternatename) |
|
53 |
VALUES |
|
54 |
('Cambodia', 'Khétt Siĕm Réab', 'Siemréab') |
|
55 |
, ('Chile', 'Región del Biobío', 'Bío-Bío') |
|
56 |
, ('Chile', 'Región del Libertador General Bernardo O’Higgins', 'Libertador General Bernardo O''Higgins') |
|
57 |
, ('Chile', 'Región de Los Ríos', 'Los Lagos') |
|
58 |
, ('Chile', 'Providencia y Santa Catalina, Departamento de Archipiélago de San Andrés', 'San Andrés y Providencia') |
|
59 |
, ('Dominican Republic', 'Provincia de Baoruco', 'Bahoruco') |
|
60 |
, ('Ecuador', 'Provincia de Morona Santiago', 'Morona Santiago') |
|
61 |
, ('Ecuador', 'Provincia de Napo', 'Orellana') |
|
62 |
, ('Ecuador', 'Provincia de Zamora Chinchipe', 'Zamora Chinchipe') |
|
63 |
, ('Gabon', 'Province du Woleu-Ntem', 'Wouleu-Ntem') |
|
64 |
, ('Ghana', 'Brong-Ahafo Region', 'Brong Ahafo') |
|
65 |
, ('Guatemala', 'Departamento de Quetzaltenango', 'Quezaltenango') |
|
66 |
, ('Haiti', 'Département de l''Artibonite', 'L''Artibonite') |
|
67 |
, ('Liberia', 'Grand Bassa County', 'GrandBassa') |
|
68 |
, ('Liberia', 'Grand Gedeh County', 'GrandGedeh') |
|
69 |
, ('Macao', 'Concelho de Macau', 'Ilhas') |
|
70 |
, ('Mozambique', 'Niassa Province', 'Nassa') |
|
71 |
, ('Norway', 'Østfold fylke', 'Ãstfold') |
|
72 |
, ('Peru', 'Región de Huánuco', 'Huánuco') |
|
73 |
, ('Puerto Rico', 'Guánica Municipio', 'Guánica') |
|
74 |
, ('Puerto Rico', 'Loíza Municipio', 'Loíza') |
|
75 |
, ('Puerto Rico', 'Manatí Municipio', 'Manatí') |
|
76 |
, ('Puerto Rico', 'Peñuelas Municipio', 'Peñuelas') |
|
77 |
, ('Puerto Rico', 'Río Grande Municipio', 'Río Grande') |
|
78 |
, ('Puerto Rico', 'San Germán Municipio', 'San Germán') |
|
79 |
, ('Puerto Rico', 'San Sebastián Municipio', 'San Sebastián') |
|
80 |
, ('Russia', 'Karachayevo-Cherkesskaya Respublika', 'Karachay-Cherkess') |
|
81 |
, ('Syria', 'Muḩāfaz̧at al Lādhiqīyah', 'Lattakia') |
|
82 |
, ('Togo', 'Région Centrale', 'Centre') |
|
83 |
; |
|
84 |
|
|
85 |
COMMIT; |
|
86 |
|
|
87 |
|
derived/biengeo/update.geonames.sql | ||
---|---|---|
1 |
BEGIN; |
|
2 |
|
|
3 |
SELECT setval('geonames_geonameid_seq', (SELECT max(geonameid) FROM geonames)); |
|
4 |
CREATE INDEX geonames_countrycode_idx ON geonames (countrycode); |
|
5 |
CREATE INDEX geonames_featurecode_idx ON geonames (featurecode); |
|
6 |
CREATE INDEX geonames_name_idx ON geonames (name); |
|
7 |
|
|
8 |
-- augment geonames with some additional places we need, and update |
|
9 |
-- hierarchy table accordingly |
|
10 |
WITH newnames AS ( |
|
11 |
INSERT INTO geonames |
|
12 |
(name, alternatenames, featurecode, countrycode ) |
|
13 |
VALUES |
|
14 |
('Antananarivo', NULL, 'ADM1', 'MG'), |
|
15 |
('Antsiranana', NULL, 'ADM1', 'MG'), |
|
16 |
('Fianarantsoa', NULL, 'ADM1', 'MG'), |
|
17 |
('Mahajanga', NULL, 'ADM1', 'MG'), |
|
18 |
('Toamasina', NULL, 'ADM1', 'MG'), |
|
19 |
('Toliara', 'Toliary', 'ADM1', 'MG') |
|
20 |
RETURNING geonameid, countrycode |
|
21 |
) |
|
22 |
INSERT INTO hierarchy (parentid, childid) |
|
23 |
SELECT c.geonameid AS parentid, |
|
24 |
n.geonameid AS childid |
|
25 |
FROM newnames n, |
|
26 |
countries c |
|
27 |
WHERE n.countrycode = c.iso; |
|
28 |
|
|
29 |
COMMIT; |
|
30 |
|
Also available in: Unified diff
Refactored geonames.sh to update_geonames_data.sh
Renamed geonames.sh to update_geonames_data.sh and moved many of the SQL
statements from the bash script into supporting update and truncate sql
scripts.
These sql and update_geonames_data.sh scripts now assume all required
tables have already been created (by install sql scripts added in a
previous commit).