Project

General

Profile

1 11495 psarando
#!/bin/bash
2
3
# Script to geoscrub-enable a postgres database, developed in the
4
# context of BIEN3 geoscrubbing but with intentions of generality.
5
#
6
# Basic workflow:
7
#  1. Import table dumps obtained from geonames.org
8
#  2. Add additional custom mapping info, including:
9
#     a. Madagascar provinces used in GADM2 but not geonames
10
#     b. extra country name mappings/fixes specific to BIEN3 data
11
#     c. extra state/prov name mappings/fixes specific to BIEN3 data
12
#     d. custom state/prov name mappings binding GADM2 to geonames
13
#
14
# Jim Regetz
15
# NCEAS
16
# Created Nov 2012
17
#
18
# refactored and reorganized by
19
# Paul Sarando
20
# iPlant Collaborative
21
# Updated Oct 2013
22
23
DB_NAME="geoscrub"
24
DB_USER="bien"
25 11558 psarando
DB_HOST_OPT=""
26 11495 psarando
SCRIPT_DIR="$(dirname $0)"
27 11561 psarando
DATADIR="${HOME}/geonames"
28 11495 psarando
GEONAMES_DUMP_URL="http://download.geonames.org/export/dump"
29
ALL_COUNTRIES_ZIP=allCountries.zip
30
ALL_COUNTRIES_TXT=allCountries.txt
31
ALTERNATE_NAMES_ZIP=alternateNames.zip
32
ALTERNATE_NAMES_TXT=alternateNames.txt
33
COUNTRY_INFO_TXT=countryInfo.txt
34
HIERARCHY_ZIP=hierarchy.zip
35
HIERARCHY_TXT=hierarchy.txt
36
37 11559 psarando
function usage {
38
    echo "Usage: $0 [OPTIONS]" >&2
39
    echo "Valid Options:" >&2
40
    echo "-d, --dbname=DBNAME      database name psql commands will connect to" >&2
41
    echo "-h, --host=HOSTNAME      database server host or socket directory" >&2
42
    echo "-U, --username=USERNAME  database user name" >&2
43 11561 psarando
    echo ""
44
    echo "Input Data Options:" >&2
45
    echo "-m, --geonames-data      Geonames.org data directory (default: ${HOME}/geonames)" >&2
46
    echo "                         Delete this directory, or the geonames.org data in it," >&2
47
    echo "                         to re-download the data." >&2
48 11559 psarando
    exit 1;
49
}
50
51
while [[ $# -gt 0  ]]; do
52
    case "$1" in
53
        -\? | --help)
54
            usage
55
            ;;
56
        -h)
57
            if [[ -z $2  ]];  then
58
                echo "Option $1 requires an argument." >&2
59
                usage
60
            fi
61
            DB_HOST_OPT="-h $2"
62
            shift 2
63
            ;;
64
        --host=*)
65
            DB_HOST_OPT="-h ${1#*=}"
66
            shift
67
            ;;
68
        -U)
69
            if [[ -z $2  ]];  then
70
                echo "Option $1 requires an argument." >&2
71
                usage
72
            fi
73
            DB_USER="$2"
74
            shift 2
75
            ;;
76
        --username=*)
77
            DB_USER="${1#*=}"
78
            shift
79
            ;;
80
        -d)
81
            if [[ -z $2  ]];  then
82
                echo "Option $1 requires an argument." >&2
83
                usage
84
            fi
85
            DB_NAME="$2"
86
            shift 2
87
            ;;
88
        --dbname=*)
89
            DB_NAME="${1#*=}"
90
            shift
91
            ;;
92 11561 psarando
        -m)
93
            if [[ -z $2  ]];  then
94
                echo "Option $1 requires an argument." >&2
95
                usage
96
            fi
97
            DATADIR="$2"
98
            shift 2
99
            ;;
100
        --geonames-data=*)
101
            DATADIR="${1#*=}"
102
            shift
103
            ;;
104 11559 psarando
        *)
105
            echo "Invalid option: $1" >&2
106
            usage
107
            ;;
108
    esac
109
done
110
111 11495 psarando
function run_sql_cmd {
112
    local SQL_CMD=$1
113
114 11558 psarando
    psql -e -U "$DB_USER" $DB_HOST_OPT -d "$DB_NAME" -c "$SQL_CMD"
115 11495 psarando
    if [[ $? != 0 ]]; then
116
        echo "Error while executing SQL command."
117
        exit 1
118
    fi
119
}
120
121
function run_sql_script {
122
    local SCRIPT=$1
123
124 11558 psarando
    psql -e --set ON_ERROR_STOP=1 -U "$DB_USER" $DB_HOST_OPT -d "$DB_NAME" < "$SCRIPT"
125 11495 psarando
    if [[ $? != 0 ]]; then
126
        echo "Error while executing SQL script ${SCRIPT}"
127
        exit 1
128
    fi
129
}
130
131
echo "Updating geonames.org tables..."
132
133
# Check for data downloaded into geonames under the current directory.
134
if [[ ! -d "${DATADIR}" ]]; then
135
    echo "making directory ${DATADIR}"
136
    mkdir -p "${DATADIR}"
137
138
    if [[ $? != 0 ]]; then
139
        echo "Could not create directory ${DATADIR}"
140
        exit 1
141
    fi
142
fi
143
144
run_sql_script "${SCRIPT_DIR}/truncate.geonames.sql"
145
146
#
147
# load geonames.org geonames
148
#
149
150
if [[ ! -r "${DATADIR}/${ALL_COUNTRIES_TXT}" ]]; then
151
    pushd "${DATADIR}"
152
    wget "${GEONAMES_DUMP_URL}/${ALL_COUNTRIES_ZIP}"
153
154
    if [[ $? != 0 ]]; then
155
        echo "Could not download ${ALL_COUNTRIES_ZIP}"
156
        exit 1
157
    fi
158
    unzip "$ALL_COUNTRIES_ZIP"
159
    rm -f "$ALL_COUNTRIES_ZIP"
160
    popd
161
fi
162
163
# import data
164 11558 psarando
echo "Copying geonames from ${DATADIR}/${ALL_COUNTRIES_TXT}"
165 11495 psarando
run_sql_cmd "\COPY geonames FROM '${DATADIR}/${ALL_COUNTRIES_TXT}' DELIMITER E'\t' NULL ''"
166
167
#
168
# load geonames.org alternate names
169
#
170
171
if [[ ! -r "${DATADIR}/${ALTERNATE_NAMES_TXT}" ]]; then
172
    pushd "${DATADIR}"
173
    wget "${GEONAMES_DUMP_URL}/${ALTERNATE_NAMES_ZIP}"
174
175
    if [[ $? != 0 ]]; then
176
        echo "Could not download ${ALTERNATE_NAMES_ZIP}"
177
        exit 1
178
    fi
179
    unzip "$ALTERNATE_NAMES_ZIP"
180
    rm -f "$ALTERNATE_NAMES_ZIP"
181
    popd
182
fi
183
184
# import data
185 11558 psarando
echo "Copying alternateNames from ${DATADIR}/${ALTERNATE_NAMES_TXT}"
186 11495 psarando
run_sql_cmd "\COPY alternateNames FROM '${DATADIR}/${ALTERNATE_NAMES_TXT}' DELIMITER E'\t' NULL ''"
187
run_sql_script "${SCRIPT_DIR}/update.alternateNames.sql"
188
189
#
190
# load geonames.org country info
191
#
192
193
if [[ ! -r "${DATADIR}/${COUNTRY_INFO_TXT}" ]]; then
194
    pushd "${DATADIR}"
195
    wget "${GEONAMES_DUMP_URL}/${COUNTRY_INFO_TXT}"
196
197
    if [[ $? != 0 ]]; then
198
        echo "Could not download ${COUNTRY_INFO_TXT}"
199
        exit 1
200
    fi
201
    popd
202
fi
203
204
# import geonames country table
205 11558 psarando
echo "Copying countries from ${DATADIR}/${COUNTRY_INFO_TXT}"
206 11495 psarando
grep -ve '^#' "${DATADIR}/${COUNTRY_INFO_TXT}" | \
207 11558 psarando
    psql -U "$DB_USER" $DB_HOST_OPT -d "$DB_NAME" -c \
208 11495 psarando
    "\COPY countries FROM STDIN DELIMITER E'\t' NULL ''"
209
if [[ $? != 0 ]]; then
210
    echo "Could not copy countries table data."
211
    exit 1
212
fi
213
run_sql_script "${SCRIPT_DIR}/update.alt_country.sql"
214
run_sql_script "${SCRIPT_DIR}/update.alt_stateprovince.sql"
215
216
#
217
# load geonames.org hierarchy info
218
#
219
220
if [[ ! -r "${DATADIR}/${HIERARCHY_TXT}" ]]; then
221
    pushd "${DATADIR}"
222
    wget "${GEONAMES_DUMP_URL}/${HIERARCHY_ZIP}"
223
224
    if [[ $? != 0 ]]; then
225
        echo "Could not download ${HIERARCHY_ZIP}"
226
        exit 1
227
    fi
228
    unzip "$HIERARCHY_ZIP"
229
    rm -f "$HIERARCHY_ZIP"
230
    popd
231
fi
232
233
# import geonames country table
234 11558 psarando
echo "Copying hierarchy from ${DATADIR}/${HIERARCHY_TXT}"
235 11495 psarando
run_sql_cmd "\COPY hierarchy FROM '${DATADIR}/${HIERARCHY_TXT}' DELIMITER E'\t' NULL ''"
236
run_sql_script "${SCRIPT_DIR}/update.hierarchy.sql"
237
238
# augment geonames with some additional places we need, and update
239
# hierarchy table accordingly
240 11558 psarando
echo "Updating geonames and hierarchy tables..."
241 11495 psarando
run_sql_script "${SCRIPT_DIR}/update.geonames.sql"
242
243
echo "Update geonames.org tables successfully completed."