Project

General

Profile

1
#!/bin/bash
2

    
3
# Script to geoscrub-enable a postgres database, developed in the
4
# context of BIEN3 geoscrubbing but with intentions of generality.
5
#
6
# Basic workflow:
7
#  1. Import table dumps obtained from geonames.org
8
#  2. Add additional custom mapping info, including:
9
#     a. Madagascar provinces used in GADM2 but not geonames
10
#     b. extra country name mappings/fixes specific to BIEN3 data
11
#     c. extra state/prov name mappings/fixes specific to BIEN3 data
12
#     d. custom state/prov name mappings binding GADM2 to geonames
13
#
14
# Jim Regetz
15
# NCEAS
16
# Created Nov 2012
17
#
18
# refactored and reorganized by
19
# Paul Sarando
20
# iPlant Collaborative
21
# Updated Oct 2013
22

    
23
DB_NAME="geoscrub"
24
DB_USER="bien"
25
DB_HOST_OPT=""
26
SCRIPT_DIR="$(dirname $0)"
27
DATADIR="${HOME}/geonames"
28
GEONAMES_DUMP_URL="http://download.geonames.org/export/dump"
29
ALL_COUNTRIES_ZIP=allCountries.zip
30
ALL_COUNTRIES_TXT=allCountries.txt
31
ALTERNATE_NAMES_ZIP=alternateNames.zip
32
ALTERNATE_NAMES_TXT=alternateNames.txt
33
COUNTRY_INFO_TXT=countryInfo.txt
34
HIERARCHY_ZIP=hierarchy.zip
35
HIERARCHY_TXT=hierarchy.txt
36

    
37
function usage {
38
    echo "Usage: $0 [OPTIONS]" >&2
39
    echo "Valid Options:" >&2
40
    echo "-d, --dbname=DBNAME      database name psql commands will connect to" >&2
41
    echo "-h, --host=HOSTNAME      database server host or socket directory" >&2
42
    echo "-U, --username=USERNAME  database user name" >&2
43
    echo ""
44
    echo "Input Data Options:" >&2
45
    echo "-m, --geonames-data      Geonames.org data directory (default: ${HOME}/geonames)" >&2
46
    echo "                         Delete this directory, or the geonames.org data in it," >&2
47
    echo "                         to re-download the data." >&2
48
    exit 1;
49
}
50

    
51
while [[ $# -gt 0  ]]; do
52
    case "$1" in
53
        -\? | --help)
54
            usage
55
            ;;
56
        -h)
57
            if [[ -z $2  ]];  then
58
                echo "Option $1 requires an argument." >&2
59
                usage
60
            fi
61
            DB_HOST_OPT="-h $2"
62
            shift 2
63
            ;;
64
        --host=*)
65
            DB_HOST_OPT="-h ${1#*=}"
66
            shift
67
            ;;
68
        -U)
69
            if [[ -z $2  ]];  then
70
                echo "Option $1 requires an argument." >&2
71
                usage
72
            fi
73
            DB_USER="$2"
74
            shift 2
75
            ;;
76
        --username=*)
77
            DB_USER="${1#*=}"
78
            shift
79
            ;;
80
        -d)
81
            if [[ -z $2  ]];  then
82
                echo "Option $1 requires an argument." >&2
83
                usage
84
            fi
85
            DB_NAME="$2"
86
            shift 2
87
            ;;
88
        --dbname=*)
89
            DB_NAME="${1#*=}"
90
            shift
91
            ;;
92
        -m)
93
            if [[ -z $2  ]];  then
94
                echo "Option $1 requires an argument." >&2
95
                usage
96
            fi
97
            DATADIR="$2"
98
            shift 2
99
            ;;
100
        --geonames-data=*)
101
            DATADIR="${1#*=}"
102
            shift
103
            ;;
104
        *)
105
            echo "Invalid option: $1" >&2
106
            usage
107
            ;;
108
    esac
109
done
110

    
111
function run_sql_cmd {
112
    local SQL_CMD=$1
113

    
114
    psql -e -U "$DB_USER" $DB_HOST_OPT -d "$DB_NAME" -c "$SQL_CMD"
115
    if [[ $? != 0 ]]; then
116
        echo "Error while executing SQL command."
117
        exit 1
118
    fi
119
}
120

    
121
function run_sql_script {
122
    local SCRIPT=$1
123

    
124
    psql -e --set ON_ERROR_STOP=1 -U "$DB_USER" $DB_HOST_OPT -d "$DB_NAME" < "$SCRIPT"
125
    if [[ $? != 0 ]]; then
126
        echo "Error while executing SQL script ${SCRIPT}"
127
        exit 1
128
    fi
129
}
130

    
131
echo "Updating geonames.org tables..."
132

    
133
# Check for data downloaded into geonames under the current directory.
134
if [[ ! -d "${DATADIR}" ]]; then
135
    echo "making directory ${DATADIR}"
136
    mkdir -p "${DATADIR}"
137

    
138
    if [[ $? != 0 ]]; then
139
        echo "Could not create directory ${DATADIR}"
140
        exit 1
141
    fi
142
fi
143

    
144
run_sql_script "${SCRIPT_DIR}/truncate.geonames.sql"
145

    
146
#
147
# load geonames.org geonames
148
#
149

    
150
if [[ ! -r "${DATADIR}/${ALL_COUNTRIES_TXT}" ]]; then
151
    pushd "${DATADIR}"
152
    wget "${GEONAMES_DUMP_URL}/${ALL_COUNTRIES_ZIP}"
153

    
154
    if [[ $? != 0 ]]; then
155
        echo "Could not download ${ALL_COUNTRIES_ZIP}"
156
        exit 1
157
    fi
158
    unzip "$ALL_COUNTRIES_ZIP"
159
    rm -f "$ALL_COUNTRIES_ZIP"
160
    popd
161
fi
162

    
163
# import data
164
echo "Copying geonames from ${DATADIR}/${ALL_COUNTRIES_TXT}"
165
run_sql_cmd "\COPY geonames FROM '${DATADIR}/${ALL_COUNTRIES_TXT}' DELIMITER E'\t' NULL ''"
166

    
167
#
168
# load geonames.org alternate names
169
#
170

    
171
if [[ ! -r "${DATADIR}/${ALTERNATE_NAMES_TXT}" ]]; then
172
    pushd "${DATADIR}"
173
    wget "${GEONAMES_DUMP_URL}/${ALTERNATE_NAMES_ZIP}"
174

    
175
    if [[ $? != 0 ]]; then
176
        echo "Could not download ${ALTERNATE_NAMES_ZIP}"
177
        exit 1
178
    fi
179
    unzip "$ALTERNATE_NAMES_ZIP"
180
    rm -f "$ALTERNATE_NAMES_ZIP"
181
    popd
182
fi
183

    
184
# import data
185
echo "Copying alternateNames from ${DATADIR}/${ALTERNATE_NAMES_TXT}"
186
run_sql_cmd "\COPY alternateNames FROM '${DATADIR}/${ALTERNATE_NAMES_TXT}' DELIMITER E'\t' NULL ''"
187
run_sql_script "${SCRIPT_DIR}/update.alternateNames.sql"
188

    
189
#
190
# load geonames.org country info
191
#
192

    
193
if [[ ! -r "${DATADIR}/${COUNTRY_INFO_TXT}" ]]; then
194
    pushd "${DATADIR}"
195
    wget "${GEONAMES_DUMP_URL}/${COUNTRY_INFO_TXT}"
196

    
197
    if [[ $? != 0 ]]; then
198
        echo "Could not download ${COUNTRY_INFO_TXT}"
199
        exit 1
200
    fi
201
    popd
202
fi
203

    
204
# import geonames country table
205
echo "Copying countries from ${DATADIR}/${COUNTRY_INFO_TXT}"
206
grep -ve '^#' "${DATADIR}/${COUNTRY_INFO_TXT}" | \
207
    psql -U "$DB_USER" $DB_HOST_OPT -d "$DB_NAME" -c \
208
    "\COPY countries FROM STDIN DELIMITER E'\t' NULL ''"
209
if [[ $? != 0 ]]; then
210
    echo "Could not copy countries table data."
211
    exit 1
212
fi
213
run_sql_script "${SCRIPT_DIR}/update.alt_country.sql"
214
run_sql_script "${SCRIPT_DIR}/update.alt_stateprovince.sql"
215

    
216
#
217
# load geonames.org hierarchy info
218
#
219

    
220
if [[ ! -r "${DATADIR}/${HIERARCHY_TXT}" ]]; then
221
    pushd "${DATADIR}"
222
    wget "${GEONAMES_DUMP_URL}/${HIERARCHY_ZIP}"
223

    
224
    if [[ $? != 0 ]]; then
225
        echo "Could not download ${HIERARCHY_ZIP}"
226
        exit 1
227
    fi
228
    unzip "$HIERARCHY_ZIP"
229
    rm -f "$HIERARCHY_ZIP"
230
    popd
231
fi
232

    
233
# import geonames country table
234
echo "Copying hierarchy from ${DATADIR}/${HIERARCHY_TXT}"
235
run_sql_cmd "\COPY hierarchy FROM '${DATADIR}/${HIERARCHY_TXT}' DELIMITER E'\t' NULL ''"
236
run_sql_script "${SCRIPT_DIR}/update.hierarchy.sql"
237

    
238
# augment geonames with some additional places we need, and update
239
# hierarchy table accordingly
240
echo "Updating geonames and hierarchy tables..."
241
run_sql_script "${SCRIPT_DIR}/update.geonames.sql"
242

    
243
echo "Update geonames.org tables successfully completed."
244

    
(26-26/27)