Project

General

Profile

1
#!/bin/bash
2

    
3
# Script to geoscrub-enable a postgres database, developed in the
4
# context of BIEN3 geoscrubbing but with intentions of generality.
5
#
6
# Basic workflow:
7
#  1. Import table dumps obtained from geonames.org
8
#  2. Add additional custom mapping info, including:
9
#     a. Madagascar provinces used in GADM2 but not geonames
10
#     b. extra country name mappings/fixes specific to BIEN3 data
11
#     c. extra state/prov name mappings/fixes specific to BIEN3 data
12
#     d. custom state/prov name mappings binding GADM2 to geonames
13
#
14
# Jim Regetz
15
# NCEAS
16
# Created Nov 2012
17
#
18
# refactored and reorganized by
19
# Paul Sarando
20
# iPlant Collaborative
21
# Updated Oct 2013
22

    
23
DB_NAME="geoscrub"
24
DB_USER="bien"
25
DB_HOST_OPT=""
26
SCRIPT_DIR="$(dirname $0)"
27
GEONAMES_DUMP_URL="http://download.geonames.org/export/dump"
28
ALL_COUNTRIES_ZIP=allCountries.zip
29
ALL_COUNTRIES_TXT=allCountries.txt
30
ALTERNATE_NAMES_ZIP=alternateNames.zip
31
ALTERNATE_NAMES_TXT=alternateNames.txt
32
COUNTRY_INFO_TXT=countryInfo.txt
33
HIERARCHY_ZIP=hierarchy.zip
34
HIERARCHY_TXT=hierarchy.txt
35

    
36
function usage {
37
    echo "Usage: $0 [OPTIONS]" >&2
38
    echo "Valid Options:" >&2
39
    echo "-d, --dbname=DBNAME      database name psql commands will connect to" >&2
40
    echo "-h, --host=HOSTNAME      database server host or socket directory" >&2
41
    echo "-U, --username=USERNAME  database user name" >&2
42
    exit 1;
43
}
44

    
45
while [[ $# -gt 0  ]]; do
46
    case "$1" in
47
        -\? | --help)
48
            usage
49
            ;;
50
        -h)
51
            if [[ -z $2  ]];  then
52
                echo "Option $1 requires an argument." >&2
53
                usage
54
            fi
55
            DB_HOST_OPT="-h $2"
56
            shift 2
57
            ;;
58
        --host=*)
59
            DB_HOST_OPT="-h ${1#*=}"
60
            shift
61
            ;;
62
        -U)
63
            if [[ -z $2  ]];  then
64
                echo "Option $1 requires an argument." >&2
65
                usage
66
            fi
67
            DB_USER="$2"
68
            shift 2
69
            ;;
70
        --username=*)
71
            DB_USER="${1#*=}"
72
            shift
73
            ;;
74
        -d)
75
            if [[ -z $2  ]];  then
76
                echo "Option $1 requires an argument." >&2
77
                usage
78
            fi
79
            DB_NAME="$2"
80
            shift 2
81
            ;;
82
        --dbname=*)
83
            DB_NAME="${1#*=}"
84
            shift
85
            ;;
86
        *)
87
            echo "Invalid option: $1" >&2
88
            usage
89
            ;;
90
    esac
91
done
92

    
93
function run_sql_cmd {
94
    local SQL_CMD=$1
95

    
96
    psql -e -U "$DB_USER" $DB_HOST_OPT -d "$DB_NAME" -c "$SQL_CMD"
97
    if [[ $? != 0 ]]; then
98
        echo "Error while executing SQL command."
99
        exit 1
100
    fi
101
}
102

    
103
function run_sql_script {
104
    local SCRIPT=$1
105

    
106
    psql -e --set ON_ERROR_STOP=1 -U "$DB_USER" $DB_HOST_OPT -d "$DB_NAME" < "$SCRIPT"
107
    if [[ $? != 0 ]]; then
108
        echo "Error while executing SQL script ${SCRIPT}"
109
        exit 1
110
    fi
111
}
112

    
113
echo "Updating geonames.org tables..."
114

    
115
# Check for data downloaded into geonames under the current directory.
116
DATADIR="$(dirname $0)/geonames"
117
if [[ ! -d "${DATADIR}" ]]; then
118
    echo "making directory ${DATADIR}"
119
    mkdir -p "${DATADIR}"
120

    
121
    if [[ $? != 0 ]]; then
122
        echo "Could not create directory ${DATADIR}"
123
        exit 1
124
    fi
125
fi
126

    
127
run_sql_script "${SCRIPT_DIR}/truncate.geonames.sql"
128

    
129
#
130
# load geonames.org geonames
131
#
132

    
133
if [[ ! -r "${DATADIR}/${ALL_COUNTRIES_TXT}" ]]; then
134
    pushd "${DATADIR}"
135
    wget "${GEONAMES_DUMP_URL}/${ALL_COUNTRIES_ZIP}"
136

    
137
    if [[ $? != 0 ]]; then
138
        echo "Could not download ${ALL_COUNTRIES_ZIP}"
139
        exit 1
140
    fi
141
    unzip "$ALL_COUNTRIES_ZIP"
142
    rm -f "$ALL_COUNTRIES_ZIP"
143
    popd
144
fi
145

    
146
# import data
147
echo "Copying geonames from ${DATADIR}/${ALL_COUNTRIES_TXT}"
148
run_sql_cmd "\COPY geonames FROM '${DATADIR}/${ALL_COUNTRIES_TXT}' DELIMITER E'\t' NULL ''"
149

    
150
#
151
# load geonames.org alternate names
152
#
153

    
154
if [[ ! -r "${DATADIR}/${ALTERNATE_NAMES_TXT}" ]]; then
155
    pushd "${DATADIR}"
156
    wget "${GEONAMES_DUMP_URL}/${ALTERNATE_NAMES_ZIP}"
157

    
158
    if [[ $? != 0 ]]; then
159
        echo "Could not download ${ALTERNATE_NAMES_ZIP}"
160
        exit 1
161
    fi
162
    unzip "$ALTERNATE_NAMES_ZIP"
163
    rm -f "$ALTERNATE_NAMES_ZIP"
164
    popd
165
fi
166

    
167
# import data
168
echo "Copying alternateNames from ${DATADIR}/${ALTERNATE_NAMES_TXT}"
169
run_sql_cmd "\COPY alternateNames FROM '${DATADIR}/${ALTERNATE_NAMES_TXT}' DELIMITER E'\t' NULL ''"
170
run_sql_script "${SCRIPT_DIR}/update.alternateNames.sql"
171

    
172
#
173
# load geonames.org country info
174
#
175

    
176
if [[ ! -r "${DATADIR}/${COUNTRY_INFO_TXT}" ]]; then
177
    pushd "${DATADIR}"
178
    wget "${GEONAMES_DUMP_URL}/${COUNTRY_INFO_TXT}"
179

    
180
    if [[ $? != 0 ]]; then
181
        echo "Could not download ${COUNTRY_INFO_TXT}"
182
        exit 1
183
    fi
184
    popd
185
fi
186

    
187
# import geonames country table
188
echo "Copying countries from ${DATADIR}/${COUNTRY_INFO_TXT}"
189
grep -ve '^#' "${DATADIR}/${COUNTRY_INFO_TXT}" | \
190
    psql -U "$DB_USER" $DB_HOST_OPT -d "$DB_NAME" -c \
191
    "\COPY countries FROM STDIN DELIMITER E'\t' NULL ''"
192
if [[ $? != 0 ]]; then
193
    echo "Could not copy countries table data."
194
    exit 1
195
fi
196
run_sql_script "${SCRIPT_DIR}/update.alt_country.sql"
197
run_sql_script "${SCRIPT_DIR}/update.alt_stateprovince.sql"
198

    
199
#
200
# load geonames.org hierarchy info
201
#
202

    
203
if [[ ! -r "${DATADIR}/${HIERARCHY_TXT}" ]]; then
204
    pushd "${DATADIR}"
205
    wget "${GEONAMES_DUMP_URL}/${HIERARCHY_ZIP}"
206

    
207
    if [[ $? != 0 ]]; then
208
        echo "Could not download ${HIERARCHY_ZIP}"
209
        exit 1
210
    fi
211
    unzip "$HIERARCHY_ZIP"
212
    rm -f "$HIERARCHY_ZIP"
213
    popd
214
fi
215

    
216
# import geonames country table
217
echo "Copying hierarchy from ${DATADIR}/${HIERARCHY_TXT}"
218
run_sql_cmd "\COPY hierarchy FROM '${DATADIR}/${HIERARCHY_TXT}' DELIMITER E'\t' NULL ''"
219
run_sql_script "${SCRIPT_DIR}/update.hierarchy.sql"
220

    
221
# augment geonames with some additional places we need, and update
222
# hierarchy table accordingly
223
echo "Updating geonames and hierarchy tables..."
224
run_sql_script "${SCRIPT_DIR}/update.geonames.sql"
225

    
226
echo "Update geonames.org tables successfully completed."
227

    
(25-25/26)