Revision 11493
Added by Paul Sarando about 11 years ago
derived/biengeo/geoscrub.sh | ||
---|---|---|
1 |
#!/bin/bash |
|
2 |
|
|
3 |
# Bash script to create a new postgis database and prep it with GADM |
|
4 |
# data for geovalidation purposes. |
|
5 |
# |
|
6 |
# Basic workflow: |
|
7 |
# 1. Load geoscrub input data into database |
|
8 |
# 2. Scrub geoscrub input data with the geonames.sql script |
|
9 |
# 3. Scrub geoscrub input data with the geovalidate.sql script |
|
10 |
# |
|
11 |
# Paul Sarando |
|
12 |
# iPlant Collaborative |
|
13 |
# Oct 2013 |
|
14 |
|
|
15 |
DB_NAME="geoscrub" |
|
16 |
DB_USER="bien" |
|
17 |
DB_HOST="localhost" |
|
18 |
SCRIPT_DIR="$(dirname $0)" |
|
19 |
|
|
20 |
function run_sql_script { |
|
21 |
local SCRIPT=$1 |
|
22 |
|
|
23 |
psql -e -U "$DB_USER" -h "$DB_HOST" -d "$DB_NAME" --set ON_ERROR_STOP=1 < "$SCRIPT" |
|
24 |
if [[ $? != 0 ]]; then |
|
25 |
echo "Error while executing SQL script ${SCRIPT}" |
|
26 |
exit 1 |
|
27 |
fi |
|
28 |
} |
|
29 |
|
|
30 |
"${SCRIPT_DIR}"/load-geoscrub-input.sh |
|
31 |
if [[ $? != 0 ]]; then |
|
32 |
echo "Could not load ${DB_NAME} database with geonames.org data." |
|
33 |
exit 1 |
|
34 |
fi |
|
35 |
|
|
36 |
echo "Scrubbing input with geonames data..." |
|
37 |
run_sql_script "${SCRIPT_DIR}/geonames.sql" |
|
38 |
|
|
39 |
echo "Scrubbing input with geovalidate data..." |
|
40 |
run_sql_script "${SCRIPT_DIR}/geovalidate.sql" |
|
41 |
|
|
42 |
echo "Input successfully scrubbed." |
|
43 |
echo "Scrubbed input available in the geoscrub table of the ${DB_NAME} database." |
|
44 |
|
|
0 | 45 |
derived/biengeo/README.txt | ||
---|---|---|
37 | 37 |
WARNING: deletes any previous geoscrubbing results! |
38 | 38 |
runtime: ~5.5 h |
39 | 39 |
cd <svn_biengeo_root> |
40 |
4. load-geoscrub-input.sh
|
|
41 |
- dumps geoscrub_input from vegbien and loads it into the geoscrub db
|
|
42 |
5. geonames.sql
|
|
43 |
sudo -u postgres psql -e --set ON_ERROR_STOP=1 -d geoscrub < geonames.sql
|
|
44 |
- contains SQL statements that scrub asserted names and (to the
|
|
45 |
extent possible) map them to GADM2
|
|
46 |
6. geovalidate.sql
|
|
47 |
runtime: 5.5 h
|
|
48 |
sudo -u postgres psql -e --set ON_ERROR_STOP=1 -d geoscrub < geovalidate.sql
|
|
49 |
- contains (postgis-extended) SQL statements that score the validity |
|
50 |
of GADM2-scrubbed names against given point coordinates |
|
40 |
4. geoscrub.sh
|
|
41 |
- runs the following scripts in order to load and scrub vegbien input data:
|
|
42 |
* load-geoscrub-input.sh
|
|
43 |
- dumps geoscrub_input from vegbien and loads it into the geoscrub db
|
|
44 |
* geonames.sql
|
|
45 |
- contains SQL statements that scrub asserted names and (to the
|
|
46 |
extent possible) map them to GADM2
|
|
47 |
* geovalidate.sql
|
|
48 |
runtime: 5.5 h
|
|
49 |
- contains (postgis-extended) SQL statements that score the validity
|
|
50 |
of GADM2-scrubbed names against given point coordinates
|
|
51 | 51 |
|
52 | 52 |
[Also see comments embedded in specific scripts in this directory.] |
53 | 53 |
|
54 |
The bash and SQL statements contained in the files as ordered below
|
|
54 |
The bash and SQL statements contained in the files as ordered above
|
|
55 | 55 |
should be applied to carry out geographic name scrubbing and |
56 | 56 |
geovalidation on a given corpus of BIEN location records. |
57 | 57 |
|
derived/biengeo/load-geoscrub-input.sh | ||
---|---|---|
26 | 26 |
DB_NAME="geoscrub" |
27 | 27 |
DB_USER="bien" |
28 | 28 |
DB_HOST="localhost" |
29 |
SCRIPT_DIR=$(dirname $0)
|
|
29 |
SCRIPT_DIR="$(dirname $0)"
|
|
30 | 30 |
DATA_URL="http://fs.vegpath.org/exports/geoscrub_input.no_header.cols=country,stateProvince,county,decimalLatitude,decimalLongitude.csv" |
31 | 31 |
DATADIR="${SCRIPT_DIR}/input" |
32 | 32 |
DATAFILE="${DATADIR}/geoscrub-corpus.csv" |
... | ... | |
60 | 60 |
exit 1 |
61 | 61 |
fi |
62 | 62 |
|
63 |
# load |
|
63 |
# load vegbien_geoscrub table with input data
|
|
64 | 64 |
psql -U "$DB_USER" -h "$DB_HOST" -d "$DB_NAME" -c "\COPY vegbien_geoscrub FROM '${DATAFILE}' WITH CSV" |
65 | 65 |
|
Also available in: Unified diff
Added geoscrub.sh script.
This script runs the load-geoscrub-input.sh, geonames.sql, and
geovalidate.sql scripts in order to load and scrub vegbien input data.
Updated README to explain the new script.
Minor updates to load-geoscrub-input.sh.