1 |
dd85f6d3
|
Jim Regetz
|
# R script for batch parsing and loading GHCN daily station files
|
2 |
|
|
# (*.dly) into a SQLite database
|
3 |
|
|
#
|
4 |
|
|
# As written, the script is intended to create and populate the database
|
5 |
|
|
# from scratch, reporting an error if the script already exists. In
|
6 |
|
|
# principle though, the code that processes and loads a given *.dly file
|
7 |
|
|
# could be run on its own to load additional data into an already
|
8 |
|
|
# existing database file.
|
9 |
|
|
#
|
10 |
|
|
# Jim Regetz
|
11 |
|
|
# NCEAS
|
12 |
|
|
# Created on 09-May-2012
|
13 |
|
|
|
14 |
|
|
require(RSQLite)
|
15 |
|
|
|
16 |
|
|
db.path <- "ghcn_all.db"
|
17 |
|
|
|
18 |
|
|
# define bulk insert helper function (adapted from RSQLite
|
19 |
|
|
# documentation)
|
20 |
|
|
ghcn_bulk_insert <- function(db, sql, dat) {
|
21 |
|
|
dbBeginTransaction(db)
|
22 |
|
|
dbGetPreparedQuery(db, sql, bind.data = dat)
|
23 |
|
|
dbCommit(db)
|
24 |
|
|
dbGetQuery(db, "select count(*) from ghcn")[[1]]
|
25 |
|
|
}
|
26 |
|
|
|
27 |
|
|
# establish database connection
|
28 |
|
|
con <- dbDriver("SQLite")
|
29 |
|
|
if (file.exists(db.path)) {
|
30 |
|
|
stop("database already exists at ", db.path)
|
31 |
|
|
}
|
32 |
|
|
db <- dbConnect(con, dbname=db.path)
|
33 |
|
|
|
34 |
|
|
# create main ghcn table
|
35 |
|
|
sql <- "
|
36 |
|
|
CREATE TABLE ghcn (
|
37 |
|
|
id text,
|
38 |
|
|
year int,
|
39 |
|
|
month int,
|
40 |
|
|
element text,
|
41 |
|
|
day int,
|
42 |
|
|
value int,
|
43 |
|
|
mflag text,
|
44 |
|
|
qflag text,
|
45 |
|
|
sflag text,
|
46 |
|
|
srcrowid int)
|
47 |
|
|
"
|
48 |
|
|
dbGetQuery(db, sql)
|
49 |
|
|
|
50 |
|
|
# prepare sql insert statement
|
51 |
|
|
params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))),
|
52 |
|
|
collapse=", ")
|
53 |
|
|
sql <- paste("insert into ghcn values (", params.clist, ")", sep="")
|
54 |
|
|
|
55 |
|
|
# process and insert daily data
|
56 |
2d08ed05
|
Jim Regetz
|
DLY.COLS <- c("character", "integer", "integer", "character",
|
57 |
|
|
rep(c("numeric", "character", "character", "character"), times=31))
|
58 |
|
|
|
59 |
669c150d
|
Jim Regetz
|
# shell out to OS to leverage grep/awk/tr for faster initial parsing and
|
60 |
|
|
# filtering of data; if no data records are read in, this function
|
61 |
|
|
# returns NULL
|
62 |
|
|
loadAsCSV <- function(dly, patt=NULL) {
|
63 |
2d08ed05
|
Jim Regetz
|
awk <- paste(
|
64 |
|
|
"awk -v FIELDWIDTHS='",
|
65 |
|
|
paste(c(11, 4, 2, 4, rep(c(5,1,1,1), times=31)), collapse=" "),
|
66 |
|
|
"' -v OFS=',' '{ $1=$1 \"\"; print }'", sep="")
|
67 |
|
|
tr <- "tr -d ' '"
|
68 |
669c150d
|
Jim Regetz
|
if (is.null(patt)) {
|
69 |
|
|
cmd <- paste(awk, dly, "|", tr)
|
70 |
|
|
} else {
|
71 |
|
|
patt <- shQuote(paste(patt, collapse="\\|"))
|
72 |
|
|
cmd <- paste("grep -e", patt, dly, "|", awk, "|", tr)
|
73 |
|
|
}
|
74 |
|
|
csv <- system(cmd, intern=TRUE)
|
75 |
|
|
if (length(csv)>0) {
|
76 |
|
|
read.csv(textConnection(csv), header=FALSE, colClasses=DLY.COLS)
|
77 |
|
|
} else {
|
78 |
|
|
NULL
|
79 |
|
|
}
|
80 |
2d08ed05
|
Jim Regetz
|
}
|
81 |
|
|
|
82 |
dd85f6d3
|
Jim Regetz
|
dailies <- list.files(pattern="*.dly")
|
83 |
|
|
for (file in dailies) {
|
84 |
669c150d
|
Jim Regetz
|
x <- loadAsCSV(file, c("TMAX", "TMIN"))
|
85 |
|
|
if (is.null(x)) {
|
86 |
|
|
message("no rows imported for ", file)
|
87 |
|
|
} else {
|
88 |
|
|
long <- reshape(x, direction="long",
|
89 |
|
|
varying=matrix(5:ncol(x), nrow=4))
|
90 |
|
|
ghcn_bulk_insert(db, sql, long)
|
91 |
|
|
}
|
92 |
dd85f6d3
|
Jim Regetz
|
}
|