1
|
# R script for batch parsing and loading GHCN daily station files
|
2
|
# (*.dly) into a SQLite database
|
3
|
#
|
4
|
# As written, the script is intended to create and populate the database
|
5
|
# from scratch, reporting an error if the script already exists. In
|
6
|
# principle though, the code that processes and loads a given *.dly file
|
7
|
# could be run on its own to load additional data into an already
|
8
|
# existing database file.
|
9
|
#
|
10
|
# Jim Regetz
|
11
|
# NCEAS
|
12
|
# Created on 09-May-2012
|
13
|
|
14
|
require(RSQLite)
|
15
|
|
16
|
db.path <- "ghcn_all.db"
|
17
|
|
18
|
# define bulk insert helper function (adapted from RSQLite
|
19
|
# documentation)
|
20
|
ghcn_bulk_insert <- function(db, sql, dat) {
|
21
|
dbBeginTransaction(db)
|
22
|
dbGetPreparedQuery(db, sql, bind.data = dat)
|
23
|
dbCommit(db)
|
24
|
dbGetQuery(db, "select count(*) from ghcn")[[1]]
|
25
|
}
|
26
|
|
27
|
# establish database connection
|
28
|
con <- dbDriver("SQLite")
|
29
|
if (file.exists(db.path)) {
|
30
|
stop("database already exists at ", db.path)
|
31
|
}
|
32
|
db <- dbConnect(con, dbname=db.path)
|
33
|
|
34
|
# create main ghcn table
|
35
|
sql <- "
|
36
|
CREATE TABLE ghcn (
|
37
|
id text,
|
38
|
year int,
|
39
|
month int,
|
40
|
element text,
|
41
|
day int,
|
42
|
value int,
|
43
|
mflag text,
|
44
|
qflag text,
|
45
|
sflag text,
|
46
|
srcrowid int)
|
47
|
"
|
48
|
dbGetQuery(db, sql)
|
49
|
|
50
|
# prepare sql insert statement
|
51
|
params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))),
|
52
|
collapse=", ")
|
53
|
sql <- paste("insert into ghcn values (", params.clist, ")", sep="")
|
54
|
|
55
|
# process and insert daily data
|
56
|
DLY.COLS <- c("character", "integer", "integer", "character",
|
57
|
rep(c("numeric", "character", "character", "character"), times=31))
|
58
|
|
59
|
# shell out to OS to leverage grep/awk/tr for faster initial parsing and
|
60
|
# filtering of data; if no data records are read in, this function
|
61
|
# returns NULL
|
62
|
loadAsCSV <- function(dly, patt=NULL) {
|
63
|
awk <- paste(
|
64
|
"awk -v FIELDWIDTHS='",
|
65
|
paste(c(11, 4, 2, 4, rep(c(5,1,1,1), times=31)), collapse=" "),
|
66
|
"' -v OFS=',' '{ $1=$1 \"\"; print }'", sep="")
|
67
|
tr <- "tr -d ' '"
|
68
|
if (is.null(patt)) {
|
69
|
cmd <- paste(awk, dly, "|", tr)
|
70
|
} else {
|
71
|
patt <- shQuote(paste(patt, collapse="\\|"))
|
72
|
cmd <- paste("grep -e", patt, dly, "|", awk, "|", tr)
|
73
|
}
|
74
|
csv <- system(cmd, intern=TRUE)
|
75
|
if (length(csv)>0) {
|
76
|
read.csv(textConnection(csv), header=FALSE, colClasses=DLY.COLS)
|
77
|
} else {
|
78
|
NULL
|
79
|
}
|
80
|
}
|
81
|
|
82
|
dailies <- list.files(pattern="*.dly")
|
83
|
for (file in dailies) {
|
84
|
x <- loadAsCSV(file, c("TMAX", "TMIN"))
|
85
|
if (is.null(x)) {
|
86
|
message("no rows imported for ", file)
|
87
|
} else {
|
88
|
long <- reshape(x, direction="long",
|
89
|
varying=matrix(5:ncol(x), nrow=4))
|
90
|
ghcn_bulk_insert(db, sql, long)
|
91
|
}
|
92
|
}
|