1
|
# R script for batch parsing and loading GHCN daily station files
|
2
|
# (*.dly) into a SQLite database. Script will process all such files in
|
3
|
# the current working directory.
|
4
|
#
|
5
|
# As written, the script is intended to create and populate the database
|
6
|
# from scratch, reporting an error if it already exists. In principle
|
7
|
# though, the code that processes and loads a given *.dly file could be
|
8
|
# run on its own to load additional data into an already existing
|
9
|
# database file.
|
10
|
#
|
11
|
# At the moment, only TMIN and TMAX are loaded, but that can easily be
|
12
|
# changed.
|
13
|
#
|
14
|
# Jim Regetz
|
15
|
# NCEAS
|
16
|
# Created on 09-May-2012
|
17
|
|
18
|
require(RSQLite)
|
19
|
|
20
|
#-------------#
|
21
|
# "constants" #
|
22
|
#-------------#
|
23
|
|
24
|
# name of target db
|
25
|
db.path <- "ghcn_all.db"
|
26
|
|
27
|
# variables to keep
|
28
|
VARS <- c("TMIN", "TMAX")
|
29
|
|
30
|
# column characteristics of the *.dly data files
|
31
|
DLY.COLS <- c("character", "integer", "integer", "character",
|
32
|
rep(c("numeric", "character", "character", "character"), times=31))
|
33
|
NUM.WIDE.COLS <- 4 + 4*31
|
34
|
DAYS <- lapply(seq(from=5, to=NUM.WIDE.COLS, by=4), function(i) i:(i+3))
|
35
|
|
36
|
|
37
|
#------------------#
|
38
|
# helper functions #
|
39
|
#------------------#
|
40
|
|
41
|
# bulk insert helper function (adapted from RSQLite documentation)
|
42
|
ghcn_bulk_insert <- function(db, sql, dat) {
|
43
|
dbBeginTransaction(db)
|
44
|
dbGetPreparedQuery(db, sql, bind.data = dat)
|
45
|
dbCommit(db)
|
46
|
}
|
47
|
|
48
|
# shell out to OS to leverage grep/awk/tr for faster initial parsing and
|
49
|
# filtering of data into a temp file; if filtering yields no data
|
50
|
# records, this function returns NULL
|
51
|
loadAsCSV <- function(dly, patt=NULL) {
|
52
|
tmpfile <- tempfile()
|
53
|
awk <- paste(
|
54
|
"awk -v FIELDWIDTHS='",
|
55
|
paste(c(11, 4, 2, 4, rep(c(5,1,1,1), times=31)), collapse=" "),
|
56
|
"' -v OFS=',' '{ $1=$1 \"\"; print }'", sep="")
|
57
|
tr <- "tr -d ' '"
|
58
|
if (is.null(patt)) {
|
59
|
cmd <- paste(awk, dly, "|", tr)
|
60
|
} else {
|
61
|
patt <- shQuote(paste(patt, collapse="\\|"))
|
62
|
cmd <- paste("grep -e", patt, dly, "|", awk, "|", tr)
|
63
|
}
|
64
|
cmd <- paste(cmd, tmpfile, sep=" > ")
|
65
|
# execute command and read from tmpfile if successful
|
66
|
if (system(cmd)==0 & 0<file.info(tmpfile)$size) {
|
67
|
out <- read.csv(tmpfile, header=FALSE, colClasses=DLY.COLS)
|
68
|
file.remove(tmpfile)
|
69
|
} else {
|
70
|
out <- NULL
|
71
|
}
|
72
|
return(out)
|
73
|
}
|
74
|
|
75
|
# function to convert the wide-form (days across columns) GHCN data into
|
76
|
# long form (unique row for each day*element); note that all indexing
|
77
|
# here is hard-coded to work for the *.dly files, and simply assumes
|
78
|
# that they are all consistent
|
79
|
wideToLong <- function(dat, days) {
|
80
|
# convert id vars to long form, relying on R to recycle the first
|
81
|
# four to match the length of the fifth (slightly faster than doing
|
82
|
# this manually)
|
83
|
out <- data.frame(
|
84
|
dat[1:4],
|
85
|
V5=rep(1:31, each=nrow(dat))
|
86
|
)
|
87
|
# now combine and fill in the daily values/flags
|
88
|
for (i in 1:4) {
|
89
|
cols <- sapply(days, function(x) x[[i]])
|
90
|
out[[5+i]] <- as.vector(as.matrix(dat[, cols]))
|
91
|
}
|
92
|
# add original row id
|
93
|
out$id <- 1:nrow(dat)
|
94
|
out
|
95
|
}
|
96
|
|
97
|
|
98
|
#-----------------#
|
99
|
# procedural code #
|
100
|
#-----------------#
|
101
|
|
102
|
# establish database connection
|
103
|
con <- dbDriver("SQLite")
|
104
|
if (file.exists(db.path)) {
|
105
|
stop("database already exists at ", db.path)
|
106
|
}
|
107
|
db <- dbConnect(con, dbname=db.path)
|
108
|
|
109
|
# create main ghcn table
|
110
|
sql <- "
|
111
|
CREATE TABLE ghcn (
|
112
|
id text,
|
113
|
year int,
|
114
|
month int,
|
115
|
element text,
|
116
|
day int,
|
117
|
value int,
|
118
|
mflag text,
|
119
|
qflag text,
|
120
|
sflag text,
|
121
|
srcrowid int)
|
122
|
"
|
123
|
dbGetQuery(db, sql)
|
124
|
|
125
|
# prepare sql insert statement
|
126
|
params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))),
|
127
|
collapse=", ")
|
128
|
sql <- paste("insert into ghcn values (", params.clist, ")", sep="")
|
129
|
|
130
|
# process and insert daily data
|
131
|
dailies <- list.files(pattern="*.dly")
|
132
|
for (file in dailies) {
|
133
|
dly <- loadAsCSV(file, VARS)
|
134
|
if (!is.null(dly)) {
|
135
|
long <- wideToLong(dly, DAYS)
|
136
|
ghcn_bulk_insert(db, sql, long)
|
137
|
} else {
|
138
|
message("no rows imported for ", file)
|
139
|
}
|
140
|
}
|