/climate/extra/ghcn-to-sqlite.R - Annotate - Environment and organisms - NCEAS Projects

root/climate/extra/ghcn-to-sqlite.R @ d4e63f5d

-            dd85f6d3
+# R script for batch parsing and loading GHCN daily station files
-            d4e63f5d
+# (*.dly) into a SQLite database. Script will process all such files in
 # the current working directory.
-            dd85f6d3
+            Jim Regetz
 # As written, the script is intended to create and populate the database
-            d4e63f5d
+# from scratch, reporting an error if it already exists. In principle
 # though, the code that processes and loads a given *.dly file could be
 # run on its own to load additional data into an already existing
 # database file.
+#
 # At the moment, only TMIN and TMAX are loaded, but that can easily be
 # changed.
-            dd85f6d3
+            Jim Regetz
 # Jim Regetz
 # NCEAS
 # Created on 09-May-2012
 require(RSQLite)
-            d4e63f5d
+#-------------#
 # "constants" #
 #-------------#
 # name of target db
-            dd85f6d3
+db.path <- "ghcn_all.db"
-            d4e63f5d
+# variables to keep
 VARS <- c("TMIN", "TMAX")
 # column characteristics of the *.dly data files
 DLY.COLS <- c("character", "integer", "integer", "character",
     rep(c("numeric", "character", "character", "character"), times=31))
 NUM.WIDE.COLS <- 4 + 4*31
 DAYS <- lapply(seq(from=5, to=NUM.WIDE.COLS, by=4), function(i) i:(i+3))
 #------------------#
 # helper functions #
 #------------------#
 # bulk insert helper function (adapted from RSQLite documentation)
-            dd85f6d3
+ghcn_bulk_insert <- function(db, sql, dat) {
     dbBeginTransaction(db)
     dbGetPreparedQuery(db, sql, bind.data = dat)
     dbCommit(db)
     dbGetQuery(db, "select count(*) from ghcn")[[1]]
+}
-c150d
+# shell out to OS to leverage grep/awk/tr for faster initial parsing and
 # filtering of data; if no data records are read in, this function
 # returns NULL
 loadAsCSV <- function(dly, patt=NULL) {
-d08ed05
+    awk <- paste(
         "awk -v FIELDWIDTHS='",
         paste(c(11, 4, 2, 4, rep(c(5,1,1,1), times=31)), collapse=" "),
         "' -v OFS=',' '{ $1=$1 \"\"; print }'", sep="")
     tr <- "tr -d ' '"
-c150d
+    if (is.null(patt)) {
         cmd <- paste(awk, dly, "|", tr)
     } else {
         patt <- shQuote(paste(patt, collapse="\\|"))
         cmd <- paste("grep -e", patt, dly, "|", awk, "|", tr)
+    }
     csv <- system(cmd, intern=TRUE)
     if (length(csv)>0) {
         read.csv(textConnection(csv), header=FALSE, colClasses=DLY.COLS)
     } else {
         NULL
+    }
-d08ed05
+            Jim Regetz
-aa5c07
+# split data columnwise by day, then recombine into long format; note
 # that the indexing here is hard-coded to work for the *.dly files, and
 # simply assumes that they are all consistent
 wideToLong <- function(dat, days) {
     daily.data <- lapply(seq_along(days), function(i) {
         dat <- data.frame(dat[1:4], day=i, dat[days[[i]]])
         dat$srcrowid <- seq(nrow(dat))
         names(dat) <- 1:ncol(dat)
         dat
         })
     do.call("rbind", daily.data)
+}
-            d4e63f5d
+            Jim Regetz
 #-----------------#
 # procedural code #
 #-----------------#
 # establish database connection
 con <- dbDriver("SQLite")
 if (file.exists(db.path)) {
     stop("database already exists at ", db.path)
+}
 db <- dbConnect(con, dbname=db.path)
 # create main ghcn table
 sql <- "
     CREATE TABLE ghcn (
         id text,
         year int,
         month int,
         element text,
         day int,
         value int,
         mflag text,
         qflag text,
         sflag text,
         srcrowid int)
+"
 dbGetQuery(db, sql)
 # prepare sql insert statement
 params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))),
     collapse=", ")
 sql <- paste("insert into ghcn values (", params.clist, ")", sep="")
 # process and insert daily data
-            dd85f6d3
+dailies <- list.files(pattern="*.dly")
 for (file in dailies) {
-            d4e63f5d
+    dly <- loadAsCSV(file, VARS)
-aa5c07
+    if (!is.null(dly)) {
         long <- wideToLong(dly, DAYS)
-c150d
+        ghcn_bulk_insert(db, sql, long)
-aa5c07
+    } else {
         message("no rows imported for ", file)
-c150d
+            Jim Regetz
-            dd85f6d3
+            Jim Regetz

Project

General

Profile