/climate/extra/ghcn-to-sqlite.R - Environment and organisms - NCEAS Projects

root/climate/extra/ghcn-to-sqlite.R @ 38aa5c07

       # R script for batch parsing and loading GHCN daily station files
       # (*.dly) into a SQLite database
+      #
       # As written, the script is intended to create and populate the database
       # from scratch, reporting an error if the script already exists. In
       # principle though, the code that processes and loads a given *.dly file
       # could be run on its own to load additional data into an already
       # existing database file.
+      #
       # Jim Regetz
       # NCEAS
       # Created on 09-May-2012
       require(RSQLite)
       db.path <- "ghcn_all.db"
       # define bulk insert helper function (adapted from RSQLite
       # documentation)
       ghcn_bulk_insert <- function(db, sql, dat) {
           dbBeginTransaction(db)
           dbGetPreparedQuery(db, sql, bind.data = dat)
           dbCommit(db)
           dbGetQuery(db, "select count(*) from ghcn")[[1]]
+      }
       # establish database connection
       con <- dbDriver("SQLite")
       if (file.exists(db.path)) {
           stop("database already exists at ", db.path)
+      }
       db <- dbConnect(con, dbname=db.path)
       # create main ghcn table
       sql <- "
           CREATE TABLE ghcn (
               id text,
               year int,
               month int,
               element text,
               day int,
               value int,
               mflag text,
               qflag text,
               sflag text,
               srcrowid int)
+      "
       dbGetQuery(db, sql)
       # prepare sql insert statement
       params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))),
           collapse=", ")
       sql <- paste("insert into ghcn values (", params.clist, ")", sep="")
       # process and insert daily data
       DLY.COLS <- c("character", "integer", "integer", "character",
           rep(c("numeric", "character", "character", "character"), times=31))
       # shell out to OS to leverage grep/awk/tr for faster initial parsing and
       # filtering of data; if no data records are read in, this function
       # returns NULL
       loadAsCSV <- function(dly, patt=NULL) {
           awk <- paste(
               "awk -v FIELDWIDTHS='",
               paste(c(11, 4, 2, 4, rep(c(5,1,1,1), times=31)), collapse=" "),
               "' -v OFS=',' '{ $1=$1 \"\"; print }'", sep="")
           tr <- "tr -d ' '"
           if (is.null(patt)) {
               cmd <- paste(awk, dly, "|", tr)
           } else {
               patt <- shQuote(paste(patt, collapse="\\|"))
               cmd <- paste("grep -e", patt, dly, "|", awk, "|", tr)
+          }
           csv <- system(cmd, intern=TRUE)
           if (length(csv)>0) {
               read.csv(textConnection(csv), header=FALSE, colClasses=DLY.COLS)
           } else {
               NULL
+          }
+      }
       # split data columnwise by day, then recombine into long format; note
       # that the indexing here is hard-coded to work for the *.dly files, and
       # simply assumes that they are all consistent
       wideToLong <- function(dat, days) {
           daily.data <- lapply(seq_along(days), function(i) {
               dat <- data.frame(dat[1:4], day=i, dat[days[[i]]])
               dat$srcrowid <- seq(nrow(dat))
               names(dat) <- 1:ncol(dat)
               dat
               })
           do.call("rbind", daily.data)
+      }
       NUM.WIDE.COLS <- 4 + 4*31
       DAYS <- lapply(seq(from=5, to=NUM.WIDE.COLS, by=4), function(i) i:(i+3))
       dailies <- list.files(pattern="*.dly")
       for (file in dailies) {
           dly <- loadAsCSV(file, c("TMAX", "TMIN"))
           if (!is.null(dly)) {
               long <- wideToLong(dly, DAYS)
               ghcn_bulk_insert(db, sql, long)
           } else {
               message("no rows imported for ", file)
+          }
+      }

« Previous
1
2
Next »

(2-2/2)

Project

General

Profile