Project

General

Profile

Download (3.52 KB) Statistics
| Branch: | Revision:
1 dd85f6d3 Jim Regetz
# R script for batch parsing and loading GHCN daily station files
2 d4e63f5d Jim Regetz
# (*.dly) into a SQLite database. Script will process all such files in
3
# the current working directory.
4 dd85f6d3 Jim Regetz
#
5
# As written, the script is intended to create and populate the database
6 d4e63f5d Jim Regetz
# from scratch, reporting an error if it already exists. In principle
7
# though, the code that processes and loads a given *.dly file could be
8
# run on its own to load additional data into an already existing
9
# database file.
10
#
11
# At the moment, only TMIN and TMAX are loaded, but that can easily be
12
# changed.
13 dd85f6d3 Jim Regetz
#
14
# Jim Regetz
15
# NCEAS
16
# Created on 09-May-2012
17
18
require(RSQLite)
19
20 d4e63f5d Jim Regetz
#-------------#
21
# "constants" #
22
#-------------#
23
24
# name of target db
25 dd85f6d3 Jim Regetz
db.path <- "ghcn_all.db"
26
27 d4e63f5d Jim Regetz
# variables to keep
28
VARS <- c("TMIN", "TMAX")
29
30
# column characteristics of the *.dly data files
31
DLY.COLS <- c("character", "integer", "integer", "character",
32
    rep(c("numeric", "character", "character", "character"), times=31))
33
NUM.WIDE.COLS <- 4 + 4*31
34
DAYS <- lapply(seq(from=5, to=NUM.WIDE.COLS, by=4), function(i) i:(i+3))
35
36
37
#------------------#
38
# helper functions #
39
#------------------#
40
41
# bulk insert helper function (adapted from RSQLite documentation)
42 dd85f6d3 Jim Regetz
ghcn_bulk_insert <- function(db, sql, dat) {
43
    dbBeginTransaction(db)
44
    dbGetPreparedQuery(db, sql, bind.data = dat)
45
    dbCommit(db)
46
    dbGetQuery(db, "select count(*) from ghcn")[[1]]
47
}
48
49 669c150d Jim Regetz
# shell out to OS to leverage grep/awk/tr for faster initial parsing and
50
# filtering of data; if no data records are read in, this function
51
# returns NULL
52
loadAsCSV <- function(dly, patt=NULL) {
53 2d08ed05 Jim Regetz
    awk <- paste(
54
        "awk -v FIELDWIDTHS='",
55
        paste(c(11, 4, 2, 4, rep(c(5,1,1,1), times=31)), collapse=" "),
56
        "' -v OFS=',' '{ $1=$1 \"\"; print }'", sep="")
57
    tr <- "tr -d ' '"
58 669c150d Jim Regetz
    if (is.null(patt)) {
59
        cmd <- paste(awk, dly, "|", tr)
60
    } else {
61
        patt <- shQuote(paste(patt, collapse="\\|"))
62
        cmd <- paste("grep -e", patt, dly, "|", awk, "|", tr)
63
    }
64
    csv <- system(cmd, intern=TRUE)
65
    if (length(csv)>0) {
66
        read.csv(textConnection(csv), header=FALSE, colClasses=DLY.COLS)
67
    } else {
68
        NULL
69
    }
70 2d08ed05 Jim Regetz
}
71
72 38aa5c07 Jim Regetz
# split data columnwise by day, then recombine into long format; note
73
# that the indexing here is hard-coded to work for the *.dly files, and
74
# simply assumes that they are all consistent
75
wideToLong <- function(dat, days) {
76
    daily.data <- lapply(seq_along(days), function(i) {
77
        dat <- data.frame(dat[1:4], day=i, dat[days[[i]]])
78
        dat$srcrowid <- seq(nrow(dat))
79
        names(dat) <- 1:ncol(dat)
80
        dat
81
        })
82
    do.call("rbind", daily.data)
83
}
84
85 d4e63f5d Jim Regetz
86
#-----------------#
87
# procedural code #
88
#-----------------#
89
90
# establish database connection
91
con <- dbDriver("SQLite")
92
if (file.exists(db.path)) {
93
    stop("database already exists at ", db.path)
94
}
95
db <- dbConnect(con, dbname=db.path)
96
97
# create main ghcn table
98
sql <- "
99
    CREATE TABLE ghcn (
100
        id text,
101
        year int,
102
        month int,
103
        element text,
104
        day int,
105
        value int,
106
        mflag text,
107
        qflag text,
108
        sflag text,
109
        srcrowid int)
110
"
111
dbGetQuery(db, sql)
112
113
# prepare sql insert statement
114
params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))),
115
    collapse=", ")
116
sql <- paste("insert into ghcn values (", params.clist, ")", sep="")
117
118
# process and insert daily data
119 dd85f6d3 Jim Regetz
dailies <- list.files(pattern="*.dly")
120
for (file in dailies) {
121 d4e63f5d Jim Regetz
    dly <- loadAsCSV(file, VARS)
122 38aa5c07 Jim Regetz
    if (!is.null(dly)) {
123
        long <- wideToLong(dly, DAYS)
124 669c150d Jim Regetz
        ghcn_bulk_insert(db, sql, long)
125 38aa5c07 Jim Regetz
    } else {
126
        message("no rows imported for ", file)
127 669c150d Jim Regetz
    }
128 dd85f6d3 Jim Regetz
}