Project

General

Profile

Download (3.95 KB) Statistics
| Branch: | Revision:
1
# R script for batch parsing and loading GHCN daily station files
2
# (*.dly) into a SQLite database. Script will process all such files in
3
# the current working directory.
4
#
5
# As written, the script is intended to create and populate the database
6
# from scratch, reporting an error if it already exists. In principle
7
# though, the code that processes and loads a given *.dly file could be
8
# run on its own to load additional data into an already existing
9
# database file.
10
#
11
# At the moment, only TMIN and TMAX are loaded, but that can easily be
12
# changed.
13
#
14
# Jim Regetz
15
# NCEAS
16
# Created on 09-May-2012
17

    
18
require(RSQLite)
19

    
20
#-------------#
21
# "constants" #
22
#-------------#
23

    
24
# name of target db
25
db.path <- "ghcn_all.db"
26

    
27
# variables to keep
28
VARS <- c("TMIN", "TMAX")
29

    
30
# column characteristics of the *.dly data files
31
DLY.COLS <- c("character", "integer", "integer", "character",
32
    rep(c("numeric", "character", "character", "character"), times=31))
33
NUM.WIDE.COLS <- 4 + 4*31
34
DAYS <- lapply(seq(from=5, to=NUM.WIDE.COLS, by=4), function(i) i:(i+3))
35

    
36

    
37
#------------------#
38
# helper functions #
39
#------------------#
40

    
41
# bulk insert helper function (adapted from RSQLite documentation)
42
ghcn_bulk_insert <- function(db, sql, dat) {
43
    dbBeginTransaction(db)
44
    dbGetPreparedQuery(db, sql, bind.data = dat)
45
    dbCommit(db)
46
}
47

    
48
# shell out to OS to leverage grep/awk/tr for faster initial parsing and
49
# filtering of data into a temp file; if filtering yields no data
50
# records, this function returns NULL
51
loadAsCSV <- function(dly, patt=NULL) {
52
    tmpfile <- tempfile()
53
    awk <- paste(
54
        "awk -v FIELDWIDTHS='",
55
        paste(c(11, 4, 2, 4, rep(c(5,1,1,1), times=31)), collapse=" "),
56
        "' -v OFS=',' '{ $1=$1 \"\"; print }'", sep="")
57
    tr <- "tr -d ' '"
58
    if (is.null(patt)) {
59
        cmd <- paste(awk, dly, "|", tr)
60
    } else {
61
        patt <- shQuote(paste(patt, collapse="\\|"))
62
        cmd <- paste("grep -e", patt, dly, "|", awk, "|", tr)
63
    }
64
    cmd <- paste(cmd, tmpfile, sep=" > ")
65
    # execute command and read from tmpfile if successful
66
    if (system(cmd)==0 & 0<file.info(tmpfile)$size) {
67
        out <- read.csv(tmpfile, header=FALSE, colClasses=DLY.COLS)
68
        file.remove(tmpfile)
69
    } else {
70
        out <- NULL
71
    }
72
    return(out)
73
}
74

    
75
# function to convert the wide-form (days across columns) GHCN data into
76
# long form (unique row for each day*element); note that all indexing
77
# here is hard-coded to work for the *.dly files, and simply assumes
78
# that they are all consistent
79
wideToLong <- function(dat, days) {
80
    # convert id vars to long form, relying on R to recycle the first
81
    # four to match the length of the fifth (slightly faster than doing
82
    # this manually)
83
    out <- data.frame(
84
        dat[1:4],
85
        V5=rep(1:31, each=nrow(dat))
86
    )
87
    # now combine and fill in the daily values/flags
88
    for (i in 1:4) {
89
        cols <- sapply(days, function(x) x[[i]])
90
        out[[5+i]] <- as.vector(as.matrix(dat[, cols]))
91
    }
92
    # add original row id
93
    out$id <- 1:nrow(dat)
94
    out
95
}
96

    
97

    
98
#-----------------#
99
# procedural code #
100
#-----------------#
101

    
102
# establish database connection
103
con <- dbDriver("SQLite")
104
if (file.exists(db.path)) {
105
    stop("database already exists at ", db.path)
106
}
107
db <- dbConnect(con, dbname=db.path)
108

    
109
# create main ghcn table
110
sql <- "
111
    CREATE TABLE ghcn (
112
        id text,
113
        year int,
114
        month int,
115
        element text,
116
        day int,
117
        value int,
118
        mflag text,
119
        qflag text,
120
        sflag text,
121
        srcrowid int)
122
"
123
dbGetQuery(db, sql)
124

    
125
# prepare sql insert statement
126
params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))),
127
    collapse=", ")
128
sql <- paste("insert into ghcn values (", params.clist, ")", sep="")
129

    
130
# process and insert daily data
131
dailies <- list.files(pattern="*.dly")
132
for (file in dailies) {
133
    dly <- loadAsCSV(file, VARS)
134
    if (!is.null(dly)) {
135
        long <- wideToLong(dly, DAYS)
136
        ghcn_bulk_insert(db, sql, long)
137
    } else {
138
        message("no rows imported for ", file)
139
    }
140
}
(3-3/3)