Project

General

Profile

Download (3.11 KB) Statistics
| Branch: | Revision:
1
# R script for batch parsing and loading GHCN daily station files
2
# (*.dly) into a SQLite database
3
#
4
# As written, the script is intended to create and populate the database
5
# from scratch, reporting an error if the script already exists. In
6
# principle though, the code that processes and loads a given *.dly file
7
# could be run on its own to load additional data into an already
8
# existing database file.
9
#
10
# Jim Regetz
11
# NCEAS
12
# Created on 09-May-2012
13

    
14
require(RSQLite)
15

    
16
db.path <- "ghcn_all.db"
17

    
18
# define bulk insert helper function (adapted from RSQLite
19
# documentation)
20
ghcn_bulk_insert <- function(db, sql, dat) {
21
    dbBeginTransaction(db)
22
    dbGetPreparedQuery(db, sql, bind.data = dat)
23
    dbCommit(db)
24
    dbGetQuery(db, "select count(*) from ghcn")[[1]]
25
}
26

    
27
# establish database connection
28
con <- dbDriver("SQLite")
29
if (file.exists(db.path)) {
30
    stop("database already exists at ", db.path)
31
}
32
db <- dbConnect(con, dbname=db.path)
33

    
34
# create main ghcn table
35
sql <- "
36
    CREATE TABLE ghcn (
37
        id text,
38
        year int,
39
        month int,
40
        element text,
41
        day int,
42
        value int,
43
        mflag text,
44
        qflag text,
45
        sflag text,
46
        srcrowid int)
47
"
48
dbGetQuery(db, sql)
49

    
50
# prepare sql insert statement
51
params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))),
52
    collapse=", ")
53
sql <- paste("insert into ghcn values (", params.clist, ")", sep="")
54

    
55
# process and insert daily data
56
DLY.COLS <- c("character", "integer", "integer", "character",
57
    rep(c("numeric", "character", "character", "character"), times=31))
58

    
59
# shell out to OS to leverage grep/awk/tr for faster initial parsing and
60
# filtering of data; if no data records are read in, this function
61
# returns NULL
62
loadAsCSV <- function(dly, patt=NULL) {
63
    awk <- paste(
64
        "awk -v FIELDWIDTHS='",
65
        paste(c(11, 4, 2, 4, rep(c(5,1,1,1), times=31)), collapse=" "),
66
        "' -v OFS=',' '{ $1=$1 \"\"; print }'", sep="")
67
    tr <- "tr -d ' '"
68
    if (is.null(patt)) {
69
        cmd <- paste(awk, dly, "|", tr)
70
    } else {
71
        patt <- shQuote(paste(patt, collapse="\\|"))
72
        cmd <- paste("grep -e", patt, dly, "|", awk, "|", tr)
73
    }
74
    csv <- system(cmd, intern=TRUE)
75
    if (length(csv)>0) {
76
        read.csv(textConnection(csv), header=FALSE, colClasses=DLY.COLS)
77
    } else {
78
        NULL
79
    }
80
}
81

    
82
# split data columnwise by day, then recombine into long format; note
83
# that the indexing here is hard-coded to work for the *.dly files, and
84
# simply assumes that they are all consistent
85
wideToLong <- function(dat, days) {
86
    daily.data <- lapply(seq_along(days), function(i) {
87
        dat <- data.frame(dat[1:4], day=i, dat[days[[i]]])
88
        dat$srcrowid <- seq(nrow(dat))
89
        names(dat) <- 1:ncol(dat)
90
        dat
91
        })
92
    do.call("rbind", daily.data)
93
}
94

    
95
NUM.WIDE.COLS <- 4 + 4*31
96
DAYS <- lapply(seq(from=5, to=NUM.WIDE.COLS, by=4), function(i) i:(i+3))
97
dailies <- list.files(pattern="*.dly")
98
for (file in dailies) {
99
    dly <- loadAsCSV(file, c("TMAX", "TMIN"))
100
    if (!is.null(dly)) {
101
        long <- wideToLong(dly, DAYS)
102
        ghcn_bulk_insert(db, sql, long)
103
    } else {
104
        message("no rows imported for ", file)
105
    }
106
}
(2-2/2)