Project

General

Profile

Download (3.47 KB) Statistics
| Branch: | Revision:
1
# R script for batch parsing and loading GHCN daily station files
2
# (*.dly) into a SQLite database. Script will process all such files in
3
# the current working directory.
4
#
5
# As written, the script is intended to create and populate the database
6
# from scratch, reporting an error if it already exists. In principle
7
# though, the code that processes and loads a given *.dly file could be
8
# run on its own to load additional data into an already existing
9
# database file.
10
#
11
# At the moment, only TMIN and TMAX are loaded, but that can easily be
12
# changed.
13
#
14
# Jim Regetz
15
# NCEAS
16
# Created on 09-May-2012
17

    
18
require(RSQLite)
19

    
20
#-------------#
21
# "constants" #
22
#-------------#
23

    
24
# name of target db
25
db.path <- "ghcn_all.db"
26

    
27
# variables to keep
28
VARS <- c("TMIN", "TMAX")
29

    
30
# column characteristics of the *.dly data files
31
DLY.COLS <- c("character", "integer", "integer", "character",
32
    rep(c("numeric", "character", "character", "character"), times=31))
33
NUM.WIDE.COLS <- 4 + 4*31
34
DAYS <- lapply(seq(from=5, to=NUM.WIDE.COLS, by=4), function(i) i:(i+3))
35

    
36

    
37
#------------------#
38
# helper functions #
39
#------------------#
40

    
41
# bulk insert helper function (adapted from RSQLite documentation)
42
ghcn_bulk_insert <- function(db, sql, dat) {
43
    dbBeginTransaction(db)
44
    dbGetPreparedQuery(db, sql, bind.data = dat)
45
    dbCommit(db)
46
}
47

    
48
# shell out to OS to leverage grep/awk/tr for faster initial parsing and
49
# filtering of data; if no data records are read in, this function
50
# returns NULL
51
loadAsCSV <- function(dly, patt=NULL) {
52
    awk <- paste(
53
        "awk -v FIELDWIDTHS='",
54
        paste(c(11, 4, 2, 4, rep(c(5,1,1,1), times=31)), collapse=" "),
55
        "' -v OFS=',' '{ $1=$1 \"\"; print }'", sep="")
56
    tr <- "tr -d ' '"
57
    if (is.null(patt)) {
58
        cmd <- paste(awk, dly, "|", tr)
59
    } else {
60
        patt <- shQuote(paste(patt, collapse="\\|"))
61
        cmd <- paste("grep -e", patt, dly, "|", awk, "|", tr)
62
    }
63
    csv <- system(cmd, intern=TRUE)
64
    if (length(csv)>0) {
65
        read.csv(textConnection(csv), header=FALSE, colClasses=DLY.COLS)
66
    } else {
67
        NULL
68
    }
69
}
70

    
71
# split data columnwise by day, then recombine into long format; note
72
# that the indexing here is hard-coded to work for the *.dly files, and
73
# simply assumes that they are all consistent
74
wideToLong <- function(dat, days) {
75
    daily.data <- lapply(seq_along(days), function(i) {
76
        dat <- data.frame(dat[1:4], day=i, dat[days[[i]]])
77
        dat$srcrowid <- seq(nrow(dat))
78
        names(dat) <- 1:ncol(dat)
79
        dat
80
        })
81
    do.call("rbind", daily.data)
82
}
83

    
84

    
85
#-----------------#
86
# procedural code #
87
#-----------------#
88

    
89
# establish database connection
90
con <- dbDriver("SQLite")
91
if (file.exists(db.path)) {
92
    stop("database already exists at ", db.path)
93
}
94
db <- dbConnect(con, dbname=db.path)
95

    
96
# create main ghcn table
97
sql <- "
98
    CREATE TABLE ghcn (
99
        id text,
100
        year int,
101
        month int,
102
        element text,
103
        day int,
104
        value int,
105
        mflag text,
106
        qflag text,
107
        sflag text,
108
        srcrowid int)
109
"
110
dbGetQuery(db, sql)
111

    
112
# prepare sql insert statement
113
params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))),
114
    collapse=", ")
115
sql <- paste("insert into ghcn values (", params.clist, ")", sep="")
116

    
117
# process and insert daily data
118
dailies <- list.files(pattern="*.dly")
119
for (file in dailies) {
120
    dly <- loadAsCSV(file, VARS)
121
    if (!is.null(dly)) {
122
        long <- wideToLong(dly, DAYS)
123
        ghcn_bulk_insert(db, sql, long)
124
    } else {
125
        message("no rows imported for ", file)
126
    }
127
}
(2-2/2)