Project

General

Profile

Download (2.13 KB) Statistics
| Branch: | Revision:
1
# R script for batch parsing and loading GHCN daily station files
2
# (*.dly) into a SQLite database
3
#
4
# As written, the script is intended to create and populate the database
5
# from scratch, reporting an error if the script already exists. In
6
# principle though, the code that processes and loads a given *.dly file
7
# could be run on its own to load additional data into an already
8
# existing database file.
9
#
10
# Jim Regetz
11
# NCEAS
12
# Created on 09-May-2012
13

    
14
require(RSQLite)
15

    
16
db.path <- "ghcn_all.db"
17

    
18
# define bulk insert helper function (adapted from RSQLite
19
# documentation)
20
ghcn_bulk_insert <- function(db, sql, dat) {
21
    dbBeginTransaction(db)
22
    dbGetPreparedQuery(db, sql, bind.data = dat)
23
    dbCommit(db)
24
    dbGetQuery(db, "select count(*) from ghcn")[[1]]
25
}
26

    
27
# establish database connection
28
con <- dbDriver("SQLite")
29
if (file.exists(db.path)) {
30
    stop("database already exists at ", db.path)
31
}
32
db <- dbConnect(con, dbname=db.path)
33

    
34
# create main ghcn table
35
sql <- "
36
    CREATE TABLE ghcn (
37
        id text,
38
        year int,
39
        month int,
40
        element text,
41
        day int,
42
        value int,
43
        mflag text,
44
        qflag text,
45
        sflag text,
46
        srcrowid int)
47
"
48
dbGetQuery(db, sql)
49

    
50
# prepare sql insert statement
51
params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))),
52
    collapse=", ")
53
sql <- paste("insert into ghcn values (", params.clist, ")", sep="")
54

    
55
# process and insert daily data
56
DLY.COLS <- c("character", "integer", "integer", "character",
57
    rep(c("numeric", "character", "character", "character"), times=31))
58

    
59
loadAsCSV <- function(dly) {
60
    awk <- paste(
61
        "awk -v FIELDWIDTHS='",
62
        paste(c(11, 4, 2, 4, rep(c(5,1,1,1), times=31)), collapse=" "),
63
        "' -v OFS=',' '{ $1=$1 \"\"; print }'", sep="")
64
    tr <- "tr -d ' '"
65
    system(paste(awk, dly, "|", tr), intern=TRUE)
66
    read.csv(textConnection(system(paste(awk, dly, "|", tr),
67
        intern=TRUE)), header=FALSE, colClasses=DLY.COLS)
68
}
69

    
70
dailies <- list.files(pattern="*.dly")
71
for (file in dailies) {
72
    x <- loadAsCSV(file)
73
    long <- reshape(x, direction="long",
74
        varying=matrix(5:ncol(x), nrow=4))
75
    ghcn_bulk_insert(db, sql, long)
76
}
(2-2/2)