Project

General

Profile

« Previous | Next » 

Revision d4e63f5d

Added by Jim Regetz over 12 years ago

  • ID d4e63f5df88078f46ea75938d5ed69ecdd060dd8

superficial code reformatting

View differences:

climate/extra/ghcn-to-sqlite.R
1 1
# R script for batch parsing and loading GHCN daily station files
2
# (*.dly) into a SQLite database
2
# (*.dly) into a SQLite database. Script will process all such files in
3
# the current working directory.
3 4
#
4 5
# As written, the script is intended to create and populate the database
5
# from scratch, reporting an error if the script already exists. In
6
# principle though, the code that processes and loads a given *.dly file
7
# could be run on its own to load additional data into an already
8
# existing database file.
6
# from scratch, reporting an error if it already exists. In principle
7
# though, the code that processes and loads a given *.dly file could be
8
# run on its own to load additional data into an already existing
9
# database file.
10
#
11
# At the moment, only TMIN and TMAX are loaded, but that can easily be
12
# changed.
9 13
#
10 14
# Jim Regetz
11 15
# NCEAS
......
13 17

  
14 18
require(RSQLite)
15 19

  
20
#-------------#
21
# "constants" #
22
#-------------#
23

  
24
# name of target db
16 25
db.path <- "ghcn_all.db"
17 26

  
18
# define bulk insert helper function (adapted from RSQLite
19
# documentation)
27
# variables to keep
28
VARS <- c("TMIN", "TMAX")
29

  
30
# column characteristics of the *.dly data files
31
DLY.COLS <- c("character", "integer", "integer", "character",
32
    rep(c("numeric", "character", "character", "character"), times=31))
33
NUM.WIDE.COLS <- 4 + 4*31
34
DAYS <- lapply(seq(from=5, to=NUM.WIDE.COLS, by=4), function(i) i:(i+3))
35

  
36

  
37
#------------------#
38
# helper functions #
39
#------------------#
40

  
41
# bulk insert helper function (adapted from RSQLite documentation)
20 42
ghcn_bulk_insert <- function(db, sql, dat) {
21 43
    dbBeginTransaction(db)
22 44
    dbGetPreparedQuery(db, sql, bind.data = dat)
......
24 46
    dbGetQuery(db, "select count(*) from ghcn")[[1]]
25 47
}
26 48

  
27
# establish database connection
28
con <- dbDriver("SQLite")
29
if (file.exists(db.path)) {
30
    stop("database already exists at ", db.path)
31
}
32
db <- dbConnect(con, dbname=db.path)
33

  
34
# create main ghcn table
35
sql <- "
36
    CREATE TABLE ghcn (
37
        id text,
38
        year int,
39
        month int,
40
        element text,
41
        day int,
42
        value int,
43
        mflag text,
44
        qflag text,
45
        sflag text,
46
        srcrowid int)
47
"
48
dbGetQuery(db, sql)
49

  
50
# prepare sql insert statement
51
params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))),
52
    collapse=", ")
53
sql <- paste("insert into ghcn values (", params.clist, ")", sep="")
54

  
55
# process and insert daily data
56
DLY.COLS <- c("character", "integer", "integer", "character",
57
    rep(c("numeric", "character", "character", "character"), times=31))
58

  
59 49
# shell out to OS to leverage grep/awk/tr for faster initial parsing and
60 50
# filtering of data; if no data records are read in, this function
61 51
# returns NULL
......
92 82
    do.call("rbind", daily.data)
93 83
}
94 84

  
95
NUM.WIDE.COLS <- 4 + 4*31
96
DAYS <- lapply(seq(from=5, to=NUM.WIDE.COLS, by=4), function(i) i:(i+3))
85

  
86
#-----------------#
87
# procedural code #
88
#-----------------#
89

  
90
# establish database connection
91
con <- dbDriver("SQLite")
92
if (file.exists(db.path)) {
93
    stop("database already exists at ", db.path)
94
}
95
db <- dbConnect(con, dbname=db.path)
96

  
97
# create main ghcn table
98
sql <- "
99
    CREATE TABLE ghcn (
100
        id text,
101
        year int,
102
        month int,
103
        element text,
104
        day int,
105
        value int,
106
        mflag text,
107
        qflag text,
108
        sflag text,
109
        srcrowid int)
110
"
111
dbGetQuery(db, sql)
112

  
113
# prepare sql insert statement
114
params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))),
115
    collapse=", ")
116
sql <- paste("insert into ghcn values (", params.clist, ")", sep="")
117

  
118
# process and insert daily data
97 119
dailies <- list.files(pattern="*.dly")
98 120
for (file in dailies) {
99
    dly <- loadAsCSV(file, c("TMAX", "TMIN"))
121
    dly <- loadAsCSV(file, VARS)
100 122
    if (!is.null(dly)) {
101 123
        long <- wideToLong(dly, DAYS)
102 124
        ghcn_bulk_insert(db, sql, long)

Also available in: Unified diff