Revision d4e63f5d
Added by Jim Regetz over 12 years ago
- ID d4e63f5df88078f46ea75938d5ed69ecdd060dd8
climate/extra/ghcn-to-sqlite.R | ||
---|---|---|
1 | 1 |
# R script for batch parsing and loading GHCN daily station files |
2 |
# (*.dly) into a SQLite database |
|
2 |
# (*.dly) into a SQLite database. Script will process all such files in |
|
3 |
# the current working directory. |
|
3 | 4 |
# |
4 | 5 |
# As written, the script is intended to create and populate the database |
5 |
# from scratch, reporting an error if the script already exists. In |
|
6 |
# principle though, the code that processes and loads a given *.dly file |
|
7 |
# could be run on its own to load additional data into an already |
|
8 |
# existing database file. |
|
6 |
# from scratch, reporting an error if it already exists. In principle |
|
7 |
# though, the code that processes and loads a given *.dly file could be |
|
8 |
# run on its own to load additional data into an already existing |
|
9 |
# database file. |
|
10 |
# |
|
11 |
# At the moment, only TMIN and TMAX are loaded, but that can easily be |
|
12 |
# changed. |
|
9 | 13 |
# |
10 | 14 |
# Jim Regetz |
11 | 15 |
# NCEAS |
... | ... | |
13 | 17 |
|
14 | 18 |
require(RSQLite) |
15 | 19 |
|
20 |
#-------------# |
|
21 |
# "constants" # |
|
22 |
#-------------# |
|
23 |
|
|
24 |
# name of target db |
|
16 | 25 |
db.path <- "ghcn_all.db" |
17 | 26 |
|
18 |
# define bulk insert helper function (adapted from RSQLite |
|
19 |
# documentation) |
|
27 |
# variables to keep |
|
28 |
VARS <- c("TMIN", "TMAX") |
|
29 |
|
|
30 |
# column characteristics of the *.dly data files |
|
31 |
DLY.COLS <- c("character", "integer", "integer", "character", |
|
32 |
rep(c("numeric", "character", "character", "character"), times=31)) |
|
33 |
NUM.WIDE.COLS <- 4 + 4*31 |
|
34 |
DAYS <- lapply(seq(from=5, to=NUM.WIDE.COLS, by=4), function(i) i:(i+3)) |
|
35 |
|
|
36 |
|
|
37 |
#------------------# |
|
38 |
# helper functions # |
|
39 |
#------------------# |
|
40 |
|
|
41 |
# bulk insert helper function (adapted from RSQLite documentation) |
|
20 | 42 |
ghcn_bulk_insert <- function(db, sql, dat) { |
21 | 43 |
dbBeginTransaction(db) |
22 | 44 |
dbGetPreparedQuery(db, sql, bind.data = dat) |
... | ... | |
24 | 46 |
dbGetQuery(db, "select count(*) from ghcn")[[1]] |
25 | 47 |
} |
26 | 48 |
|
27 |
# establish database connection |
|
28 |
con <- dbDriver("SQLite") |
|
29 |
if (file.exists(db.path)) { |
|
30 |
stop("database already exists at ", db.path) |
|
31 |
} |
|
32 |
db <- dbConnect(con, dbname=db.path) |
|
33 |
|
|
34 |
# create main ghcn table |
|
35 |
sql <- " |
|
36 |
CREATE TABLE ghcn ( |
|
37 |
id text, |
|
38 |
year int, |
|
39 |
month int, |
|
40 |
element text, |
|
41 |
day int, |
|
42 |
value int, |
|
43 |
mflag text, |
|
44 |
qflag text, |
|
45 |
sflag text, |
|
46 |
srcrowid int) |
|
47 |
" |
|
48 |
dbGetQuery(db, sql) |
|
49 |
|
|
50 |
# prepare sql insert statement |
|
51 |
params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))), |
|
52 |
collapse=", ") |
|
53 |
sql <- paste("insert into ghcn values (", params.clist, ")", sep="") |
|
54 |
|
|
55 |
# process and insert daily data |
|
56 |
DLY.COLS <- c("character", "integer", "integer", "character", |
|
57 |
rep(c("numeric", "character", "character", "character"), times=31)) |
|
58 |
|
|
59 | 49 |
# shell out to OS to leverage grep/awk/tr for faster initial parsing and |
60 | 50 |
# filtering of data; if no data records are read in, this function |
61 | 51 |
# returns NULL |
... | ... | |
92 | 82 |
do.call("rbind", daily.data) |
93 | 83 |
} |
94 | 84 |
|
95 |
NUM.WIDE.COLS <- 4 + 4*31 |
|
96 |
DAYS <- lapply(seq(from=5, to=NUM.WIDE.COLS, by=4), function(i) i:(i+3)) |
|
85 |
|
|
86 |
#-----------------# |
|
87 |
# procedural code # |
|
88 |
#-----------------# |
|
89 |
|
|
90 |
# establish database connection |
|
91 |
con <- dbDriver("SQLite") |
|
92 |
if (file.exists(db.path)) { |
|
93 |
stop("database already exists at ", db.path) |
|
94 |
} |
|
95 |
db <- dbConnect(con, dbname=db.path) |
|
96 |
|
|
97 |
# create main ghcn table |
|
98 |
sql <- " |
|
99 |
CREATE TABLE ghcn ( |
|
100 |
id text, |
|
101 |
year int, |
|
102 |
month int, |
|
103 |
element text, |
|
104 |
day int, |
|
105 |
value int, |
|
106 |
mflag text, |
|
107 |
qflag text, |
|
108 |
sflag text, |
|
109 |
srcrowid int) |
|
110 |
" |
|
111 |
dbGetQuery(db, sql) |
|
112 |
|
|
113 |
# prepare sql insert statement |
|
114 |
params.clist <- paste(rep("?", length(dbListFields(db, "ghcn"))), |
|
115 |
collapse=", ") |
|
116 |
sql <- paste("insert into ghcn values (", params.clist, ")", sep="") |
|
117 |
|
|
118 |
# process and insert daily data |
|
97 | 119 |
dailies <- list.files(pattern="*.dly") |
98 | 120 |
for (file in dailies) { |
99 |
dly <- loadAsCSV(file, c("TMAX", "TMIN"))
|
|
121 |
dly <- loadAsCSV(file, VARS)
|
|
100 | 122 |
if (!is.null(dly)) { |
101 | 123 |
long <- wideToLong(dly, DAYS) |
102 | 124 |
ghcn_bulk_insert(db, sql, long) |
Also available in: Unified diff
superficial code reformatting