1
|
####################Interpolation of Tmax for 10 dates.#####################
|
2
|
#This script interpolates station values for the Oregon case study. This program loads the station data from a csv file
|
3
|
#and perform two types of regression: multiple linear model and general additive model (GAM). Note that this program:
|
4
|
#1)assumes that the csv file is in the current working
|
5
|
#2)extract relevant variables from raster images before performing the regressions.
|
6
|
#The user must provide the list of raster images in a textile.
|
7
|
#Script created by Benoit Parmentier on March 3, 2012.
|
8
|
|
9
|
###Loading r library and packages # loading the raster package
|
10
|
library(gtools) # loading ...
|
11
|
library(mgcv)
|
12
|
|
13
|
###Parameters and arguments
|
14
|
#infile1<-"ghcn_or_b_02122012_OR83M.csv"
|
15
|
infile1<-"ghcn_or_tmax_b_03032012_OR83M.csv"
|
16
|
path<-"C:/Data/Benoit/NCEAS/window_Oregon_data"
|
17
|
setwd(path)
|
18
|
#infile2<-"dates_interpolation_03012012.txt" # list of 10 dates for the regression
|
19
|
infile2<-"dates_interpolation_03052012.txt"
|
20
|
prop<-0.3 #Proportion of testing retained for validation
|
21
|
out_prefix<-"_03042012_r1"
|
22
|
infile3<-"models_interpolation_03052012.txt"
|
23
|
|
24
|
|
25
|
#######START OF THE SCRIPT #############
|
26
|
|
27
|
###Reading the station data and setting up for models' comparison
|
28
|
ghcn<-read.csv(paste(path,"/",infile1, sep=""), header=TRUE) #The "paste" function concatenates the path and file name in common string.
|
29
|
ghcn = transform(ghcn,Northness = cos(ASPECT)) #Adding a variable to the dataframe
|
30
|
ghcn = transform(ghcn,Eastness = sin(ASPECT)) #adding variable to the dataframe.
|
31
|
|
32
|
ghcn = transform(ghcn,Northness_w = sin(slope)*cos(ASPECT)) #Adding a variable to the dataframe
|
33
|
ghcn = transform(ghcn,Eastness_w = sin(slope)*sin(ASPECT)) #adding variable to the dataframe.
|
34
|
set.seed(100)
|
35
|
dates <-readLines(paste(path,"/",infile2, sep=""))
|
36
|
models <-readLines(paste(path,"/",infile3, sep=""))
|
37
|
|
38
|
results <- matrix(1,length(dates),14) #This is a matrix containing the diagnostic measures from the GAM models.
|
39
|
|
40
|
results_AIC<- matrix(1,length(dates),length(models)+3)
|
41
|
results_GCV<- matrix(1,length(dates),length(models)+3)
|
42
|
results_RMSE<- matrix(1,length(dates),length(models)+3)
|
43
|
|
44
|
ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 subsets data
|
45
|
#note that compare to the previous version date_ column was changed to date
|
46
|
|
47
|
## looping through the dates...
|
48
|
#Change this into a nested loop, looping through the number of models
|
49
|
|
50
|
for(i in 1:length(dates)){ # start of the for loop #1
|
51
|
|
52
|
###Regression part 1: Creating a validation dataset by creating training and testing datasets
|
53
|
|
54
|
n<-nrow(ghcn.subsets[[i]])
|
55
|
ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows
|
56
|
nv<-n-ns #create a sample for validation with prop of the rows
|
57
|
#ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows
|
58
|
ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
|
59
|
ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
|
60
|
data_s <- ghcn.subsets[[i]][ind.training, ]
|
61
|
data_v <- ghcn.subsets[[i]][ind.testing, ]
|
62
|
|
63
|
####Regression part 2: GAM models
|
64
|
|
65
|
mod1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=data_s)
|
66
|
mod2<- gam(tmax~ s(lat,lon,ELEV_SRTM), data=data_s)
|
67
|
mod3<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (Northness)+ s (Eastness) + s(DISTOC), data=data_s)
|
68
|
mod4<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (Northness_w)+ s (Eastness_w) + s(DISTOC), data=data_s)
|
69
|
mod5<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM, Northness) + s (Eastness) + s(DISTOC), data=data_s)
|
70
|
mod6<- gam(tmax~ s(lat,lon) + s (ELEV_SRTM, Northness) + s (Eastness) + s(DISTOC), data=data_s)
|
71
|
|
72
|
|
73
|
####Regression part 3: Calculating and storing diagnostic measures
|
74
|
|
75
|
results_AIC[i,1]<- dates[i] #storing the interpolation dates in the first column
|
76
|
results_AIC[i,2]<- ns #number of stations used in the training stage
|
77
|
results_AIC[i,3]<- AIC (mod1)
|
78
|
results_AIC[i,4]<- AIC (mod2)
|
79
|
results_AIC[i,5]<- AIC (mod3)
|
80
|
results_AIC[i,6]<- AIC (mod4)
|
81
|
results_AIC[i,7]<- AIC (mod5)
|
82
|
results_AIC[i,8]<- AIC (mod6)
|
83
|
|
84
|
results_GCV[i,1]<- dates[i] #storing the interpolation dates in the first column
|
85
|
results_GCV[i,2]<- ns #number of stations used in the training stage
|
86
|
results_GCV[i,3]<- mod1$gcv.ubre
|
87
|
results_GCV[i,4]<- mod2$gcv.ubre
|
88
|
results_GCV[i,5]<- mod3$gcv.ubre
|
89
|
results_GCV[i,6]<- mod4$gcv.ubre
|
90
|
results_GCV[i,7]<- mod5$gcv.ubre
|
91
|
results_GCV[i,8]<- mod6$gcv.ubre
|
92
|
|
93
|
results_DEV[i,1]<- dates[i] #storing the interpolation dates in the first column
|
94
|
results_DEV[i,2]<- ns #number of stations used in the training stage
|
95
|
results_DEV[i,3]<- mod1$deviance
|
96
|
results_DEV[i,4]<- mod2$deviance
|
97
|
results_DEV[i,5]<- mod3$deviance
|
98
|
results_DEV[i,6]<- mod4$deviance
|
99
|
results_DEV[i,7]<- mod5$deviance
|
100
|
results_DEV[i,8]<- mod6$deviance
|
101
|
|
102
|
#####VALIDATION: Prediction checking the results using the testing data########
|
103
|
|
104
|
y_mod1<- predict(mod1, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values.
|
105
|
y_mod2<- predict(mod2, newdata=data_v, se.fit = TRUE)
|
106
|
y_mod3<- predict(mod3, newdata=data_v, se.fit = TRUE)
|
107
|
y_mod4<- predict(mod4, newdata=data_v, se.fit = TRUE)
|
108
|
y_mod5<- predict(mod5, newdata=data_v, se.fit = TRUE)
|
109
|
y_mod5<- predict(mod6, newdata=data_v, se.fit = TRUE)
|
110
|
|
111
|
res_mod1<- data_v$tmax - y_mod1$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation
|
112
|
res_mod2<- data_v$tmax - y_mod2$fit #Residuals for GAM model that resembles the PRISM interpolation
|
113
|
res_mod3<- data_v$tmax - y_mod3$fit
|
114
|
res_mod4<- data_v$tmax - y_mod4$fit
|
115
|
res_mod5<- data_v$tmax - y_mod5$fit
|
116
|
res_mod6<- data_v$tmax - y_mod6$fit
|
117
|
|
118
|
RMSE_mod1 <- sqrt(sum(res_mod1^2)/nv)
|
119
|
RMSE_mod2 <- sqrt(sum(res_mod2^2)/nv)
|
120
|
RMSE_mod3 <- sqrt(sum(res_mod3^2)/nv)
|
121
|
RMSE_mod4 <- sqrt(sum(res_mod4^2)/nv)
|
122
|
RMSE_mod5 <- sqrt(sum(res_mod5^2)/nv)
|
123
|
RMSE_mod6 <- sqrt(sum(res_mod6^2)/nv)
|
124
|
|
125
|
|
126
|
results_RMSE[i,1]<- dates[i] #storing the interpolation dates in the first column
|
127
|
results_RMSE[i,2]<- ns #number of stations used in the training stage
|
128
|
results_RMSE[i,3]<- RMSE_mod1
|
129
|
results_RMSE[i,4]<- RMSE_mod2
|
130
|
results_RMSE[i,5]<- RMSE_mod3
|
131
|
results_RMSE[i,6]<- RMSE_mod4
|
132
|
results_RMSE[i,7]<- RMSE_mod5
|
133
|
results_RMSE[i,8]<- RMSE_mod6
|
134
|
|
135
|
data_name<-paste("ghcn_v_",dates[[i]],sep="")
|
136
|
assign(data_name,data_v)
|
137
|
#ghcn_v<-ls(pattern="ghcn_v_")
|
138
|
|
139
|
# end of the for loop #1
|
140
|
}
|
141
|
|
142
|
## Plotting and saving diagnostic measures
|
143
|
|
144
|
results_RMSEnum <-results_RMSE
|
145
|
mode(results_RMSEnum)<- "numeric"
|
146
|
# Make it numeric first
|
147
|
# Now turn it into a data.frame...
|
148
|
|
149
|
results_table_RMSE<-as.data.frame(results_RMSEnum)
|
150
|
colnames(results_table_RMSE)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6")
|
151
|
|
152
|
win.graph()
|
153
|
barplot(results_table$RMSE_A1/10,main="RMSE for the A1 models",names.arg=results_table$dates,ylab="Temp (deg. C)",xlab="interpolated date")
|
154
|
savePlot(paste("GAM_ANUSPLIN1_RMSE",out_prefix,".emf", sep=""), type="emf")
|
155
|
win.graph()
|
156
|
barplot(results_table$RMSE_P1/10,main="RMSE for the P1 models",names.arg=results_table$dates,ylab="Temp ( deg. C)",xlab="interpolated date")
|
157
|
savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf")
|
158
|
win.graph()
|
159
|
barplot(results_table$RMSE_P2/10,main="RMSE for the P2 models",names.arg=results_table$dates,ylab="Temp ( deg. C)",xlab="interpolated date")
|
160
|
savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf")
|
161
|
win.graph()
|
162
|
barplot(results_table$AIC_A1,main="AIC for the A1 models",names.arg=results_table$dates,ylab="Temp ( deg. C)",xlab="interpolated date")
|
163
|
savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf")
|
164
|
win.graph()
|
165
|
barplot(results_table$AIC_P1/10,main="AIC for the P1 models",names.arg=results_table$dates,ylab="Temp ( deg. C)",xlab="interpolated date")
|
166
|
savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf")
|
167
|
win.graph()
|
168
|
barplot(results_table$AIC_P2/10,main="AIC for the P2 models",names.arg=results_table$dates,ylab="Temp (10 X deg. C)",xlab="interolated date")
|
169
|
savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf")
|
170
|
win.graph()
|
171
|
barplot(results_table$Deviance_A1/10,main="Deviance for the A1 models",names.arg=results_table$dates,ylab="Temp (10 X deg. C)",xlab="interolated date")
|
172
|
savePlot(paste("GAM_ANUSPLIN1_Deviance",out_prefix,".emf", sep=""), type="emf")
|
173
|
win.graph()
|
174
|
barplot(results_table$Deviance_P1/10,main="Deviance for the P1 models",names.arg=results_table$dates,ylab="Temp (10 X deg. C)",xlab="interolated date")
|
175
|
savePlot(paste("GAM_PRISM1_Deviance",out_prefix,".emf", sep=""), type="emf")
|
176
|
win.graph()
|
177
|
barplot(results_table$Deviance_P2/10,main="Deviance for the P2 models",names.arg=results_table$dates,ylab="Temp (10 X deg. C)",xlab="interolated date")
|
178
|
savePlot(paste("GAM_PRISM2_Deviance",out_prefix,".emf", sep=""), type="emf")
|
179
|
|
180
|
write.csv(results_table, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep=""))
|
181
|
|
182
|
|
183
|
# End of script##########
|
184
|
|
185
|
# ###############################
|
186
|
|
187
|
#
|
188
|
# ############Diagnostic GAM plots#############
|
189
|
# win.graph()
|
190
|
# gam.check(GAM_ANUSPLIN1)
|
191
|
# savePlot("GAM_ANUSPLIN1_diagnostic1.emf", type="emf")
|
192
|
# win.graph()
|
193
|
# gam.check(GAM_PRISM1) #This will produce basic plots of residuals
|
194
|
# savePlot("GAM_PRISM_diagnostic1.emf", type="emf")
|
195
|
# gam.check(GAM_ANUSPLIN1)
|
196
|
# win.graph()
|
197
|
# vis.gam(GAM_ANUSPLIN1)
|
198
|
# savePlot("GAM_ANUSPLIN1_prediction.emf", type="emf")
|
199
|
# win.graph()
|
200
|
# vis.gam(GAM_PRISM1)
|
201
|
# savePlot("GAM_PRISM1_prediction.emf", type="emf")
|
202
|
# win.graph()
|
203
|
# vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM"))
|
204
|
# #vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM", theta=100,phi=200))
|
205
|
# savePlot("GAM_ANUSPLIN1_prediction2.emf", type="emf")
|
206
|
#
|
207
|
|
208
|
#
|
209
|
|
210
|
|
211
|
|
212
|
|
213
|
|