1
|
####################Interpolation of Tmax for 10 dates.#####################
|
2
|
#This script interpolates station tmax values for the Oregon case study.It provides a mean to asssess the effect of random sampling and proportion
|
3
|
# of validation hold out on the RMSE.This program loads the station data from a csv file
|
4
|
#and perform one type of regression: general additive model (GAM) with different variables:
|
5
|
# Lat, long, ELEV_SRTM, Eastness, Northness, DISTOC, mean_LST_monthly, Land Cover proportions.
|
6
|
#Note that this program:
|
7
|
#1)assumes that the csv file is in the current working
|
8
|
#2)extract relevant variables from raster images before performing the regressions.
|
9
|
#3)does not clear memory workspace at the start or end of script.
|
10
|
#This scripts predicts tmax using GAM and LST derived from MOD11A1.
|
11
|
#Interactions terms are also included and assessed using the RMSE from validation dataset.
|
12
|
#There are 10 dates used for the GAM interpolation. The dates must be provided as a textfile.
|
13
|
#Script created by Benoit Parmentier on April 25, 2012.
|
14
|
|
15
|
###Loading r library and packages # loading the raster package
|
16
|
library(gtools) # loading ...
|
17
|
library(mgcv)
|
18
|
library(sp)
|
19
|
library(spdep)
|
20
|
library(rgdal)
|
21
|
|
22
|
###Parameters and arguments
|
23
|
|
24
|
infile1<-"ghcn_or_tmax_b_04142012_OR83M.shp"
|
25
|
path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations"
|
26
|
#path<-"H:/Data/IPLANT_project/data_Oregon_stations"
|
27
|
setwd(path)
|
28
|
infile2<-"dates_interpolation_03052012.txt" #List of 10 dates for the regression
|
29
|
prop<-0.3 #Proportion of testing retained for validation
|
30
|
n_runs<- 30 #Number of runs
|
31
|
out_prefix<-"_05142012_run03_LST"
|
32
|
infile3<-"LST_dates_var_names.txt" #List of LST averages.
|
33
|
infile4<-"models_interpolation_05142012.txt"
|
34
|
|
35
|
#######START OF THE SCRIPT #############
|
36
|
|
37
|
###Reading the station data and setting up for models' comparison
|
38
|
filename<-sub(".shp","",infile1) #Removing the extension from file.
|
39
|
ghcn<-readOGR(".", filename) #reading shapefile
|
40
|
|
41
|
ghcn = transform(ghcn,Northness = cos(ASPECT)) #Adding a variable to the dataframe
|
42
|
ghcn = transform(ghcn,Eastness = sin(ASPECT)) #adding variable to the dataframe.
|
43
|
ghcn = transform(ghcn,Northness_w = sin(slope)*cos(ASPECT)) #Adding a variable to the dataframe
|
44
|
ghcn = transform(ghcn,Eastness_w = sin(slope)*sin(ASPECT)) #adding variable to the dataframe.
|
45
|
#Note that "transform" output is a data.frame not spatial object
|
46
|
#set.seed(100) #modify this to a seed variable allowing different runs.
|
47
|
|
48
|
dates <-readLines(paste(path,"/",infile2, sep=""))
|
49
|
LST_dates <-readLines(paste(path,"/",infile3, sep=""))
|
50
|
models <-readLines(paste(path,"/",infile4, sep=""))
|
51
|
|
52
|
#results <- matrix(1,length(dates),14) #This is a matrix containing the diagnostic measures from the GAM models.
|
53
|
|
54
|
results_AIC<- matrix(1,length(dates),length(models)+3)
|
55
|
results_GCV<- matrix(1,length(dates),length(models)+3)
|
56
|
results_DEV<- matrix(1,length(dates),length(models)+3)
|
57
|
results_RMSE<- matrix(1,length(dates),length(models)+3)
|
58
|
RMSE_run<-matrix(1,length(dates),1)
|
59
|
cor_LST_LC1<-matrix(1,length(dates),1) #correlation LST-LC1
|
60
|
cor_LST_LC3<-matrix(1,length(dates),1) #correlation LST-LC3
|
61
|
cor_LST_tmax<-matrix(1,length(dates),1) #correlation LST-tmax
|
62
|
#Screening for bad values
|
63
|
#RMSE_run<-matrix(1,lenth)
|
64
|
results_RMSE_all<- matrix(1,length(dates)*n_runs,length(models)+3)
|
65
|
results_RMSE_all2<- matrix(1,0,length(models)+3)
|
66
|
ghcn_all<-ghcn
|
67
|
ghcn_test<-subset(ghcn,ghcn$tmax>-150 & ghcn$tmax<400)
|
68
|
ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0)
|
69
|
ghcn<-ghcn_test2
|
70
|
|
71
|
#month_var<-c("mm_01","mm_02","mm_03","mm_04","mm_05","mm_06","mm_07","mm_08","mm_09", "mm_10", "mm_11", "mm_12")
|
72
|
|
73
|
## looping through the dates...
|
74
|
#Changed this section into a nested loop, looping through the number of models
|
75
|
|
76
|
for(j in 1:n_runs){
|
77
|
|
78
|
ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 subset data
|
79
|
|
80
|
for(i in 1:length(dates)){ # start of the for loop #1
|
81
|
date<-strptime(dates[i], "%Y%m%d")
|
82
|
month<-strftime(date, "%m")
|
83
|
LST_month<-paste("mm_",month,sep="")
|
84
|
###Regression part 1: Creating a validation dataset by creating training and testing datasets
|
85
|
|
86
|
mod <-ghcn.subsets[[i]][,match(LST_month, names(ghcn.subsets[[i]]))]
|
87
|
ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod)
|
88
|
#Screening LST values
|
89
|
#ghcn.subsets[[i]]<-subset(ghcn.subsets[[i]],ghcn.subsets[[i]]$LST> 258 & ghcn.subsets[[i]]$LST<313)
|
90
|
n<-nrow(ghcn.subsets[[i]])
|
91
|
ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows
|
92
|
nv<-n-ns #create a sample for validation with prop of the rows
|
93
|
ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
|
94
|
ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
|
95
|
data_s <- ghcn.subsets[[i]][ind.training, ]
|
96
|
data_v <- ghcn.subsets[[i]][ind.testing, ]
|
97
|
|
98
|
####Regression part 2: GAM models
|
99
|
|
100
|
mod1<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=data_s)
|
101
|
mod2<- gam(tmax~ s(lat,lon,ELEV_SRTM), data=data_s)
|
102
|
mod3<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (Northness)+ s (Eastness) + s(DISTOC), data=data_s)
|
103
|
mod4<- gam(tmax~ s(lat) + s (lon) + s(ELEV_SRTM) + s(Northness) + s (Eastness) + s(DISTOC) + s(LST), data=data_s)
|
104
|
mod5<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST), data=data_s)
|
105
|
mod6<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC1), data=data_s)
|
106
|
mod7<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC3), data=data_s)
|
107
|
mod8<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST) + s(LC1), data=data_s)
|
108
|
|
109
|
####Regression part 3: Calculating and storing diagnostic measures
|
110
|
|
111
|
results_AIC[i,1]<- j
|
112
|
results_AIC[i,2]<- dates[i] #storing the interpolation dates in the first column
|
113
|
results_AIC[i,3]<- ns #number of stations used in the training stage
|
114
|
results_AIC[i,4]<- AIC (mod1)
|
115
|
results_AIC[i,5]<- AIC (mod2)
|
116
|
results_AIC[i,6]<- AIC (mod3)
|
117
|
results_AIC[i,7]<- AIC (mod4)
|
118
|
results_AIC[i,8]<- AIC (mod5)
|
119
|
results_AIC[i,9]<- AIC (mod6)
|
120
|
results_AIC[i,10]<- AIC (mod7)
|
121
|
|
122
|
results_GCV[i,1]<- j # "j" is the counter index for the number of runs.
|
123
|
results_GCV[i,2]<- dates[i] #storing the interpolation dates in the first column
|
124
|
results_GCV[i,3]<- ns #number of stations used in the training stage
|
125
|
results_GCV[i,4]<- mod1$gcv.ubre
|
126
|
results_GCV[i,5]<- mod2$gcv.ubre
|
127
|
results_GCV[i,6]<- mod3$gcv.ubre
|
128
|
results_GCV[i,7]<- mod4$gcv.ubre
|
129
|
results_GCV[i,8]<- mod5$gcv.ubre
|
130
|
results_GCV[i,9]<- mod6$gcv.ubre
|
131
|
results_GCV[i,10]<- mod7$gcv.ubre
|
132
|
|
133
|
results_DEV[i,1]<- j # "j" is the counter index for the number of runs.
|
134
|
results_DEV[i,2]<- dates[i] #storing the interpolation dates in the first column
|
135
|
results_DEV[i,3]<- ns #number of stations used in the training stage
|
136
|
results_DEV[i,4]<- mod1$deviance
|
137
|
results_DEV[i,5]<- mod2$deviance
|
138
|
results_DEV[i,6]<- mod3$deviance
|
139
|
results_DEV[i,7]<- mod4$deviance
|
140
|
results_DEV[i,8]<- mod5$deviance
|
141
|
results_DEV[i,9]<- mod6$deviance
|
142
|
results_DEV[i,10]<- mod7$deviance
|
143
|
|
144
|
#####VALIDATION: Prediction checking the results using the testing data########
|
145
|
|
146
|
y_mod1<- predict(mod1, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values.
|
147
|
y_mod2<- predict(mod2, newdata=data_v, se.fit = TRUE)
|
148
|
y_mod3<- predict(mod3, newdata=data_v, se.fit = TRUE)
|
149
|
y_mod4<- predict(mod4, newdata=data_v, se.fit = TRUE)
|
150
|
y_mod5<- predict(mod5, newdata=data_v, se.fit = TRUE)
|
151
|
y_mod6<- predict(mod6, newdata=data_v, se.fit = TRUE)
|
152
|
y_mod7<- predict(mod7, newdata=data_v, se.fit = TRUE)
|
153
|
|
154
|
res_mod1<- data_v$tmax - y_mod1$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation
|
155
|
res_mod2<- data_v$tmax - y_mod2$fit #Residuals for GAM model that resembles the PRISM interpolation
|
156
|
res_mod3<- data_v$tmax - y_mod3$fit
|
157
|
res_mod4<- data_v$tmax - y_mod4$fit
|
158
|
res_mod5<- data_v$tmax - y_mod5$fit
|
159
|
res_mod6<- data_v$tmax - y_mod6$fit
|
160
|
res_mod7<- data_v$tmax - y_mod7$fit
|
161
|
|
162
|
RMSE_mod1 <- sqrt(sum(res_mod1^2)/nv)
|
163
|
RMSE_mod2 <- sqrt(sum(res_mod2^2)/nv)
|
164
|
RMSE_mod3 <- sqrt(sum(res_mod3^2)/nv)
|
165
|
RMSE_mod4 <- sqrt(sum(res_mod4^2)/nv)
|
166
|
RMSE_mod5 <- sqrt(sum(res_mod5^2)/nv)
|
167
|
RMSE_mod6 <- sqrt(sum(res_mod6^2)/nv)
|
168
|
RMSE_mod7 <- sqrt(sum(res_mod7^2)/nv)
|
169
|
|
170
|
results_RMSE[i,1]<- j
|
171
|
results_RMSE[i,2]<- dates[i] #storing the interpolation dates in the first column
|
172
|
results_RMSE[i,3]<- ns #number of stations used in the training stage
|
173
|
results_RMSE[i,4]<- RMSE_mod1
|
174
|
results_RMSE[i,5]<- RMSE_mod2
|
175
|
results_RMSE[i,6]<- RMSE_mod3
|
176
|
results_RMSE[i,7]<- RMSE_mod4
|
177
|
results_RMSE[i,8]<- RMSE_mod5
|
178
|
results_RMSE[i,9]<- RMSE_mod6
|
179
|
results_RMSE[i,10]<- RMSE_mod7
|
180
|
#Saving dataset in dataframes
|
181
|
data_name<-paste("ghcn_v_",dates[[i]],sep="")
|
182
|
assign(data_name,data_v)
|
183
|
data_name<-paste("ghcn_s_",dates[[i]],sep="")
|
184
|
assign(data_name,data_s)
|
185
|
#ghcn_v<-ls(pattern="ghcn_v_")
|
186
|
# end of the for loop #2 (nested in loop #1)
|
187
|
}
|
188
|
|
189
|
results_RMSEnum <-results_RMSE
|
190
|
results_AICnum <-results_AIC
|
191
|
mode(results_RMSEnum)<- "numeric"
|
192
|
mode(results_AICnum)<- "numeric"
|
193
|
# Make it numeric first
|
194
|
# Now turn it into a data.frame...
|
195
|
|
196
|
results_table_RMSE<-as.data.frame(results_RMSEnum)
|
197
|
results_table_AIC<-as.data.frame(results_AICnum)
|
198
|
colnames(results_table_RMSE)<-c("run","dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6", "mod7")
|
199
|
colnames(results_table_AIC)<-c("run","dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6", "mod7")
|
200
|
|
201
|
write.table(results_table_RMSE, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep=""), sep=",", col.names=FALSE, append=TRUE)
|
202
|
#write.table(results_table_AIC, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep=""),sep=",", append=TRUE)
|
203
|
|
204
|
#results_RMSE_all Need to put all the results in one data.frame...
|
205
|
results_RMSE_all2<-rbind(results_RMSE_all2,results_table_RMSE)
|
206
|
#Print(j) #This is the run umber printed to the console.
|
207
|
} #end of loop 1
|
208
|
|
209
|
write.table(results_RMSE_all2, file= paste(path,"/","results_GAM_Assessment",out_prefix,"all.txt",sep=""), sep=",", col.names=TRUE)
|
210
|
|
211
|
|
212
|
###Analysing the results from the 365 days run: Summarize by month
|
213
|
#
|
214
|
# for(i in 1:nrow(results_table_RMSE)){
|
215
|
# date<-results_table_RMSE$dates[i]
|
216
|
# date<-strptime(date, "%Y%m%d")
|
217
|
# results_table_RMSE$month[i]<-as.integer(strftime(date, "%m"))
|
218
|
# }
|
219
|
#
|
220
|
# average<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE,mean, na.rm=TRUE)
|
221
|
# average<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, FUN=mean)
|
222
|
# #average on all the data.frame
|
223
|
# averaget<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month),FUN=mean, na.rm=TRUE)
|
224
|
# #mediant<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month),FUN=median, na.rm=TRUE)
|
225
|
# #average_lowt<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month), FUN=function(v) t.test(v)$conf.int[1])
|
226
|
# #average_up<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[2])
|
227
|
#
|
228
|
# median<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, median, na.rm=TRUE)
|
229
|
# average_low<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[1])
|
230
|
# average_up<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[2])
|
231
|
#
|
232
|
# mod<-names(averaget)
|
233
|
# mod<-mod[4:11]
|
234
|
# #Saving graphic plots
|
235
|
# for(i in 1:length(mod)){
|
236
|
# X11(width=14,height=10)
|
237
|
# name<-mod[i]
|
238
|
# barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main="Mean RMSE per month", names.arg=c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep","Oct", "Nov", "Dec"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
|
239
|
# #title(paste("Sampling RMSE for mod",i,sep=""))
|
240
|
# savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
|
241
|
# dev.off()
|
242
|
# }
|
243
|
#
|
244
|
#
|
245
|
# for(i in 1:length(mod)){
|
246
|
# X11(width=14,height=10)
|
247
|
# name<-mod[i]
|
248
|
# barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main=paste(" Mean RMSE per month ",name, sep=""), names.arg=c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep","Oct", "Nov", "Dec"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
|
249
|
# #title(paste("Sampling RMSE for mod",i,sep=""))
|
250
|
# savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
|
251
|
# dev.off()
|
252
|
#
|
253
|
# X11(width=14,height=10)
|
254
|
# name<-mod[i]
|
255
|
# hist(results_table_RMSE[[name]],breaks=15, main=paste(" Histogram RMSE_",name, sep=""),xlab=paste("RMSE ",name, sep=""))
|
256
|
# savePlot(paste("Hist_results_RMSE_365_",name,out_prefix,".png", sep=""), type="png")
|
257
|
# dev.off()
|
258
|
#
|
259
|
# }
|
260
|
#
|
261
|
# for(i in 1:length(mod)){
|
262
|
# X11(width=14,height=10)
|
263
|
# name<-mod[i]
|
264
|
# hist(results_table_RMSE[[name]],breaks=15, main=paste(" Histogram RMSE_",name, sep=""),xlab=paste("RMSE ",name, sep=""))
|
265
|
# savePlot(paste("Hist_results_RMSE_365_",name,out_prefix,".png", sep=""), type="png")
|
266
|
# dev.off()
|
267
|
# }
|
268
|
#
|
269
|
# r<-(results_RMSE_all2[,4:10]) #selecting only the columns related to models...
|
270
|
#
|
271
|
# mean_r<-sapply(r, mean)
|
272
|
# median_r<-sapply(r, median)
|
273
|
# sd_r<-sapply(r, sd)
|
274
|
#
|
275
|
# barplot(mean_r,ylim=c(20,26),ylab="RMSE in tenth deg C")
|
276
|
# barplot(median_r,ylim=c(20,26),ylab="RMSE in tenth deg C",add=TRUE,inside=FALSE,beside=TRUE) # put both on the same plot
|
277
|
# barplot(sd_r,ylim=c(6,8),ylab="RMSE in tenth deg C") # put both on the same plot
|
278
|
#
|
279
|
# height<-rbind(mean_r,median_r)
|
280
|
# barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE,legend=rownames(height))
|
281
|
# barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE, col=c("darkblue","red"),legend=rownames(height)) # put both on the same plot
|
282
|
# PNG(paste("Barplot_results_RMSE_sampling_",out_prefix,".png", sep=""))
|
283
|
#
|
284
|
# barplot2(mean_r,median_r,ylim=c(23,26),ylab="RMSE in tenth deg C") # put both on the same plot
|
285
|
# #Collect var explained and p values for each var...
|
286
|
#
|
287
|
# mod<-names(mean_r)
|
288
|
# average<-mean_r
|
289
|
# average_low<-mean_r-sd_r
|
290
|
# average_up<-mean_r+sd_r
|
291
|
#
|
292
|
# for(i in 1:length(mod)){
|
293
|
# #X11(width=14,height=10)
|
294
|
# name<-mod[i]
|
295
|
# barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main=paste(" Mean RMSE per month ",name, sep=""), names.arg=c("mod1", "mod2", "mod3", "mod4", "mod5", "mod6","mod7"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
|
296
|
# #title(paste("Sampling RMSE for mod",i,sep=""))
|
297
|
# #savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
|
298
|
# #dev.off()
|
299
|
# }
|
300
|
|
301
|
# End of script##########
|
302
|
|
303
|
#Selecting dates and files based on names
|
304
|
#cor_LST_LC<-matrix(1,10,1)
|
305
|
# for(i in 1:length(dates)){
|
306
|
# cor_LST_LC1[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC1)
|
307
|
# }
|
308
|
# for(i in 1:length(dates)){
|
309
|
# cor_LST_LC3[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC3)
|
310
|
# }
|
311
|
|