1 |
4c29b739
|
Adam M. Wilson
|
####################Interpolation of Tmax for 10 dates.#####################
|
2 |
|
|
#This script interpolates station tmax values for the Oregon case study.It provides a mean to asssess the effect of random sampling and proportion
|
3 |
|
|
# of validation hold out on the RMSE.This program loads the station data from a csv file
|
4 |
|
|
#and perform one type of regression: general additive model (GAM) with different variables:
|
5 |
|
|
# Lat, long, ELEV_SRTM, Eastness, Northness, DISTOC, mean_LST_monthly, Land Cover proportions.
|
6 |
|
|
#Note that this program:
|
7 |
|
|
#1)assumes that the csv file is in the current working
|
8 |
|
|
#2)extract relevant variables from raster images before performing the regressions.
|
9 |
|
|
#3)does not clear memory workspace at the start or end of script.
|
10 |
|
|
#This scripts predicts tmax using GAM and LST derived from MOD11A1.
|
11 |
|
|
#Interactions terms are also included and assessed using the RMSE from validation dataset.
|
12 |
|
|
#There are 10 dates used for the GAM interpolation. The dates must be provided as a textfile.
|
13 |
|
|
#Script created by Benoit Parmentier on April 25, 2012.
|
14 |
|
|
|
15 |
|
|
###Loading r library and packages # loading the raster package
|
16 |
|
|
library(gtools) # loading ...
|
17 |
|
|
library(mgcv)
|
18 |
|
|
library(sp)
|
19 |
|
|
library(spdep)
|
20 |
|
|
library(rgdal)
|
21 |
|
|
library(multicore) # if installed allows easy parallelization
|
22 |
|
|
library(reshape) # very useful for switching from 'wide' to 'long' data formats
|
23 |
|
|
|
24 |
|
|
###Parameters and arguments
|
25 |
|
|
|
26 |
|
|
infile1<-"ghcn_or_ppt_covariates_20120705_OR83M.shp"
|
27 |
|
|
path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations"
|
28 |
|
|
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations"
|
29 |
|
|
#path<-"H:/Data/IPLANT_project/data_Oregon_stations"
|
30 |
|
|
path="/home/wilson/data"
|
31 |
|
|
setwd(path)
|
32 |
|
|
infile2<-"dates_interpolation_03052012.txt" #List of 10 dates for the regression
|
33 |
|
|
prop<-0.3 #Proportion of testing retained for validation
|
34 |
|
|
n_runs<- 30 #Number of runs
|
35 |
|
|
out_prefix<-"_20120705_run01_LST"
|
36 |
|
|
infile3<-"LST_dates_var_names.txt" #List of LST averages.
|
37 |
|
|
infile4<-"models_interpolation_05142012.txt"
|
38 |
|
|
|
39 |
|
|
#######START OF THE SCRIPT #############
|
40 |
|
|
|
41 |
|
|
###Reading the station data and setting up for models' comparison
|
42 |
|
|
filename<-sub(".shp","",infile1) #Removing the extension from file.
|
43 |
|
|
ghcn<-readOGR(".", filename) #reading shapefile
|
44 |
|
|
|
45 |
|
|
ghcn = transform(ghcn,Northness = cos(ASPECT*pi/180)) #Adding a variable to the dataframe
|
46 |
|
|
ghcn = transform(ghcn,Eastness = sin(ASPECT*pi/180)) #adding variable to the dataframe.
|
47 |
|
|
ghcn = transform(ghcn,Northness_w = sin(slope*pi/180)*cos(ASPECT*pi/180)) #Adding a variable to the dataframe
|
48 |
|
|
ghcn = transform(ghcn,Eastness_w = sin(slope*pi/180)*sin(ASPECT*pi/180)) #adding variable to the dataframe.
|
49 |
|
|
#Note that "transform" output is a data.frame not spatial object
|
50 |
|
|
#set.seed(100) #modify this to a seed variable allowing different runs.
|
51 |
|
|
|
52 |
|
|
dates <-readLines(paste(path,"/",infile2, sep=""))
|
53 |
|
|
LST_dates <-readLines(paste(path,"/",infile3, sep=""))
|
54 |
|
|
models <-readLines(paste(path,"/",infile4, sep=""))
|
55 |
|
|
|
56 |
|
|
#results <- matrix(1,length(dates),14) #This is a matrix containing the diagnostic measures from the GAM models.
|
57 |
|
|
|
58 |
|
|
#### Define GAM models
|
59 |
|
|
var="tmax"
|
60 |
|
|
|
61 |
|
|
mods=data.frame(formula=c(
|
62 |
|
|
paste(var,"~ s(lat) + s (lon) + s (ELEV_SRTM)",sep=""),
|
63 |
|
|
paste(var,"~ s(lat,lon,ELEV_SRTM)",sep=""),
|
64 |
|
|
paste(var,"~ s(lat) + s (lon) + s (ELEV_SRTM) + s (Northness)+ s (Eastness) + s(DISTOC)",sep=""),
|
65 |
|
|
paste(var,"~ s(lat) + s (lon) + s(ELEV_SRTM) + s(Northness) + s (Eastness) + s(DISTOC) + s(LST)",sep=""),
|
66 |
|
|
paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST)",sep=""),
|
67 |
|
|
paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC1)",sep=""),
|
68 |
|
|
paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC3)",sep=""),
|
69 |
|
|
paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST) + s(LC1)",sep="")
|
70 |
|
|
),stringsAsFactors=F)
|
71 |
|
|
mods$model=paste("mod",1:nrow(mods),sep="")
|
72 |
|
|
|
73 |
|
|
|
74 |
|
|
### subset dataset?
|
75 |
|
|
ghcn_all<-ghcn
|
76 |
|
|
ghcn_test<-subset(ghcn,ghcn$tmax>-150 & ghcn$tmax<400)
|
77 |
|
|
ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0)
|
78 |
|
|
ghcn<-ghcn_test2
|
79 |
|
|
|
80 |
|
|
|
81 |
|
|
## loop through the dates...
|
82 |
|
|
|
83 |
|
|
ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 subset data
|
84 |
|
|
|
85 |
|
|
results=do.call(rbind.data.frame, # Collect the results in a single data.frame
|
86 |
|
|
mclapply(1:length(dates),function(i) { # loop over dates
|
87 |
|
|
date<-strptime(dates[i], "%Y%m%d") # get date
|
88 |
|
|
month<-strftime(date, "%m") # get month
|
89 |
|
|
LST_month<-paste("mm_",month,sep="") # get LST_month name
|
90 |
|
|
|
91 |
|
|
##Regression part 1: Creating a validation dataset by creating training and testing datasets
|
92 |
|
|
mod <-ghcn.subsets[[i]][,match(LST_month, names(ghcn.subsets[[i]]))]
|
93 |
|
|
ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod)
|
94 |
|
|
##Screening LST values
|
95 |
|
|
##ghcn.subsets[[i]]<-subset(ghcn.subsets[[i]],ghcn.subsets[[i]]$LST> 258 & ghcn.subsets[[i]]$LST<313)
|
96 |
|
|
n<-nrow(ghcn.subsets[[i]])
|
97 |
|
|
ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows
|
98 |
|
|
nv<-n-ns #create a sample for validation with prop of the rows
|
99 |
|
|
ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
|
100 |
|
|
ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
|
101 |
|
|
data_s <- ghcn.subsets[[i]][ind.training, ]
|
102 |
|
|
data_v <- ghcn.subsets[[i]][ind.testing, ]
|
103 |
|
|
|
104 |
|
|
## lapply loops through models for the ith day, calculates the validation metrics, and saves them as R objects
|
105 |
|
|
results=do.call(rbind.data.frame,
|
106 |
|
|
lapply(1:nrow(mods),function(m,savemodel=F,saveFullPrediction=T) {
|
107 |
|
|
## run the model
|
108 |
|
|
mod=gam(formula(mods$formula[m]),data=data_s)
|
109 |
|
|
|
110 |
|
|
##VALIDATION: Prediction checking the results using the testing data########
|
111 |
|
|
y_mod<- predict(mod, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values.
|
112 |
|
|
res_mod<- data_v$tmax - y_mod$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation
|
113 |
|
|
|
114 |
|
|
##Regression part 3: Calculating and storing diagnostic measures
|
115 |
|
|
tresults=data.frame( # build 1-row dataframe for this model-date
|
116 |
|
|
date=dates[i], # interpolation date
|
117 |
|
|
model=mods$model[m], # model number
|
118 |
|
|
ns=ns, # number of stations used in the training stage
|
119 |
|
|
AIC=AIC(mod), # AIC
|
120 |
|
|
GCV=mod$gcv.ubre, # GCV
|
121 |
|
|
DEV=mod$deviance, # Deviance
|
122 |
|
|
RMSE=sqrt(sum(res_mod^2)/nv) # RMSE
|
123 |
|
|
)
|
124 |
|
|
|
125 |
|
|
## Save the model object if desired
|
126 |
|
|
if(savemodel) save(mod,file= paste(path,"/","results_GAM_Model","_",out_prefix,"_",dates[i],".Rdata",sep=""))
|
127 |
|
|
|
128 |
|
|
## do the full prediction and save it if desired
|
129 |
|
|
if(saveFullPrediction){
|
130 |
|
|
s_raster=readRaster(filename=paste(path,"covariates.gri"))
|
131 |
|
|
predict(s_raster,mod,filename=paste(outpath,"_",sub("-","",date),"_prediction.tif",sep=""),format="GTiff",progress="text",fun="predict")
|
132 |
|
|
predict(s_raster,mod,filename=paste(outpath,"_",sub("-","",date),"_prediction.se.tif",sep=""),format="GTiff",progress="text",fun="predict.se")
|
133 |
|
|
}
|
134 |
|
|
|
135 |
|
|
## print progress
|
136 |
|
|
print(paste("Finshed Model:",mods$model[m]," for Date:",dates[i]))
|
137 |
|
|
## return the results table
|
138 |
|
|
return(tresults)
|
139 |
|
|
# end of the for loop #2 (nested in loop #1)
|
140 |
|
|
}))
|
141 |
|
|
print(paste("Finshed Date:",dates[i]))
|
142 |
|
|
return(results)
|
143 |
|
|
}
|
144 |
|
|
))
|
145 |
|
|
|
146 |
|
|
|
147 |
|
|
|
148 |
|
|
write.table(results_RMSE_all2, file= paste(path,"/","results_GAM_Assessment",out_prefix,"all.txt",sep=""), sep=",", col.names=TRUE)
|
149 |
|
|
|
150 |
|
|
resl=melt(results,id=c("run","date","model","ns"))
|
151 |
|
|
|
152 |
|
|
xyplot(value~date|variable,groups=model,data=resl,scales=list(y=list(relation="free")),auto.key=list()
|
153 |
|
|
|
154 |
|
|
|
155 |
|
|
|
156 |
|
|
###Analysing the results from the 365 days run: Summarize by month
|
157 |
|
|
#
|
158 |
|
|
# for(i in 1:nrow(results_table_RMSE)){
|
159 |
|
|
# date<-results_table_RMSE$dates[i]
|
160 |
|
|
# date<-strptime(date, "%Y%m%d")
|
161 |
|
|
# results_table_RMSE$month[i]<-as.integer(strftime(date, "%m"))
|
162 |
|
|
# }
|
163 |
|
|
#
|
164 |
|
|
# average<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE,mean, na.rm=TRUE)
|
165 |
|
|
# average<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, FUN=mean)
|
166 |
|
|
# #average on all the data.frame
|
167 |
|
|
# averaget<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month),FUN=mean, na.rm=TRUE)
|
168 |
|
|
# #mediant<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month),FUN=median, na.rm=TRUE)
|
169 |
|
|
# #average_lowt<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month), FUN=function(v) t.test(v)$conf.int[1])
|
170 |
|
|
# #average_up<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[2])
|
171 |
|
|
#
|
172 |
|
|
# median<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, median, na.rm=TRUE)
|
173 |
|
|
# average_low<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[1])
|
174 |
|
|
# average_up<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[2])
|
175 |
|
|
#
|
176 |
|
|
# mod<-names(averaget)
|
177 |
|
|
# mod<-mod[4:11]
|
178 |
|
|
# #Saving graphic plots
|
179 |
|
|
# for(i in 1:length(mod)){
|
180 |
|
|
# X11(width=14,height=10)
|
181 |
|
|
# name<-mod[i]
|
182 |
|
|
# barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main="Mean RMSE per month", names.arg=c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep","Oct", "Nov", "Dec"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
|
183 |
|
|
# #title(paste("Sampling RMSE for mod",i,sep=""))
|
184 |
|
|
# savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
|
185 |
|
|
# dev.off()
|
186 |
|
|
# }
|
187 |
|
|
#
|
188 |
|
|
#
|
189 |
|
|
# for(i in 1:length(mod)){
|
190 |
|
|
# X11(width=14,height=10)
|
191 |
|
|
# name<-mod[i]
|
192 |
|
|
# barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main=paste(" Mean RMSE per month ",name, sep=""), names.arg=c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep","Oct", "Nov", "Dec"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
|
193 |
|
|
# #title(paste("Sampling RMSE for mod",i,sep=""))
|
194 |
|
|
# savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
|
195 |
|
|
# dev.off()
|
196 |
|
|
#
|
197 |
|
|
# X11(width=14,height=10)
|
198 |
|
|
# name<-mod[i]
|
199 |
|
|
# hist(results_table_RMSE[[name]],breaks=15, main=paste(" Histogram RMSE_",name, sep=""),xlab=paste("RMSE ",name, sep=""))
|
200 |
|
|
# savePlot(paste("Hist_results_RMSE_365_",name,out_prefix,".png", sep=""), type="png")
|
201 |
|
|
# dev.off()
|
202 |
|
|
#
|
203 |
|
|
# }
|
204 |
|
|
#
|
205 |
|
|
# for(i in 1:length(mod)){
|
206 |
|
|
# X11(width=14,height=10)
|
207 |
|
|
# name<-mod[i]
|
208 |
|
|
# hist(results_table_RMSE[[name]],breaks=15, main=paste(" Histogram RMSE_",name, sep=""),xlab=paste("RMSE ",name, sep=""))
|
209 |
|
|
# savePlot(paste("Hist_results_RMSE_365_",name,out_prefix,".png", sep=""), type="png")
|
210 |
|
|
# dev.off()
|
211 |
|
|
# }
|
212 |
|
|
#
|
213 |
|
|
# r<-(results_RMSE_all2[,4:10]) #selecting only the columns related to models...
|
214 |
|
|
#
|
215 |
|
|
# mean_r<-sapply(r, mean)
|
216 |
|
|
# median_r<-sapply(r, median)
|
217 |
|
|
# sd_r<-sapply(r, sd)
|
218 |
|
|
#
|
219 |
|
|
# barplot(mean_r,ylim=c(20,26),ylab="RMSE in tenth deg C")
|
220 |
|
|
# barplot(median_r,ylim=c(20,26),ylab="RMSE in tenth deg C",add=TRUE,inside=FALSE,beside=TRUE) # put both on the same plot
|
221 |
|
|
# barplot(sd_r,ylim=c(6,8),ylab="RMSE in tenth deg C") # put both on the same plot
|
222 |
|
|
#
|
223 |
|
|
# height<-rbind(mean_r,median_r)
|
224 |
|
|
# barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE,legend=rownames(height))
|
225 |
|
|
# barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE, col=c("darkblue","red"),legend=rownames(height)) # put both on the same plot
|
226 |
|
|
# PNG(paste("Barplot_results_RMSE_sampling_",out_prefix,".png", sep=""))
|
227 |
|
|
#
|
228 |
|
|
# barplot2(mean_r,median_r,ylim=c(23,26),ylab="RMSE in tenth deg C") # put both on the same plot
|
229 |
|
|
# #Collect var explained and p values for each var...
|
230 |
|
|
#
|
231 |
|
|
# mod<-names(mean_r)
|
232 |
|
|
# average<-mean_r
|
233 |
|
|
# average_low<-mean_r-sd_r
|
234 |
|
|
# average_up<-mean_r+sd_r
|
235 |
|
|
#
|
236 |
|
|
# for(i in 1:length(mod)){
|
237 |
|
|
# #X11(width=14,height=10)
|
238 |
|
|
# name<-mod[i]
|
239 |
|
|
# barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main=paste(" Mean RMSE per month ",name, sep=""), names.arg=c("mod1", "mod2", "mod3", "mod4", "mod5", "mod6","mod7"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
|
240 |
|
|
# #title(paste("Sampling RMSE for mod",i,sep=""))
|
241 |
|
|
# #savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
|
242 |
|
|
# #dev.off()
|
243 |
|
|
# }
|
244 |
|
|
|
245 |
|
|
# End of script##########
|
246 |
|
|
|
247 |
|
|
#Selecting dates and files based on names
|
248 |
|
|
#cor_LST_LC<-matrix(1,10,1)
|
249 |
|
|
# for(i in 1:length(dates)){
|
250 |
|
|
# cor_LST_LC1[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC1)
|
251 |
|
|
# }
|
252 |
|
|
# for(i in 1:length(dates)){
|
253 |
|
|
# cor_LST_LC3[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC3)
|
254 |
|
|
# }
|