Project

General

Profile

Download (12.9 KB) Statistics
| Branch: | Revision:
1
####################Interpolation of Tmax for 10 dates.#####################
2
#This script interpolates station tmax values for the Oregon case study.It provides a mean to asssess the effect of random sampling and proportion
3
# of validation hold out on the RMSE.This program loads the station data from a csv file 
4
#and perform one type of regression:  general additive model (GAM) with different variables: 
5
# Lat, long, ELEV_SRTM, Eastness, Northness, DISTOC, mean_LST_monthly, Land Cover proportions.
6
#Note that this program:
7
#1)assumes that the csv file is in the current working 
8
#2)extract relevant variables from raster images before performing the regressions. 
9
#3)does not clear memory workspace at the start or end of script.
10
#This scripts predicts tmax using GAM and LST derived from MOD11A1.
11
#Interactions terms are also included and assessed using the RMSE from validation dataset.
12
#There are 10 dates used for the GAM interpolation. The dates must be provided as a textfile.
13
#Script created by Benoit Parmentier on April 25, 2012. 
14

    
15
###Loading r library and packages                                                      # loading the raster package
16
library(gtools)                                                                        # loading ...
17
library(mgcv)
18
library(sp)
19
library(spdep)
20
library(rgdal)
21
library(multicore)  # if installed allows easy parallelization
22
library(reshape)    # very useful for switching from 'wide' to 'long' data formats
23

    
24
###Parameters and arguments
25

    
26
infile1<-"ghcn_or_ppt_covariates_20120705_OR83M.shp"
27
path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations"
28
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations"
29
                                        #path<-"H:/Data/IPLANT_project/data_Oregon_stations"
30
path="/home/wilson/data"
31
setwd(path) 
32
infile2<-"dates_interpolation_03052012.txt"               #List of 10 dates for the regression
33
prop<-0.3  #Proportion of testing retained for validation   
34
n_runs<- 30 #Number of runs
35
out_prefix<-"_20120705_run01_LST"
36
infile3<-"LST_dates_var_names.txt"     #List of LST averages.
37
infile4<-"models_interpolation_05142012.txt"
38

    
39
#######START OF THE SCRIPT #############
40

    
41
###Reading the station data and setting up for models' comparison
42
filename<-sub(".shp","",infile1)              #Removing the extension from file.
43
ghcn<-readOGR(".", filename)                  #reading shapefile 
44
                  
45
ghcn = transform(ghcn,Northness = cos(ASPECT*pi/180)) #Adding a variable to the dataframe
46
ghcn = transform(ghcn,Eastness = sin(ASPECT*pi/180))  #adding variable to the dataframe.
47
ghcn = transform(ghcn,Northness_w = sin(slope*pi/180)*cos(ASPECT*pi/180)) #Adding a variable to the dataframe
48
ghcn = transform(ghcn,Eastness_w = sin(slope*pi/180)*sin(ASPECT*pi/180))  #adding variable to the dataframe.
49
                                              #Note that "transform" output is a data.frame not spatial object 
50
#set.seed(100) #modify this to a seed variable allowing different runs.
51

    
52
 dates <-readLines(paste(path,"/",infile2, sep=""))
53
LST_dates <-readLines(paste(path,"/",infile3, sep=""))
54
models <-readLines(paste(path,"/",infile4, sep=""))
55

    
56
#results <- matrix(1,length(dates),14)            #This is a matrix containing the diagnostic measures from the GAM models.
57

    
58
####  Define GAM models
59
var="tmax"
60

    
61
mods=data.frame(formula=c(    
62
                  paste(var,"~ s(lat) + s (lon) + s (ELEV_SRTM)",sep=""),
63
                  paste(var,"~ s(lat,lon,ELEV_SRTM)",sep=""),
64
                  paste(var,"~ s(lat) + s (lon) + s (ELEV_SRTM) +  s (Northness)+ s (Eastness) + s(DISTOC)",sep=""),
65
                  paste(var,"~ s(lat) + s (lon) + s(ELEV_SRTM) + s(Northness) + s (Eastness) + s(DISTOC) + s(LST)",sep=""),
66
                  paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST)",sep=""),
67
                  paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC1)",sep=""),
68
                  paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC3)",sep=""),
69
                  paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST) + s(LC1)",sep="")
70
                  ),stringsAsFactors=F)
71
mods$model=paste("mod",1:nrow(mods),sep="")
72

    
73

    
74
### subset dataset?
75
ghcn_all<-ghcn
76
ghcn_test<-subset(ghcn,ghcn$tmax>-150 & ghcn$tmax<400)
77
ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0)
78
ghcn<-ghcn_test2
79

    
80

    
81
## loop through the dates...
82
  
83
ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 subset data
84
  
85
 results=do.call(rbind.data.frame,                   # Collect the results in a single data.frame
86
   mclapply(1:length(dates),function(i) {            # loop over dates
87
     date<-strptime(dates[i], "%Y%m%d")              # get date
88
     month<-strftime(date, "%m")                     # get month
89
     LST_month<-paste("mm_",month,sep="")            # get LST_month name
90

    
91
     ##Regression part 1: Creating a validation dataset by creating training and testing datasets
92
     mod <-ghcn.subsets[[i]][,match(LST_month, names(ghcn.subsets[[i]]))]
93
     ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod)
94
     ##Screening LST values
95
     ##ghcn.subsets[[i]]<-subset(ghcn.subsets[[i]],ghcn.subsets[[i]]$LST> 258 & ghcn.subsets[[i]]$LST<313)
96
     n<-nrow(ghcn.subsets[[i]])
97
     ns<-n-round(n*prop)  #Create a sample from the data frame with 70% of the rows
98
     nv<-n-ns             #create a sample for validation with prop of the rows
99
     ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
100
     ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
101
     data_s <- ghcn.subsets[[i]][ind.training, ]
102
     data_v <- ghcn.subsets[[i]][ind.testing, ]
103
     
104
     ## lapply loops through models for the ith day, calculates the validation metrics, and saves them as R objects
105
     results=do.call(rbind.data.frame,
106
       lapply(1:nrow(mods),function(m,savemodel=F,saveFullPrediction=T) {  
107
         ## run the model
108
         mod=gam(formula(mods$formula[m]),data=data_s)
109
         
110
         ##VALIDATION: Prediction checking the results using the testing data########
111
         y_mod<- predict(mod, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values.
112
         res_mod<- data_v$tmax - y_mod$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation
113
         
114
         ##Regression part 3: Calculating and storing diagnostic measures
115
         tresults=data.frame(            # build 1-row dataframe for this model-date
116
           date=dates[i],                # interpolation date
117
           model=mods$model[m],          # model number
118
           ns=ns,                        # number of stations used in the training stage
119
           AIC=AIC(mod),                # AIC
120
           GCV=mod$gcv.ubre,             # GCV
121
           DEV=mod$deviance,             # Deviance
122
           RMSE=sqrt(sum(res_mod^2)/nv)  # RMSE
123
           )
124
         
125
         ## Save the model object if desired
126
         if(savemodel)  save(mod,file= paste(path,"/","results_GAM_Model","_",out_prefix,"_",dates[i],".Rdata",sep=""))
127

    
128
         ## do the full prediction and save it if desired
129
         if(saveFullPrediction){
130
           s_raster=readRaster(filename=paste(path,"covariates.gri"))
131
           predict(s_raster,mod,filename=paste(outpath,"_",sub("-","",date),"_prediction.tif",sep=""),format="GTiff",progress="text",fun="predict")
132
           predict(s_raster,mod,filename=paste(outpath,"_",sub("-","",date),"_prediction.se.tif",sep=""),format="GTiff",progress="text",fun="predict.se")
133
         }
134
         
135
         ## print progress
136
         print(paste("Finshed Model:",mods$model[m]," for Date:",dates[i]))
137
         ## return the results table
138
         return(tresults)
139
                                        # end of the for loop #2 (nested in loop #1)
140
       }))
141
     print(paste("Finshed Date:",dates[i]))
142
     return(results)
143
   }
144
            ))
145

    
146

    
147

    
148
  write.table(results_RMSE_all2, file= paste(path,"/","results_GAM_Assessment",out_prefix,"all.txt",sep=""), sep=",", col.names=TRUE)
149

    
150
resl=melt(results,id=c("run","date","model","ns"))
151

    
152
  xyplot(value~date|variable,groups=model,data=resl,scales=list(y=list(relation="free")),auto.key=list()
153

    
154
  
155

    
156
###Analysing the results from the 365 days run: Summarize by month
157
# 
158
# for(i in 1:nrow(results_table_RMSE)){
159
#   date<-results_table_RMSE$dates[i]
160
#   date<-strptime(date, "%Y%m%d")
161
#   results_table_RMSE$month[i]<-as.integer(strftime(date, "%m"))
162
# }
163
# 
164
# average<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE,mean, na.rm=TRUE)
165
# average<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, FUN=mean)
166
# #average on all the data.frame
167
# averaget<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month),FUN=mean, na.rm=TRUE)
168
# #mediant<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month),FUN=median, na.rm=TRUE)
169
# #average_lowt<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month), FUN=function(v) t.test(v)$conf.int[1])
170
# #average_up<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[2])
171
# 
172
# median<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, median, na.rm=TRUE)
173
# average_low<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[1])
174
# average_up<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[2])
175
# 
176
# mod<-names(averaget)
177
# mod<-mod[4:11]
178
# #Saving graphic plots
179
# for(i in 1:length(mod)){
180
#   X11(width=14,height=10)
181
#   name<-mod[i]
182
#   barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main="Mean RMSE per month", names.arg=c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep","Oct", "Nov", "Dec"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
183
#   #title(paste("Sampling RMSE for mod",i,sep=""))
184
#   savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
185
#   dev.off() 
186
# }
187
# 
188
# 
189
# for(i in 1:length(mod)){
190
#   X11(width=14,height=10)
191
#   name<-mod[i]
192
#   barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main=paste(" Mean RMSE per month ",name, sep=""), names.arg=c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep","Oct", "Nov", "Dec"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
193
#   #title(paste("Sampling RMSE for mod",i,sep=""))
194
#   savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
195
#   dev.off() 
196
#   
197
#   X11(width=14,height=10)
198
#   name<-mod[i]
199
#   hist(results_table_RMSE[[name]],breaks=15, main=paste(" Histogram RMSE_",name, sep=""),xlab=paste("RMSE ",name, sep=""))
200
#   savePlot(paste("Hist_results_RMSE_365_",name,out_prefix,".png", sep=""), type="png")
201
#   dev.off()
202
#   
203
# }
204
# 
205
# for(i in 1:length(mod)){
206
#   X11(width=14,height=10)
207
#   name<-mod[i]
208
#   hist(results_table_RMSE[[name]],breaks=15, main=paste(" Histogram RMSE_",name, sep=""),xlab=paste("RMSE ",name, sep=""))
209
#   savePlot(paste("Hist_results_RMSE_365_",name,out_prefix,".png", sep=""), type="png")
210
#   dev.off()
211
# }
212
# 
213
# r<-(results_RMSE_all2[,4:10]) #selecting only the columns related to models...
214
# 
215
# mean_r<-sapply(r, mean)
216
# median_r<-sapply(r, median)
217
# sd_r<-sapply(r, sd)
218
# 
219
# barplot(mean_r,ylim=c(20,26),ylab="RMSE in tenth deg C")
220
# barplot(median_r,ylim=c(20,26),ylab="RMSE in tenth deg C",add=TRUE,inside=FALSE,beside=TRUE) # put both on the same plot
221
# barplot(sd_r,ylim=c(6,8),ylab="RMSE in tenth deg C") # put both on the same plot
222
# 
223
# height<-rbind(mean_r,median_r)
224
# barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE,legend=rownames(height))
225
# barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE, col=c("darkblue","red"),legend=rownames(height)) # put both on the same plot
226
# PNG(paste("Barplot_results_RMSE_sampling_",out_prefix,".png", sep=""))
227
# 
228
# barplot2(mean_r,median_r,ylim=c(23,26),ylab="RMSE in tenth deg C") # put both on the same plot
229
# #Collect var explained and p values for each var...
230
# 
231
# mod<-names(mean_r)
232
# average<-mean_r
233
# average_low<-mean_r-sd_r
234
# average_up<-mean_r+sd_r
235
# 
236
# for(i in 1:length(mod)){
237
#   #X11(width=14,height=10)
238
#   name<-mod[i]
239
#   barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main=paste(" Mean RMSE per month ",name, sep=""), names.arg=c("mod1", "mod2", "mod3", "mod4", "mod5", "mod6","mod7"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
240
#   #title(paste("Sampling RMSE for mod",i,sep=""))
241
#   #savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
242
#   #dev.off()
243
#   }
244

    
245
# End of script##########
246

    
247
#Selecting dates and files based on names
248
#cor_LST_LC<-matrix(1,10,1)
249
# for(i in 1:length(dates)){
250
#   cor_LST_LC1[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC1)
251
# }
252
# for(i in 1:length(dates)){
253
#   cor_LST_LC3[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC3)
254
# }
255

    
(2-2/9)