Project

General

Profile

Download (15.9 KB) Statistics
| Branch: | Revision:
1
##################    Interpolation of Tmax for 10 dates.  #######################################
2
###########################  TWO-STAGE REGRESSION  ###############################################
3
#This script interpolates station values for the Oregon case study using a two-stage regression. #
4
#For each input dates, it performs 1) Step 1: General Additive Model (GAM)                       #
5
#                                  2) Step 2: Kriging on residuals from step 1                   #
6
#                                                                                                #
7
#The script uses LST monthly averages as input variables and  loads the station data             # 
8
#from a shape file with projection information.                                                  #
9
#Note that this program:                                                                         #
10
#1)assumes that the shape file is in the current working.                                        # 
11
#2)extract relevant variables from raster images before performing the regressions.              #
12
#This scripts predicts tmax using ing GAM and LST derived from MOD11A1.                          #
13
#Interactions terms are also included and assessed using the RMSE from validation dataset.       #
14
#There are 10 dates used for the GAM interpolation. The dates must be provided as a textfile.    #
15
#AUTHOR: Benoit Parmentier                                                                       #
16
#DATE: 05/09/212                                                                                 #
17
#PROJECT: NCEAS INPLAN: Environment and Organisms --TASK#364--                                   #
18
##################################################################################################
19

    
20
###Loading r library and packages                                                      # loading the raster package
21
library(gtools)                                                                        # loading ...
22
library(mgcv)
23
library(sp)
24
library(spdep)
25
library(rgdal)
26
library(gstat)
27

    
28
###Parameters and arguments
29

    
30
infile1<- "ghcn_or_tmax_b_04142012_OR83M.shp"
31
#infile2<-"dates_interpolation_03052012.txt"                                          #List of 10 dates for the regression
32
infile2<-"list_365_dates_04212012.txt"
33
infile3<-"LST_dates_var_names.txt"
34
infile4<-"models_interpolation_05142012.txt"
35
infile5<-"mean_day244_rescaled.rst" #Raster or grid for the locations of predictions
36

    
37
#path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations"         #Jupiter LOCATION on EOS
38
path<-"H:/Data/IPLANT_project/data_Oregon_stations"                                 #Jupiter Location on XANDERS
39
setwd(path) 
40
prop<-0.3                                                                           #Proportion of testing retained for validation   
41
seed_number<-100
42
out_prefix<-"_05142012_365d_Kr_LST"
43

    
44
#######START OF THE SCRIPT #############
45

    
46
###Reading the station data and setting up for models' comparison
47
filename<-sub(".shp","",infile1)            #Removing the extension from file.
48
ghcn<-readOGR(".", filename)                  #reading shapefile 
49

    
50
CRS<-proj4string(ghcn)
51

    
52
mean_LST<- readGDAL(infile5)  #This reads the whole raster in memory and provide a grid for kriging
53
proj4string(mean_LST)<-CRS #Assigning coordinates information
54

    
55
ghcn = transform(ghcn,Northness = cos(ASPECT)) #Adding a variable to the dataframe
56
ghcn = transform(ghcn,Eastness = sin(ASPECT))  #adding variable to the dataframe.
57
ghcn = transform(ghcn,Northness_w = sin(slope)*cos(ASPECT)) #Adding a variable to the dataframe
58
ghcn = transform(ghcn,Eastness_w = sin(slope)*sin(ASPECT))  #adding variable to the dataframe.
59

    
60
set.seed(100)
61
dates <-readLines(paste(path,"/",infile2, sep=""))
62
LST_dates <-readLines(paste(path,"/",infile3, sep=""))
63
models <-readLines(paste(path,"/",infile4, sep=""))
64

    
65
results <- matrix(1,length(dates),14)            #This is a matrix containing the diagnostic measures from the GAM models.
66

    
67
results_AIC<- matrix(1,length(dates),length(models)+2)  #Storing diagnostic statistics
68
results_GCV<- matrix(1,length(dates),length(models)+2)
69
results_DEV<- matrix(1,length(dates),length(models)+2)
70
results_RMSE<- matrix(1,length(dates),length(models)+2)
71
results_RMSE_kr<- matrix(1,length(dates),length(models)+2)
72
cor_LST_LC1<-matrix(1,10,1)      #correlation LST-LC1
73
cor_LST_LC3<-matrix(1,10,1)      #correlation LST-LC3
74
cor_LST_tmax<-matrix(1,10,1)    #correlation LST-tmax
75
#Screening for bad values
76

    
77
ghcn_all<-ghcn
78
ghcn_test<-subset(ghcn,ghcn$tmax>-150 & ghcn$tmax<400)
79
ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0)
80
ghcn<-ghcn_test2
81
#coords<- ghcn[,c('x_OR83M','y_OR83M')]
82

    
83
month_var<-c("mm_01","mm_02","mm_03","mm_04","mm_05","mm_06","mm_07","mm_08","mm_09", "mm_10", "mm_11", "mm_12")
84
ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 subsets data
85
#note that compare to the previous version date_ column was changed to date
86

    
87
## looping through the dates...
88
#Change this into  a nested loop, looping through the number of models
89

    
90
for(i in 1:length(dates)){            # start of the for loop #1
91
  date<-strptime(dates[i], "%Y%m%d")
92
  month<-strftime(date, "%m")
93
  LST_month<-paste("mm_",month,sep="")
94
  ###Regression part 1: Creating a validation dataset by creating training and testing datasets
95
  
96
  mod_LST <-ghcn.subsets[[i]][,match(LST_month, names(ghcn.subsets[[i]]))]
97
  ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod_LST)
98
  #Screening LST values
99
  #ghcn.subsets[[i]]<-subset(ghcn.subsets[[i]],ghcn.subsets[[i]]$LST> 258 & ghcn.subsets[[i]]$LST<313)
100
  n<-nrow(ghcn.subsets[[i]])
101
  ns<-n-round(n*prop)  #Create a sample from the data frame with 70% of the rows
102
  nv<-n-ns             #create a sample for validation with prop of the rows
103
  #ns<-n-round(n*prop)  #Create a sample from the data frame with 70% of the rows
104
  ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
105
  ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
106
  data_s <- ghcn.subsets[[i]][ind.training, ]
107
  data_v <- ghcn.subsets[[i]][ind.testing, ]
108

    
109
  ####Regression part 2: GAM models (REGRESSION STEP1)
110

    
111
  mod1<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=data_s)
112
  mod2<- gam(tmax~ s(lat,lon,ELEV_SRTM), data=data_s)
113
  mod3<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) +  s (Northness)+ s (Eastness) + s(DISTOC), data=data_s)
114
  mod4<- gam(tmax~ s(lat) + s (lon) + s(ELEV_SRTM) + s(Northness) + s (Eastness) + s(DISTOC) + s(LST), data=data_s)
115
  mod5<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST), data=data_s)
116
  mod6<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC1), data=data_s)
117
  mod7<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC3), data=data_s)
118
  mod8<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST) + s(LC1), data=data_s)
119
  
120
  ####Regression part 3: Calculating and storing diagnostic measures
121
  #listmod can be created and looped over. In this case we loop around the objects..
122
  for (j in 1:length(models)){
123
    name<-paste("mod",j,sep="") #modj is the name of he "j" model (mod1 if j=1) 
124
    mod<-get(name)                   #accessing GAM model ojbect "j"
125
    results_AIC[i,1]<- dates[i]  #storing the interpolation dates in the first column
126
    results_AIC[i,2]<- ns        #number of stations used in the training stage
127
    results_AIC[i,j+2]<- AIC (mod)
128
  
129
    results_GCV[i,1]<- dates[i]  #storing the interpolation dates in the first column
130
    results_GCV[i,2]<- ns        #number of stations used in the training stage
131
    results_GCV[i,j+2]<- mod$gcv.ubre
132
  
133
    results_DEV[i,1]<- dates[i]  #storing the interpolation dates in the first column
134
    results_DEV[i,2]<- ns        #number of stations used in the training stage
135
    results_DEV[i,j+2]<- mod$deviance
136

    
137
    #####VALIDATION: Prediction checking the results using the testing data########
138
 
139
    #Automate this using a data frame of size??
140
    y_mod<- predict(mod, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values.
141
    res_mod<- data_v$tmax - y_mod$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation
142
    RMSE_mod <- sqrt(sum(res_mod^2)/nv) #RMSE FOR REGRESSION STEP 1: GAM     
143

    
144
    results_RMSE[i,1]<- dates[i]  #storing the interpolation dates in the first column
145
    results_RMSE[i,2]<- ns        #number of stations used in the training stage
146
    results_RMSE[i,j+2]<- RMSE_mod
147
  
148
    #Saving residuals and prediction in the dataframes: tmax predicted from GAM
149
    pred<-paste("pred_mod",j,sep="")
150
    data_v[[pred]]<-as.numeric(y_mod$fit)
151
    data_s[[pred]]<-as.numeric(mod$fit) #Storing model fit values (predicted on training sample)
152
   
153
    name2<-paste("res_mod",j,sep="")
154
    data_v[[name2]]<-as.numeric(res_mod)
155
    data_s[[name2]]<-as.numeric(mod$residuals)
156
    #end of loop calculating RMSE
157
    #NEED TO ADD BIAS AND MAE
158
    
159
    }
160
  
161
  ###BEFORE Kringing the data object must be transformed to SDF
162
  
163
  coords<- data_v[,c('x_OR83M','y_OR83M')]
164
  coordinates(data_v)<-coords
165
  proj4string(data_v)<-CRS  #Need to assign coordinates...
166
  coords<- data_s[,c('x_OR83M','y_OR83M')]
167
  coordinates(data_s)<-coords
168
  proj4string(data_s)<-CRS  #Need to assign coordinates..
169
  
170
  #KRIGING ON GAM RESIDUALS: REGRESSION STEP2
171

    
172
  for (j in 1:length(models)){
173
    name<-paste("res_mod",j,sep="")
174
    data_s$residuals<-data_s[[name]]
175
    X11()
176
    hscat(residuals~1,data_s,(0:9)*20000) # 9 lag classes with 20,000m width
177
    v<-variogram(residuals~1, data_s)   
178
    plot(v)                               # This plot may be saved at a later stage...
179
    dev.off()
180
    v.fit<-fit.variogram(v,vgm(1,"Sph", 150000,1))
181
    res_krige<-krige(residuals~1, data_s,mean_LST, v.fit)#mean_LST provides the data grid/raster image for the kriging locations.
182
  
183
    res_krig1_s <- overlay(res_krige,data_s)             #This overlays the kriged surface tmax and the location of weather stations
184
    res_krig1_v <- overlay(res_krige,data_v)             #This overlays the kriged surface tmax and the location of weather stations
185
  
186
    name2<-paste("pred_kr_mod",j,sep="")
187
    #Adding the results back into the original dataframes.
188
    data_s[[name2]]<-res_krig1_s$var1.pred
189
    data_v[[name2]]<-res_krig1_v$var1.pred  
190
    
191
    #NEED TO ADD IT BACK TO THE PREDICTION FROM GAM
192
    gam_kr<-paste("pred_gam_kr",j,sep="")
193
    pred_gam<-paste("pred_mod",j,sep="")
194
    data_s[[gam_kr]]<-data_s[[pred_gam]]+ data_s[[name2]]
195
    data_v[[gam_kr]]<-data_v[[pred_gam]]+ data_v[[name2]]
196
    
197
    #Calculate RMSE and then krig the residuals....!
198
  
199
    res_mod_kr_s<- data_s$tmax - data_s[[gam_kr]]           #Residuals from kriging.
200
    res_mod_kr_v<- data_v$tmax - data_v[[gam_kr]]           #Residuals from cokriging.
201
  
202
    RMSE_mod_kr_s <- sqrt(sum(res_mod_kr_s^2,na.rm=TRUE)/(nv-sum(is.na(res_mod_kr_s))))                  #RMSE from kriged surface.
203
    RMSE_mod_kr_v <- sqrt(sum(res_mod_kr_v^2,na.rm=TRUE)/(nv-sum(is.na(res_mod_kr_v))))                  #RMSE from co-kriged surface.
204
    #(nv-sum(is.na(res_mod2)))
205
    #Writing out results
206
  
207
    results_RMSE_kr[i,1]<- dates[i]  #storing the interpolation dates in the first column
208
    results_RMSE_kr[i,2]<- ns        #number of stations used in the training stage
209
    results_RMSE_kr[i,j+2]<- RMSE_mod_kr_v
210
    #results_RMSE_kr[i,3]<- res_mod_kr_v
211
    name3<-paste("res_kr_mod",j,sep="")
212
    #as.numeric(res_mod)
213
    #data_s[[name3]]<-res_mod_kr_s
214
    data_s[[name3]]<-as.numeric(res_mod_kr_s)
215
    #data_v[[name3]]<-res_mod_kr_v 
216
    data_v[[name3]]<-as.numeric(res_mod_kr_v)
217
    #Writing residuals from kriging
218
    
219
    }
220
  
221
  ###SAVING THE DATA FRAME IN SHAPEFILES AND TEXTFILES
222
  
223
  data_name<-paste("ghcn_v_",out_prefix,"_",dates[[i]],sep="")
224
  assign(data_name,data_v)
225
  #write.table(data_v, file= paste(path,"/",data_name,".txt",sep=""), sep=" ")
226
  #write out a new shapefile (including .prj component)
227
  #outfile<-sub(".shp","",data_name)   #Removing extension if it is present
228
  #writeOGR(data_v,".", outfile, driver ="ESRI Shapefile")
229
  
230
  data_name<-paste("ghcn_s_",out_prefix,"_",dates[[i]],sep="")
231
  assign(data_name,data_s)
232
  #write.table(data_s, file= paste(path,"/",data_name,".txt",sep=""), sep=" ")
233
  #outfile<-sub(".shp","",data_name)   #Removing extension if it is present
234
  #writeOGR(data_s,".", outfile, driver ="ESRI Shapefile")
235
  
236
  # end of the for loop1
237
  
238
  }
239

    
240
## Plotting and saving diagnostic measures
241

    
242
results_RMSEnum <-results_RMSE
243
results_AICnum <-results_AIC
244
mode(results_RMSEnum)<- "numeric"         # Make it numeric first
245
mode(results_AICnum)<- "numeric"          # Now turn it into a data.frame...
246

    
247
results_table_RMSE<-as.data.frame(results_RMSEnum)
248
results_table_AIC<-as.data.frame(results_AICnum)
249
colnames(results_table_RMSE)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6", "mod7")
250
colnames(results_table_AIC)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6", "mod7")
251

    
252
results_RMSE_kr_num <-results_RMSE_kr
253
mode(results_RMSE_kr_num)<- "numeric"         # Make it numeric first
254
results_table_RMSE_kr<-as.data.frame(results_RMSE_kr_num)
255
colnames(results_table_RMSE_kr)<-c("dates","ns","mod1k", "mod2k","mod3k", "mod4k", "mod5k", "mod6k", "mod7k")
256
#results_table_RMSE
257

    
258
write.table(results_table_RMSE, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep=""), sep=",")
259
write.table(results_table_AIC, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep=""),sep=",", append=TRUE)
260
write.table(results_table_RMSE_kr, file= paste(path,"/","results_GAM_Assessment_kr",out_prefix,".txt",sep=""), sep=",")
261

    
262
##Visualization of results##
263

    
264
for(i in 1:length(dates)){
265
  X11()
266
  RMSE_kr<-results_table_RMSE_kr[i,]
267
  RMSE_ga<-results_table_RMSE[i,]
268
  
269
  RMSE_kr<-RMSE_kr[,1:length(models)+2]
270
  RMSE_ga<-RMSE_ga[,1:length(models)+2]
271
  colnames(RMSE_kr)<-names(RMSE_ga)
272
  height<-rbind(RMSE_ga,RMSE_kr)
273
  rownames(height)<-c("GAM","GAM_KR")
274
  height<-as.matrix(height)
275
  barplot(height,ylim=c(14,36),ylab="RMSE in tenth deg C",beside=TRUE,
276
          legend.text=rownames(height),
277
          args.legend=list(x="topright"),
278
          main=paste("RMSE for date ",dates[i], sep=""))
279
  savePlot(paste("Barplot_results_RMSE_GAM_KR_",dates[i],out_prefix,".png", sep=""), type="png")
280
  dev.off()
281
  }
282
  
283
r1<-(results_table_RMSE[,3:10]) #selecting only the columns related to models and method 1
284
r2<-(results_table_RMSE_kr[,3:10]) #selecting only the columns related to models and method 1
285
mean_r1<-mean(r1)
286
mean_r2<-mean(r2)
287
median_r1<-sapply(r1, median)   #Calulcating the mean for every model (median of columns)
288
median_r2<-sapply(r2, median)
289
sd_r1<-sapply(r1, sd)
290
sd_r2<-sapply(r2, sd)
291

    
292
barplot(mean_r1,ylim=c(23,26),ylab="RMSE in tenth deg C")
293
barplot(mean_r2,ylim=c(23,26),ylab="RMSE in tenth deg C")
294
barplot(median_r1,ylim=c(23,26),ylab="RMSE in tenth deg C",add=TRUE,inside=FALSE,beside=TRUE) # put both on the same plot
295
barplot(median_r2,ylim=c(23,26),ylab="RMSE in tenth deg C",add=TRUE,inside=FALSE,beside=TRUE) # put both on the same plot
296

    
297
barplot(sd_r1,ylim=c(6,8),ylab="RMSE in tenth deg C") # put both on the same plot
298
barplot(sd_r2,ylim=c(6,8),ylab="RMSE in tenth deg C") # put both on the same plot
299

    
300
height<-rbind(mean_r1,mean_r2)
301
barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE,legend=rownames(height))
302
barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE, col=c("darkblue","red"),legend=rownames(height)) # put both on the same plot
303

    
304
height<-rbind(median_r1,median_r2)
305
barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE,legend=rownames(height))
306
barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE, col=c("darkblue","red"),legend=rownames(height)) # put both on the same plot
307

    
308
height<-rbind(mean_r2,median_r2)
309
barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE,legend=rownames(height))
310
barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE, col=c("darkblue","red"),legend=rownames(height)) # put both on the same plot
311

    
312
#barplot2(mean_r,median_r,ylim=c(23,26),ylab="RMSE in tenth deg C") # put both on the same plot
313
#Collect var explained and p values for each var...
314

    
315
### End of script  ##########
316

    
317

    
318

    
(4-4/31)