Project

General

Profile

« Previous | Next » 

Revision ab8957ca

Added by Benoit Parmentier over 12 years ago

GAM LST, modify code to generalize number of models and assessment,GAM+Kriging task #364

View differences:

climate/research/oregon/interpolation/GAM_LST_reg.R
1
####################Interpolation of Tmax for 10 dates.#####################
2
#This script interpolates station values for the Oregon case study using a two stage regression.
3
#This program loads the station data from a shape file and perform 8 regressions using general additive model (GAM) followed by kriging on the residuals.
4
#It uses LST monthly averages as input variables.
5
#Note that this program:
6
#1)assumes that the shape file is in the current working 
7
#2)extract relevant variables from raster images before performing the regressions. 
8
#This scripts predicts tmax using ing GAM and LST derived from MOD11A1.
9
#Interactions terms are also included and assessed using the RMSE from validation dataset.
10
#There are 10 dates used for the GAM interpolation. The dates must be provided as a textfile.
11
#Script created by Benoit Parmentier on May 6, 2012. 
1
##################    Interpolation of Tmax for 10 dates.  #######################################
2
###########################  TWO-STAGE REGRESSION  ###############################################
3
#This script interpolates station values for the Oregon case study using a two-stage regression. #
4
#For each input dates, it performs 1) Step 1: General Additive Model (GAM)                       #
5
#                                  2) Step 2: Kriging on residuals from step 1                   #
6
#                                                                                                #
7
#The script uses LST monthly averages as input variables and  loads the station data             # 
8
#from a shape file with projection information.                                                  #
9
#Note that this program:                                                                         #
10
#1)assumes that the shape file is in the current working.                                        # 
11
#2)extract relevant variables from raster images before performing the regressions.              #
12
#This scripts predicts tmax using ing GAM and LST derived from MOD11A1.                          #
13
#Interactions terms are also included and assessed using the RMSE from validation dataset.       #
14
#There are 10 dates used for the GAM interpolation. The dates must be provided as a textfile.    #
15
#AUTHOR: Benoit Parmentier                                                                       #
16
#DATE: 05/09/212                                                                                 #
17
#PROJECT: NCEAS INPLAN: Environment and Organisms --TASK#364--                                   #
18
##################################################################################################
12 19

  
13 20
###Loading r library and packages                                                      # loading the raster package
14 21
library(gtools)                                                                        # loading ...
......
21 28
###Parameters and arguments
22 29

  
23 30
infile1<-"ghcn_or_tmax_b_04142012_OR83M.shp"
24
#path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations"
25
path<-"H:/Data/IPLANT_project/data_Oregon_stations"
26
setwd(path) 
27
infile2<-"dates_interpolation_03052012_2dates_test.txt"                                          #List of 10 dates for the regression
28
prop<-0.3                                                                            #Proportion of testing retained for validation   
29
out_prefix<-"_05062012_Kr_LST"
31
#infile2<-"dates_interpolation_03052012_2dates_test.txt"
32
infile2<-"dates_interpolation_03052012.txt"                                          #List of 10 dates for the regression
30 33
infile3<-"LST_dates_var_names.txt"
31
infile4<-"models_interpolation_04032012b.txt"
34
infile4<-"models_interpolation_04032012.txt"
35
infile5<-"mean_day244_rescaled.rst" #Raster or grid for the locations of predictions
36

  
37
#path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations"         #Jupiter LOCATION on EOS
38
path<-"H:/Data/IPLANT_project/data_Oregon_stations"                                 #Jupiter Location on XANDERS
39
setwd(path) 
40
prop<-0.3                                                                           #Proportion of testing retained for validation   
41
seed_number<-100
42
out_prefix<-"_05062012m_Kr_LST"
32 43

  
33 44
#######START OF THE SCRIPT #############
34 45

  
35 46
###Reading the station data and setting up for models' comparison
36
filename<-sub(".shp","",infile1)              #Removing the extension from file.
47
filename<-sub(".shp","",infile1)            #Removing the extension from file.
37 48
ghcn<-readOGR(".", filename)                  #reading shapefile 
38 49

  
39 50
CRS<-proj4string(ghcn)
40 51

  
41
mean_LST<- readGDAL("mean_day244_rescaled.rst")  #This reads the whole raster in memory and provide a grid for kriging
52
mean_LST<- readGDAL(infile5)  #This reads the whole raster in memory and provide a grid for kriging
42 53
proj4string(mean_LST)<-CRS #Assigning coordinates information
43 54

  
44

  
45 55
ghcn = transform(ghcn,Northness = cos(ASPECT)) #Adding a variable to the dataframe
46 56
ghcn = transform(ghcn,Eastness = sin(ASPECT))  #adding variable to the dataframe.
47 57
ghcn = transform(ghcn,Northness_w = sin(slope)*cos(ASPECT)) #Adding a variable to the dataframe
......
54 64

  
55 65
results <- matrix(1,length(dates),14)            #This is a matrix containing the diagnostic measures from the GAM models.
56 66

  
57
results_AIC<- matrix(1,length(dates),length(models)+2)  
67
results_AIC<- matrix(1,length(dates),length(models)+2)  #Storing diagnostic statistics
58 68
results_GCV<- matrix(1,length(dates),length(models)+2)
59 69
results_DEV<- matrix(1,length(dates),length(models)+2)
60 70
results_RMSE<- matrix(1,length(dates),length(models)+2)
......
83 93
  LST_month<-paste("mm_",month,sep="")
84 94
  ###Regression part 1: Creating a validation dataset by creating training and testing datasets
85 95
  
86
  mod <-ghcn.subsets[[i]][,match(LST_month, names(ghcn.subsets[[i]]))]
87
  ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod)
96
  mod_LST <-ghcn.subsets[[i]][,match(LST_month, names(ghcn.subsets[[i]]))]
97
  ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod_LST)
88 98
  #Screening LST values
89 99
  #ghcn.subsets[[i]]<-subset(ghcn.subsets[[i]],ghcn.subsets[[i]]$LST> 258 & ghcn.subsets[[i]]$LST<313)
90 100
  n<-nrow(ghcn.subsets[[i]])
......
95 105
  ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
96 106
  data_s <- ghcn.subsets[[i]][ind.training, ]
97 107
  data_v <- ghcn.subsets[[i]][ind.testing, ]
98
  #mod <-data_s[,match(LST_dates[i], names(data_s))]
99
  #data_s = transform(data_s,LST = mod)
100
  #data_v = transform(data_v,LST = mod)
101
  ####Regression part 2: GAM models
108

  
109
  ####Regression part 2: GAM models (REGRESSION STEP1)
102 110

  
103 111
  mod1<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=data_s)
104 112
  mod2<- gam(tmax~ s(lat,lon,ELEV_SRTM), data=data_s)
......
109 117
  mod7<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC3), data=data_s)
110 118
  
111 119
  ####Regression part 3: Calculating and storing diagnostic measures
120
  #listmod can be created and looped over. In this case we loop around the objects..
121
  for (j in 1:length(models)){
122
    name<-paste("mod",j,sep="") #modj is the name of he "j" model (mod1 if j=1) 
123
    mod<-get(name)                   #accessing GAM model ojbect "j"
124
    results_AIC[i,1]<- dates[i]  #storing the interpolation dates in the first column
125
    results_AIC[i,2]<- ns        #number of stations used in the training stage
126
    results_AIC[i,j+2]<- AIC (mod)
112 127
  
113
  results_AIC[i,1]<- dates[i]  #storing the interpolation dates in the first column
114
  results_AIC[i,2]<- ns        #number of stations used in the training stage
115
  results_AIC[i,3]<- AIC (mod1)
116
  results_AIC[i,4]<- AIC (mod2)
117
  results_AIC[i,5]<- AIC (mod3)
118
  results_AIC[i,6]<- AIC (mod4)
119
  results_AIC[i,7]<- AIC (mod5)
120
  results_AIC[i,8]<- AIC (mod6)
121
  results_AIC[i,9]<- AIC (mod7)
122
  
123
  results_GCV[i,1]<- dates[i]  #storing the interpolation dates in the first column
124
  results_GCV[i,2]<- ns        #number of stations used in the training stage
125
  results_GCV[i,3]<- mod1$gcv.ubre
126
  results_GCV[i,4]<- mod2$gcv.ubre
127
  results_GCV[i,5]<- mod3$gcv.ubre
128
  results_GCV[i,6]<- mod4$gcv.ubre
129
  results_GCV[i,7]<- mod5$gcv.ubre
130
  results_GCV[i,8]<- mod6$gcv.ubre
131
  results_GCV[i,9]<- mod7$gcv.ubre
132
  
133
  results_DEV[i,1]<- dates[i]  #storing the interpolation dates in the first column
134
  results_DEV[i,2]<- ns        #number of stations used in the training stage
135
  results_DEV[i,3]<- mod1$deviance
136
  results_DEV[i,4]<- mod2$deviance
137
  results_DEV[i,5]<- mod3$deviance
138
  results_DEV[i,6]<- mod4$deviance
139
  results_DEV[i,7]<- mod5$deviance
140
  results_DEV[i,8]<- mod6$deviance
141
  results_DEV[i,9]<- mod7$deviance
128
    results_GCV[i,1]<- dates[i]  #storing the interpolation dates in the first column
129
    results_GCV[i,2]<- ns        #number of stations used in the training stage
130
    results_GCV[i,j+2]<- mod$gcv.ubre
142 131
  
143
  #####VALIDATION: Prediction checking the results using the testing data########
132
    results_DEV[i,1]<- dates[i]  #storing the interpolation dates in the first column
133
    results_DEV[i,2]<- ns        #number of stations used in the training stage
134
    results_DEV[i,j+2]<- mod$deviance
135

  
136
    #####VALIDATION: Prediction checking the results using the testing data########
144 137
 
145
  #Automate this using a data frame of size??
146
  y_mod1<- predict(mod1, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values.
147
  y_mod2<- predict(mod2, newdata=data_v, se.fit = TRUE)            
148
  y_mod3<- predict(mod3, newdata=data_v, se.fit = TRUE) 
149
  y_mod4<- predict(mod4, newdata=data_v, se.fit = TRUE) 
150
  y_mod5<- predict(mod5, newdata=data_v, se.fit = TRUE) 
151
  y_mod6<- predict(mod6, newdata=data_v, se.fit = TRUE)
152
  y_mod7<- predict(mod7, newdata=data_v, se.fit = TRUE)
153
  
154
  res_mod1<- data_v$tmax - y_mod1$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation
155
  res_mod2<- data_v$tmax - y_mod2$fit   #Residuals for GAM model that resembles the PRISM interpolation                               
156
  res_mod3<- data_v$tmax - y_mod3$fit  
157
  res_mod4<- data_v$tmax - y_mod4$fit
158
  res_mod5<- data_v$tmax - y_mod5$fit
159
  res_mod6<- data_v$tmax - y_mod6$fit
160
  res_mod7<- data_v$tmax - y_mod7$fit
161
  
162
  RMSE_mod1 <- sqrt(sum(res_mod1^2)/nv)          
163
  RMSE_mod2 <- sqrt(sum(res_mod2^2)/nv)
164
  RMSE_mod3 <- sqrt(sum(res_mod3^2)/nv)
165
  RMSE_mod4 <- sqrt(sum(res_mod4^2)/nv)
166
  RMSE_mod5 <- sqrt(sum(res_mod5^2)/nv)
167
  RMSE_mod6 <- sqrt(sum(res_mod6^2)/nv)
168
  RMSE_mod7 <- sqrt(sum(res_mod7^2)/nv)
169

  
170
  results_RMSE[i,1]<- dates[i]  #storing the interpolation dates in the first column
171
  results_RMSE[i,2]<- ns        #number of stations used in the training stage
172
  results_RMSE[i,3]<- RMSE_mod1
173
  results_RMSE[i,4]<- RMSE_mod2
174
  results_RMSE[i,5]<- RMSE_mod3
175
  results_RMSE[i,6]<- RMSE_mod4
176
  results_RMSE[i,7]<- RMSE_mod5
177
  results_RMSE[i,8]<- RMSE_mod6
178
  results_RMSE[i,9]<- RMSE_mod7
179
  
180
  #Saving dataset in dataframes: residuals from RMSE
181
  
182
#   data_v$mod1<-y_mod1$fit
183
#   data_v$mod2<-y_mod2$fit
184
#   data_v$mod3<-y_mod3$fit
185
#   data_v$mod4<-y_mod4$fit
186
#   data_v$mod5<-y_mod5$fit
187
#   data_v$mod6<-y_mod6$fit
188
#   data_v$mod7<-y_mod7$fit
189
#   
190
#   data_s$mod1<-mod1$fit
191
#   data_s$mod2<-mod2$fit
192
#   data_s$mod3<-mod3$fit
193
#   data_s$mod4<-mod4$fit
194
#   data_s$mod5<-mod5$fit
195
#   data_s$mod6<-mod6$fit
196
#   data_s$mod7<-mod7$fit
197
#   
198
  data_v$res_mod1<-as.numeric(res_mod1)
199
  data_v$res_mod2<-as.numeric(res_mod2)
200
  data_v$res_mod3<-as.numeric(res_mod3)
201
  data_v$res_mod4<-as.numeric(res_mod4)
202
  data_v$res_mod5<-as.numeric(res_mod5)
203
  data_v$res_mod6<-as.numeric(res_mod6)
204
  data_v$res_mod7<-as.numeric(res_mod7)
138
    #Automate this using a data frame of size??
139
    y_mod<- predict(mod, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values.
140
    res_mod<- data_v$tmax - y_mod$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation
141
    RMSE_mod <- sqrt(sum(res_mod^2)/nv) #RMSE FOR REGRESSION STEP 1: GAM     
142

  
143
    results_RMSE[i,1]<- dates[i]  #storing the interpolation dates in the first column
144
    results_RMSE[i,2]<- ns        #number of stations used in the training stage
145
    results_RMSE[i,j+2]<- RMSE_mod
205 146
  
206
  data_s$res_mod1<-as.numeric(mod1$residuals)
207
  data_s$res_mod2<-as.numeric(mod2$residuals)
208
  data_s$res_mod3<-as.numeric(mod3$residuals)
209
  data_s$res_mod4<-as.numeric(mod4$residuals)
210
  data_s$res_mod5<-as.numeric(mod5$residuals)
211
  data_s$res_mod6<-as.numeric(mod6$residuals)
212
  data_s$res_mod7<-as.numeric(mod7$residuals)
147
    #Saving residuals and prediction in the dataframes: tmax predicted from GAM
148
    pred<-paste("pred_mod",j,sep="")
149
    data_v[[pred]]<-as.numeric(y_mod$fit)
150
    data_s[[pred]]<-as.numeric(mod$fit) #Storing model fit values (predicted on training sample)
151
   
152
    name2<-paste("res_mod",j,sep="")
153
    data_v[[name2]]<-as.numeric(res_mod)
154
    data_s[[name2]]<-as.numeric(mod$residuals)
155
    #end of loop calculating RMSE
156
    #NEED TO ADD BIAS AND MAE
157
    
158
    }
213 159
  
214 160
  ###BEFORE Kringing the data object must be transformed to SDF
215 161
  
......
220 166
  coordinates(data_s)<-coords
221 167
  proj4string(data_s)<-CRS  #Need to assign coordinates..
222 168
  
223
  #Kriging residuals!!
169
  #KRIGING ON GAM RESIDUALS: REGRESSION STEP2
224 170

  
225 171
  for (j in 1:length(models)){
226 172
    name<-paste("res_mod",j,sep="")
227 173
    data_s$residuals<-data_s[[name]]
228 174
    X11()
229 175
    hscat(residuals~1,data_s,(0:9)*20000) # 9 lag classes with 20,000m width
230
    v<-variogram(residuals~1, data_s)
231
    plot(v)
176
    v<-variogram(residuals~1, data_s)   
177
    plot(v)                               # This plot may be saved at a later stage...
178
    dev.off()
232 179
    v.fit<-fit.variogram(v,vgm(1,"Sph", 150000,1))
233 180
    res_krige<-krige(residuals~1, data_s,mean_LST, v.fit)#mean_LST provides the data grid/raster image for the kriging locations.
234 181
  
235
    # Kriging visualization of Residuals fit over space
236
  
237
    #spplot.vcov(co_kriged_surf)                           #Visualizing the covariance structure
238
  
239 182
    res_krig1_s <- overlay(res_krige,data_s)             #This overlays the kriged surface tmax and the location of weather stations
240 183
    res_krig1_v <- overlay(res_krige,data_v)             #This overlays the kriged surface tmax and the location of weather stations
241 184
  
......
243 186
    #Adding the results back into the original dataframes.
244 187
    data_s[[name2]]<-res_krig1_s$var1.pred
245 188
    data_v[[name2]]<-res_krig1_v$var1.pred  
246
  
189
    
190
    #NEED TO ADD IT BACK TO THE PREDICTION FROM GAM
191
    gam_kr<-paste("pred_gam_kr",j,sep="")
192
    pred_gam<-paste("pred_mod",j,sep="")
193
    data_s[[gam_kr]]<-data_s[[pred_gam]]+ data_s[[name2]]
194
    data_v[[gam_kr]]<-data_v[[pred_gam]]+ data_v[[name2]]
195
    
247 196
    #Calculate RMSE and then krig the residuals....!
248 197
  
249
    res_mod_kr_s<- data_s$tmax - data_s[[name2]]           #Residuals from kriging.
250
    res_mod_kr_v<- data_v$tmax - data_v[[name2]]           #Residuals from cokriging.
198
    res_mod_kr_s<- data_s$tmax - data_s[[gam_kr]]           #Residuals from kriging.
199
    res_mod_kr_v<- data_v$tmax - data_v[[gam_kr]]           #Residuals from cokriging.
251 200
  
252 201
    RMSE_mod_kr_s <- sqrt(sum(res_mod_kr_s^2,na.rm=TRUE)/(nv-sum(is.na(res_mod_kr_s))))                  #RMSE from kriged surface.
253 202
    RMSE_mod_kr_v <- sqrt(sum(res_mod_kr_v^2,na.rm=TRUE)/(nv-sum(is.na(res_mod_kr_v))))                  #RMSE from co-kriged surface.
......
259 208
    results_RMSE_kr[i,j+2]<- RMSE_mod_kr_v
260 209
    #results_RMSE_kr[i,3]<- res_mod_kr_v
261 210
    name3<-paste("res_kr_mod",j,sep="")
262
    data_s[[name3]]<-res_mod_kr_s
263
    data_v[[name3]]<-res_mod_kr_v #Writing residuals from kriging
211
    #as.numeric(res_mod)
212
    #data_s[[name3]]<-res_mod_kr_s
213
    data_s[[name3]]<-as.numeric(res_mod_kr_s)
214
    #data_v[[name3]]<-res_mod_kr_v 
215
    data_v[[name3]]<-as.numeric(res_mod_kr_v)
216
    #Writing residuals from kriging
217
    
264 218
    }
219
  
220
  ###SAVING THE DATA FRAME IN SHAPEFILES AND TEXTFILES
221
  
265 222
  data_name<-paste("ghcn_v_",out_prefix,"_",dates[[i]],sep="")
266 223
  assign(data_name,data_v)
267 224
  write.table(data_v, file= paste(path,"/",data_name,".txt",sep=""), sep=" ")
268 225
  #write out a new shapefile (including .prj component)
269
  outfile<-sub(".shp","",data_name)   #Removing extension if it is present
270
  writeOGR(data_v,".", outfile, driver ="ESRI Shapefile")
226
  #outfile<-sub(".shp","",data_name)   #Removing extension if it is present
227
  #writeOGR(data_v,".", outfile, driver ="ESRI Shapefile")
271 228
  
272 229
  data_name<-paste("ghcn_s_",out_prefix,"_",dates[[i]],sep="")
273 230
  assign(data_name,data_s)
274 231
  write.table(data_s, file= paste(path,"/",data_name,".txt",sep=""), sep=" ")
275
  outfile<-sub(".shp","",data_name)   #Removing extension if it is present
276
  writeOGR(data_s,".", outfile, driver ="ESRI Shapefile")
232
  #outfile<-sub(".shp","",data_name)   #Removing extension if it is present
233
  #writeOGR(data_s,".", outfile, driver ="ESRI Shapefile")
234
  
235
  # end of the for loop1
277 236
  
278
  # end of the for loop #1
279 237
  }
280 238

  
281 239
## Plotting and saving diagnostic measures
......
313 271
  height<-rbind(RMSE_ga,RMSE_kr)
314 272
  rownames(height)<-c("GAM","GAM_KR")
315 273
  height<-as.matrix(height)
316
  barplot(height,ylim=c(0,105),ylab="RMSE in tenth deg C",beside=TRUE,
274
  barplot(height,ylim=c(14,36),ylab="RMSE in tenth deg C",beside=TRUE,
317 275
          legend.text=rownames(height),
318 276
          args.legend=list(x="topright"),
319 277
          main=paste("RMSE for date ",dates[i], sep=""))
320 278
  savePlot(paste("Barplot_results_RMSE_GAM_KR_",dates[i],out_prefix,".png", sep=""), type="png")
279
  dev.off()
321 280
  }
322 281
  
323
# End of script##########
282
r1<-(results_table_RMSE[,3:10]) #selecting only the columns related to models and method 1
283
r2<-(results_table_RMSE[,3:10]) #selecting only the columns related to models and method 1
284
mean_r1<-mean(r1)
285
mean_r2<-mean(r2)
286
median_r1<-sapply(r1, median)   #Calulcating the mean for every model (median of columns)
287
median_r2<-sapply(r2, median)
288
sd_r1<-sapply(r1, sd)
289
sd_r2<-sapply(r2, sd)
290

  
291
barplot(mean_r,ylim=c(23,26),ylab="RMSE in tenth deg C")
292
barplot(median_r,ylim=c(23,26),ylab="RMSE in tenth deg C",add=TRUE,inside=FALSE,beside=TRUE) # put both on the same plot
293
barplot(sd_r,ylim=c(6,8),ylab="RMSE in tenth deg C") # put both on the same plot
294

  
295
height<-rbind(mean_r,median_r)
296
barplot(height,ylim=c(23,26),ylab="RMSE in tenth deg C",beside=TRUE,legend=rownames(height))
297
barplot(height,ylim=c(23,26),ylab="RMSE in tenth deg C",beside=TRUE, col=c("darkblue","red"),legend=rownames(height)) # put both on the same plot
298

  
299
barplot2(mean_r,median_r,ylim=c(23,26),ylab="RMSE in tenth deg C") # put both on the same plot
300
#Collect var explained and p values for each var...
301
### End of script  ##########
324 302

  
325 303

  
326 304

  

Also available in: Unified diff