Project

General

Profile

« Previous | Next » 

Revision 24eedf3a

Added by Benoit Parmentier over 12 years ago

GAM using LST as input variable task #361

View differences:

climate/research/oregon/interpolation/GAM_LST_reg.R
1
####################Interpolation of Tmax for 10 dates.#####################
2
#This script interpolates station values for the Oregon case study. This program loads the station data from a csv file 
3
#and perform two types  of regression: multiple linear model and general additive model (GAM). Note that this program:
4
#1)assumes that the csv file is in the current working 
5
#2)extract relevant variables from raster images before performing the regressions. 
6
#This scripts predicts tmas using GAM and LST derived from MOD11A1.
7
#Interactions terms are also included and assessed using the RMSE from validation dataset.
8
#There are 10 dates used for the GAM interpolation. The dates must be provided as a textfile.
9
#Script created by Benoit Parmentier on April 4, 2012. 
10

  
11
###Loading r library and packages                                                                       # loading the raster package
12
library(gtools)                                                                        # loading ...
13
library(mgcv)
14
library(sp)
15
library(spdep)
16
library(rgdal)
17

  
18
###Parameters and arguments
19

  
20
infile1<-"ghcn_or_tmax_b_04032012_OR83M.shp"
21
#path<-"C:/Data/Benoit/NCEAS/window_Oregon_data
22
path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations"
23
setwd(path)
24
#infile2<-"dates_interpolation_03012012.txt"  # list of 10 dates for the regression
25
infile2<-"dates_interpolation_03052012.txt"
26
prop<-0.3                                                                            #Proportion of testing retained for validation   
27
out_prefix<-"_04032012_LST_r1"
28
infile3<-"LST_dates_var_names.txt"
29
infile4<-"models_interpolation_04032012.txt"
30

  
31

  
32
#######START OF THE SCRIPT #############
33

  
34
###Reading the station data and setting up for models' comparison
35
ghcn<-readOGR(".", "ghcn_or_tmax_b_04032012_OR83M")
36
                         
37
ghcn = transform(ghcn,Northness = cos(ASPECT)) #Adding a variable to the dataframe
38
ghcn = transform(ghcn,Eastness = sin(ASPECT))  #adding variable to the dataframe.
39

  
40
ghcn = transform(ghcn,Northness_w = sin(slope)*cos(ASPECT)) #Adding a variable to the dataframe
41
ghcn = transform(ghcn,Eastness_w = sin(slope)*sin(ASPECT))  #adding variable to the dataframe.
42
set.seed(100)
43
dates <-readLines(paste(path,"/",infile2, sep=""))
44
LST_dates <-readLines(paste(path,"/",infile3, sep=""))
45
models <-readLines(paste(path,"/",infile4, sep=""))
46

  
47
results <- matrix(1,length(dates),14)            #This is a matrix containing the diagnostic measures from the GAM models.
48

  
49
results_AIC<- matrix(1,length(dates),length(models)+2)  
50
results_GCV<- matrix(1,length(dates),length(models)+2)
51
results_DEV<- matrix(1,length(dates),length(models)+2)
52
results_RMSE<- matrix(1,length(dates),length(models)+2)
53

  
54
#Screening for bad values
55

  
56
ghcn_all<-ghcn
57
ghcn_test<-subset(ghcn,ghcn$tmax>-150 & ghcn$tmax<400)
58
ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0)
59
ghcn<-ghcn_test2
60

  
61
ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 subsets data
62
#note that compare to the previous version date_ column was changed to date
63

  
64
## looping through the dates...
65
#Change this into  a nested loop, looping through the number of models
66

  
67
for(i in 1:length(dates)){            # start of the for loop #1
68
  
69
  ###Regression part 1: Creating a validation dataset by creating training and testing datasets
70
  
71
  mod <-ghcn.subsets[[i]][,match(LST_dates[i], names(ghcn.subsets[[i]]))]
72
  ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod)
73
  n<-nrow(ghcn.subsets[[i]])
74
  ns<-n-round(n*prop)  #Create a sample from the data frame with 70% of the rows
75
  nv<-n-ns             #create a sample for validation with prop of the rows
76
  #ns<-n-round(n*prop)  #Create a sample from the data frame with 70% of the rows
77
  ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
78
  ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
79
  data_s <- ghcn.subsets[[i]][ind.training, ]
80
  data_v <- ghcn.subsets[[i]][ind.testing, ]
81
  #mod <-data_s[,match(LST_dates[i], names(data_s))]
82
  #data_s = transform(data_s,LST = mod)
83
  #data_v = transform(data_v,LST = mod)
84
  ####Regression part 2: GAM models
85

  
86
  mod1<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=data_s)
87
  mod2<- gam(tmax~ s(lat,lon,ELEV_SRTM), data=data_s)
88
  mod3<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) +  s (Northness)+ s (Eastness) + s(DISTOC), data=data_s)
89
  mod4<- gam(tmax~ s(lat) + s (lon) + s(ELEV_SRTM) + s(Northness) + s (Eastness) + s(DISTOC) + s(LST), data=data_s)
90
  mod5<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST), data=data_s)
91
  mod6<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC1), data=data_s)
92
  
93
  
94
  ####Regression part 3: Calculating and storing diagnostic measures
95
  results_AIC[i,1]<- dates[i]  #storing the interpolation dates in the first column
96
  results_AIC[i,2]<- ns        #number of stations used in the training stage
97
  results_AIC[i,3]<- AIC (mod1)
98
  results_AIC[i,4]<- AIC (mod2)
99
  results_AIC[i,5]<- AIC (mod3)
100
  results_AIC[i,6]<- AIC (mod4)
101
  results_AIC[i,7]<- AIC (mod5)
102
  results_AIC[i,8]<- AIC (mod6)
103
  
104
  results_GCV[i,1]<- dates[i]  #storing the interpolation dates in the first column
105
  results_GCV[i,2]<- ns        #number of stations used in the training stage
106
  results_GCV[i,3]<- mod1$gcv.ubre
107
  results_GCV[i,4]<- mod2$gcv.ubre
108
  results_GCV[i,5]<- mod3$gcv.ubre
109
  results_GCV[i,6]<- mod4$gcv.ubre
110
  results_GCV[i,7]<- mod5$gcv.ubre
111
  results_GCV[i,8]<- mod6$gcv.ubre
112
  
113
  results_DEV[i,1]<- dates[i]  #storing the interpolation dates in the first column
114
  results_DEV[i,2]<- ns        #number of stations used in the training stage
115
  results_DEV[i,3]<- mod1$deviance
116
  results_DEV[i,4]<- mod2$deviance
117
  results_DEV[i,5]<- mod3$deviance
118
  results_DEV[i,6]<- mod4$deviance
119
  results_DEV[i,7]<- mod5$deviance
120
  results_DEV[i,8]<- mod6$deviance
121
  
122
  #####VALIDATION: Prediction checking the results using the testing data########
123
 
124
  y_mod1<- predict(mod1, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values.
125
  y_mod2<- predict(mod2, newdata=data_v, se.fit = TRUE)            
126
  y_mod3<- predict(mod3, newdata=data_v, se.fit = TRUE) 
127
  y_mod4<- predict(mod4, newdata=data_v, se.fit = TRUE) 
128
  y_mod5<- predict(mod5, newdata=data_v, se.fit = TRUE) 
129
  y_mod6<- predict(mod6, newdata=data_v, se.fit = TRUE)
130
  
131
  res_mod1<- data_v$tmax - y_mod1$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation
132
  res_mod2<- data_v$tmax - y_mod2$fit   #Residuals for GAM model that resembles the PRISM interpolation                               
133
  res_mod3<- data_v$tmax - y_mod3$fit  
134
  res_mod4<- data_v$tmax - y_mod4$fit
135
  res_mod5<- data_v$tmax - y_mod5$fit
136
  res_mod6<- data_v$tmax - y_mod6$fit
137
  
138
  RMSE_mod1 <- sqrt(sum(res_mod1^2)/nv)          
139
  RMSE_mod2 <- sqrt(sum(res_mod2^2)/nv)
140
  RMSE_mod3 <- sqrt(sum(res_mod3^2)/nv)
141
  RMSE_mod4 <- sqrt(sum(res_mod4^2)/nv)
142
  RMSE_mod5 <- sqrt(sum(res_mod5^2)/nv)
143
  RMSE_mod6 <- sqrt(sum(res_mod6^2)/nv)
144
  
145

  
146
  results_RMSE[i,1]<- dates[i]  #storing the interpolation dates in the first column
147
  results_RMSE[i,2]<- ns        #number of stations used in the training stage
148
  results_RMSE[i,3]<- RMSE_mod1
149
  results_RMSE[i,4]<- RMSE_mod2
150
  results_RMSE[i,5]<- RMSE_mod3
151
  results_RMSE[i,6]<- RMSE_mod4
152
  results_RMSE[i,7]<- RMSE_mod5
153
  results_RMSE[i,8]<- RMSE_mod6
154
  
155
  #Saving dataset in dataframes
156
  data_name<-paste("ghcn_v_",dates[[i]],sep="")
157
  assign(data_name,data_v)
158
  data_name<-paste("ghcn_s_",dates[[i]],sep="")
159
  assign(data_name,data_s)
160
  #ghcn_v<-ls(pattern="ghcn_v_")
161
  
162
  # end of the for loop #1
163
  }
164

  
165
## Plotting and saving diagnostic measures
166

  
167
results_RMSEnum <-results_RMSE
168
mode(results_RMSEnum)<- "numeric"
169
# Make it numeric first
170
# Now turn it into a data.frame...
171

  
172
results_table_RMSE<-as.data.frame(results_RMSEnum)
173
colnames(results_table_RMSE)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6")
174

  
175
# win.graph()
176
# barplot(results_table$RMSE_A1/10,main="RMSE for the A1 models",names.arg=results_table$dates,ylab="Temp (deg. C)",xlab="interpolated date")
177
# savePlot(paste("GAM_ANUSPLIN1_RMSE",out_prefix,".emf", sep=""), type="emf")
178
# win.graph()
179
# barplot(results_table$RMSE_P1/10,main="RMSE for the P1 models",names.arg=results_table$dates,ylab="Temp ( deg. C)",xlab="interpolated date")
180
# savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf")
181
# win.graph()
182
# barplot(results_table$RMSE_P2/10,main="RMSE for the P2 models",names.arg=results_table$dates,ylab="Temp ( deg. C)",xlab="interpolated date")
183
# savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf")
184
# win.graph()
185
# barplot(results_table$AIC_A1,main="AIC for the A1 models",names.arg=results_table$dates,ylab="Temp ( deg. C)",xlab="interpolated date")
186
# savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf")
187
# win.graph()
188
# barplot(results_table$AIC_P1/10,main="AIC for the P1 models",names.arg=results_table$dates,ylab="Temp ( deg. C)",xlab="interpolated date")
189
# savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf")
190
# win.graph()
191
# barplot(results_table$AIC_P2/10,main="AIC for the P2 models",names.arg=results_table$dates,ylab="Temp (10 X deg. C)",xlab="interolated date")
192
# savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", s‰‰ep=""), type="emf")
193
# win.graph()
194
# barplot(results_table$Deviance_A1/10,main="Deviance for the A1 models",names.arg=results_table$dates,ylab="Temp (10 X deg. C)",xlab="interolated date")
195
# savePlot(paste("GAM_ANUSPLIN1_Deviance",out_prefix,".emf", sep=""), type="emf")
196
# win.graph()
197
# barplot(results_table$Deviance_P1/10,main="Deviance for the P1 models",names.arg=results_table$dates,ylab="Temp (10 X deg. C)",xlab="interolated date")
198
# savePlot(paste("GAM_PRISM1_Deviance",out_prefix,".emf", sep=""), type="emf")
199
# win.graph()
200
# barplot(results_table$Deviance_P2/10,main="Deviance for the P2 models",names.arg=results_table$dates,ylab="Temp (10 X deg. C)",xlab="interolated date")
201
# savePlot(paste("GAM_PRISM2_Deviance",out_prefix,".emf", sep=""), type="emf")
202

  
203
#results_table_RMSE
204
write.csv(results_table_RMSE, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep=""))
205

  
206

  
207
# End of script##########
208

  
209
# ###############################
210

  
211
# 
212
# ############Diagnostic GAM plots#############
213
# win.graph()
214
# gam.check(GAM_ANUSPLIN1)
215
# savePlot("GAM_ANUSPLIN1_diagnostic1.emf", type="emf")
216
# win.graph()
217
# gam.check(GAM_PRISM1)   #This will produce basic plots of residuals
218
# savePlot("GAM_PRISM_diagnostic1.emf", type="emf")
219
# gam.check(GAM_ANUSPLIN1)
220
# win.graph()
221
# vis.gam(GAM_ANUSPLIN1)
222
# savePlot("GAM_ANUSPLIN1_prediction.emf", type="emf")        
223
# win.graph()
224
# vis.gam(GAM_PRISM1)
225
# savePlot("GAM_PRISM1_prediction.emf", type="emf")
226
# win.graph()
227
# vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM"))
228
# #vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM", theta=100,phi=200))
229
# savePlot("GAM_ANUSPLIN1_prediction2.emf", type="emf")
230
#
231

  
232
#                 
233

  
234

  
235

  
236

  
237
 

Also available in: Unified diff