Revision 24eedf3a
Added by Benoit Parmentier over 12 years ago
climate/research/oregon/interpolation/GAM_LST_reg.R | ||
---|---|---|
1 |
####################Interpolation of Tmax for 10 dates.##################### |
|
2 |
#This script interpolates station values for the Oregon case study. This program loads the station data from a csv file |
|
3 |
#and perform two types of regression: multiple linear model and general additive model (GAM). Note that this program: |
|
4 |
#1)assumes that the csv file is in the current working |
|
5 |
#2)extract relevant variables from raster images before performing the regressions. |
|
6 |
#This scripts predicts tmas using GAM and LST derived from MOD11A1. |
|
7 |
#Interactions terms are also included and assessed using the RMSE from validation dataset. |
|
8 |
#There are 10 dates used for the GAM interpolation. The dates must be provided as a textfile. |
|
9 |
#Script created by Benoit Parmentier on April 4, 2012. |
|
10 |
|
|
11 |
###Loading r library and packages # loading the raster package |
|
12 |
library(gtools) # loading ... |
|
13 |
library(mgcv) |
|
14 |
library(sp) |
|
15 |
library(spdep) |
|
16 |
library(rgdal) |
|
17 |
|
|
18 |
###Parameters and arguments |
|
19 |
|
|
20 |
infile1<-"ghcn_or_tmax_b_04032012_OR83M.shp" |
|
21 |
#path<-"C:/Data/Benoit/NCEAS/window_Oregon_data |
|
22 |
path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations" |
|
23 |
setwd(path) |
|
24 |
#infile2<-"dates_interpolation_03012012.txt" # list of 10 dates for the regression |
|
25 |
infile2<-"dates_interpolation_03052012.txt" |
|
26 |
prop<-0.3 #Proportion of testing retained for validation |
|
27 |
out_prefix<-"_04032012_LST_r1" |
|
28 |
infile3<-"LST_dates_var_names.txt" |
|
29 |
infile4<-"models_interpolation_04032012.txt" |
|
30 |
|
|
31 |
|
|
32 |
#######START OF THE SCRIPT ############# |
|
33 |
|
|
34 |
###Reading the station data and setting up for models' comparison |
|
35 |
ghcn<-readOGR(".", "ghcn_or_tmax_b_04032012_OR83M") |
|
36 |
|
|
37 |
ghcn = transform(ghcn,Northness = cos(ASPECT)) #Adding a variable to the dataframe |
|
38 |
ghcn = transform(ghcn,Eastness = sin(ASPECT)) #adding variable to the dataframe. |
|
39 |
|
|
40 |
ghcn = transform(ghcn,Northness_w = sin(slope)*cos(ASPECT)) #Adding a variable to the dataframe |
|
41 |
ghcn = transform(ghcn,Eastness_w = sin(slope)*sin(ASPECT)) #adding variable to the dataframe. |
|
42 |
set.seed(100) |
|
43 |
dates <-readLines(paste(path,"/",infile2, sep="")) |
|
44 |
LST_dates <-readLines(paste(path,"/",infile3, sep="")) |
|
45 |
models <-readLines(paste(path,"/",infile4, sep="")) |
|
46 |
|
|
47 |
results <- matrix(1,length(dates),14) #This is a matrix containing the diagnostic measures from the GAM models. |
|
48 |
|
|
49 |
results_AIC<- matrix(1,length(dates),length(models)+2) |
|
50 |
results_GCV<- matrix(1,length(dates),length(models)+2) |
|
51 |
results_DEV<- matrix(1,length(dates),length(models)+2) |
|
52 |
results_RMSE<- matrix(1,length(dates),length(models)+2) |
|
53 |
|
|
54 |
#Screening for bad values |
|
55 |
|
|
56 |
ghcn_all<-ghcn |
|
57 |
ghcn_test<-subset(ghcn,ghcn$tmax>-150 & ghcn$tmax<400) |
|
58 |
ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0) |
|
59 |
ghcn<-ghcn_test2 |
|
60 |
|
|
61 |
ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 subsets data |
|
62 |
#note that compare to the previous version date_ column was changed to date |
|
63 |
|
|
64 |
## looping through the dates... |
|
65 |
#Change this into a nested loop, looping through the number of models |
|
66 |
|
|
67 |
for(i in 1:length(dates)){ # start of the for loop #1 |
|
68 |
|
|
69 |
###Regression part 1: Creating a validation dataset by creating training and testing datasets |
|
70 |
|
|
71 |
mod <-ghcn.subsets[[i]][,match(LST_dates[i], names(ghcn.subsets[[i]]))] |
|
72 |
ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod) |
|
73 |
n<-nrow(ghcn.subsets[[i]]) |
|
74 |
ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
|
75 |
nv<-n-ns #create a sample for validation with prop of the rows |
|
76 |
#ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
|
77 |
ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly |
|
78 |
ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training) |
|
79 |
data_s <- ghcn.subsets[[i]][ind.training, ] |
|
80 |
data_v <- ghcn.subsets[[i]][ind.testing, ] |
|
81 |
#mod <-data_s[,match(LST_dates[i], names(data_s))] |
|
82 |
#data_s = transform(data_s,LST = mod) |
|
83 |
#data_v = transform(data_v,LST = mod) |
|
84 |
####Regression part 2: GAM models |
|
85 |
|
|
86 |
mod1<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=data_s) |
|
87 |
mod2<- gam(tmax~ s(lat,lon,ELEV_SRTM), data=data_s) |
|
88 |
mod3<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (Northness)+ s (Eastness) + s(DISTOC), data=data_s) |
|
89 |
mod4<- gam(tmax~ s(lat) + s (lon) + s(ELEV_SRTM) + s(Northness) + s (Eastness) + s(DISTOC) + s(LST), data=data_s) |
|
90 |
mod5<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST), data=data_s) |
|
91 |
mod6<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC1), data=data_s) |
|
92 |
|
|
93 |
|
|
94 |
####Regression part 3: Calculating and storing diagnostic measures |
|
95 |
results_AIC[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
96 |
results_AIC[i,2]<- ns #number of stations used in the training stage |
|
97 |
results_AIC[i,3]<- AIC (mod1) |
|
98 |
results_AIC[i,4]<- AIC (mod2) |
|
99 |
results_AIC[i,5]<- AIC (mod3) |
|
100 |
results_AIC[i,6]<- AIC (mod4) |
|
101 |
results_AIC[i,7]<- AIC (mod5) |
|
102 |
results_AIC[i,8]<- AIC (mod6) |
|
103 |
|
|
104 |
results_GCV[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
105 |
results_GCV[i,2]<- ns #number of stations used in the training stage |
|
106 |
results_GCV[i,3]<- mod1$gcv.ubre |
|
107 |
results_GCV[i,4]<- mod2$gcv.ubre |
|
108 |
results_GCV[i,5]<- mod3$gcv.ubre |
|
109 |
results_GCV[i,6]<- mod4$gcv.ubre |
|
110 |
results_GCV[i,7]<- mod5$gcv.ubre |
|
111 |
results_GCV[i,8]<- mod6$gcv.ubre |
|
112 |
|
|
113 |
results_DEV[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
114 |
results_DEV[i,2]<- ns #number of stations used in the training stage |
|
115 |
results_DEV[i,3]<- mod1$deviance |
|
116 |
results_DEV[i,4]<- mod2$deviance |
|
117 |
results_DEV[i,5]<- mod3$deviance |
|
118 |
results_DEV[i,6]<- mod4$deviance |
|
119 |
results_DEV[i,7]<- mod5$deviance |
|
120 |
results_DEV[i,8]<- mod6$deviance |
|
121 |
|
|
122 |
#####VALIDATION: Prediction checking the results using the testing data######## |
|
123 |
|
|
124 |
y_mod1<- predict(mod1, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values. |
|
125 |
y_mod2<- predict(mod2, newdata=data_v, se.fit = TRUE) |
|
126 |
y_mod3<- predict(mod3, newdata=data_v, se.fit = TRUE) |
|
127 |
y_mod4<- predict(mod4, newdata=data_v, se.fit = TRUE) |
|
128 |
y_mod5<- predict(mod5, newdata=data_v, se.fit = TRUE) |
|
129 |
y_mod6<- predict(mod6, newdata=data_v, se.fit = TRUE) |
|
130 |
|
|
131 |
res_mod1<- data_v$tmax - y_mod1$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation |
|
132 |
res_mod2<- data_v$tmax - y_mod2$fit #Residuals for GAM model that resembles the PRISM interpolation |
|
133 |
res_mod3<- data_v$tmax - y_mod3$fit |
|
134 |
res_mod4<- data_v$tmax - y_mod4$fit |
|
135 |
res_mod5<- data_v$tmax - y_mod5$fit |
|
136 |
res_mod6<- data_v$tmax - y_mod6$fit |
|
137 |
|
|
138 |
RMSE_mod1 <- sqrt(sum(res_mod1^2)/nv) |
|
139 |
RMSE_mod2 <- sqrt(sum(res_mod2^2)/nv) |
|
140 |
RMSE_mod3 <- sqrt(sum(res_mod3^2)/nv) |
|
141 |
RMSE_mod4 <- sqrt(sum(res_mod4^2)/nv) |
|
142 |
RMSE_mod5 <- sqrt(sum(res_mod5^2)/nv) |
|
143 |
RMSE_mod6 <- sqrt(sum(res_mod6^2)/nv) |
|
144 |
|
|
145 |
|
|
146 |
results_RMSE[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
147 |
results_RMSE[i,2]<- ns #number of stations used in the training stage |
|
148 |
results_RMSE[i,3]<- RMSE_mod1 |
|
149 |
results_RMSE[i,4]<- RMSE_mod2 |
|
150 |
results_RMSE[i,5]<- RMSE_mod3 |
|
151 |
results_RMSE[i,6]<- RMSE_mod4 |
|
152 |
results_RMSE[i,7]<- RMSE_mod5 |
|
153 |
results_RMSE[i,8]<- RMSE_mod6 |
|
154 |
|
|
155 |
#Saving dataset in dataframes |
|
156 |
data_name<-paste("ghcn_v_",dates[[i]],sep="") |
|
157 |
assign(data_name,data_v) |
|
158 |
data_name<-paste("ghcn_s_",dates[[i]],sep="") |
|
159 |
assign(data_name,data_s) |
|
160 |
#ghcn_v<-ls(pattern="ghcn_v_") |
|
161 |
|
|
162 |
# end of the for loop #1 |
|
163 |
} |
|
164 |
|
|
165 |
## Plotting and saving diagnostic measures |
|
166 |
|
|
167 |
results_RMSEnum <-results_RMSE |
|
168 |
mode(results_RMSEnum)<- "numeric" |
|
169 |
# Make it numeric first |
|
170 |
# Now turn it into a data.frame... |
|
171 |
|
|
172 |
results_table_RMSE<-as.data.frame(results_RMSEnum) |
|
173 |
colnames(results_table_RMSE)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6") |
|
174 |
|
|
175 |
# win.graph() |
|
176 |
# barplot(results_table$RMSE_A1/10,main="RMSE for the A1 models",names.arg=results_table$dates,ylab="Temp (deg. C)",xlab="interpolated date") |
|
177 |
# savePlot(paste("GAM_ANUSPLIN1_RMSE",out_prefix,".emf", sep=""), type="emf") |
|
178 |
# win.graph() |
|
179 |
# barplot(results_table$RMSE_P1/10,main="RMSE for the P1 models",names.arg=results_table$dates,ylab="Temp ( deg. C)",xlab="interpolated date") |
|
180 |
# savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf") |
|
181 |
# win.graph() |
|
182 |
# barplot(results_table$RMSE_P2/10,main="RMSE for the P2 models",names.arg=results_table$dates,ylab="Temp ( deg. C)",xlab="interpolated date") |
|
183 |
# savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf") |
|
184 |
# win.graph() |
|
185 |
# barplot(results_table$AIC_A1,main="AIC for the A1 models",names.arg=results_table$dates,ylab="Temp ( deg. C)",xlab="interpolated date") |
|
186 |
# savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf") |
|
187 |
# win.graph() |
|
188 |
# barplot(results_table$AIC_P1/10,main="AIC for the P1 models",names.arg=results_table$dates,ylab="Temp ( deg. C)",xlab="interpolated date") |
|
189 |
# savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf") |
|
190 |
# win.graph() |
|
191 |
# barplot(results_table$AIC_P2/10,main="AIC for the P2 models",names.arg=results_table$dates,ylab="Temp (10 X deg. C)",xlab="interolated date") |
|
192 |
# savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", s‰‰ep=""), type="emf") |
|
193 |
# win.graph() |
|
194 |
# barplot(results_table$Deviance_A1/10,main="Deviance for the A1 models",names.arg=results_table$dates,ylab="Temp (10 X deg. C)",xlab="interolated date") |
|
195 |
# savePlot(paste("GAM_ANUSPLIN1_Deviance",out_prefix,".emf", sep=""), type="emf") |
|
196 |
# win.graph() |
|
197 |
# barplot(results_table$Deviance_P1/10,main="Deviance for the P1 models",names.arg=results_table$dates,ylab="Temp (10 X deg. C)",xlab="interolated date") |
|
198 |
# savePlot(paste("GAM_PRISM1_Deviance",out_prefix,".emf", sep=""), type="emf") |
|
199 |
# win.graph() |
|
200 |
# barplot(results_table$Deviance_P2/10,main="Deviance for the P2 models",names.arg=results_table$dates,ylab="Temp (10 X deg. C)",xlab="interolated date") |
|
201 |
# savePlot(paste("GAM_PRISM2_Deviance",out_prefix,".emf", sep=""), type="emf") |
|
202 |
|
|
203 |
#results_table_RMSE |
|
204 |
write.csv(results_table_RMSE, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep="")) |
|
205 |
|
|
206 |
|
|
207 |
# End of script########## |
|
208 |
|
|
209 |
# ############################### |
|
210 |
|
|
211 |
# |
|
212 |
# ############Diagnostic GAM plots############# |
|
213 |
# win.graph() |
|
214 |
# gam.check(GAM_ANUSPLIN1) |
|
215 |
# savePlot("GAM_ANUSPLIN1_diagnostic1.emf", type="emf") |
|
216 |
# win.graph() |
|
217 |
# gam.check(GAM_PRISM1) #This will produce basic plots of residuals |
|
218 |
# savePlot("GAM_PRISM_diagnostic1.emf", type="emf") |
|
219 |
# gam.check(GAM_ANUSPLIN1) |
|
220 |
# win.graph() |
|
221 |
# vis.gam(GAM_ANUSPLIN1) |
|
222 |
# savePlot("GAM_ANUSPLIN1_prediction.emf", type="emf") |
|
223 |
# win.graph() |
|
224 |
# vis.gam(GAM_PRISM1) |
|
225 |
# savePlot("GAM_PRISM1_prediction.emf", type="emf") |
|
226 |
# win.graph() |
|
227 |
# vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM")) |
|
228 |
# #vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM", theta=100,phi=200)) |
|
229 |
# savePlot("GAM_ANUSPLIN1_prediction2.emf", type="emf") |
|
230 |
# |
|
231 |
|
|
232 |
# |
|
233 |
|
|
234 |
|
|
235 |
|
|
236 |
|
|
237 |
|
Also available in: Unified diff
GAM using LST as input variable task #361