Revision 06fb13b9
Added by Benoit Parmentier over 10 years ago
climate/research/oregon/interpolation/nex_run_gam_fitting.R | ||
---|---|---|
52 | 52 |
env[[nm]] |
53 | 53 |
} |
54 | 54 |
|
55 |
|
|
56 |
### Function to fit using the training data from the workflow |
|
57 |
fit_models<-function(list_formulas,data_training){ |
|
58 |
|
|
59 |
#This functions several models and returns model objects. |
|
60 |
#Arguments: - list of formulas for GAM models |
|
61 |
# - fitting data in a data.frame or SpatialPointDataFrame |
|
62 |
|
|
63 |
#Output: list of model objects |
|
64 |
|
|
65 |
list_fitted_models<-vector("list",length(list_formulas)) |
|
66 |
for (k in 1:length(list_formulas)){ |
|
67 |
formula<-list_formulas[[k]] |
|
68 |
mod<- try(gam(formula, data=data_training)) #change to any model!! |
|
69 |
#mod<- try(autoKrige(formula, input_data=data_s,new_data=s_sgdf,data_variogram=data_s)) |
|
70 |
|
|
71 |
model_name<-paste("mod",k,sep="") |
|
72 |
assign(model_name,mod) |
|
73 |
list_fitted_models[[k]]<-mod |
|
74 |
} |
|
75 |
return(list_fitted_models) |
|
76 |
} |
|
77 |
|
|
78 |
##Function to predict using a mod object from the workflow... |
|
55 | 79 |
predict_raster_model<-function(in_models,r_stack,out_filename){ |
56 | 80 |
|
57 | 81 |
#This functions performs predictions on a raster grid given input models. |
... | ... | |
78 | 102 |
return(list_rast_pred) |
79 | 103 |
} |
80 | 104 |
|
81 |
fit_models<-function(list_formulas,data_training){ |
|
82 |
|
|
83 |
#This functions several models and returns model objects. |
|
84 |
#Arguments: - list of formulas for GAM models |
|
85 |
# - fitting data in a data.frame or SpatialPointDataFrame |
|
86 |
|
|
87 |
#Output: list of model objects |
|
88 |
|
|
89 |
list_fitted_models<-vector("list",length(list_formulas)) |
|
90 |
for (k in 1:length(list_formulas)){ |
|
91 |
formula<-list_formulas[[k]] |
|
92 |
mod<- try(gam(formula, data=data_training)) #change to any model!! |
|
93 |
#mod<- try(autoKrige(formula, input_data=data_s,new_data=s_sgdf,data_variogram=data_s)) |
|
94 |
|
|
95 |
model_name<-paste("mod",k,sep="") |
|
96 |
assign(model_name,mod) |
|
97 |
list_fitted_models[[k]]<-mod |
|
98 |
} |
|
99 |
return(list_fitted_models) |
|
100 |
} |
|
101 |
|
|
102 |
|
|
105 |
## Create a new directory |
|
103 | 106 |
create_dir_fun <- function(out_dir,out_suffix){ |
104 | 107 |
if(!is.null(out_suffix)){ |
105 | 108 |
out_name <- paste("output_",out_suffix,sep="") |
... | ... | |
112 | 115 |
return(out_dir) |
113 | 116 |
} |
114 | 117 |
|
118 |
#Extract info from object |
|
115 | 119 |
extract_list_from_list_obj<-function(obj_list,list_name){ |
116 | 120 |
library(plyr) |
117 | 121 |
|
... | ... | |
141 | 145 |
return(tb_list_tmp) #this is a data.frame |
142 | 146 |
} |
143 | 147 |
|
144 |
##############################s |
|
145 |
#### Parameters and constants |
|
146 |
|
|
147 |
#scp -rp raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" |
|
148 |
#scp -rp reg*.tif parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" |
|
149 |
|
|
150 |
in_dir <- "/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" |
|
151 |
raster_obj_infile <- "raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData" |
|
152 |
|
|
153 |
setwd(in_dir) |
|
154 |
|
|
155 |
########################## START SCRIPT ############################## |
|
156 |
|
|
157 |
raster_obj<- load_obj(raster_obj_infile) |
|
158 |
|
|
159 |
#raster_obj <- load_obj(unlist(raster_obj_file)) #may not need unlist |
|
160 |
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas)) |
|
161 |
l_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas) |
|
162 |
|
|
163 |
#y_var ~ s(lat, lon) + s(elev_s) |
|
164 |
#y_var ~ s(lat, lon) + s(elev_s) + s(LST) |
|
165 |
#y_var ~ s(lat, lon) + s(elev_s) + s(N_w, E_w) + s(LST) + ti(LST,LC1) + s(LC1) |
|
166 |
|
|
167 |
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="") |
|
168 |
#we are assuming no monthly hold out... |
|
169 |
#we are assuming only one specific daily prop? |
|
170 |
nb_models <- length(pred_mod) |
|
171 |
|
|
172 |
j <- 7 |
|
173 |
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]] |
|
174 |
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas |
|
175 |
clim_method_mod_obj$clim #file predicted |
|
176 |
|
|
177 |
l_mod <- clim_method_mod_obj$mod #file predicted |
|
178 |
|
|
179 |
reg_rast <- stack(list.files(pattern="*.tif")) |
|
180 |
plot(reg_rast,y=15) |
|
181 |
|
|
182 |
names(clim_method_mod_obj) |
|
183 |
clim_method_mod_obj$data_month |
|
184 |
clim_method_mod_obj$data_month_v |
|
185 |
|
|
186 |
## check for residual pattern, removeable by increasing `k' |
|
187 |
## typically `k', below, chould be substantially larger than |
|
188 |
## the original, `k' but certainly less than n/2. |
|
189 |
vis.gam(mod1) |
|
190 |
vis.gam(mod1,view=c("lat","lon"),theta= 35) # plot against by variable |
|
191 |
#http://stats.stackexchange.com/questions/12223/how-to-tune-smoothing-in-mgcv-gam-model |
|
192 |
mod1<- try(gam(y_var ~ s(lat, lon,k=22) + s(elev_s) + s(LST), data=data_training)) #change to any model!! |
|
193 |
|
|
194 |
##New functions... |
|
195 |
fit_models<-function(list_formulas,data_training){ |
|
196 |
|
|
197 |
#This functions several models and returns model objects. |
|
198 |
#Arguments: - list of formulas for GAM models |
|
199 |
# - fitting data in a data.frame or SpatialPointDataFrame |
|
200 |
|
|
201 |
#Output: list of model objects |
|
202 |
|
|
203 |
list_fitted_models<-vector("list",length(list_formulas)) |
|
204 |
for (k in 1:length(list_formulas)){ |
|
205 |
formula<-list_formulas[[k]] |
|
206 |
mod<- try(gam(formula, data=data_training)) #change to any model!! |
|
207 |
mod<- try(gam(formula, data=data_training,k=5)) #change to any model!! |
|
208 |
mod<- try(gam(y_var ~ s(lat, lon,k=14) + s(elev_s) + s(LST), data=data_training)) #change to any model!! |
|
209 |
|
|
210 |
#mod<- try(autoKrige(formula, input_data=data_s,new_data=s_sgdf,data_variogram=data_s)) |
|
211 |
|
|
212 |
model_name<-paste("mod",k,sep="") |
|
213 |
assign(model_name,mod) |
|
214 |
list_fitted_models[[k]]<-mod |
|
215 |
} |
|
216 |
return(list_fitted_models) |
|
217 |
} |
|
148 |
### New functions to set the k dimension in GAM |
|
218 | 149 |
|
219 | 150 |
##Format formula using prescribed k values for gam |
220 | 151 |
format_formula_k_fun <- function(formula,l_k){ |
... | ... | |
316 | 247 |
return(list_df_k) |
317 | 248 |
} |
318 | 249 |
|
250 |
extract_fitting_mod_gam_stat <- function(mod){ |
|
251 |
#Note that this assumes that we are using a gam mod object |
|
252 |
gcv_val <- mod$gcv.ubre |
|
253 |
aic_val <- AIC(mod) |
|
254 |
rmse_val <- sqrt(mean((residuals(mod))^2)) |
|
255 |
mae_val <- mean(abs(residuals(mod))) |
|
256 |
bias_val <- mean(residuals(mod)) |
|
257 |
n_val <- length(residuals(mod)) |
|
258 |
#Now create a data.frame |
|
259 |
df_val <- data.frame(gcv=gcv_val, |
|
260 |
aic=aic_val, |
|
261 |
rmse=rmse_val, |
|
262 |
mae=mae_val, |
|
263 |
bias=bias_val, |
|
264 |
n=n_val) |
|
265 |
return(df_val) |
|
266 |
} |
|
319 | 267 |
|
320 |
#l_k <- vector("list",3) |
|
321 |
#l_k <- list(25,10,10) |
|
322 |
l_k <- c(30,10,10) |
|
323 |
l_k_obj <- test_k_gam(formula,l_k,data_training) |
|
268 |
##############################s |
|
269 |
#### Parameters and constants |
|
270 |
|
|
271 |
#scp -rp raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" |
|
272 |
#scp -rp reg*.tif parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" |
|
273 |
|
|
274 |
## laod data from Northen Africa |
|
275 |
in_dir <- "/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" |
|
276 |
raster_obj_infile <- "raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData" |
|
277 |
|
|
278 |
setwd(in_dir) |
|
279 |
|
|
280 |
########################## START SCRIPT ############################## |
|
281 |
|
|
282 |
#### PART I: Explore fitting of GAM #### |
|
283 |
|
|
284 |
raster_obj<- load_obj(raster_obj_infile) |
|
285 |
|
|
286 |
#raster_obj <- load_obj(unlist(raster_obj_file)) #may not need unlist |
|
287 |
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas)) |
|
288 |
list_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas) |
|
289 |
|
|
290 |
#Models used: |
|
291 |
#y_var ~ s(lat, lon) + s(elev_s) |
|
292 |
#y_var ~ s(lat, lon) + s(elev_s) + s(LST) |
|
293 |
#y_var ~ s(lat, lon) + s(elev_s) + s(N_w, E_w) + s(LST) + ti(LST,LC1) + s(LC1) |
|
294 |
|
|
295 |
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="") |
|
296 |
#we are assuming no monthly hold out... |
|
297 |
#we are assuming only one specific daily prop? |
|
298 |
nb_models <- length(pred_mod) |
|
299 |
|
|
300 |
#Select one month to play around: |
|
301 |
j <- 7 # July |
|
302 |
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]] |
|
303 |
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas |
|
304 |
clim_method_mod_obj$clim #file predicted |
|
305 |
|
|
306 |
l_mod <- clim_method_mod_obj$mod #file predicted |
|
307 |
|
|
308 |
#Quick look at the study area: Equatorial to Northern Africa |
|
309 |
reg_rast <- stack(list.files(pattern="*.tif")) |
|
310 |
plot(reg_rast,y=15) |
|
311 |
|
|
312 |
names(clim_method_mod_obj) |
|
313 |
clim_method_mod_obj$data_month |
|
314 |
clim_method_mod_obj$data_month_v |
|
315 |
|
|
316 |
|
|
317 |
k <- 2 #select model 2 with LST |
|
318 |
formula <-list_formulas[[k]] |
|
319 |
mod<- try(gam(formula, data=data_training)) #does not fit!! as expected |
|
320 |
|
|
321 |
mod_t1<- try(gam(y_var ~ s(lat, lon,k=14) + s(elev_s) + s(LST), data=data_training)) #change to any model!! |
|
322 |
gam.check(mod_t1) |
|
323 |
mod_t2<- try(gam(y_var ~ s(lat, lon,k=5) + s(elev_s) + s(LST), data=data_training)) #change to any model!! |
|
324 |
gam.check(mod_t2) #in this case k=5 is too small for the interactive term as k-index is less than 1 |
|
325 |
|
|
326 |
## check for residual pattern, removeable by increasing `k' |
|
327 |
## typically `k', below, chould be substantially larger than |
|
328 |
## the original, `k' but certainly less than n/2. |
|
329 |
vis.gam(mod_t1) |
|
330 |
vis.gam(mod_t1,view=c("lat","lon"),theta= 35) # plot against by variable |
|
331 |
#http://stats.stackexchange.com/questions/12223/how-to-tune-smoothing-in-mgcv-gam-model |
|
332 |
|
|
333 |
mod_t3 <- try(gam(y_var ~ s(lat, lon,k=22) + s(elev_s) + s(LST), data=data_training)) #change to any model!! |
|
334 |
gam.check(mod_t3) |
|
335 |
|
|
336 |
#Explore mod object |
|
337 |
mod_t1$gcv.ubre |
|
338 |
mod_t1$aic |
|
339 |
mod_t1$edf |
|
340 |
|
|
341 |
#### PART II: Use the new functions to explore fitting with k-dimension in GAM #### |
|
342 |
|
|
343 |
j <- 7 # July |
|
344 |
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]] |
|
345 |
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas |
|
324 | 346 |
|
347 |
l_mod <- clim_method_mod_obj$mod #file predicted |
|
348 |
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas)) |
|
349 |
list_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas) |
|
350 |
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="") |
|
351 |
#we are assuming no monthly hold out... |
|
352 |
#we are assuming only one specific daily prop? |
|
353 |
nb_models <- length(pred_mod) |
|
354 |
|
|
355 |
data_training <- clim_method_mod_obj$data_month |
|
356 |
|
|
357 |
#list_fitted_models<-vector("list",length(list_formulas)) |
|
358 |
k <-2 #select model 3 with LST |
|
359 |
names_mod <- paste("mod_",k,sep="") |
|
360 |
formula <-list_formulas[[k]] |
|
361 |
|
|
362 |
l_k <- c(30,10,10) #default values |
|
363 |
#Create |
|
364 |
l_k_obj <- test_k_gam(formula,l_k,data_training) |
|
325 | 365 |
#d |
326 | 366 |
#Now get k_index for each model and store it in a table. |
327 | 367 |
#function gam.check with |
368 |
|
|
369 |
list_df_k <- create_gam_check_table(l_k_obj) |
|
370 |
|
|
371 |
list_mod_gam_stat <- lapply(list_mod,FUN=extract_fitting_mod_gam_stat) |
|
372 |
df_mod_stat <- list_mod_gam_stat[[7]] |
|
373 |
#combine tables... |
|
374 |
l_df_diagnostics<- lapply(1:length(list_df_k),FUN=function(i,df_1,df_2){cbind(df_1[[i]],df_2[[2]])},df_1=list_df_k,df_2=list_mod_gam_stat) |
|
375 |
|
|
376 |
df_diagnostics <- do.call(rbind,l_df_diagnostics) |
|
377 |
|
|
378 |
df_diagnostics$date <- rep(unique(data_training$date),nrow(df_diagnostics)) |
|
379 |
df_diagnostics$month <- rep(unique(data_training$month)) |
|
380 |
|
|
381 |
df_diagnostics$pred_mod <-names_mod #defined earlier... |
|
382 |
|
|
328 | 383 |
#select the right mode based on k-index, edf or other criteria? |
329 | 384 |
|
385 |
#choice of the lowest k for fit and k_index > 1 |
|
386 |
#then use model 1 |
|
387 |
|
|
388 |
#choice of the highest k for fit and k_index > 1 |
|
389 |
#then use model 7 |
|
390 |
|
|
391 |
########### Use loop to fit model for for 1 to 12 ?############## |
|
330 | 392 |
#Now function to run mod depending on conditions |
331 | 393 |
|
394 |
#first make a function for one model (could be month) |
|
395 |
j <- 7 # July |
|
396 |
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]] |
|
397 |
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas |
|
398 |
clim_method_mod_obj$clim #file predicted |
|
399 |
|
|
400 |
l_mod <- clim_method_mod_obj$mod #file predicted |
|
401 |
|
|
402 |
names(clim_method_mod_obj) |
|
403 |
data_training <- clim_method_mod_obj$data_month |
|
404 |
clim_method_mod_obj$data_month_v |
|
405 |
|
|
406 |
list_fitted_models<-vector("list",length(list_formulas)) |
|
407 |
k <-3 #select model 3 with LST |
|
408 |
formula <-list_formulas[[k]] |
|
409 |
|
|
410 |
j <- 7 # July |
|
411 |
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]] |
|
412 |
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas |
|
413 |
|
|
414 |
l_mod <- clim_method_mod_obj$mod #file predicted |
|
415 |
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas)) |
|
416 |
list_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas) |
|
417 |
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="") |
|
418 |
#we are assuming no monthly hold out... |
|
419 |
#we are assuming only one specific daily prop? |
|
420 |
nb_models <- length(pred_mod) |
|
421 |
|
|
422 |
data_training <- clim_method_mod_obj$data_month |
|
423 |
|
|
424 |
#list_fitted_models<-vector("list",length(list_formulas)) |
|
425 |
k <-2 #select model 3 with LST |
|
426 |
names_mod <- paste("mod_",k,sep="") |
|
427 |
formula <-list_formulas[[k]] |
|
428 |
|
|
429 |
l_k <- c(30,10,10) #default values |
|
430 |
|
|
431 |
test_df <- fit_gam_model_with_diagnostics(l_k,data_training,formula,names_mod) |
|
432 |
|
|
433 |
fit_gam_model_with_diagnostics <- function(l_k,data_training,formula,names_mod){ |
|
434 |
|
|
435 |
|
|
436 |
#STEP 1: fit with a range for k values |
|
437 |
|
|
438 |
#Fit models using given k values, formula and training dataset |
|
439 |
#This is done starting at l_k/2 e.g. c(30,10,10) becomes c(15,5,5) where k is assigned to each term in the order given |
|
440 |
l_k_obj <- test_k_gam(formula,l_k,data_training) |
|
441 |
|
|
442 |
#STEP 2: get k-index diagnostic for ech model |
|
443 |
|
|
444 |
#Now get k_index for each model and store it in a table. |
|
445 |
#function gam.check is sued |
|
446 |
|
|
447 |
list_df_k <- create_gam_check_table(l_k_obj) |
|
448 |
|
|
449 |
#STEP 3: produce additional metrics for diagnostis of fit |
|
450 |
#this includes the calculation of RMSE, MAE, bias, gcv,aic for fitted model |
|
451 |
list_mod <- l_k_obj$list_mod |
|
452 |
list_mod_gam_stat <- lapply(list_mod,FUN=extract_fitting_mod_gam_stat) |
|
453 |
|
|
454 |
#STEP 4: combine tables of diagnostics, format and add additional information |
|
455 |
|
|
456 |
#combine tables... |
|
457 |
l_df_diagnostics <- lapply(1:length(list_df_k),FUN=function(i,df_1,df_2){cbind(df_1[[i]],df_2[[2]])},df_1=list_df_k,df_2=list_mod_gam_stat) |
|
458 |
df_diagnostics <- do.call(rbind,l_df_diagnostics) |
|
459 |
df_diagnostics$date <- rep(unique(data_training$date),nrow(df_diagnostics)) |
|
460 |
df_diagnostics$month <- rep(unique(data_training$month)) |
|
461 |
df_diagnostics$pred_mod <-names_mod #defined earlier... (input argutment ) |
|
462 |
|
|
463 |
#Return the diagnostic table |
|
464 |
return(df_diagnostics) |
|
465 |
} |
|
466 |
|
|
332 | 467 |
################## END OF SCRIPT ############### |
Also available in: Unified diff
gam fitting diagnostic, adding rmse, mae and other metrics as well as wrap function to run fit at different