Revision d1fde319
Added by Benoit Parmentier over 10 years ago
climate/research/oregon/interpolation/nex_run_gam_fitting.R | ||
---|---|---|
1 | 1 |
#################################### INTERPOLATION OF TEMPERATURES ####################################### |
2 |
############################ Script for experimentation of GAM model fitting ############################## |
|
2 |
############################ Script for the experimentation of GAM model fitting ##############################
|
|
3 | 3 |
#This script uses the worklfow code applied to the globe. Results currently reside on NEX/PLEIADES NASA. |
4 |
#The purpose is to test different parameters for the GAM function to allow . |
|
5 |
## - experiment with gamma |
|
6 |
# - experiment with k |
|
7 |
# A table of diagnostic is created. |
|
4 |
#The purpose is to test different parameters for the GAM function to allow fitting in area with low stations count. |
|
5 |
##We experiment with: |
|
6 |
# - gamma: multiplying factor that acts on the smoothing parameter lambda |
|
7 |
# - k: dimension for each term which affects the number of knots and complextiy of hte relationship |
|
8 |
# |
|
9 |
#The code generate a table of diagnostic that includes: k-index, rmse,mae, aic, gcv. |
|
10 |
# |
|
8 | 11 |
#AUTHOR: Benoit Parmentier |
9 | 12 |
#CREATED ON: 07/30/2014 |
10 |
#MODIFIED ON: 08/06/2014
|
|
13 |
#MODIFIED ON: 08/07/2014
|
|
11 | 14 |
#Version: 1 |
12 |
#PROJECT: Environmental Layers project |
|
15 |
#PROJECT: Environmental Layers project NCEAS-NASA
|
|
13 | 16 |
#TO DO: |
17 |
# - check that step are increased by 1 for k |
|
18 |
# - k starting value can be controled by increase or decrease |
|
19 |
# - k initial value can be set |
|
14 | 20 |
# |
15 | 21 |
################################################################################################# |
16 | 22 |
|
... | ... | |
43 | 49 |
|
44 | 50 |
#### FUNCTION USED IN SCRIPT |
45 | 51 |
|
46 |
function_analyses_paper1 <-"contribution_of_covariates_paper_interpolation_functions_05212014.R" #first interp paper |
|
47 |
function_analyses_paper2 <-"multi_timescales_paper_interpolation_functions_05052014.R" |
|
48 |
|
|
49 | 52 |
load_obj <- function(f) |
50 | 53 |
{ |
51 | 54 |
env <- new.env() |
... | ... | |
53 | 56 |
env[[nm]] |
54 | 57 |
} |
55 | 58 |
|
56 |
|
|
57 | 59 |
### Function to fit using the training data from the workflow |
58 | 60 |
fit_models<-function(list_formulas,data_training){ |
59 | 61 |
|
... | ... | |
170 | 172 |
} |
171 | 173 |
|
172 | 174 |
## Fit model using training data, formula and k dimensions with increment |
173 |
test_k_gam <- function(formula,l_k,data_training){ |
|
175 |
test_k_gam <- function(formula,l_k,data_training,l_k_min=NULL){ |
|
176 |
|
|
177 |
if(is.null(l_k_min)){ |
|
178 |
l_k_tmp <- as.numeric(l_k)/2 #if no starting l_k then use half the l_k value (which correspond to the max) |
|
179 |
} |
|
180 |
|
|
181 |
if(!is.null(l_k_min)){ |
|
182 |
l_k_tmp <- l_k_min #if l_k then use l_k_min |
|
183 |
} |
|
174 | 184 |
|
175 |
l_k_tmp <- as.numeric(l_k)/2 |
|
176 | 185 |
#l_k_tmp <- l_k |
177 | 186 |
#maybe go in a loop to fit? could this in a loop for each term then select the term with highest k and k_index >1 ?? |
178 | 187 |
#list_mod <- vector("list",0) |
... | ... | |
191 | 200 |
#if opt_increase{} |
192 | 201 |
l_k_tmp <- l_k_tmp + rep(1,length(l_k_tmp)) #can modify this option to go -1 instead of plus 1? |
193 | 202 |
#if opt_decrease{} |
194 |
l_k_tmp <- l_k_tmp + rep(1,length(l_k_tmp)) #can modify this option to go -1 instead of plus 1?
|
|
203 |
#l_k_tmp <- l_k_tmp - rep(1,length(l_k_tmp)) #can modify this option to go -1 instead of plus 1?
|
|
195 | 204 |
|
196 | 205 |
j <- j+1 |
197 | 206 |
limit_l_k <- (l_k_tmp > l_k) #now check that c(30,10,10 is not gone above!!) |
... | ... | |
269 | 278 |
} |
270 | 279 |
|
271 | 280 |
## Run whole process of diagnostic creation for given formula (without k), training data and model name |
272 |
fit_gam_model_with_diagnostics <- function(l_k,data_training,formula,names_mod){ |
|
281 |
fit_gam_model_with_diagnostics <- function(l_k,data_training,formula,names_mod,y_var_name,out_suffix,out_path,l_k_min=NULL){
|
|
273 | 282 |
#Input parameters: |
274 | 283 |
#l_k: vector of k value for each smooth term |
284 |
#formula: model formul without the k values!! eg y_var ~ s(lat,lon) + s(elev_s) |
|
275 | 285 |
#data_training: data.frame containing data for fitting of gam |
276 | 286 |
#names_mod: character (string) with name of the model fitted. |
287 |
#y_var_name: variable being model eg dailyTmax |
|
288 |
#out_suffix: ouput suffix added toe the file saved |
|
289 |
#out_path: output directory |
|
290 |
#TODO: |
|
291 |
# -allow default val for out_suffix, y_var_name and out_path |
|
292 |
|
|
293 |
#### library and param |
|
277 | 294 |
|
278 | 295 |
library(mgcv) #to avoid error below?? Maybe an indepent graphic devices needs to be opened?? |
279 | 296 |
#Error in get(name, envir = asNamespace(pkg), inherits = FALSE) : |
... | ... | |
281 | 298 |
#Graphics error: Error in get(name, envir = asNamespace(pkg), inherits = FALSE) : |
282 | 299 |
#object 'rversion' not found |
283 | 300 |
|
301 |
########## START SCRIPT ############# |
|
302 |
|
|
284 | 303 |
#STEP 1: fit with a range for k values |
285 | 304 |
|
286 | 305 |
#Fit models using given k values, formula and training dataset |
287 | 306 |
#This is done starting at l_k/2 e.g. c(30,10,10) becomes c(15,5,5) where k is assigned to each term in the order given |
288 |
l_k_obj <- test_k_gam(formula,l_k,data_training) |
|
307 |
l_k_obj <- test_k_gam(formula,l_k,data_training,l_k_min)
|
|
289 | 308 |
|
290 | 309 |
#STEP 2: get k-index diagnostic for ech model |
291 | 310 |
|
... | ... | |
316 | 335 |
diagnostics_obj <- list(df_diagnostics,list_mod) |
317 | 336 |
names(diagnostics_obj) <- c("df_diagnostics","list_mod") |
318 | 337 |
|
338 |
out_prefix <- out_suffix |
|
339 |
filename_obj <- file.path(out_path,paste("diagnostics_obj_gam_fitting","_", y_var_name,out_prefix,".RData",sep="")) |
|
340 |
|
|
341 |
save(diagnostics_obj,file=filename_obj ) |
|
342 |
|
|
319 | 343 |
return(diagnostics_obj) |
344 |
|
|
320 | 345 |
} |
321 | 346 |
|
322 | 347 |
##############################s |
323 | 348 |
#### Parameters and constants |
324 | 349 |
|
350 |
#First, copy data from NEX to Atlas: |
|
351 |
|
|
325 | 352 |
#scp -rp raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" |
326 | 353 |
#scp -rp reg*.tif parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" |
327 | 354 |
|
328 |
## laod data from Northen Africa |
|
329 |
in_dir <- "/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" |
|
330 |
raster_obj_infile <- "raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData" |
|
355 |
##Second, load data from Northen Africa |
|
356 |
in_dir <- "/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" #Atlas location |
|
357 |
raster_obj_infile <- "raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData" #Raster prediction object |
|
358 |
out_path <- in_dir #output dir can be set |
|
359 |
|
|
360 |
out_suffix <-"_08062014" #is the same as "out_prefix" in the workflow |
|
361 |
y_var_name <- "dailyTmax" #see workflow |
|
331 | 362 |
|
332 |
setwd(in_dir)
|
|
363 |
setwd(out_path)
|
|
333 | 364 |
|
334 | 365 |
########################## START SCRIPT ############################## |
335 | 366 |
|
336 | 367 |
######################################## |
337 | 368 |
#### PART I: Explore fitting of GAM with k and gamma parameters #### |
338 | 369 |
|
339 |
raster_obj<- load_obj(raster_obj_infile)
|
|
370 |
raster_obj <- load_obj(raster_obj_infile) #load raster predction object containing all the information
|
|
340 | 371 |
|
341 | 372 |
#raster_obj <- load_obj(unlist(raster_obj_file)) #may not need unlist |
342 |
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas)) |
|
343 |
list_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas) |
|
373 |
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas)) #number of models predicted
|
|
374 |
list_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas) #list of formula with models predicted
|
|
344 | 375 |
|
345 |
#Models used: |
|
376 |
#Models used/predicted:
|
|
346 | 377 |
#y_var ~ s(lat, lon) + s(elev_s) |
347 | 378 |
#y_var ~ s(lat, lon) + s(elev_s) + s(LST) |
348 | 379 |
#y_var ~ s(lat, lon) + s(elev_s) + s(N_w, E_w) + s(LST) + ti(LST,LC1) + s(LC1) |
349 | 380 |
|
350 |
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="") |
|
381 |
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="") #names of models
|
|
351 | 382 |
#we are assuming no monthly hold out... |
352 | 383 |
#we are assuming only one specific daily prop? |
353 |
nb_models <- length(pred_mod) |
|
384 |
nb_models <- length(pred_mod) #4 instead of 3
|
|
354 | 385 |
|
355 |
#Select one month to play around:
|
|
386 |
#Select one month to test and explored model fitting parameters:
|
|
356 | 387 |
j <- 7 # July |
357 |
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]] |
|
358 |
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas |
|
388 |
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]] #object containing model fitted at the monthly time scale |
|
389 |
#Object is a list made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas |
|
390 |
names(clim_method_mod_obj) |
|
391 |
clim_method_mod_obj$data_month #fitting data |
|
392 |
clim_method_mod_obj$data_month_v #no hold out for monthly data |
|
359 | 393 |
clim_method_mod_obj$clim #file predicted |
360 | 394 |
|
361 |
l_mod <- clim_method_mod_obj$mod #file predicted
|
|
362 |
|
|
395 |
l_mod <- clim_method_mod_obj$mod # R model object for GAM
|
|
396 |
data_training <- clim_method_mod_obj$data_month #no hold out for monthly data |
|
363 | 397 |
#Quick look at the study area: Equatorial to Northern Africa |
364 | 398 |
reg_rast <- stack(list.files(pattern="*.tif")) |
365 |
plot(reg_rast,y=15) |
|
366 |
|
|
367 |
names(clim_method_mod_obj) |
|
368 |
clim_method_mod_obj$data_month |
|
369 |
clim_method_mod_obj$data_month_v |
|
370 |
|
|
399 |
plot(reg_rast,y=15) #plot SRTM (elev) |
|
400 |
dim(reg_rast) #7,200 ; 3,600 ; 15 |
|
371 | 401 |
|
402 |
### Which model to test? Use model 2 because it is not fitted. |
|
372 | 403 |
k <- 2 #select model 2 with LST |
373 |
formula <-list_formulas[[k]] |
|
404 |
formula <-list_formulas[[k]] #model 2: y_var ~ s(lat, lon) + s(elev_s) + s(LST)
|
|
374 | 405 |
mod<- try(gam(formula, data=data_training)) #does not fit!! as expected |
375 | 406 |
|
376 |
### TRY with different gamma
|
|
407 |
#### TRY with different gamma parameter value
|
|
377 | 408 |
|
378 |
mod_g1<- try(gam(y_var ~ s(lat, lon) + s(elev_s),gamma=1.4, data=data_training)) #change to any model!! |
|
409 |
mod_g1 <- try(gam(y_var ~ s(lat, lon) + s(elev_s),gamma=1.4, data=data_training)) #change to any model!!
|
|
379 | 410 |
gam.check(mod_g1) |
380 | 411 |
# k' edf k-index p-value |
381 | 412 |
#s(lat,lon) 29.000 9.470 0.967 0.32 |
... | ... | |
384 | 415 |
mod_g2<- try(gam(y_var ~ s(lat, lon) + s(elev_s),gamma=10, data=data_training)) #change to any model!! |
385 | 416 |
gam.check(mod_g2) #increase in k-index!! |
386 | 417 |
|
418 |
mod_g3<- try(gam(y_var ~ s(lat, lon) + s(elev_s),gamma=100, data=data_training)) #change to any model!! |
|
419 |
gam.check(mod_g3) #increase in k-index!! |
|
420 |
|
|
387 | 421 |
# k' edf k-index p-value |
388 | 422 |
#s(lat,lon) 29.00 29.00 1.50 1.00 |
389 | 423 |
#s(elev_s) 9.00 9.00 1.11 0.74 |
390 | 424 |
|
425 |
#Now try with model2: y_var ~ s(lat, lon) + s(elev_s) + s(LST) |
|
426 |
|
|
391 | 427 |
mod<- try(gam(y_var ~ s(lat, lon) + s(elev_s) + s(LST),gamma=1.4, data=data_training)) #does not fit! |
392 |
mod<- try(gam(y_var ~ s(lat, lon) + s(elev_s) + s(LST),gamma=10, data=data_training)) #does not fitl!! |
|
428 |
mod<- try(gam(y_var ~ s(lat, lon) + s(elev_s) + s(LST),gamma=10, data=data_training)) #does not fit!! |
|
429 |
mod<- try(gam(y_var ~ s(lat, lon) + s(elev_s) + s(LST),gamma=100, data=data_training)) #does not fit!! |
|
393 | 430 |
|
394 | 431 |
### TRY with different k |
395 | 432 |
|
... | ... | |
398 | 435 |
mod_t2<- try(gam(y_var ~ s(lat, lon,k=5) + s(elev_s) + s(LST), data=data_training)) #change to any model!! |
399 | 436 |
gam.check(mod_t2) #in this case k=5 is too small for the interactive term as k-index is less than 1 |
400 | 437 |
|
401 |
## check for residual pattern, removeable by increasing `k' |
|
402 |
## typically `k', below, chould be substantially larger than |
|
403 |
## the original, `k' but certainly less than n/2. |
|
404 |
vis.gam(mod_t1) |
|
405 |
vis.gam(mod_t1,view=c("lat","lon"),theta= 35) # plot against by variable |
|
406 | 438 |
#http://stats.stackexchange.com/questions/12223/how-to-tune-smoothing-in-mgcv-gam-model |
407 | 439 |
|
408 | 440 |
mod_t3 <- try(gam(y_var ~ s(lat, lon,k=22) + s(elev_s) + s(LST), data=data_training)) #change to any model!! |
... | ... | |
413 | 445 |
mod_t1$aic |
414 | 446 |
mod_t1$edf |
415 | 447 |
|
448 |
## check for residual pattern, removeable by increasing `k' |
|
449 |
## typically `k', below, chould be substantially larger than |
|
450 |
## the original, `k' but certainly less than n/2. |
|
451 |
|
|
452 |
### Visualization of predicted values |
|
453 |
vis.gam(mod_t1) |
|
454 |
vis.gam(mod_t1,view=c("lat","lon"),theta= 35) # plot against by variable |
|
455 |
x_data <- mod_t1$model #model matrix also known as design matrix, note that it is stored as a data.frame |
|
456 |
|
|
457 |
plot(mod_t1) |
|
458 |
y<- fitted.values(mod_t1) |
|
459 |
x<- x_data$lat |
|
460 |
plot(y ~ x) #this is predicted versus lat |
|
461 |
points(x_data$y_var ~ x,col="red") |
|
462 |
y<- fitted.values(mod_t1) |
|
463 |
x<- x_data$LST |
|
464 |
plot(y~x) #this is predicted vs LST |
|
465 |
|
|
416 | 466 |
######################################## |
417 | 467 |
#### PART II: Use the new functions to explore fitting with k-dimension in GAM #### |
418 | 468 |
|
... | ... | |
434 | 484 |
|
435 | 485 |
#list_fitted_models<-vector("list",length(list_formulas)) |
436 | 486 |
k <-2 #select model 3 with LST |
437 |
names_mod <- paste("mod_",k,sep="")
|
|
487 |
names_mod <- paste("mod",k,sep="") |
|
438 | 488 |
model_name<-paste("mod",k,sep="") |
439 | 489 |
|
440 | 490 |
formula <-list_formulas[[k]] |
441 | 491 |
|
442 |
l_k <- c(30,10,10) #default values |
|
492 |
l_k <- c(30,10,10) #default values : max val |
|
493 |
l_k_min <- c(10,4,4) #minimum values, default is null |
|
494 |
|
|
443 | 495 |
#Create |
444 |
l_k_obj <- test_k_gam(formula,l_k,data_training) |
|
445 |
#d |
|
496 |
l_k_obj <- test_k_gam(formula,l_k,data_training,l_k_min)
|
|
497 |
|
|
446 | 498 |
#Now get k_index for each model and store it in a table. |
447 | 499 |
#function gam.check with |
448 | 500 |
|
449 | 501 |
list_df_k <- create_gam_check_table(l_k_obj) |
450 | 502 |
|
503 |
list_mod <- l_k_obj$list_mod |
|
504 |
list_mod<- list_mod[unlist(lapply(list_mod,FUN=function(x){!inherits(x,"try-error")}))] |
|
451 | 505 |
list_mod_gam_stat <- lapply(list_mod,FUN=extract_fitting_mod_gam_stat) |
452 | 506 |
df_mod_stat <- list_mod_gam_stat[[7]] |
453 | 507 |
#combine tables... |
... | ... | |
487 | 541 |
clim_method_mod_obj$data_month_v |
488 | 542 |
|
489 | 543 |
list_fitted_models<-vector("list",length(list_formulas)) |
490 |
k <-3 #select model 3 with LST
|
|
544 |
k <- 2 #select model 3 with LST
|
|
491 | 545 |
formula <-list_formulas[[k]] |
492 | 546 |
|
493 | 547 |
j <- 7 # July |
... | ... | |
502 | 556 |
#we are assuming only one specific daily prop? |
503 | 557 |
nb_models <- length(pred_mod) |
504 | 558 |
|
505 |
data_training <- clim_method_mod_obj$data_month |
|
559 |
data_training <- clim_method_mod_obj$data_month #this is for July
|
|
506 | 560 |
|
507 | 561 |
#list_fitted_models<-vector("list",length(list_formulas)) |
508 | 562 |
k <-2 #select model 2 with LST |
... | ... | |
512 | 566 |
formula <-list_formulas[[k]] |
513 | 567 |
|
514 | 568 |
l_k <- c(30,10,10) #default values |
569 |
l_k_min <- c(10,4,4) #minimum values, default is null |
|
515 | 570 |
|
516 | 571 |
#test_df <- fit_gam_model_with_diagnostics(l_k,data_training,formula,model_name) |
517 |
test_obj <- fit_gam_model_with_diagnostics(l_k,data_training,formula,model_name) |
|
572 |
#y_var_name, out_suffix, out_path are all set at the beginning of the script |
|
573 |
#test_obj <- fit_gam_model_with_diagnostics(l_k,data_training,formula,model_name) |
|
574 |
out_suffix_s <- paste(7,"_",out_suffix,sep="") #7 for month of july |
|
575 |
|
|
576 |
#could transform the input arguments to be a list?? |
|
577 |
#use l_k_min=NULL to start with half of the maximum k values... |
|
578 |
test1_gam_fit_obj <- fit_gam_model_with_diagnostics(l_k,data_training,formula,names_mod,y_var_name,out_suffix_s,out_path,l_k_min=NULL) |
|
579 |
#use l_k_min=c(10,4,4) |
|
580 |
out_suffix_s <- paste(7,"_","lk_min_",out_suffix,sep="") #7 for month of july |
|
581 |
test2_gam_fit_obj <- fit_gam_model_with_diagnostics(l_k,data_training,formula,names_mod,y_var_name,out_suffix_s,out_path,l_k_min) |
|
582 |
|
|
583 |
names(test2_gam_fit_obj) |
|
584 |
|
|
585 |
df_diagnostics <- test2_gam_fit_obj$df_diagnostics #extract the diagnostic table |
|
586 |
|
|
587 |
names(df_diagnostics) |
|
588 |
df_diagnostics$fit_no |
|
589 |
df_diagnostics$rmse |
|
590 |
plot(df_diagnostics$rmse) |
|
591 |
#plot(df_diagnostics$rmse~df_diagnostics$fit_no,data=df_diagnostics) |
|
592 |
|
|
593 |
#Now do this over 12 months for each model?? |
|
518 | 594 |
|
519 |
#Now do this over 12 months??
|
|
595 |
#loop here or lapply
|
|
520 | 596 |
|
521 | 597 |
################## END OF SCRIPT ############### |
Also available in: Unified diff
gam fitting, adding option to run fitting with starting list of k values and clean up