Project

General

Profile

« Previous | Next » 

Revision d1fde319

Added by Benoit Parmentier over 10 years ago

gam fitting, adding option to run fitting with starting list of k values and clean up

View differences:

climate/research/oregon/interpolation/nex_run_gam_fitting.R
1 1
####################################  INTERPOLATION OF TEMPERATURES  #######################################
2
############################  Script for experimentation of GAM model fitting ##############################
2
############################  Script for the experimentation of GAM model fitting ##############################
3 3
#This script uses the worklfow code applied to the globe. Results currently reside on NEX/PLEIADES NASA.
4
#The purpose is to test different parameters for the GAM function to allow .
5
## - experiment with gamma
6
# - experiment with k
7
# A table of diagnostic is created.
4
#The purpose is to test different parameters for the GAM function to allow fitting in area with low stations count.
5
##We experiment with:
6
# - gamma:  multiplying factor that acts on the smoothing parameter lambda
7
# - k: dimension for each term which affects the number of knots and complextiy of hte relationship
8
#
9
#The code generate a table of diagnostic that includes: k-index, rmse,mae, aic, gcv.
10
#
8 11
#AUTHOR: Benoit Parmentier 
9 12
#CREATED ON: 07/30/2014  
10
#MODIFIED ON: 08/06/2014            
13
#MODIFIED ON: 08/07/2014            
11 14
#Version: 1
12
#PROJECT: Environmental Layers project  
15
#PROJECT: Environmental Layers project  NCEAS-NASA
13 16
#TO DO:
17
# - check that step are increased by 1 for k
18
# - k starting  value can  be controled by increase or decrease
19
# - k initial value can be set
14 20
#
15 21
#################################################################################################
16 22

  
......
43 49

  
44 50
#### FUNCTION USED IN SCRIPT
45 51

  
46
function_analyses_paper1 <-"contribution_of_covariates_paper_interpolation_functions_05212014.R" #first interp paper
47
function_analyses_paper2 <-"multi_timescales_paper_interpolation_functions_05052014.R"
48

  
49 52
load_obj <- function(f)
50 53
{
51 54
  env <- new.env()
......
53 56
  env[[nm]]
54 57
}
55 58

  
56

  
57 59
### Function to fit using  the training  data  from the  workflow
58 60
fit_models<-function(list_formulas,data_training){
59 61

  
......
170 172
}
171 173

  
172 174
## Fit model using training  data, formula and k dimensions with increment
173
test_k_gam <- function(formula,l_k,data_training){
175
test_k_gam <- function(formula,l_k,data_training,l_k_min=NULL){
176

  
177
  if(is.null(l_k_min)){
178
    l_k_tmp <- as.numeric(l_k)/2  #if  no starting l_k then use half the l_k value (which correspond to the max)
179
  }
180

  
181
  if(!is.null(l_k_min)){
182
    l_k_tmp <- l_k_min  #if l_k then use l_k_min
183
  }
174 184

  
175
  l_k_tmp <- as.numeric(l_k)/2
176 185
  #l_k_tmp <- l_k
177 186
  #maybe go in  a loop to  fit? could this in a loop for each term then select the term with highest k and k_index >1 ??
178 187
  #list_mod <- vector("list",0)
......
191 200
    #if opt_increase{}
192 201
    l_k_tmp <- l_k_tmp + rep(1,length(l_k_tmp)) #can modify this option to go -1 instead of plus 1?
193 202
    #if opt_decrease{}
194
    l_k_tmp <- l_k_tmp + rep(1,length(l_k_tmp)) #can modify this option to go -1 instead of plus 1?
203
    #l_k_tmp <- l_k_tmp - rep(1,length(l_k_tmp)) #can modify this option to go -1 instead of plus 1?
195 204
    
196 205
    j <- j+1
197 206
    limit_l_k <- (l_k_tmp > l_k) #now check that c(30,10,10 is not gone above!!)
......
269 278
} 
270 279

  
271 280
## Run whole process of diagnostic creation for given formula (without k), training data and model name
272
fit_gam_model_with_diagnostics <- function(l_k,data_training,formula,names_mod){
281
fit_gam_model_with_diagnostics <- function(l_k,data_training,formula,names_mod,y_var_name,out_suffix,out_path,l_k_min=NULL){
273 282
  #Input parameters:
274 283
  #l_k: vector of k value for each smooth term
284
  #formula: model formul without the k values!! eg y_var ~ s(lat,lon) + s(elev_s)
275 285
  #data_training: data.frame containing  data for fitting of  gam
276 286
  #names_mod: character (string) with name of the model fitted.
287
  #y_var_name: variable being model eg  dailyTmax
288
  #out_suffix: ouput suffix added toe the file  saved
289
  #out_path: output directory
290
  #TODO: 
291
  # -allow default val for out_suffix, y_var_name and out_path
292
  
293
  #### library and param
277 294
  
278 295
  library(mgcv) #to avoid error below?? Maybe an indepent graphic devices needs to be opened??
279 296
  #Error in get(name, envir = asNamespace(pkg), inherits = FALSE) : 
......
281 298
  #Graphics error: Error in get(name, envir = asNamespace(pkg), inherits = FALSE) : 
282 299
  #object 'rversion' not found
283 300
  
301
  ########## START  SCRIPT #############
302
  
284 303
  #STEP 1: fit with a range for k values
285 304
  
286 305
  #Fit models using given k values, formula and training dataset   
287 306
  #This is done starting at l_k/2 e.g. c(30,10,10) becomes c(15,5,5) where k is assigned to each term in the order given
288
  l_k_obj <- test_k_gam(formula,l_k,data_training)
307
  l_k_obj <- test_k_gam(formula,l_k,data_training,l_k_min)
289 308
  
290 309
  #STEP 2: get  k-index diagnostic for ech model
291 310
  
......
316 335
  diagnostics_obj <- list(df_diagnostics,list_mod)
317 336
  names(diagnostics_obj) <- c("df_diagnostics","list_mod")
318 337
  
338
  out_prefix  <- out_suffix
339
  filename_obj <- file.path(out_path,paste("diagnostics_obj_gam_fitting","_", y_var_name,out_prefix,".RData",sep=""))
340
  
341
  save(diagnostics_obj,file=filename_obj )
342

  
319 343
  return(diagnostics_obj)
344
  
320 345
}
321 346

  
322 347
##############################s
323 348
#### Parameters and constants  
324 349

  
350
#First, copy data from NEX to Atlas:
351

  
325 352
#scp -rp raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0"
326 353
#scp -rp reg*.tif parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0"
327 354

  
328
## laod data from Northen Africa
329
in_dir <- "/data/project/layers/commons/NEX_data/output_regions/15.0_20.0"
330
raster_obj_infile <- "raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData"
355
##Second, load data from Northen Africa
356
in_dir <- "/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" #Atlas location
357
raster_obj_infile <- "raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData" #Raster prediction object
358
out_path <- in_dir #output dir can  be set
359

  
360
out_suffix  <-"_08062014" #is the same as "out_prefix" in the workflow 
361
y_var_name <- "dailyTmax" #see workflow
331 362

  
332
setwd(in_dir)
363
setwd(out_path)
333 364

  
334 365
########################## START SCRIPT ##############################
335 366

  
336 367
########################################
337 368
#### PART I: Explore fitting of GAM with k and gamma parameters ####
338 369

  
339
raster_obj<- load_obj(raster_obj_infile)
370
raster_obj <- load_obj(raster_obj_infile) #load raster predction object containing all the information
340 371

  
341 372
#raster_obj <- load_obj(unlist(raster_obj_file)) #may not need unlist
342
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas))
343
list_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas)
373
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas)) #number of models predicted
374
list_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas) #list of formula with models predicted
344 375

  
345
#Models used:
376
#Models used/predicted:
346 377
#y_var ~ s(lat, lon) + s(elev_s)
347 378
#y_var ~ s(lat, lon) + s(elev_s) + s(LST)
348 379
#y_var ~ s(lat, lon) + s(elev_s) + s(N_w, E_w) + s(LST) + ti(LST,LC1) + s(LC1)
349 380

  
350
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="")
381
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="") #names of models
351 382
#we are assuming no monthly hold out...
352 383
#we are assuming only one specific daily prop?
353
nb_models <- length(pred_mod)
384
nb_models <- length(pred_mod) #4 instead of 3
354 385

  
355
#Select one month to play around:
386
#Select one month to test and explored model fitting parameters:
356 387
j <- 7 # July
357
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]]
358
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas
388
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]] #object containing model fitted at the monthly time scale
389
#Object is a list made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas
390
names(clim_method_mod_obj)
391
clim_method_mod_obj$data_month #fitting  data
392
clim_method_mod_obj$data_month_v #no hold out for monthly data
359 393
clim_method_mod_obj$clim #file predicted
360 394

  
361
l_mod <- clim_method_mod_obj$mod #file predicted
362

  
395
l_mod <- clim_method_mod_obj$mod # R model object for GAM
396
data_training <- clim_method_mod_obj$data_month #no hold out for monthly data
363 397
#Quick look at the study area: Equatorial to Northern Africa
364 398
reg_rast <- stack(list.files(pattern="*.tif"))
365
plot(reg_rast,y=15)
366

  
367
names(clim_method_mod_obj)
368
clim_method_mod_obj$data_month
369
clim_method_mod_obj$data_month_v
370

  
399
plot(reg_rast,y=15) #plot SRTM (elev)
400
dim(reg_rast) #7,200 ; 3,600 ; 15
371 401

  
402
### Which model to test? Use model 2 because it is not fitted.
372 403
k <- 2 #select model 2 with LST
373
formula <-list_formulas[[k]]
404
formula <-list_formulas[[k]] #model 2: y_var ~ s(lat, lon) + s(elev_s) + s(LST)
374 405
mod<- try(gam(formula, data=data_training)) #does not fit!! as expected
375 406

  
376
### TRY with different gamma
407
#### TRY with different gamma parameter value
377 408

  
378
mod_g1<- try(gam(y_var ~ s(lat, lon) + s(elev_s),gamma=1.4, data=data_training)) #change to any model!!
409
mod_g1 <- try(gam(y_var ~ s(lat, lon) + s(elev_s),gamma=1.4, data=data_training)) #change to any model!!
379 410
gam.check(mod_g1)
380 411
#               k'    edf k-index p-value
381 412
#s(lat,lon) 29.000  9.470   0.967    0.32
......
384 415
mod_g2<- try(gam(y_var ~ s(lat, lon) + s(elev_s),gamma=10, data=data_training)) #change to any model!!
385 416
gam.check(mod_g2) #increase in k-index!!
386 417

  
418
mod_g3<- try(gam(y_var ~ s(lat, lon) + s(elev_s),gamma=100, data=data_training)) #change to any model!!
419
gam.check(mod_g3) #increase in k-index!!
420

  
387 421
#              k'   edf k-index p-value
388 422
#s(lat,lon) 29.00 29.00    1.50    1.00
389 423
#s(elev_s)   9.00  9.00    1.11    0.74
390 424

  
425
#Now try with model2: y_var ~ s(lat, lon) + s(elev_s) + s(LST)
426

  
391 427
mod<- try(gam(y_var ~ s(lat, lon) + s(elev_s) + s(LST),gamma=1.4, data=data_training)) #does not fit!
392
mod<- try(gam(y_var ~ s(lat, lon) + s(elev_s) + s(LST),gamma=10, data=data_training)) #does not fitl!!
428
mod<- try(gam(y_var ~ s(lat, lon) + s(elev_s) + s(LST),gamma=10, data=data_training)) #does not fit!!
429
mod<- try(gam(y_var ~ s(lat, lon) + s(elev_s) + s(LST),gamma=100, data=data_training)) #does not fit!!
393 430

  
394 431
### TRY with different k
395 432

  
......
398 435
mod_t2<- try(gam(y_var ~ s(lat, lon,k=5) + s(elev_s) + s(LST), data=data_training)) #change to any model!!
399 436
gam.check(mod_t2) #in this case k=5 is too small for the interactive  term as k-index is less than 1
400 437

  
401
## check for residual pattern, removeable by increasing `k'
402
## typically `k', below, chould be substantially larger than 
403
## the original, `k' but certainly less than n/2.
404
vis.gam(mod_t1)
405
vis.gam(mod_t1,view=c("lat","lon"),theta= 35) # plot against by variable
406 438
#http://stats.stackexchange.com/questions/12223/how-to-tune-smoothing-in-mgcv-gam-model
407 439

  
408 440
mod_t3 <- try(gam(y_var ~ s(lat, lon,k=22) + s(elev_s) + s(LST), data=data_training)) #change to any model!!
......
413 445
mod_t1$aic
414 446
mod_t1$edf
415 447

  
448
## check for residual pattern, removeable by increasing `k'
449
## typically `k', below, chould be substantially larger than 
450
## the original, `k' but certainly less than n/2.
451

  
452
### Visualization of predicted values
453
vis.gam(mod_t1)
454
vis.gam(mod_t1,view=c("lat","lon"),theta= 35) # plot against by variable
455
x_data <- mod_t1$model #model matrix also known as design matrix, note that it is stored as a data.frame
456

  
457
plot(mod_t1)
458
y<- fitted.values(mod_t1)
459
x<- x_data$lat
460
plot(y ~ x) #this is predicted versus lat
461
points(x_data$y_var ~ x,col="red")
462
y<- fitted.values(mod_t1)
463
x<- x_data$LST
464
plot(y~x) #this is predicted vs LST
465

  
416 466
########################################
417 467
#### PART II: Use the new functions to explore fitting with k-dimension in GAM  ####
418 468

  
......
434 484

  
435 485
#list_fitted_models<-vector("list",length(list_formulas))
436 486
k <-2 #select model 3 with LST
437
names_mod <- paste("mod_",k,sep="")
487
names_mod <- paste("mod",k,sep="")
438 488
model_name<-paste("mod",k,sep="")
439 489

  
440 490
formula <-list_formulas[[k]]
441 491

  
442
l_k <- c(30,10,10) #default values
492
l_k <- c(30,10,10) #default values : max val
493
l_k_min <- c(10,4,4) #minimum values, default is null
494

  
443 495
#Create 
444
l_k_obj <- test_k_gam(formula,l_k,data_training)
445
#d
496
l_k_obj <- test_k_gam(formula,l_k,data_training,l_k_min)
497

  
446 498
#Now get k_index for each model and store it in a table.
447 499
#function gam.check with
448 500

  
449 501
list_df_k <- create_gam_check_table(l_k_obj)
450 502

  
503
list_mod <- l_k_obj$list_mod 
504
list_mod<- list_mod[unlist(lapply(list_mod,FUN=function(x){!inherits(x,"try-error")}))]
451 505
list_mod_gam_stat <- lapply(list_mod,FUN=extract_fitting_mod_gam_stat)
452 506
df_mod_stat <- list_mod_gam_stat[[7]]
453 507
#combine tables...
......
487 541
clim_method_mod_obj$data_month_v
488 542

  
489 543
list_fitted_models<-vector("list",length(list_formulas))
490
k <-3 #select model 3 with LST
544
k <- 2 #select model 3 with LST
491 545
formula <-list_formulas[[k]]
492 546

  
493 547
j <- 7 # July
......
502 556
#we are assuming only one specific daily prop?
503 557
nb_models <- length(pred_mod)
504 558

  
505
data_training <- clim_method_mod_obj$data_month
559
data_training <- clim_method_mod_obj$data_month #this is for July
506 560

  
507 561
#list_fitted_models<-vector("list",length(list_formulas))
508 562
k <-2 #select model 2 with LST
......
512 566
formula <-list_formulas[[k]]
513 567

  
514 568
l_k <- c(30,10,10) #default values
569
l_k_min <- c(10,4,4) #minimum values, default is null
515 570

  
516 571
#test_df <- fit_gam_model_with_diagnostics(l_k,data_training,formula,model_name)
517
test_obj <- fit_gam_model_with_diagnostics(l_k,data_training,formula,model_name)
572
#y_var_name, out_suffix, out_path are all set at the beginning of the script
573
#test_obj <- fit_gam_model_with_diagnostics(l_k,data_training,formula,model_name)
574
out_suffix_s  <- paste(7,"_",out_suffix,sep="") #7 for month of july
575

  
576
#could transform the input arguments to be a list??
577
#use l_k_min=NULL to start with half of the maximum k values...
578
test1_gam_fit_obj <- fit_gam_model_with_diagnostics(l_k,data_training,formula,names_mod,y_var_name,out_suffix_s,out_path,l_k_min=NULL)
579
#use l_k_min=c(10,4,4) 
580
out_suffix_s  <- paste(7,"_","lk_min_",out_suffix,sep="") #7 for month of july
581
test2_gam_fit_obj <- fit_gam_model_with_diagnostics(l_k,data_training,formula,names_mod,y_var_name,out_suffix_s,out_path,l_k_min)
582

  
583
names(test2_gam_fit_obj)
584

  
585
df_diagnostics <- test2_gam_fit_obj$df_diagnostics #extract the diagnostic  table
586

  
587
names(df_diagnostics)
588
df_diagnostics$fit_no
589
df_diagnostics$rmse
590
plot(df_diagnostics$rmse)
591
#plot(df_diagnostics$rmse~df_diagnostics$fit_no,data=df_diagnostics)
592

  
593
#Now do this over 12 months for each model??
518 594

  
519
#Now do this over 12 months??
595
#loop here or lapply
520 596

  
521 597
################## END OF SCRIPT ###############

Also available in: Unified diff