Project

General

Profile

« Previous | Next » 

Revision 06fb13b9

Added by Benoit Parmentier over 10 years ago

gam fitting diagnostic, adding rmse, mae and other metrics as well as wrap function to run fit at different

View differences:

climate/research/oregon/interpolation/nex_run_gam_fitting.R
52 52
  env[[nm]]
53 53
}
54 54

  
55

  
56
### Function to fit using  the training  data  from the  workflow
57
fit_models<-function(list_formulas,data_training){
58

  
59
  #This functions several models and returns model objects.
60
  #Arguments: - list of formulas for GAM models
61
  #           - fitting data in a data.frame or SpatialPointDataFrame
62

  
63
  #Output: list of model objects 
64

  
65
  list_fitted_models<-vector("list",length(list_formulas))
66
  for (k in 1:length(list_formulas)){
67
    formula<-list_formulas[[k]]
68
    mod<- try(gam(formula, data=data_training)) #change to any model!!
69
    #mod<- try(autoKrige(formula, input_data=data_s,new_data=s_sgdf,data_variogram=data_s))
70

  
71
    model_name<-paste("mod",k,sep="")
72
    assign(model_name,mod) 
73
    list_fitted_models[[k]]<-mod
74
  }
75
  return(list_fitted_models) 
76
}
77

  
78
##Function to predict using a mod object from the workflow...
55 79
predict_raster_model<-function(in_models,r_stack,out_filename){
56 80

  
57 81
  #This functions performs predictions on a raster grid given input models.
......
78 102
  return(list_rast_pred)
79 103
}
80 104

  
81
fit_models<-function(list_formulas,data_training){
82

  
83
  #This functions several models and returns model objects.
84
  #Arguments: - list of formulas for GAM models
85
  #           - fitting data in a data.frame or SpatialPointDataFrame
86

  
87
  #Output: list of model objects 
88

  
89
  list_fitted_models<-vector("list",length(list_formulas))
90
  for (k in 1:length(list_formulas)){
91
    formula<-list_formulas[[k]]
92
    mod<- try(gam(formula, data=data_training)) #change to any model!!
93
    #mod<- try(autoKrige(formula, input_data=data_s,new_data=s_sgdf,data_variogram=data_s))
94

  
95
    model_name<-paste("mod",k,sep="")
96
    assign(model_name,mod) 
97
    list_fitted_models[[k]]<-mod
98
  }
99
  return(list_fitted_models) 
100
}
101

  
102

  
105
## Create a new directory
103 106
create_dir_fun <- function(out_dir,out_suffix){
104 107
  if(!is.null(out_suffix)){
105 108
    out_name <- paste("output_",out_suffix,sep="")
......
112 115
  return(out_dir)
113 116
}
114 117

  
118
#Extract info from object
115 119
extract_list_from_list_obj<-function(obj_list,list_name){
116 120
  library(plyr)
117 121
  
......
141 145
  return(tb_list_tmp) #this is  a data.frame
142 146
}
143 147

  
144
##############################s
145
#### Parameters and constants  
146

  
147
#scp -rp raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0"
148
#scp -rp reg*.tif parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0"
149

  
150
in_dir <- "/data/project/layers/commons/NEX_data/output_regions/15.0_20.0"
151
raster_obj_infile <- "raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData"
152

  
153
setwd(in_dir)
154

  
155
########################## START SCRIPT ##############################
156

  
157
raster_obj<- load_obj(raster_obj_infile)
158

  
159
#raster_obj <- load_obj(unlist(raster_obj_file)) #may not need unlist
160
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas))
161
l_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas)
162

  
163
#y_var ~ s(lat, lon) + s(elev_s)
164
#y_var ~ s(lat, lon) + s(elev_s) + s(LST)
165
#y_var ~ s(lat, lon) + s(elev_s) + s(N_w, E_w) + s(LST) + ti(LST,LC1) + s(LC1)
166

  
167
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="")
168
#we are assuming no monthly hold out...
169
#we are assuming only one specific daily prop?
170
nb_models <- length(pred_mod)
171

  
172
j <- 7
173
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]]
174
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas
175
clim_method_mod_obj$clim #file predicted
176

  
177
l_mod <- clim_method_mod_obj$mod #file predicted
178

  
179
reg_rast <- stack(list.files(pattern="*.tif"))
180
plot(reg_rast,y=15)
181

  
182
names(clim_method_mod_obj)
183
clim_method_mod_obj$data_month
184
clim_method_mod_obj$data_month_v
185

  
186
## check for residual pattern, removeable by increasing `k'
187
## typically `k', below, chould be substantially larger than 
188
## the original, `k' but certainly less than n/2.
189
vis.gam(mod1)
190
vis.gam(mod1,view=c("lat","lon"),theta= 35) # plot against by variable
191
#http://stats.stackexchange.com/questions/12223/how-to-tune-smoothing-in-mgcv-gam-model
192
mod1<- try(gam(y_var ~ s(lat, lon,k=22) + s(elev_s) + s(LST), data=data_training)) #change to any model!!
193

  
194
##New functions...
195
fit_models<-function(list_formulas,data_training){
196

  
197
  #This functions several models and returns model objects.
198
  #Arguments: - list of formulas for GAM models
199
  #           - fitting data in a data.frame or SpatialPointDataFrame
200

  
201
  #Output: list of model objects 
202

  
203
  list_fitted_models<-vector("list",length(list_formulas))
204
  for (k in 1:length(list_formulas)){
205
    formula<-list_formulas[[k]]
206
    mod<- try(gam(formula, data=data_training)) #change to any model!!
207
    mod<- try(gam(formula, data=data_training,k=5)) #change to any model!!
208
    mod<- try(gam(y_var ~ s(lat, lon,k=14) + s(elev_s) + s(LST), data=data_training)) #change to any model!!
209

  
210
    #mod<- try(autoKrige(formula, input_data=data_s,new_data=s_sgdf,data_variogram=data_s))
211

  
212
    model_name<-paste("mod",k,sep="")
213
    assign(model_name,mod) 
214
    list_fitted_models[[k]]<-mod
215
  }
216
  return(list_fitted_models) 
217
}
148
### New functions to set the k dimension in GAM
218 149

  
219 150
##Format formula using prescribed k values for gam
220 151
format_formula_k_fun <- function(formula,l_k){
......
316 247
  return(list_df_k)
317 248
}
318 249

  
250
extract_fitting_mod_gam_stat <- function(mod){
251
  #Note that this assumes that we are using a gam mod object
252
  gcv_val <- mod$gcv.ubre
253
  aic_val <- AIC(mod)
254
  rmse_val <- sqrt(mean((residuals(mod))^2))
255
  mae_val <- mean(abs(residuals(mod)))
256
  bias_val <- mean(residuals(mod))
257
  n_val <- length(residuals(mod))
258
  #Now create a data.frame
259
  df_val <- data.frame(gcv=gcv_val,
260
                   aic=aic_val,
261
                   rmse=rmse_val,
262
                   mae=mae_val,
263
                   bias=bias_val,
264
                   n=n_val)  
265
  return(df_val)
266
} 
319 267

  
320
#l_k <- vector("list",3)
321
#l_k <- list(25,10,10)
322
l_k <- c(30,10,10)
323
l_k_obj <- test_k_gam(formula,l_k,data_training)
268
##############################s
269
#### Parameters and constants  
270

  
271
#scp -rp raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0"
272
#scp -rp reg*.tif parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0"
273

  
274
## laod data from Northen Africa
275
in_dir <- "/data/project/layers/commons/NEX_data/output_regions/15.0_20.0"
276
raster_obj_infile <- "raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData"
277

  
278
setwd(in_dir)
279

  
280
########################## START SCRIPT ##############################
281

  
282
#### PART I: Explore fitting of GAM  ####
283

  
284
raster_obj<- load_obj(raster_obj_infile)
285

  
286
#raster_obj <- load_obj(unlist(raster_obj_file)) #may not need unlist
287
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas))
288
list_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas)
289

  
290
#Models used:
291
#y_var ~ s(lat, lon) + s(elev_s)
292
#y_var ~ s(lat, lon) + s(elev_s) + s(LST)
293
#y_var ~ s(lat, lon) + s(elev_s) + s(N_w, E_w) + s(LST) + ti(LST,LC1) + s(LC1)
294

  
295
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="")
296
#we are assuming no monthly hold out...
297
#we are assuming only one specific daily prop?
298
nb_models <- length(pred_mod)
299

  
300
#Select one month to play around:
301
j <- 7 # July
302
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]]
303
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas
304
clim_method_mod_obj$clim #file predicted
305

  
306
l_mod <- clim_method_mod_obj$mod #file predicted
307

  
308
#Quick look at the study area: Equatorial to Northern Africa
309
reg_rast <- stack(list.files(pattern="*.tif"))
310
plot(reg_rast,y=15)
311

  
312
names(clim_method_mod_obj)
313
clim_method_mod_obj$data_month
314
clim_method_mod_obj$data_month_v
315

  
316

  
317
k <- 2 #select model 2 with LST
318
formula <-list_formulas[[k]]
319
mod<- try(gam(formula, data=data_training)) #does not fit!! as expected
320

  
321
mod_t1<- try(gam(y_var ~ s(lat, lon,k=14) + s(elev_s) + s(LST), data=data_training)) #change to any model!!
322
gam.check(mod_t1)
323
mod_t2<- try(gam(y_var ~ s(lat, lon,k=5) + s(elev_s) + s(LST), data=data_training)) #change to any model!!
324
gam.check(mod_t2) #in this case k=5 is too small for the interactive  term as k-index is less than 1
325

  
326
## check for residual pattern, removeable by increasing `k'
327
## typically `k', below, chould be substantially larger than 
328
## the original, `k' but certainly less than n/2.
329
vis.gam(mod_t1)
330
vis.gam(mod_t1,view=c("lat","lon"),theta= 35) # plot against by variable
331
#http://stats.stackexchange.com/questions/12223/how-to-tune-smoothing-in-mgcv-gam-model
332

  
333
mod_t3 <- try(gam(y_var ~ s(lat, lon,k=22) + s(elev_s) + s(LST), data=data_training)) #change to any model!!
334
gam.check(mod_t3)
335

  
336
#Explore mod object
337
mod_t1$gcv.ubre
338
mod_t1$aic
339
mod_t1$edf
340

  
341
#### PART II: Use the new functions to explore fitting with k-dimension in GAM  ####
342

  
343
j <- 7 # July
344
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]]
345
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas
324 346

  
347
l_mod <- clim_method_mod_obj$mod #file predicted
348
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas))
349
list_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas)
350
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="")
351
#we are assuming no monthly hold out...
352
#we are assuming only one specific daily prop?
353
nb_models <- length(pred_mod)
354

  
355
data_training <- clim_method_mod_obj$data_month
356

  
357
#list_fitted_models<-vector("list",length(list_formulas))
358
k <-2 #select model 3 with LST
359
names_mod <- paste("mod_",k,sep="")
360
formula <-list_formulas[[k]]
361

  
362
l_k <- c(30,10,10) #default values
363
#Create 
364
l_k_obj <- test_k_gam(formula,l_k,data_training)
325 365
#d
326 366
#Now get k_index for each model and store it in a table.
327 367
#function gam.check with
368

  
369
list_df_k <- create_gam_check_table(l_k_obj)
370

  
371
list_mod_gam_stat <- lapply(list_mod,FUN=extract_fitting_mod_gam_stat)
372
df_mod_stat <- list_mod_gam_stat[[7]]
373
#combine tables...
374
l_df_diagnostics<- lapply(1:length(list_df_k),FUN=function(i,df_1,df_2){cbind(df_1[[i]],df_2[[2]])},df_1=list_df_k,df_2=list_mod_gam_stat)
375

  
376
df_diagnostics <- do.call(rbind,l_df_diagnostics)
377

  
378
df_diagnostics$date <- rep(unique(data_training$date),nrow(df_diagnostics))
379
df_diagnostics$month <- rep(unique(data_training$month))
380

  
381
df_diagnostics$pred_mod <-names_mod #defined earlier... 
382

  
328 383
#select the  right mode based on k-index, edf or other criteria?
329 384

  
385
#choice of the lowest k for fit and k_index > 1
386
#then use model 1
387

  
388
#choice of the highest k for fit and k_index > 1
389
#then use model 7
390

  
391
########### Use loop to fit model for for  1 to  12 ?##############
330 392
#Now function to run mod depending on conditions
331 393

  
394
#first make a function for one model (could be month)
395
j <- 7 # July
396
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]]
397
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas
398
clim_method_mod_obj$clim #file predicted
399

  
400
l_mod <- clim_method_mod_obj$mod #file predicted
401

  
402
names(clim_method_mod_obj)
403
data_training <- clim_method_mod_obj$data_month
404
clim_method_mod_obj$data_month_v
405

  
406
list_fitted_models<-vector("list",length(list_formulas))
407
k <-3 #select model 3 with LST
408
formula <-list_formulas[[k]]
409

  
410
j <- 7 # July
411
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]]
412
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas
413

  
414
l_mod <- clim_method_mod_obj$mod #file predicted
415
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas))
416
list_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas)
417
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="")
418
#we are assuming no monthly hold out...
419
#we are assuming only one specific daily prop?
420
nb_models <- length(pred_mod)
421

  
422
data_training <- clim_method_mod_obj$data_month
423

  
424
#list_fitted_models<-vector("list",length(list_formulas))
425
k <-2 #select model 3 with LST
426
names_mod <- paste("mod_",k,sep="")
427
formula <-list_formulas[[k]]
428

  
429
l_k <- c(30,10,10) #default values
430

  
431
test_df <- fit_gam_model_with_diagnostics(l_k,data_training,formula,names_mod)
432

  
433
fit_gam_model_with_diagnostics <- function(l_k,data_training,formula,names_mod){
434

  
435
  
436
  #STEP 1: fit with a range for k values
437
  
438
  #Fit models using given k values, formula and training dataset   
439
  #This is done starting at l_k/2 e.g. c(30,10,10) becomes c(15,5,5) where k is assigned to each term in the order given
440
  l_k_obj <- test_k_gam(formula,l_k,data_training)
441
  
442
  #STEP 2: get  k-index diagnostic for ech model
443
  
444
  #Now get k_index for each model and store it in a table.
445
  #function gam.check is sued
446
  
447
  list_df_k <- create_gam_check_table(l_k_obj)
448
  
449
  #STEP 3: produce additional metrics for diagnostis of fit
450
  #this includes the calculation of RMSE, MAE, bias, gcv,aic for fitted model
451
  list_mod <- l_k_obj$list_mod
452
  list_mod_gam_stat <- lapply(list_mod,FUN=extract_fitting_mod_gam_stat)
453
  
454
  #STEP 4: combine tables of diagnostics, format and add additional information  
455

  
456
  #combine tables...
457
  l_df_diagnostics <- lapply(1:length(list_df_k),FUN=function(i,df_1,df_2){cbind(df_1[[i]],df_2[[2]])},df_1=list_df_k,df_2=list_mod_gam_stat)
458
  df_diagnostics <- do.call(rbind,l_df_diagnostics)
459
  df_diagnostics$date <- rep(unique(data_training$date),nrow(df_diagnostics))
460
  df_diagnostics$month <- rep(unique(data_training$month))
461
  df_diagnostics$pred_mod <-names_mod #defined earlier... (input argutment )
462
  
463
  #Return the diagnostic table
464
  return(df_diagnostics)
465
}
466

  
332 467
################## END OF SCRIPT ###############

Also available in: Unified diff