Project

General

Profile

« Previous | Next » 

Revision 2977cf79

Added by Benoit Parmentier over 10 years ago

initial commit, script for gam fitting experimentation - fitting in low stations area

View differences:

climate/research/oregon/interpolation/nex_run_gam_fitting.R
1
####################################  INTERPOLATION OF TEMPERATURES  #######################################
2
############################  Script for experimentation of GAM model fitting ##############################
3
#This script uses the worklfow code applied to the globe. Results currently reside on NEX/PLEIADES NASA.
4
#The purpose is to test different parameters for the GAM function to allow . 
5
#AUTHOR: Benoit Parmentier 
6
#CREATED ON: 07/30/2014  
7
#MODIFIED ON: 07/31/2014            
8
#Version: 1
9
#PROJECT: Environmental Layers project  
10
#TO DO:
11
# - experiment with gamma
12
# - experiment with k
13
#
14
#################################################################################################
15

  
16
### Loading R library and packages        
17
#library used in the workflow production:
18
library(gtools)                              # loading some useful tools 
19
library(mgcv)                                # GAM package by Simon Wood
20
library(sp)                                  # Spatial pacakge with class definition by Bivand et al.
21
library(spdep)                               # Spatial pacakge with methods and spatial stat. by Bivand et al.
22
library(rgdal)                               # GDAL wrapper for R, spatial utilities
23
library(gstat)                               # Kriging and co-kriging by Pebesma et al.
24
library(fields)                              # NCAR Spatial Interpolation methods such as kriging, splines
25
library(raster)                              # Hijmans et al. package for raster processing
26
library(gdata)                               # various tools with xls reading, cbindX
27
library(rasterVis)                           # Raster plotting functions
28
library(parallel)                            # Parallelization of processes with multiple cores
29
library(maptools)                            # Tools and functions for sp and other spatial objects e.g. spCbind
30
library(maps)                                # Tools and data for spatial/geographic objects
31
library(reshape)                             # Change shape of object, summarize results 
32
library(plotrix)                             # Additional plotting functions
33
library(plyr)                                # Various tools including rbind.fill
34
library(spgwr)                               # GWR method
35
library(automap)                             # Kriging automatic fitting of variogram using gstat
36
library(rgeos)                               # Geometric, topologic library of functions
37
#RPostgreSQL                                 # Interface R and Postgres, not used in this script
38
library(gridExtra)
39
#Additional libraries not used in workflow
40
library(pgirmess)                            # Krusall Wallis test with mulitple options, Kruskalmc {pgirmess}  
41
library(colorRamps)
42

  
43
#### FUNCTION USED IN SCRIPT
44

  
45
function_analyses_paper1 <-"contribution_of_covariates_paper_interpolation_functions_05212014.R" #first interp paper
46
function_analyses_paper2 <-"multi_timescales_paper_interpolation_functions_05052014.R"
47

  
48
load_obj <- function(f)
49
{
50
  env <- new.env()
51
  nm <- load(f, env)[1]
52
  env[[nm]]
53
}
54

  
55
predict_raster_model<-function(in_models,r_stack,out_filename){
56

  
57
  #This functions performs predictions on a raster grid given input models.
58
  #Arguments: list of fitted models, raster stack of covariates
59
  #Output: spatial grid data frame of the subset of tiles
60

  
61
  list_rast_pred<-vector("list",length(in_models))
62

  
63
  for (i in 1:length(in_models)){
64

  
65
    mod <-in_models[[i]] #accessing GAM model ojbect "j"
66
    raster_name<-out_filename[[i]]
67
    if (inherits(mod,"gam")) {           #change to c("gam","autoKrige")
68
      raster_pred<- predict(object=r_stack,model=mod,na.rm=FALSE) #Using the coeff to predict new values.
69
      names(raster_pred)<-"y_pred"  
70
      writeRaster(raster_pred, filename=raster_name,overwrite=TRUE)  #Writing the data in a raster file format...(IDRISI)
71
      #print(paste("Interpolation:","mod", j ,sep=" "))
72
      list_rast_pred[[i]]<-raster_name
73
    }
74
  }
75
  if (inherits(mod,"try-error")) {
76
    print(paste("no gam model fitted:",mod[1],sep=" ")) #change message for any model type...
77
  }
78
  return(list_rast_pred)
79
}
80

  
81
fit_models<-function(list_formulas,data_training){
82

  
83
  #This functions several models and returns model objects.
84
  #Arguments: - list of formulas for GAM models
85
  #           - fitting data in a data.frame or SpatialPointDataFrame
86

  
87
  #Output: list of model objects 
88

  
89
  list_fitted_models<-vector("list",length(list_formulas))
90
  for (k in 1:length(list_formulas)){
91
    formula<-list_formulas[[k]]
92
    mod<- try(gam(formula, data=data_training)) #change to any model!!
93
    #mod<- try(autoKrige(formula, input_data=data_s,new_data=s_sgdf,data_variogram=data_s))
94

  
95
    model_name<-paste("mod",k,sep="")
96
    assign(model_name,mod) 
97
    list_fitted_models[[k]]<-mod
98
  }
99
  return(list_fitted_models) 
100
}
101

  
102

  
103
create_dir_fun <- function(out_dir,out_suffix){
104
  if(!is.null(out_suffix)){
105
    out_name <- paste("output_",out_suffix,sep="")
106
    out_dir <- file.path(out_dir,out_name)
107
  }
108
  #create if does not exists: create the output dir as defined 
109
  if(!file.exists(out_dir)){
110
    dir.create(out_dir)
111
  }
112
  return(out_dir)
113
}
114

  
115
extract_list_from_list_obj<-function(obj_list,list_name){
116
  library(plyr)
117
  
118
  #Create a list of an object from a given list of object using a name prodived as input
119
  
120
  list_tmp<-vector("list",length(obj_list))
121
  for (i in 1:length(obj_list)){
122
    tmp <- obj_list[[i]][[list_name]] #double bracket to return data.frame
123
    list_tmp[[i]] <- as.data.frame(tmp) #deal with spdf...
124
  }
125
  return(list_tmp) #this is  a data.frame
126
}
127

  
128
#This extract a data.frame object from raster prediction obj and combine them in one data.frame 
129
extract_from_list_obj<-function(obj_list,list_name){
130
  #extract object from list of list. This useful for raster_prediction_obj
131
  library(plyr)
132
  
133
  list_tmp<-vector("list",length(obj_list))
134
  for (i in 1:length(obj_list)){
135
    tmp<-obj_list[[i]][[list_name]] #double bracket to return data.frame
136
    list_tmp[[i]]<- as.data.frame(tmp) #if spdf
137
  }
138
  tb_list_tmp<-do.call(rbind.fill,list_tmp) #long rownames
139
  #tb_list_tmp<-do.call(rbind,list_tmp) #long rownames
140
  
141
  return(tb_list_tmp) #this is  a data.frame
142
}
143

  
144
#scp -rp raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0"
145
#scp -rp reg*.tif parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0"
146

  
147
in_dir <- "/data/project/layers/commons/NEX_data/output_regions/15.0_20.0"
148
raster_obj_infile <- "raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData"
149

  
150
setwd(in_dir)
151

  
152
raster_obj<- load_obj(raster_obj_infile)
153

  
154
#raster_obj <- load_obj(unlist(raster_obj_file)) #may not need unlist
155
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas))
156
l_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas)
157

  
158
#y_var ~ s(lat, lon) + s(elev_s)
159
#y_var ~ s(lat, lon) + s(elev_s) + s(LST)
160
#y_var ~ s(lat, lon) + s(elev_s) + s(N_w, E_w) + s(LST) + ti(LST,LC1) + s(LC1)
161

  
162
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="")
163
#we are assuming no monthly hold out...
164
#we are assuming only one specific daily prop?
165
nb_models <- length(pred_mod)
166

  
167
j <- 7
168
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]]
169
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas
170
clim_method_mod_obj$clim #file predicted
171

  
172
l_mod <- clim_method_mod_obj$mod #file predicted
173

  
174
reg_rast <- stack(list.files(pattern="*.tif"))
175
plot(reg_rast,y=15)
176

  
177
names(clim_method_mod_obj)
178
clim_method_mod_obj$data_month
179
clim_method_mod_obj$data_month_v
180

  
181
## check for residual pattern, removeable by increasing `k'
182
## typically `k', below, chould be substantially larger than 
183
## the original, `k' but certainly less than n/2.
184
vis.gam(mod1)
185
vis.gam(mod1,view=c("lat","lon"),theta= 35) # plot against by variable
186
#http://stats.stackexchange.com/questions/12223/how-to-tune-smoothing-in-mgcv-gam-model
187

  
188
fit_models<-function(list_formulas,data_training){
189

  
190
  #This functions several models and returns model objects.
191
  #Arguments: - list of formulas for GAM models
192
  #           - fitting data in a data.frame or SpatialPointDataFrame
193

  
194
  #Output: list of model objects 
195

  
196
  list_fitted_models<-vector("list",length(list_formulas))
197
  for (k in 1:length(list_formulas)){
198
    formula<-list_formulas[[k]]
199
    mod<- try(gam(formula, data=data_training)) #change to any model!!
200
    mod<- try(gam(formula, data=data_training,k=5)) #change to any model!!
201
    mod<- try(gam(y_var ~ s(lat, lon,k=14) + s(elev_s) + s(LST), data=data_training)) #change to any model!!
202

  
203
    #mod<- try(autoKrige(formula, input_data=data_s,new_data=s_sgdf,data_variogram=data_s))
204

  
205
    model_name<-paste("mod",k,sep="")
206
    assign(model_name,mod) 
207
    list_fitted_models[[k]]<-mod
208
  }
209
  return(list_fitted_models) 
210
}
211

  
212
mod1<- try(gam(y_var ~ s(lat, lon,k=22) + s(elev_s) + s(LST), data=data_training)) #change to any model!!
213

  
214
> sink("test.txt")
215
> (gam.check(mod1))
216
> getwd(0)
217
Error in getwd(0) : unused argument (0)
218
> getwd()
219
> sink()
220
> getwd(0)
221
Error in getwd(0) : unused argument (0)
222
> getwd()
223
[1] "/data/project/layers/commons/NEX_data/output_regions/15.0_20.0"
224
> system("more test.txt")
225

  
226
format_formula_k_fun <- function(formula,l_k){
227
  #l_k: list of k parameter for GAM method with the value of k and the name of the variable
228
  #split the formula by term 
229
  #add k parameter to formula
230
  names(l_k) <- attr(terms(formula),"term.labels")
231
  var_names <- all.vars(formula)
232
  term_names <- attr(terms(formula),"term.labels")
233
  term_names_tmp <- strsplit(term_names,")")
234
  term_names_tmp <- paste(term_names_tmp,", k=",l_k,")",sep="")
235
  term_names_tmp <- paste(term_names_tmp,collapse= " + ")
236
  formula_k <- as.formula(paste(var_names[1] ,"~",term_names_tmp))
237
  #nb_covar <- length(var_names) - 1
238
  #paste(var_names[2:(nb_covar+1)]),l_k)
239
  return(formula_k)
240
  #mod <- try(gam(y_var ~ s(lat, lon,k=22) + s(elev_s,k=10) + s(LST,k=10), data=data_training)) #change to any model!!
241
  #mod <- try(gam(formula_k, data=data_training)) #change to any model!!
242
}
243

  
244
test_k_gam <- function(formula,l_k){
245

  
246
  l_k_tmp <- as.numeric(l_k)/2
247
  #l_k_tmp <- l_k
248
  #maybe go in  a loop to  fit? could this in a loop for each term then select the term with highest k and k_index >1 ??
249
  #list_mod <- vector("list",0)
250
  #list_l_k <- vector("list",0)
251
  list_mod <- list()
252
  list_l_k <- list()
253

  
254
  j<- 1
255
  formula_k<-format_formula_k_fun(formula,l_k_tmp)
256
  mod <- try(gam(formula_k, data=data_training)) #change to any model!! 
257
  while(!inherits(mod,"try-error")){
258
    formula_k<-format_formula_k_fun(formula,l_k_tmp)
259
    mod <- try(gam(formula_k, data=data_training)) #change to any model!! 
260
    list_l_k[[j]] <- l_k_tmp
261
    list_mod[[j]] <- mod
262
    l_k_tmp <- l_k_tmp + rep(1,length(l_k_tmp))
263
    j <- j+1
264
    limit_l_k <- (l_k_tmp > l_k) #now check that c(30,10,10 is not gone above!!)
265
    for (i in 1:length(limit_l_k)){
266
      if (l_k_tmp[i] >= l_k[i]){
267
        l_k_tmp[i] <- l_k[i]
268
      }
269
    }
270
  }
271
  #store mod and l_k in a object
272
  l_k_obj <- list(list_mod,l_k_tmp)
273
  names(l_k_obj) <- c("list_mod","l_k_tmp")
274
  
275
  return(l_k_obj)
276
}
277

  
278
#d
279
#Now get k_index for each model and store it in a table.
280
#function gam.check with
281
#select the  right mode based on k-index, edf or other criteria?
282

  
283
l_k <- vector("list",3)
284
l_k <- list(25,10,10)
285
l_k <- c(30,10,10)
286

  
287

  
288
#1. Fit with standard settings:
289
#(Start at k=10 or k=30 if 2 interactive term.)
290
#start with the highest k first...
291
#If  does not fit then
292

  
293
#mod <- try(gam(formula_k, data=data_training)) #change to any model!!
294

  
295
#         K=15 for interactive term and k=5 unique term:
296
#         - do gam.check:
297
#                    If k-index < 1
298
#                     Then increase k by 1 for interactive term and  fit again
299
#          If k-index >1 or k=10 for unique term and k=30  
300
#Breakā€¦
301

  
302

  
303
#Now function to run mod depending on conditions

Also available in: Unified diff