Revision 2977cf79
Added by Benoit Parmentier over 10 years ago
climate/research/oregon/interpolation/nex_run_gam_fitting.R | ||
---|---|---|
1 |
#################################### INTERPOLATION OF TEMPERATURES ####################################### |
|
2 |
############################ Script for experimentation of GAM model fitting ############################## |
|
3 |
#This script uses the worklfow code applied to the globe. Results currently reside on NEX/PLEIADES NASA. |
|
4 |
#The purpose is to test different parameters for the GAM function to allow . |
|
5 |
#AUTHOR: Benoit Parmentier |
|
6 |
#CREATED ON: 07/30/2014 |
|
7 |
#MODIFIED ON: 07/31/2014 |
|
8 |
#Version: 1 |
|
9 |
#PROJECT: Environmental Layers project |
|
10 |
#TO DO: |
|
11 |
# - experiment with gamma |
|
12 |
# - experiment with k |
|
13 |
# |
|
14 |
################################################################################################# |
|
15 |
|
|
16 |
### Loading R library and packages |
|
17 |
#library used in the workflow production: |
|
18 |
library(gtools) # loading some useful tools |
|
19 |
library(mgcv) # GAM package by Simon Wood |
|
20 |
library(sp) # Spatial pacakge with class definition by Bivand et al. |
|
21 |
library(spdep) # Spatial pacakge with methods and spatial stat. by Bivand et al. |
|
22 |
library(rgdal) # GDAL wrapper for R, spatial utilities |
|
23 |
library(gstat) # Kriging and co-kriging by Pebesma et al. |
|
24 |
library(fields) # NCAR Spatial Interpolation methods such as kriging, splines |
|
25 |
library(raster) # Hijmans et al. package for raster processing |
|
26 |
library(gdata) # various tools with xls reading, cbindX |
|
27 |
library(rasterVis) # Raster plotting functions |
|
28 |
library(parallel) # Parallelization of processes with multiple cores |
|
29 |
library(maptools) # Tools and functions for sp and other spatial objects e.g. spCbind |
|
30 |
library(maps) # Tools and data for spatial/geographic objects |
|
31 |
library(reshape) # Change shape of object, summarize results |
|
32 |
library(plotrix) # Additional plotting functions |
|
33 |
library(plyr) # Various tools including rbind.fill |
|
34 |
library(spgwr) # GWR method |
|
35 |
library(automap) # Kriging automatic fitting of variogram using gstat |
|
36 |
library(rgeos) # Geometric, topologic library of functions |
|
37 |
#RPostgreSQL # Interface R and Postgres, not used in this script |
|
38 |
library(gridExtra) |
|
39 |
#Additional libraries not used in workflow |
|
40 |
library(pgirmess) # Krusall Wallis test with mulitple options, Kruskalmc {pgirmess} |
|
41 |
library(colorRamps) |
|
42 |
|
|
43 |
#### FUNCTION USED IN SCRIPT |
|
44 |
|
|
45 |
function_analyses_paper1 <-"contribution_of_covariates_paper_interpolation_functions_05212014.R" #first interp paper |
|
46 |
function_analyses_paper2 <-"multi_timescales_paper_interpolation_functions_05052014.R" |
|
47 |
|
|
48 |
load_obj <- function(f) |
|
49 |
{ |
|
50 |
env <- new.env() |
|
51 |
nm <- load(f, env)[1] |
|
52 |
env[[nm]] |
|
53 |
} |
|
54 |
|
|
55 |
predict_raster_model<-function(in_models,r_stack,out_filename){ |
|
56 |
|
|
57 |
#This functions performs predictions on a raster grid given input models. |
|
58 |
#Arguments: list of fitted models, raster stack of covariates |
|
59 |
#Output: spatial grid data frame of the subset of tiles |
|
60 |
|
|
61 |
list_rast_pred<-vector("list",length(in_models)) |
|
62 |
|
|
63 |
for (i in 1:length(in_models)){ |
|
64 |
|
|
65 |
mod <-in_models[[i]] #accessing GAM model ojbect "j" |
|
66 |
raster_name<-out_filename[[i]] |
|
67 |
if (inherits(mod,"gam")) { #change to c("gam","autoKrige") |
|
68 |
raster_pred<- predict(object=r_stack,model=mod,na.rm=FALSE) #Using the coeff to predict new values. |
|
69 |
names(raster_pred)<-"y_pred" |
|
70 |
writeRaster(raster_pred, filename=raster_name,overwrite=TRUE) #Writing the data in a raster file format...(IDRISI) |
|
71 |
#print(paste("Interpolation:","mod", j ,sep=" ")) |
|
72 |
list_rast_pred[[i]]<-raster_name |
|
73 |
} |
|
74 |
} |
|
75 |
if (inherits(mod,"try-error")) { |
|
76 |
print(paste("no gam model fitted:",mod[1],sep=" ")) #change message for any model type... |
|
77 |
} |
|
78 |
return(list_rast_pred) |
|
79 |
} |
|
80 |
|
|
81 |
fit_models<-function(list_formulas,data_training){ |
|
82 |
|
|
83 |
#This functions several models and returns model objects. |
|
84 |
#Arguments: - list of formulas for GAM models |
|
85 |
# - fitting data in a data.frame or SpatialPointDataFrame |
|
86 |
|
|
87 |
#Output: list of model objects |
|
88 |
|
|
89 |
list_fitted_models<-vector("list",length(list_formulas)) |
|
90 |
for (k in 1:length(list_formulas)){ |
|
91 |
formula<-list_formulas[[k]] |
|
92 |
mod<- try(gam(formula, data=data_training)) #change to any model!! |
|
93 |
#mod<- try(autoKrige(formula, input_data=data_s,new_data=s_sgdf,data_variogram=data_s)) |
|
94 |
|
|
95 |
model_name<-paste("mod",k,sep="") |
|
96 |
assign(model_name,mod) |
|
97 |
list_fitted_models[[k]]<-mod |
|
98 |
} |
|
99 |
return(list_fitted_models) |
|
100 |
} |
|
101 |
|
|
102 |
|
|
103 |
create_dir_fun <- function(out_dir,out_suffix){ |
|
104 |
if(!is.null(out_suffix)){ |
|
105 |
out_name <- paste("output_",out_suffix,sep="") |
|
106 |
out_dir <- file.path(out_dir,out_name) |
|
107 |
} |
|
108 |
#create if does not exists: create the output dir as defined |
|
109 |
if(!file.exists(out_dir)){ |
|
110 |
dir.create(out_dir) |
|
111 |
} |
|
112 |
return(out_dir) |
|
113 |
} |
|
114 |
|
|
115 |
extract_list_from_list_obj<-function(obj_list,list_name){ |
|
116 |
library(plyr) |
|
117 |
|
|
118 |
#Create a list of an object from a given list of object using a name prodived as input |
|
119 |
|
|
120 |
list_tmp<-vector("list",length(obj_list)) |
|
121 |
for (i in 1:length(obj_list)){ |
|
122 |
tmp <- obj_list[[i]][[list_name]] #double bracket to return data.frame |
|
123 |
list_tmp[[i]] <- as.data.frame(tmp) #deal with spdf... |
|
124 |
} |
|
125 |
return(list_tmp) #this is a data.frame |
|
126 |
} |
|
127 |
|
|
128 |
#This extract a data.frame object from raster prediction obj and combine them in one data.frame |
|
129 |
extract_from_list_obj<-function(obj_list,list_name){ |
|
130 |
#extract object from list of list. This useful for raster_prediction_obj |
|
131 |
library(plyr) |
|
132 |
|
|
133 |
list_tmp<-vector("list",length(obj_list)) |
|
134 |
for (i in 1:length(obj_list)){ |
|
135 |
tmp<-obj_list[[i]][[list_name]] #double bracket to return data.frame |
|
136 |
list_tmp[[i]]<- as.data.frame(tmp) #if spdf |
|
137 |
} |
|
138 |
tb_list_tmp<-do.call(rbind.fill,list_tmp) #long rownames |
|
139 |
#tb_list_tmp<-do.call(rbind,list_tmp) #long rownames |
|
140 |
|
|
141 |
return(tb_list_tmp) #this is a data.frame |
|
142 |
} |
|
143 |
|
|
144 |
#scp -rp raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" |
|
145 |
#scp -rp reg*.tif parmentier@atlas.nceas.ucsb.edu:/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" |
|
146 |
|
|
147 |
in_dir <- "/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" |
|
148 |
raster_obj_infile <- "raster_prediction_obj_gam_CAI_dailyTmax15.0_20.0.RData" |
|
149 |
|
|
150 |
setwd(in_dir) |
|
151 |
|
|
152 |
raster_obj<- load_obj(raster_obj_infile) |
|
153 |
|
|
154 |
#raster_obj <- load_obj(unlist(raster_obj_file)) #may not need unlist |
|
155 |
nb_models <- length((raster_obj$clim_method_mod_obj[[1]]$formulas)) |
|
156 |
l_formulas <- (raster_obj$clim_method_mod_obj[[1]]$formulas) |
|
157 |
|
|
158 |
#y_var ~ s(lat, lon) + s(elev_s) |
|
159 |
#y_var ~ s(lat, lon) + s(elev_s) + s(LST) |
|
160 |
#y_var ~ s(lat, lon) + s(elev_s) + s(N_w, E_w) + s(LST) + ti(LST,LC1) + s(LC1) |
|
161 |
|
|
162 |
pred_mod <- paste("mod",c(1:nb_models,"_kr"),sep="") |
|
163 |
#we are assuming no monthly hold out... |
|
164 |
#we are assuming only one specific daily prop? |
|
165 |
nb_models <- length(pred_mod) |
|
166 |
|
|
167 |
j <- 7 |
|
168 |
clim_method_mod_obj <- raster_obj$clim_method_mod_obj[[j]] |
|
169 |
#this is made of "clim",data_month, data_month_v , sampling_month_dat, mod and formulas |
|
170 |
clim_method_mod_obj$clim #file predicted |
|
171 |
|
|
172 |
l_mod <- clim_method_mod_obj$mod #file predicted |
|
173 |
|
|
174 |
reg_rast <- stack(list.files(pattern="*.tif")) |
|
175 |
plot(reg_rast,y=15) |
|
176 |
|
|
177 |
names(clim_method_mod_obj) |
|
178 |
clim_method_mod_obj$data_month |
|
179 |
clim_method_mod_obj$data_month_v |
|
180 |
|
|
181 |
## check for residual pattern, removeable by increasing `k' |
|
182 |
## typically `k', below, chould be substantially larger than |
|
183 |
## the original, `k' but certainly less than n/2. |
|
184 |
vis.gam(mod1) |
|
185 |
vis.gam(mod1,view=c("lat","lon"),theta= 35) # plot against by variable |
|
186 |
#http://stats.stackexchange.com/questions/12223/how-to-tune-smoothing-in-mgcv-gam-model |
|
187 |
|
|
188 |
fit_models<-function(list_formulas,data_training){ |
|
189 |
|
|
190 |
#This functions several models and returns model objects. |
|
191 |
#Arguments: - list of formulas for GAM models |
|
192 |
# - fitting data in a data.frame or SpatialPointDataFrame |
|
193 |
|
|
194 |
#Output: list of model objects |
|
195 |
|
|
196 |
list_fitted_models<-vector("list",length(list_formulas)) |
|
197 |
for (k in 1:length(list_formulas)){ |
|
198 |
formula<-list_formulas[[k]] |
|
199 |
mod<- try(gam(formula, data=data_training)) #change to any model!! |
|
200 |
mod<- try(gam(formula, data=data_training,k=5)) #change to any model!! |
|
201 |
mod<- try(gam(y_var ~ s(lat, lon,k=14) + s(elev_s) + s(LST), data=data_training)) #change to any model!! |
|
202 |
|
|
203 |
#mod<- try(autoKrige(formula, input_data=data_s,new_data=s_sgdf,data_variogram=data_s)) |
|
204 |
|
|
205 |
model_name<-paste("mod",k,sep="") |
|
206 |
assign(model_name,mod) |
|
207 |
list_fitted_models[[k]]<-mod |
|
208 |
} |
|
209 |
return(list_fitted_models) |
|
210 |
} |
|
211 |
|
|
212 |
mod1<- try(gam(y_var ~ s(lat, lon,k=22) + s(elev_s) + s(LST), data=data_training)) #change to any model!! |
|
213 |
|
|
214 |
> sink("test.txt") |
|
215 |
> (gam.check(mod1)) |
|
216 |
> getwd(0) |
|
217 |
Error in getwd(0) : unused argument (0) |
|
218 |
> getwd() |
|
219 |
> sink() |
|
220 |
> getwd(0) |
|
221 |
Error in getwd(0) : unused argument (0) |
|
222 |
> getwd() |
|
223 |
[1] "/data/project/layers/commons/NEX_data/output_regions/15.0_20.0" |
|
224 |
> system("more test.txt") |
|
225 |
|
|
226 |
format_formula_k_fun <- function(formula,l_k){ |
|
227 |
#l_k: list of k parameter for GAM method with the value of k and the name of the variable |
|
228 |
#split the formula by term |
|
229 |
#add k parameter to formula |
|
230 |
names(l_k) <- attr(terms(formula),"term.labels") |
|
231 |
var_names <- all.vars(formula) |
|
232 |
term_names <- attr(terms(formula),"term.labels") |
|
233 |
term_names_tmp <- strsplit(term_names,")") |
|
234 |
term_names_tmp <- paste(term_names_tmp,", k=",l_k,")",sep="") |
|
235 |
term_names_tmp <- paste(term_names_tmp,collapse= " + ") |
|
236 |
formula_k <- as.formula(paste(var_names[1] ,"~",term_names_tmp)) |
|
237 |
#nb_covar <- length(var_names) - 1 |
|
238 |
#paste(var_names[2:(nb_covar+1)]),l_k) |
|
239 |
return(formula_k) |
|
240 |
#mod <- try(gam(y_var ~ s(lat, lon,k=22) + s(elev_s,k=10) + s(LST,k=10), data=data_training)) #change to any model!! |
|
241 |
#mod <- try(gam(formula_k, data=data_training)) #change to any model!! |
|
242 |
} |
|
243 |
|
|
244 |
test_k_gam <- function(formula,l_k){ |
|
245 |
|
|
246 |
l_k_tmp <- as.numeric(l_k)/2 |
|
247 |
#l_k_tmp <- l_k |
|
248 |
#maybe go in a loop to fit? could this in a loop for each term then select the term with highest k and k_index >1 ?? |
|
249 |
#list_mod <- vector("list",0) |
|
250 |
#list_l_k <- vector("list",0) |
|
251 |
list_mod <- list() |
|
252 |
list_l_k <- list() |
|
253 |
|
|
254 |
j<- 1 |
|
255 |
formula_k<-format_formula_k_fun(formula,l_k_tmp) |
|
256 |
mod <- try(gam(formula_k, data=data_training)) #change to any model!! |
|
257 |
while(!inherits(mod,"try-error")){ |
|
258 |
formula_k<-format_formula_k_fun(formula,l_k_tmp) |
|
259 |
mod <- try(gam(formula_k, data=data_training)) #change to any model!! |
|
260 |
list_l_k[[j]] <- l_k_tmp |
|
261 |
list_mod[[j]] <- mod |
|
262 |
l_k_tmp <- l_k_tmp + rep(1,length(l_k_tmp)) |
|
263 |
j <- j+1 |
|
264 |
limit_l_k <- (l_k_tmp > l_k) #now check that c(30,10,10 is not gone above!!) |
|
265 |
for (i in 1:length(limit_l_k)){ |
|
266 |
if (l_k_tmp[i] >= l_k[i]){ |
|
267 |
l_k_tmp[i] <- l_k[i] |
|
268 |
} |
|
269 |
} |
|
270 |
} |
|
271 |
#store mod and l_k in a object |
|
272 |
l_k_obj <- list(list_mod,l_k_tmp) |
|
273 |
names(l_k_obj) <- c("list_mod","l_k_tmp") |
|
274 |
|
|
275 |
return(l_k_obj) |
|
276 |
} |
|
277 |
|
|
278 |
#d |
|
279 |
#Now get k_index for each model and store it in a table. |
|
280 |
#function gam.check with |
|
281 |
#select the right mode based on k-index, edf or other criteria? |
|
282 |
|
|
283 |
l_k <- vector("list",3) |
|
284 |
l_k <- list(25,10,10) |
|
285 |
l_k <- c(30,10,10) |
|
286 |
|
|
287 |
|
|
288 |
#1. Fit with standard settings: |
|
289 |
#(Start at k=10 or k=30 if 2 interactive term.) |
|
290 |
#start with the highest k first... |
|
291 |
#If does not fit then |
|
292 |
|
|
293 |
#mod <- try(gam(formula_k, data=data_training)) #change to any model!! |
|
294 |
|
|
295 |
# K=15 for interactive term and k=5 unique term: |
|
296 |
# - do gam.check: |
|
297 |
# If k-index < 1 |
|
298 |
# Then increase k by 1 for interactive term and fit again |
|
299 |
# If k-index >1 or k=10 for unique term and k=30 |
|
300 |
#Breakā¦ |
|
301 |
|
|
302 |
|
|
303 |
#Now function to run mod depending on conditions |
Also available in: Unified diff
initial commit, script for gam fitting experimentation - fitting in low stations area