Revision d13c7dc1
Added by Benoit Parmentier almost 12 years ago
climate/research/oregon/interpolation/GAM_CAI_function_multisampling.R | ||
---|---|---|
1 | 1 |
runGAMCAI <- function(i) { # loop over dates |
2 | 2 |
|
3 | 3 |
#date<-strptime(dates[i], "%Y%m%d") # interpolation date being processed |
4 |
date<-strptime(sampling_dat$date[i], "%Y%m%d") # interpolation date being processed |
|
4 |
date<-strptime(sampling_dat$date[i], "%Y%m%d") # interpolation date being processed, converting the string using specific format
|
|
5 | 5 |
month<-strftime(date, "%m") # current month of the date being processed |
6 |
LST_month<-paste("mm_",month,sep="") # name of LST month to be matched |
|
6 |
LST_month<-paste("mm_",month,sep="") # name of LST month to be matched in the raster stack of covariates and data.frame
|
|
7 | 7 |
|
8 | 8 |
#Adding layer LST to the raster stack |
9 | 9 |
|
10 |
pos<-match("LST",layerNames(s_raster)) #Find the position of the layer with name "LST", if not present pos=NA |
|
11 |
s_raster<-dropLayer(s_raster,pos) # If it exists drop layer |
|
10 | 12 |
pos<-match(LST_month,layerNames(s_raster)) #Find column with the current month for instance mm12 |
11 | 13 |
r1<-raster(s_raster,layer=pos) #Select layer from stack |
12 | 14 |
layerNames(r1)<-"LST" |
... | ... | |
15 | 17 |
###Regression part 1: Creating a validation dataset by creating training and testing datasets |
16 | 18 |
|
17 | 19 |
mod_LST <-ghcn.subsets[[i]][,match(LST_month, names(ghcn.subsets[[i]]))] #Match interpolation date and monthly LST average |
18 |
ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod_LST) #Add the variable LST to the subset dataset |
|
20 |
ghcn.subsets[[i]] <- transform(ghcn.subsets[[i]],LST = mod_LST) #Add the variable LST to the subset dataset |
|
21 |
dst$LST<-dst[[LST_month]] #Add also to monthly dataset |
|
22 |
|
|
19 | 23 |
#n<-nrow(ghcn.subsets[[i]]) |
20 | 24 |
#ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
21 | 25 |
#nv<-n-ns #create a sample for validation with prop of the rows |
... | ... | |
37 | 41 |
datelabel=format(ISOdate(year,mo,day),"%b %d, %Y") |
38 | 42 |
|
39 | 43 |
########### |
40 |
# STEP 1 - LST 10 year monthly averages: THIS IS NOT USED IN CAE
|
|
44 |
# STEP 1 - LST 10 year monthly averages: THIS IS NOT USED IN CAI method
|
|
41 | 45 |
########### |
42 | 46 |
|
43 | 47 |
themolst<-raster(molst,mo) #current month being processed saved in a raster image |
... | ... | |
92 | 96 |
pos<-match("station",names(d)) #Find column with name "value" |
93 | 97 |
names(d)[pos]<-c("id") |
94 | 98 |
names(x)[pos]<-c("id") |
95 |
names(modst)[1]<-c("id") #modst contains the average tmax per month for every stations... |
|
96 |
dmoday=merge(modst,d,by="id") #LOOSING DATA HERE!!! from 113 t0 103 |
|
97 |
xmoday=merge(modst,x,by="id") #LOOSING DATA HERE!!! from 48 t0 43 |
|
98 |
names(dmoday)[4]<-c("lat") |
|
99 |
names(dmoday)[5]<-c("lon") #dmoday contains all the the information: BIAS, monn |
|
100 |
names(xmoday)[4]<-c("lat") |
|
101 |
names(xmoday)[5]<-c("lon") #dmoday contains all the the information: BIAS, monn |
|
99 |
names(modst)[1]<-c("id") #modst contains the average tmax per month for every stations...it has 193 rows |
|
100 |
|
|
101 |
dmoday=merge(modst,d,by="id",suffixes=c("",".y2")) #LOOSING DATA HERE!!! from 113 t0 103 |
|
102 |
xmoday=merge(modst,x,by="id",suffixes=c("",".y2")) #LOOSING DATA HERE!!! from 48 t0 43 |
|
103 |
mod_pat<-glob2rx("*.y2") |
|
104 |
var_pat<-grep(mod_pat,names(dmoday),value=FALSE) # using grep with "value" extracts the matching names |
|
105 |
dmoday<-dmoday[,-var_pat] |
|
106 |
mod_pat<-glob2rx("*.y2") |
|
107 |
var_pat<-grep(mod_pat,names(xmoday),value=FALSE) # using grep with "value" extracts the matching names |
|
108 |
xmoday<-xmoday[,-var_pat] #Removing duplicate columns |
|
109 |
|
|
110 |
#dmoday=merge(modst,d,by="id") #LOOSING DATA HERE!!! from 113 t0 103 |
|
111 |
#xmoday=merge(modst,x,by="id") #LOOSING DATA HERE!!! from 48 t0 43 |
|
112 |
#names(dmoday)[4]<-c("lat") |
|
113 |
#names(dmoday)[5]<-c("lon") #dmoday contains all the the information: BIAS, monn |
|
114 |
#names(xmoday)[4]<-c("lat") |
|
115 |
#names(xmoday)[5]<-c("lon") #dmoday contains all the the information: BIAS, monn |
|
102 | 116 |
|
103 | 117 |
data_v<-xmoday |
104 | 118 |
### |
... | ... | |
128 | 142 |
|
129 | 143 |
#Adding options to use only training stations: 07/11/2012 |
130 | 144 |
bias_xy<-project(as.matrix(sta_lola),proj_str) |
131 |
clim_xy<-project(as.matrix(sta_lola),proj_str) |
|
145 |
clim_xy<-project(as.matrix(sta_lola),proj_str) #This is the coordinates of monthly station location (193)
|
|
132 | 146 |
#bias_xy2=project(as.matrix(c(dmoday$lon,dmoday$lat),proj_str) |
133 | 147 |
if(bias_val==1){ |
134 |
sta_bias<-dmoday$LSTD_bias |
|
135 |
bias_xy<-cbind(dmoday$x_OR83M,dmoday$y_OR83M) |
|
148 |
sta_bias<-dmoday$LSTD_bias
|
|
149 |
bias_xy<-cbind(dmoday$x_OR83M,dmoday$y_OR83M) #This will use only stations from training daily samples for climatology step if bias_val=1
|
|
136 | 150 |
} |
137 | 151 |
|
138 |
sta_clim<-modst$TMax #This contains the monthly climatology... |
|
152 |
sta_clim<-modst$TMax #This contains the monthly climatology...used in the prediction of the monthly surface
|
|
139 | 153 |
|
140 | 154 |
#fitbias<-Krig(bias_xy,sta_bias,theta=1e5) #use TPS or krige |
141 | 155 |
fitclim<-Krig(clim_xy,sta_clim,theta=1e5) |
... | ... | |
156 | 170 |
#US(add=T,col="magenta",lwd=2) |
157 | 171 |
|
158 | 172 |
########## |
159 |
# STEP 7 - interpolate delta across space |
|
173 |
# STEP 7 - interpolate delta across space: this is the daily deviation from the monthly average
|
|
160 | 174 |
########## |
161 | 175 |
|
162 | 176 |
daily_sta_lola=dmoday[,c("lon","lat")] #could be same as before but why assume merge does this - assume not |
163 | 177 |
daily_sta_xy=project(as.matrix(daily_sta_lola),proj_str) |
164 | 178 |
daily_delta=dmoday$dailyTmax-dmoday$TMax |
165 | 179 |
|
166 |
daily_deltaclim<-dmoday$dailyTmax-dmoday$TMax |
|
167 |
daily_deltaclim_v<-data_v$dailyTmax-data_v$TMax #For validation |
|
180 |
daily_deltaclim<-dmoday$dailyTmax-dmoday$TMax #For daily surface interpolation...
|
|
181 |
daily_deltaclim_v<-data_v$dailyTmax-data_v$TMax #For validation...
|
|
168 | 182 |
#dmoday$daily_deltaclim <-daily_deltaclim |
169 | 183 |
#fitdelta<-Tps(daily_sta_xy,daily_delta) #use TPS or krige |
170 | 184 |
fitdelta<-Krig(daily_sta_xy,daily_delta,theta=1e5) #use TPS or krige |
... | ... | |
193 | 207 |
#### Added by Benoit ends |
194 | 208 |
|
195 | 209 |
######### |
196 |
# STEP 8 - assemble final answer - T= LST+Bias(interpolated)+delta(interpolated)
|
|
197 |
# T= clim(interpolated) + deltaclim(interpolated) |
|
210 |
# STEP 8 - assemble final answer - T= LST-Bias(interpolated)+delta(interpolated) (This is for fusion not implemented in this script...)
|
|
211 |
# T= clim(interpolated) + deltaclim(interpolated) (This is for CAI)
|
|
198 | 212 |
######### |
199 | 213 |
|
200 | 214 |
#bias_rast=interpolate(themolst,fitbias) #interpolation using function from raster package |
... | ... | |
263 | 277 |
resid=sta_pred-tmax |
264 | 278 |
#quilt.plot(daily_sta_lola,resid) |
265 | 279 |
|
266 |
|
|
267 |
|
|
280 |
|
|
268 | 281 |
###BEFORE GAM prediction the data object must be transformed to SDF |
269 | 282 |
|
270 | 283 |
coords<- data_v[,c('x_OR83M','y_OR83M')] |
... | ... | |
273 | 286 |
coords<- data_s[,c('x_OR83M','y_OR83M')] |
274 | 287 |
coordinates(data_s)<-coords |
275 | 288 |
proj4string(data_s)<-CRS #Need to assign coordinates.. |
289 |
coords<- modst[,c('x_OR83M','y_OR83M')] |
|
290 |
coordinates(modst)<-coords |
|
291 |
proj4string(modst)<-CRS #Need to assign coordinates.. |
|
276 | 292 |
|
277 | 293 |
ns<-nrow(data_s) #This is added to because some loss of data might have happened because of the averaging... |
278 | 294 |
nv<-nrow(data_v) |
... | ... | |
285 | 301 |
data_s$y_var<-data_s$daily_deltaclim |
286 | 302 |
data_v$y_var<-data_v$daily_deltaclim |
287 | 303 |
|
288 |
if (climgam==1){ |
|
304 |
if (climgam==1){ #This is an option to use covariates in the daily surface...
|
|
289 | 305 |
data_s$y_var<-data_s$TMax |
290 | 306 |
data_v$y_var<-data_v$TMax |
307 |
data_month<-modst |
|
308 |
data_month$y_var<-modst$TMax |
|
291 | 309 |
} |
292 | 310 |
|
293 | 311 |
#Model and response variable can be changed without affecting the script |
... | ... | |
301 | 319 |
formula7 <- as.formula("y_var~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST)+s(LC3)", env=.GlobalEnv) |
302 | 320 |
formula8 <- as.formula("y_var~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST) + s(LC1,LC3)", env=.GlobalEnv) |
303 | 321 |
|
304 |
mod1<- try(gam(formula1, data=data_s)) |
|
305 |
mod2<- try(gam(formula2, data=data_s)) #modified nesting....from 3 to 2 |
|
306 |
mod3<- try(gam(formula3, data=data_s)) |
|
307 |
mod4<- try(gam(formula4, data=data_s)) |
|
308 |
mod5<- try(gam(formula5, data=data_s)) |
|
309 |
mod6<- try(gam(formula6, data=data_s)) |
|
310 |
mod7<- try(gam(formula7, data=data_s)) |
|
311 |
mod8<- try(gam(formula8, data=data_s)) |
|
322 |
#mod1<- try(gam(formula1, data=data_s))
|
|
323 |
#mod2<- try(gam(formula2, data=data_s)) #modified nesting....from 3 to 2
|
|
324 |
#mod3<- try(gam(formula3, data=data_s))
|
|
325 |
#mod4<- try(gam(formula4, data=data_s))
|
|
326 |
#mod5<- try(gam(formula5, data=data_s))
|
|
327 |
#mod6<- try(gam(formula6, data=data_s))
|
|
328 |
#mod7<- try(gam(formula7, data=data_s))
|
|
329 |
#mod8<- try(gam(formula8, data=data_s))
|
|
312 | 330 |
|
331 |
if (climgam==1){ #This will automatically use monthly station data in the second step |
|
332 |
mod1<- try(gam(formula1, data=data_month)) |
|
333 |
mod2<- try(gam(formula2, data=data_month)) #modified nesting....from 3 to 2 |
|
334 |
mod3<- try(gam(formula3, data=data_month)) |
|
335 |
mod4<- try(gam(formula4, data=data_month)) |
|
336 |
mod5<- try(gam(formula5, data=data_month)) |
|
337 |
mod6<- try(gam(formula6, data=data_month)) |
|
338 |
mod7<- try(gam(formula7, data=data_month)) |
|
339 |
mod8<- try(gam(formula8, data=data_month)) |
|
340 |
|
|
341 |
} else if (climgam==0){ #This will use daily delta in the second step |
|
342 |
|
|
343 |
mod1<- try(gam(formula1, data=data_s)) |
|
344 |
mod2<- try(gam(formula2, data=data_s)) #modified nesting....from 3 to 2 |
|
345 |
mod3<- try(gam(formula3, data=data_s)) |
|
346 |
mod4<- try(gam(formula4, data=data_s)) |
|
347 |
mod5<- try(gam(formula5, data=data_s)) |
|
348 |
mod6<- try(gam(formula6, data=data_s)) |
|
349 |
mod7<- try(gam(formula7, data=data_s)) |
|
350 |
mod8<- try(gam(formula8, data=data_s)) |
|
351 |
} |
|
352 |
|
|
313 | 353 |
### Added by benoit |
314 | 354 |
#Store results using TPS |
315 | 355 |
j=nmodels+1 |
... | ... | |
360 | 400 |
|
361 | 401 |
#ns<-nrow(data_s) #This is added to because some loss of data might have happened because of the averaging... |
362 | 402 |
#nv<-nrow(data_v) |
363 |
browser() |
|
403 |
#browser()
|
|
364 | 404 |
|
365 | 405 |
for (j in 1:nmodels){ |
366 | 406 |
|
... | ... | |
426 | 466 |
#If mod "j" is not a model object |
427 | 467 |
if (inherits(mod,"gam")) { |
428 | 468 |
|
469 |
# model specific metrics |
|
429 | 470 |
results_m1[1,1]<- sampling_dat$date[i] #storing the interpolation dates in the first column |
430 | 471 |
results_m1[1,2]<- ns #number of stations used in the training stage |
431 | 472 |
results_m1[1,3]<- "AIC" |
... | ... | |
441 | 482 |
results_m3[1,3]<- "DEV" |
442 | 483 |
results_m3[1,j+3]<- mod$deviance |
443 | 484 |
|
444 |
y_var_fit= mod$fit |
|
445 |
R2_mod_f<- cor(data_s$dailyTmax,y_var_fit, use="complete")^2 |
|
446 |
#RMSE_mod_f<- sqrt(mean(res_mod_s^2,na.rm=TRUE)) |
|
447 |
|
|
448 |
results_RMSE_f[1,1]<- sampling_dat$date[i] #storing the interpolation dates in the first column |
|
449 |
results_RMSE_f[1,2]<- ns #number of stations used in the training stage |
|
450 |
results_RMSE_f[1,3]<- "RSME_f" |
|
451 |
#results_RMSE_f[j+3]<- sqrt(sum((y_var_fit-data_s$y_var)^2)/ns) |
|
452 |
results_RMSE_f[1,j+3]<-sqrt(mean(mod$residuals^2,na.rm=TRUE)) |
|
453 |
|
|
454 |
results_MAE_f[1,1]<- sampling_dat$date[i] #storing the interpolation dates in the first column |
|
455 |
results_MAE_f[1,2]<- ns #number of stations used in the training stage |
|
456 |
results_MAE_f[1,3]<- "MAE_f" |
|
457 |
#results_MAE_f[j+3]<-sum(abs(y_var_fit-data_s$y_var))/ns |
|
458 |
results_MAE_f[1,j+3]<-mean(abs(mod$residuals),na.rm=TRUE) |
|
459 |
|
|
460 |
results_R2_f[1,1]<- sampling_dat$date[i] #storing the interpolation dates in the first column |
|
461 |
results_R2_f[1,2]<- ns #number of stations used in the training stage |
|
462 |
results_R2_f[1,3]<- "R2_f" |
|
463 |
results_R2_f[1,j+3]<- R2_mod_f #Storing R2 for the model j |
|
464 |
|
|
465 | 485 |
##Model assessment: general diagnostic/metrics |
466 | 486 |
##validation: using the testing data |
467 | 487 |
if (predval==1) { |
... | ... | |
483 | 503 |
writeRaster(raster_pred, filename=raster_name,overwrite=TRUE) #Writing the data in a raster file format...(IDRISI) |
484 | 504 |
#writeRaster(r2, filename=raster_name,overwrite=TRUE) #Writing the data in a raster file format...(IDRISI) |
485 | 505 |
|
486 |
tmax_predicted_CAI<-raster_pred + clim_rast #Final surface as a raster layer... |
|
506 |
tmax_predicted_CAI<-raster_pred + clim_rast #Final surface as a raster layer...taht is if daily prediction with GAM
|
|
487 | 507 |
if (climgam==1){ |
488 | 508 |
tmax_predicted_CAI<-raster_pred + daily_deltaclim_rast #Final surface as a raster layer... |
489 | 509 |
} |
490 | 510 |
|
491 | 511 |
layerNames(tmax_predicted_CAI)<-"y_pred" |
492 | 512 |
data_name<-paste("predicted_mod",j,"_",sampling_dat$date[i],"_",sampling_dat$prop[i],"_",sampling_dat$run_samp[i],sep="") |
493 |
raster_name<-paste("GAMCAI_tmax_pred_",data_name,out_prefix,".rst", sep="") |
|
513 |
raster_name<-paste("GAMCAI_tmax_predicted_",data_name,out_prefix,".rst", sep="")
|
|
494 | 514 |
writeRaster(tmax_predicted_CAI, filename=raster_name,overwrite=TRUE) #Writing the data in a raster file format...(IDRISI) |
495 | 515 |
#writeRaster(r2, filename=raster_name,overwrite=TRUE) #Writing the data in a raster file format...(IDRISI) |
496 | 516 |
|
... | ... | |
513 | 533 |
} |
514 | 534 |
|
515 | 535 |
if (predval==0) { |
516 |
|
|
536 |
|
|
517 | 537 |
y_mod<- predict(mod, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values. |
518 | 538 |
|
519 | 539 |
pred_mod<-paste("pred_mod",j,sep="") |
... | ... | |
527 | 547 |
res_mod_v<- data_v$dailyTmax - data_v[[pred_mod]] #Residuals from kriging validation |
528 | 548 |
} |
529 | 549 |
|
530 |
####ADDED ON JULY 20th |
|
550 |
#y_var_fit= mod$fit #move it |
|
551 |
#Use res_mod_s so the R2 is based on daily station training |
|
552 |
R2_mod_f<- cor(data_s$dailyTmax,res_mod_s, use="complete")^2 |
|
553 |
RMSE_mod_f<- sqrt(mean(res_mod_s^2,na.rm=TRUE)) |
|
554 |
|
|
555 |
results_RMSE_f[1,1]<- sampling_dat$date[i] #storing the interpolation dates in the first column |
|
556 |
results_RMSE_f[1,2]<- ns #number of stations used in the training stage |
|
557 |
results_RMSE_f[1,3]<- "RSME_f" |
|
558 |
#results_RMSE_f[1,j+3]<-sqrt(mean(mod$residuals^2,na.rm=TRUE)) |
|
559 |
results_RMSE_f[1,j+3]<-sqrt(mean(res_mod_s^2,na.rm=TRUE)) |
|
560 |
|
|
561 |
results_MAE_f[1,1]<- sampling_dat$date[i] #storing the interpolation dates in the first column |
|
562 |
results_MAE_f[1,2]<- ns #number of stations used in the training stage |
|
563 |
results_MAE_f[1,3]<- "MAE_f" |
|
564 |
#results_MAE_f[j+3]<-sum(abs(y_var_fit-data_s$y_var))/ns |
|
565 |
results_MAE_f[1,j+3]<-mean(abs(res_mod_s),na.rm=TRUE) |
|
566 |
|
|
567 |
results_R2_f[1,1]<- sampling_dat$date[i] #storing the interpolation dates in the first column |
|
568 |
results_R2_f[1,2]<- ns #number of stations used in the training stage |
|
569 |
results_R2_f[1,3]<- "R2_f" |
|
570 |
results_R2_f[1,j+3]<- R2_mod_f #Storing R2 for the model j |
|
571 |
|
|
572 |
#### Now calculate validation metrics |
|
531 | 573 |
res_mod<-res_mod_v |
532 | 574 |
|
533 | 575 |
#RMSE_mod <- sqrt(sum(res_mod^2)/nv) #RMSE FOR REGRESSION STEP 1: GAM |
... | ... | |
604 | 646 |
mod_obj<-list(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8,mod9a,mod9b) |
605 | 647 |
names(mod_obj)<-c("mod1","mod2","mod3","mod4","mod5","mod6","mod7","mod8","mod9a","mod9b") #generate names automatically?? |
606 | 648 |
#results_list<-list(data_s,data_v,tb_metrics1,tb_metrics2) |
607 |
results_list<-list(data_s,data_v,tb_metrics1,tb_metrics2,mod_obj) |
|
608 |
names(results_list)<-c("data_s","data_v","tb_metrics1","tb_metrics2","mod_obj") |
|
649 |
#results_list<-list(data_s,data_v,tb_metrics1,tb_metrics2,mod_obj) |
|
650 |
results_list<-list(data_s,data_v,tb_metrics1,tb_metrics2,mod_obj,data_month) |
|
651 |
names(results_list)<-c("data_s","data_v","tb_metrics1","tb_metrics2","mod_obj","data_month") |
|
609 | 652 |
save(results_list,file= paste(path,"/","results_list_metrics_objects_",sampling_dat$date[i],"_",sampling_dat$prop[i],"_",sampling_dat$run_samp[i], |
610 | 653 |
out_prefix,".RData",sep="")) |
611 | 654 |
return(results_list) |
Also available in: Unified diff
GAM CAI function added climatology GAM models for OR tmax interpolation