Project

General

Profile

« Previous | Next » 

Revision 455b069b

Added by Benoit Parmentier over 11 years ago

analyses paper part 3: comb3, multisampling and distance to closest fitting station

View differences:

climate/research/oregon/interpolation/analyses_papers_methods_comparison_part3.R
1
######################################## Paper Methods_comparison #######################################
2
############################ Scripts for figures and analyses for the the IBS poster #####################################
3
#This script performs analyses and create figures for the FSS paper.
4
#It uses inputs from interpolation objects created at earlier stages...     
5
#Note that this is exploratory code i.e. not part of the worklfow.
6
#AUTHOR: Benoit Parmentier                                                                       #
7
#DATE: 08/02/2013                                                                                #
8
#PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#491--                                  #
9
###################################################################################################
10

  
11
###Loading R library and packages                                                      
12
#library(gtools)                                        # loading some useful tools 
13
library(mgcv)                   # GAM package by Wood 2006 (version 2012)
14
library(sp)                     # Spatial pacakge with class definition by Bivand et al. 2008
15
library(spdep)                  # Spatial package with methods and spatial stat. by Bivand et al. 2012
16
library(rgdal)                  # GDAL wrapper for R, spatial utilities (Keitt et al. 2012)
17
library(gstat)                  # Kriging and co-kriging by Pebesma et al. 2004
18
library(automap)                # Automated Kriging based on gstat module by Hiemstra et al. 2008
19
library(spgwr)
20
library(maptools)
21
library(graphics)
22
library(parallel)               # Urbanek S. and Ripley B., package for multi cores & parralel processing
23
library(raster)
24
library(rasterVis)
25
library(plotrix)                # Draw circle on graph and additional plotting options
26
library(reshape)                # Data format and type transformation
27

  
28
##################### Function used in the script ##############
29

  
30
## Extract a list of object from an object: Useful to extract information from
31
## RData objects saved in the interpolation phase.
32

  
33
extract_list_from_list_obj<-function(obj_list,list_name){
34
  #Create a list of an object from a given list of object using a name prodived as input
35
  
36
  list_tmp<-vector("list",length(obj_list))
37
  for (i in 1:length(obj_list)){
38
    tmp<-obj_list[[i]][[list_name]] #double bracket to return data.frame
39
    list_tmp[[i]]<-tmp
40
  }
41
  return(list_tmp) #this is  a data.frame
42
}
43

  
44
## Produce data.frame with residuals for models and distance to closest fitting station
45
calc_dist_ref_data_point <- function(i,list_param){
46
  #This function creates a list of data.frame containing the distance to teh closest
47
  # reference point (e.g. fitting station) for a give data frame. 
48
  #Inputs:
49
  #data_s: given data.frame from wich distance is computed
50
  #data_v: reference data.frame, destination, often the fitting points used in analyses
51
  #i: index variable to operate on list
52
  #names_var: 
53
  #Outputs:
54
  #list_dstspat_er
55
  
56
  #Parsing input arguments
57
  data_s<-list_param$data_s[[i]]
58
  data_v<-list_param$data_v[[i]]
59
  
60
  names_var<-list_param$names_var
61
  
62
  ######
63
  
64
  names_var<-intersect(names_var,names(data_v)) #there may be missing columns
65
  #use columns that exists
66
  
67
  d_s_v<-matrix(0,nrow(data_v),nrow(data_s))
68
  for(k in 1:nrow(data_s)){
69
    pt<-data_s[k,]
70
    d_pt<-(spDistsN1(data_v,pt,longlat=FALSE))/1000  #Distance to station k in km
71
    d_s_v[,k]<-d_pt
72
  }
73
  
74
  #Create data.frame with position, ID, dst and residuals...
75
  
76
  pos<-vector("numeric",nrow(data_v))
77
  y<-vector("numeric",nrow(data_v))
78
  dst<-vector("numeric",nrow(data_v))
79
  
80
  for (k in 1:nrow(data_v)){
81
    pos[k]<-match(min(d_s_v[k,]),d_s_v[k,])
82
    dst[k]<-min(d_s_v[k,]) 
83
  }
84
  
85
  dstspat_er<-as.data.frame(cbind(v_id=as.vector(data_v$id),s_id=as.vector(data_s$id[pos]),
86
                                  pos=pos, lat=data_v$lat, lon=data_v$lon, x=data_v$x,y=data_v$y,
87
                                  dst=dst,
88
                                  as.data.frame(data_v[,names_var])))
89
  
90
  return(dstspat_er)  
91
}  
92

  
93
# create plot of accury in term of distance to closest fitting station
94
plot_dst_spat_fun<-function(stat_tb,names_var,cat_val){
95
  
96
  range_y<-range(as.vector(unlist(stat_tb[,names_var])),na.rm=T) #flatten data.frame
97
  col_t<-rainbow(length(names_var))
98
  pch_t<- 1:length(names_var)
99
  plot(stat_tb[,names_var[1]], ylim=range_y,pch=pch_t[1],col=col_t[1],type="b",
100
       yla="MAE (in degree C)",xlab="",xaxt="n")
101
  #points((stat_tb[,names_var[k]], ylim=range_y,pch=pch_t[1],col=col_t[1]),type="p")
102
  for (k in 2:length(names_var)){
103
    lines(stat_tb[,names_var[k]], ylim=range_y,pch=pch_t[k],col=col_t[k],type="b",
104
          xlab="",axes=F)
105
    #points((stat_tb[,names_var[k]], ylim=range_y,pch=pch_t[k],col=col_t[k]),type="p")
106
  }
107
  legend("topleft",legend=names_var, 
108
         cex=1.2, pch=pch_t,col=col_t,lty=1,bty="n")
109
  axis(1,at=1:length(stat_tb[,1]),labels=stat_tb[,1])
110
}
111

  
112
################## PARAMETERS ##########
113

  
114
#in_dir<- "/home/parmentier/Data/IPLANT_project/Oregon_interpolation/Oregon_03142013/output_data_365d_GAM_fus_all_lst_05312013"
115
#in_dir<-"/home/parmentier/Data/IPLANT_project/Oregon_interpolation/Oregon_03142013/output_data_365d_gam_day_lst_comb3_07092013/"
116
in_dir<-"/home/parmentier/Data/IPLANT_project/Oregon_interpolation/Oregon_03142013/output_data_365d_gam_day_lst_comb4_07152013/"
117
in_dir2<-"/home/parmentier/Data/IPLANT_project/Oregon_interpolation/Oregon_03142013/output_data_365d_gam_day_mult_lst_comb3_07232013"
118

  
119
out_dir<-""
120
setwd(in_dir)
121

  
122
out_prefix<-"/home/parmentier/Data/IPLANT_project/Oregon_interpolation/Oregon_03142013/output_data_365d_GAM_fus_all_lst_05312013"
123
method_interpolation <- "gam_daily"
124
covar_obj_file <- "covar_obj__365d_gam_day_lst_comb3_07092013.RData"
125

  
126
raster_obj_file <- "raster_prediction_obj_gam_daily_dailyTmax_365d_gam_day_lst_comb3_07092013.RData"
127
raster_obj_file2 <- "raster_prediction_obj_gam_daily_dailyTmax_365d_gam_day_mults15_lst_comb3_07232013.RData"
128

  
129
raster_prediction_obj <-load_obj(raster_obj_file)
130
names(raster_prediction_obj) #list of two objects
131

  
132
raster_prediction_obj$summary_metrics_v
133

  
134
#################################################################################
135
############ ANALYSES 1: ACCURACY IN TERMS OF DISTANCE TO CLOSEST STATIONS... #######
136

  
137
names(raster_prediction_obj$validation_mod_obj[[1]])
138

  
139
#First extract list of training and testing spdf from interpolation object
140

  
141
list_data_s <- extract_list_from_list_obj(raster_prediction_obj$validation_mod_obj,"data_s")
142
list_data_v <- extract_list_from_list_obj(raster_prediction_obj$validation_mod_obj,"data_v")
143

  
144
#Assign model's names
145
names_mod <- paste("res_mod",1:9,sep="")
146
i<-1
147
names_mod<-c("res_mod1","res_mod2","res_mod3","res_mod4","res_mod5","res_mod6","res_mod7","res_mod8","res_mod9")
148

  
149
## Generate data frame with residuals in term of distance to closest stations
150

  
151
names_var<-c(names_mod,"dates")
152
list_param_dst<-list(i,list_data_s,list_data_v,names_mod)
153
names(list_param_dst) <- c("index","data_s","data_v","names_var")
154

  
155
#call function over 365 dates
156
list_dstspat_er <-lapply(1:length(list_data_v),FUN=calc_dist_ref_data_point,list_param=list_param_dst)
157
#now assemble in one data.frame
158
dstspat_dat<-do.call(rbind.fill,list_dstspat_er)
159

  
160
###
161

  
162
# Plot results: accuracy in term of distance to closest fitting stations...
163

  
164
plot(dstspat_dat$dst,abs(dstspat_dat$res_mod1))
165
limit_val<-seq(0,150, by=10)
166

  
167
limit_val<-seq(5,155, by=10)
168

  
169
dstspat_dat$dst_cat1 <- cut(dstspat_dat$dst,breaks=limit_val)
170
mae_fun<-function(x){mean(abs(x))}
171
sd_abs_fun<-function(x){sd(abs(x))}
172
t<-melt(dstspat_dat,
173
        measure=c("res_mod1","res_mod2","res_mod3","res_mod4","res_mod5","res_mod6","res_mod8"), 
174
        id=c("dst_cat1"),
175
        na.rm=T)
176
mae_tb<-cast(t,dst_cat1~variable,mae_fun)
177
sd_abs_tb<-cast(t,dst_cat1~variable,sd_abs_fun)
178

  
179
avg_tb<-cast(t,dst_cat1~variable,mean)
180
sd_tb<-cast(t,dst_cat1~variable,sd)
181
n_tb<-cast(t,dst_cat1~variable,length)
182
n_NA<-cast(t,dst_cat1~variable,is.na)
183

  
184
mod_name<-"mod1"
185
mod_name<-"mod4"
186
xlab_text<-"distance to fitting station"
187

  
188
n <- unlist(n_tb[1:12,c(mod_name)])
189
y <- unlist(mae_tb[1:12,c(mod_name)])
190

  
191
x<- 1:length(y)
192
y_sd <- unlist(sd_abs_tb[1:12,c(mod_name)])
193

  
194
ciw <-y_sd
195
#ciw2   <- qt(0.975, n) * y_sd2 / sqrt(n)
196

  
197
plotCI(y=y, x=x, uiw=ciw, col="red", main=paste(" MAE for ",mod_name,sep=""), barcol="blue", lwd=1,
198
       ylab="RMSE (C)", xlab=xlab_text)
199

  
200
ciw   <- qt(0.975, n) * y_sd / sqrt(n)
201

  
202
plotCI(y=y, x=x, uiw=ciw, col="red", main=paste(" CI MAE for for ",mod_name,sep=""), barcol="blue", lwd=1,
203
       ylab="RMSE (C)", xlab=xlab_text)
204

  
205
names_var<-intersect(names_var,names(dstspat_dat)) #there may be missing columns
206
range_y<-range(as.vector(dstspat_dat[,names_var]),na.rm=T)
207
dst_dat<- vector("list",length(names_var))
208

  
209
#####################################################
210
### PART II MULTISAMPLING COMPARISON ###############
211

  
212
#Use run of 7 hold out proportions, 10 to 70% with 10 random samples and 12 dates...
213
#Use gam_day method
214
#Use comb3 i.e. using baseline s(lat,lon)+s(elev)
215

  
216
#read in relevant data:
217
raster_prediction_obj <-load_obj(file.path(in_dir2,raster_obj_file2))
218
tb <-raster_prediction_obj$tb_diagnostic_v  #contains the accuracy metrics for each run...
219

  
220
names_mod<-c("mod1","mod2","mod3","mod4","mod5","mod6","mod7","mod8","mod9")
221

  
222
t<-melt(tb,
223
        measure=c("mae","rmse","r","m50"), 
224
        id=c("pred_mod","prop"),
225
        na.rm=T)
226
tbp<- subset(tb,prop!=70) #remove 70% hold out because it is only predicted for mod1 (baseline)
227

  
228
tp<-melt(tbp,
229
        measure=c("mae","rmse","r","m50"), 
230
        id=c("pred_mod","prop"),
231
        na.rm=T)
232

  
233
avg_tp<-cast(tp,pred_mod~variable,mean)
234

  
235
avg_tb<-cast(t,pred_mod+prop~variable,mean)
236
sd_tb<-cast(t,pred_mod+prop~variable,sd)
237

  
238
n_tb<-cast(t,pred_mod+prop~variable,length)
239

  
240
xyplot(avg_tb$rmse~avg_tb$prop,type="b",group=pred_mod,
241
       data=avg_tb,
242
       pch=1:length(avg_tb$pred_mod),
243
       par.settings=list(superpose.symbol = list(
244
         pch=1:length(avg_tb$pred_mod))),
245
       auto.key=list(columns=5))
246

  
247
mod_name<-"mod1"
248
mod_name<-"mod4"
249
xlab_text<-"proportion of hold out"
250

  
251
n <- unlist(subset(n_tb,pred_mod==mod_name,select=c(rmse)))
252
y <- unlist(subset(avg_tb,pred_mod==mod_name,select=c(rmse)))
253

  
254
x<- 1:length(y)
255
y_sd <- unlist(subset(sd_tb,pred_mod==mod_name,select=c(rmse)))
256

  
257
ciw <-y_sd
258
#ciw2   <- qt(0.975, n) * y_sd2 / sqrt(n)
259

  
260
plotCI(y=y, x=x, uiw=ciw, col="red", main=paste(" Mean and Std_dev RMSE for ",mod_name,sep=""), barcol="blue", lwd=1,
261
       ylab="RMSE (C)", xlab=xlab_text)
262

  
263
ciw   <- qt(0.975, n) * y_sd / sqrt(n)
264

  
265
plotCI(y=y, x=x, uiw=ciw, col="red", main=paste(" Mean and CI RMSE for ",mod_name,sep=""), barcol="blue", lwd=1,
266
       ylab="RMSE (C)", xlab=xlab_text)
267

  
268
n=150
269
ciw   <- qt(0.975, n) * y_sd / sqrt(n)
270
ciw2   <- qt(0.975, n) * y_sd2 / sqrt(n)
271

  
272
#Comparison of MAE for different proportions for FUSION and CAI using CI
273
X11()
274
plotCI(y=y, x=x, uiw=ciw, col="red", main=" FUS: RMSE proportion of validation hold out", barcol="blue", lwd=1,
275
       ylab="RMSE (C)", xlab="Proportions of validation hold out (in %)")
276
lines(x,y,col="red")
277
legend("bottomright",legend=c("fus"), cex=1.2, col=c("red"),
278
       lty=1, title="RMSE")
279
savePlot(paste("Comparison_multisampling_fus_RMSE_CI",out_prefix,".png", sep=""), type="png")
280

  
281
plotCI(y=y2, x=x2, uiw=ciw2, col="black", main=" CAI: RMSE proportion of validation hold out", barcol="blue", lwd=1,
282
       ylab="RMSE (C)", xlab="Proportions of validation hold out (in %)")
283
lines(x2,y2,col="grey")
284
legend("bottomright",legend=c("CAI"), cex=1.2, col=c("grey"),
285
       lty=1, title="RMSE")
286
savePlot(paste("Comparison_multisampling_CAI_RMSE_CI",out_prefix,".png", sep=""), type="png")
287
dev.off()
288

  
289
#Comparison of MAE for different proportions for FUSION and CAI
290
X11()
291
plot(x,y,col="red",type="b", ylab="RMSE (C)", xlab="Proportions of validation hold out (in %)")
292
lines(x2,y2,col="grey")
293
points(x2,y2,col="grey")
294
title("MAE in terms of proportions and random sampling")
295
legend("bottomright",legend=c("fus","CAI"), cex=1.2, col=c("red","grey"),
296
       lty=1, title="RMSE")
297
savePlot(paste("Comparison_multisampling_fus_CAI_RMSE_averages",out_prefix,".png", sep=""), type="png")
298
dev.off()

Also available in: Unified diff