Project

General

Profile

« Previous | Next » 

Revision fb039a6b

Added by Benoit Parmentier over 11 years ago

GAM fusion raster prediction tmax OR, added constant sampling, multisampling,GAM bias models

View differences:

climate/research/oregon/interpolation/GAM_fusion_analysis_raster_prediction_multisampling.R
1 1
##################    MULTI SAMPLING GAM FUSION METHOD ASSESSMENT ####################################
2 2
############################ Merging LST and station data ##########################################
3
#This script interpolates tmax values using MODIS LST and GHCND station data                      #
4
#interpolation area. It requires the text file of stations and a shape file of the study area.    #       
5
#Note that the projection for both GHCND and study area is lonlat WGS84.                          #
6
#AUTHOR: Benoit Parmentier                                                                        #
7
#DATE: 08/15/2012                                                                                 #
8
#PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363--                                   #
3
#This script interpolates tmax values using MODIS LST and GHCND station data                      
4
#interpolation area. It requires the text file of stations and a shape file of the study area.           
5
#Note that the projection for both GHCND and study area is lonlat WGS84.       
6
#Options to run this program are:
7
#1) Multisampling: vary the porportions of hold out and use random samples for each run
8
#2)Constant sampling: use the same sample over the runs
9
#3)over dates: run over for example 365 dates without mulitsampling
10
#4)use seed number: use seed if random samples must be repeatable
11
#5)GAM fusion: possibilty of running GAM+FUSION or GAM separately 
12
#AUTHOR: Benoit Parmentier                                                                        
13
#DATE: 10/27/2012                                                                                 
14
#PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363--                                   
9 15
###################################################################################################
10 16

  
11 17
###Loading R library and packages                                                      
......
19 25
library(raster)                              # Hijmans et al. package for raster processing
20 26
library(rasterVis)
21 27
library(parallel)                            # Urbanek S. and Ripley B., package for multi cores & parralel processing
22

  
28
library(reshape)
29
library(plotrix)
23 30
### Parameters and argument
24 31

  
25 32
infile1<- "ghcn_or_tmax_covariates_06262012_OR83M.shp"             #GHCN shapefile containing variables for modeling 2010                 
26
#tinfile2<-"list_10_dates_04212012.txt"                     #List of 10 dates for the regression
27
infile2<-"list_2_dates_04212012.txt"
28
#infile2<-"list_365_dates_04212012.txt"
33
infile2<-"list_365_dates_04212012.txt"
29 34
infile3<-"LST_dates_var_names.txt"                        #LST dates name
30 35
infile4<-"models_interpolation_05142012.txt"              #Interpolation model names
31 36
infile5<-"mean_day244_rescaled.rst"                       #Raster or grid for the locations of predictions
......
33 38
infile6<-"LST_files_monthly_climatology.txt"
34 39
inlistf<-"list_files_05032012.txt"                        #Stack of images containing the Covariates
35 40

  
36
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations"
37
path<-"/home/parmentier/Data/IPLANT_project/methods_interpolation_comparison"
38
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_GAM"
39
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_07152012"     #Jupiter LOCATION on Atlas for kriging"
40
#path<-"M:/Data/IPLANT_project/data_Oregon_stations"   #Locations on Atlas
41
path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_10242012_GAM"
41 42

  
42 43
setwd(path)
43 44
#Station location of the study area
......
48 49
nmodels<-8   #number of models running
49 50
y_var_name<-"dailyTmax"
50 51
predval<-1
51
prop<-0.3                                                                           #Proportion of testing retained for validation   
52
prop<-0.3             #Proportion of testing retained for validation   
52 53
#prop<-0.25
53
seed_number<- 100  #if seedzero then no seed?                                                                 #Seed number for random sampling
54
out_prefix<-"_365d_GAM_fusion_multisamp2_0823012"                #User defined output prefix
54
seed_number<- 100  #if seed zero then no seed?                                                                 #Seed number for random sampling
55
out_prefix<-"_365d_GAM_fusion_const_all_lstd_10282012"                #User defined output prefix
55 56

  
56 57
bias_val<-0            #if value 1 then training data is used in the bias surface rather than the all monthly stations
57
nb_sample<-15
58
prop_min<-0.1
59
prop_max<-0.7
60
step<-0.1
61

  
62
#source("fusion_function_07192012.R")
63
source("GAM_fusion_function_multisampling_08232012.R")
64
############ START OF THE SCRIPT ##################
65
#
66
#
67
### Step 0/Step 6 in Brian's code...preparing year 2010 data for modeling 
68
#
58
bias_prediction<-1     #if value 1 then use GAM for the BIAS prediction otherwise GAM direct repdiction for y_var (daily tmax)
59
nb_sample<-1           #number of time random sampling must be repeated for every hold out proportion
60
prop_min<-0.3          #if prop_min=prop_max and step=0 then predicitons are done for the number of dates...
61
prop_max<-0.3
62
step<-0         
63
constant<-1             #if value 1 then use the same samples as date one for the all set of dates
64
#projection used in the interpolation of the study area
65
CRS_interp<-"+proj=lcc +lat_1=43 +lat_2=45.5 +lat_0=41.75 +lon_0=-120.5 +x_0=400000 +y_0=0 +ellps=GRS80 +units=m +no_defs";
66
CRS_locs_WGS84<-CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +towgs84=0,0,0") #Station coords WGS84
67

  
68
source("GAM_fusion_function_multisampling_10272012.R")
69

  
70
###################### START OF THE SCRIPT ########################
69 71

  
70 72
###Reading the station data and setting up for models' comparison
71 73
filename<-sub(".shp","",infile1)             #Removing the extension from file.
......
115 117
LC1<-raster(s_raster,layer=pos)             #Select layer from stack
116 118
s_raster<-dropLayer(s_raster,pos)
117 119
LC1[is.na(LC1)]<-0
120

  
118 121
pos<-match("LC3",layerNames(s_raster)) #Find column with name "value"
119 122
LC3<-raster(s_raster,layer=pos)             #Select layer from stack
120 123
s_raster<-dropLayer(s_raster,pos)
121 124
LC3[is.na(LC3)]<-0
125

  
126
pos<-match("LC4",layerNames(s_raster)) #Find column with name "value"
127
LC4<-raster(s_raster,layer=pos)             #Select layer from stack
128
s_raster<-dropLayer(s_raster,pos)
129
LC4[is.na(LC4)]<-0
130

  
131
pos<-match("LC6",layerNames(s_raster)) #Find column with name "value"
132
LC6<-raster(s_raster,layer=pos)             #Select layer from stack
133
s_raster<-dropLayer(s_raster,pos)
134
LC6[is.na(LC6)]<-0
135

  
136
LC_s<-stack(LC1,LC3,LC4,LC6)
137
layerNames(LC_s)<-c("LC1_forest","LC3_grass","LC4_crop","LC6_urban")
138
plot(LC_s)
139

  
122 140
pos<-match("CANHEIGHT",layerNames(s_raster)) #Find column with name "value"
123 141
CANHEIGHT<-raster(s_raster,layer=pos)             #Select layer from stack
124 142
s_raster<-dropLayer(s_raster,pos)
......
135 153
values(lon)<-xy_latlon[,1]
136 154
values(lat)<-xy_latlon[,2]
137 155

  
138
r<-stack(N,E,Nw,Ew,lon,lat,LC1,LC3,CANHEIGHT)
139
rnames<-c("Northness","Eastness","Northness_w","Eastness_w", "lon","lat","LC1","LC3","CANHEIGHT")
156
r<-stack(N,E,Nw,Ew,lon,lat,LC1,LC3,LC4,LC6, CANHEIGHT)
157
rnames<-c("Northness","Eastness","Northness_w","Eastness_w", "lon","lat","LC1","LC3","LC4","LC6","CANHEIGHT")
140 158
layerNames(r)<-rnames
141 159
s_raster<-addLayer(s_raster, r)
142 160

  
......
171 189
results_RMSE_f<- matrix(1,1,nmodels+4)    #RMSE fit, RMSE for the training dataset
172 190
results_MAE_f <- matrix(1,1,nmodels+4)
173 191

  
174
######## Preparing monthly averages from the ProstGres database
192
######## Preparing monthly averages from the ProstGres database and extracting covarvariates from stack
175 193

  
176 194
# do this work outside of (before) this function
177 195
# to avoid making a copy of the data frame inside the function call
......
191 209
dst$TMax<-dst$TMax/10                #TMax is the average max temp for monthy data
192 210
#dstjan=dst[dst$month==9,]  #dst contains the monthly averages for tmax for every station over 2000-2010
193 211

  
212
#Extracting covariates from stack
213
coords<- dst[c('lon','lat')]              #Define coordinates in a data frame
214
coordinates(dst)<-coords                      #Assign coordinates to the data frame
215
proj4string(dst)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
216
dst_month<-spTransform(dst,CRS(CRS_interp))     #Project from WGS84 to new coord. system
217

  
218
stations_val<-extract(s_raster,dst_month)  #extraction of the infomration at station location
219
stations_val<-as.data.frame(stations_val)
220
dst_extract<-cbind(dst_month,stations_val)
221
dst<-dst_extract
222

  
194 223
######### Preparing daily values for training and testing
195 224

  
196 225
#Screening for bad values: value is tmax in this case
......
203 232

  
204 233
##Sampling: training and testing sites.
205 234

  
206
#set.seed(seed_number)                        #Using a seed number allow results based on random number to be compared...
235
if (seed_number>0) {
236
  set.seed(seed_number)                        #Using a seed number allow results based on random number to be compared...
237
}
207 238
nel<-length(dates)
208 239
dates_list<-vector("list",nel) #list of one row data.frame
209 240

  
210
prop_range<-(seq(from=prop_min,to=prop_max,by=step))*100
211
sn<-length(dates)*nb_sample*length(prop_range)
241
prop_range<-(seq(from=prop_min,to=prop_max,by=step))*100     #range of proportion to run
242
sn<-length(dates)*nb_sample*length(prop_range)               #Number of samples to run
212 243

  
213 244
for(i in 1:length(dates)){
214 245
  d_tmp<-rep(dates[i],nb_sample*length(prop_range)) #repeating same date
......
228 259
sampling_dat$date<- as.character(sampling_dat[,1])
229 260
#ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
230 261
ghcn.subsets <-lapply(as.character(sampling_dat$date), function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
231
  
232
sampling<-vector("list",length(ghcn.subsets))
233 262

  
263
## adding choice of constant sample 
264
if (seed_number>0) {
265
  set.seed(seed_number)                        #Using a seed number allow results based on random number to be compared...
266
}
267

  
268
sampling<-vector("list",length(ghcn.subsets))
269
sampling_station_id<-vector("list",length(ghcn.subsets))
234 270
for(i in 1:length(ghcn.subsets)){
235 271
  n<-nrow(ghcn.subsets[[i]])
236 272
  prop<-(sampling_dat$prop[i])/100
......
238 274
  nv<-n-ns              #create a sample for validation with prop of the rows
239 275
  ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
240 276
  ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
277
  #Find the corresponding 
278
  data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
279
  station_id.training<-data_sampled$station     #selected id for the randomly sampled stations (115)
280
  #Save the information
241 281
  sampling[[i]]<-ind.training
282
  sampling_station_id[[i]]<- station_id.training
283
}
284
## Use same samples across the year...
285
if (constant==1){
286
  sampled<-sampling[[1]]
287
  data_sampled<-ghcn.subsets[[1]][sampled,] #selected the randomly sampled stations
288
  station_sampled<-data_sampled$station     #selected id for the randomly sampled stations (115)
289
  list_const_sampling<-vector("list",sn)
290
  list_const_sampling_station_id<-vector("list",sn)
291
  for(i in 1:sn){
292
    station_id.training<-intersect(station_sampled,ghcn.subsets[[i]]$station)
293
    ind.training<-match(station_id.training,ghcn.subsets[[i]]$station)
294
    list_const_sampling[[i]]<-ind.training
295
    list_const_sampling_station_id[[i]]<-station_id.training
296
  }
297
  sampling<-list_const_sampling 
298
  sampling_station_id<-list_const_sampling_station_id
242 299
}
243 300

  
301
# sampling<-vector("list",length(ghcn.subsets))
302
# 
303
# for(i in 1:length(ghcn.subsets)){
304
#   n<-nrow(ghcn.subsets[[i]])
305
#   prop<-(sampling_dat$prop[i])/100
306
#   ns<-n-round(n*prop)   #Create a sample from the data frame with 70% of the rows
307
#   nv<-n-ns              #create a sample for validation with prop of the rows
308
#   ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
309
#   ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
310
#   sampling[[i]]<-ind.training
311
# }
312
# 
313
# if (constant==1){
314
#   sampled<-sampling[[1]]
315
#   list_const_sampling<-vector("list",sn)
316
#   for(i in 1:sn){
317
#     list_const_sampling[[i]]<-sampled
318
#   }
319
#   sampling<-list_const_sampling  
320
# }
321

  
244 322
######## Prediction for the range of dates and sampling data
245 323

  
246 324
#gam_fus_mod<-mclapply(1:length(dates), runGAMFusion,mc.preschedule=FALSE,mc.cores = 8) #This is the end bracket from mclapply(...) statement
247
#gam_fus_mod<-mclapply(1:1, runGAMFusion,mc.preschedule=FALSE,mc.cores = 8) #This is the end bracket from mclapply(...) statement
248
gam_fus_mod_s<-mclapply(1:length(ghcn.subsets), runGAMFusion,mc.preschedule=FALSE,mc.cores = 2) #This is the end bracket from mclapply(...) statement
249
#gam_fus_mod2<-mclapply(11:11, runGAMFusion,mc.preschedule=FALSE,mc.cores = 1) #This is the end bracket from mclapply(...) statement
325
#gam_fus_mod_s<-mclapply(1:1, runGAMFusion,mc.preschedule=FALSE,mc.cores = 1) #This is the end bracket from mclapply(...) statement
326
gam_fus_mod_s<-mclapply(1:length(ghcn.subsets), runGAMFusion,mc.preschedule=FALSE,mc.cores = 8) #This is the end bracket from mclapply(...) statement
327
#gam_fus_mod2<-mclapply(4:4, runGAMFusion,mc.preschedule=FALSE,mc.cores = 1) #This is the end bracket from mclapply(...) statement
250 328

  
329
save(gam_fus_mod_s,file= paste(path,"/","results2_fusion_Assessment_measure_all",out_prefix,".RData",sep=""))
251 330

  
252 331
## Plotting and saving diagnostic measures
253
accuracy_tab_fun<-function(i,f_list){
254
tb<-f_list[[i]][[3]]
255
return(tb)
256
}
257

  
258 332

  
259 333
tb<-gam_fus_mod_s[[1]][[3]][0,]  #empty data frame with metric table structure that can be used in rbinding...
260 334
tb_tmp<-gam_fus_mod_s #copy
......
269 343
  tb[,i]<-as.numeric(as.character(tb[,i]))  
270 344
}
271 345

  
272
metrics<-as.character(unique(tb$metric))
346
metrics<-as.character(unique(tb$metric))            #Name of accuracy metrics (RMSE,MAE etc.)
273 347
tb_metric_list<-vector("list",length(metrics))
274 348

  
275
for(i in 1:length(metrics)){            # start of the for loop #1  
349
for(i in 1:length(metrics)){            # Reorganizing information in terms of metrics 
276 350
  metric_name<-paste("tb_",metrics[i],sep="")
277 351
  tb_metric<-subset(tb, metric==metrics[i])
278 352
  tb_metric<-cbind(tb_metric,sampling_dat[,2:3])
......
280 354
  tb_metric_list[[i]]<-tb_metric
281 355
}
282 356

  
283
#tb_diagnostic1<-rbind(tb_RMSE,tb_MAE,tb_ME,tb_R2)
284 357
tb_diagnostic<-do.call(rbind,tb_metric_list)
358
#tb_diagnostic[["prop"]]<-as.factor(tb_diagnostic[["prop"]])
359
sampling_obj<-list(sampling_dat=sampling_dat,training=sampling, training_id=sampling_station_id, tb=tb_diagnostic)
285 360

  
286
avg_list<-vector("list",nmodels+1)
287

  
288
for (i in 1:(nmodels+1)){
289
  formag<-paste("mod",i,sep="")
290
  form<-as.formula(paste(formag,"~prop+metric"))
291
  avg_all1<-aggregate(form, data=tb_diagnostic, mean) 
292
  file<-paste("agg_metrics_",formag,out_prefix,".txt")
293
  write.table(avg_all1,file=file,sep=",")
294
  avg_list[[i]]<-avg_all1
295
}
296

  
297
test<-aggregate(mod9 ~ prop + metric + dates, data=tb_diagnostic, mean)
298
data_plot<-as.matrix(subset(avg_list[[9]],metric=="RMSE" & dates=="20100102"))
299

  
300
#x<- matrix(1,1,nmodels+3)  
301
y<- matrix(1,7,2)  
302

  
303
y[,1]<-as.numeric(data_plot[,4])
304
y[,2]<-as.numeric(data_plot[,5])
305

  
306
x<-cbind(unique(test$prop),unique(test$prop))
307
plot(x,y,col=c("red","blue"))
308
lines(x,y,col=c("red","blue"))
309
plot(data_plot[,4:5]~prop_t)
310

  
311
plot(x,y)
312
plot(prop,mod1,data=subset(test,metric=="RMSE" & dates=="20100101"))
313

  
361
#write.table(avg_tb, file= paste(path,"/","results2_fusion_Assessment_measure_avg_",out_prefix,".txt",sep=""), sep=",")
362
#write.table(median_tb, file= paste(path,"/","results2_fusion_Assessment_measure_median_",out_prefix,".txt",sep=""), sep=",")
314 363
write.table(tb_diagnostic, file= paste(path,"/","results2_fusion_Assessment_measure",out_prefix,".txt",sep=""), sep=",")
315 364
write.table(tb, file= paste(path,"/","results2_fusion_Assessment_measure_all",out_prefix,".txt",sep=""), sep=",")
365
save(sampling_obj, file= paste(path,"/","results2_fusion_sampling_obj",out_prefix,".RData",sep=""))
316 366
save(gam_fus_mod_s,file= paste(path,"/","results2_fusion_Assessment_measure_all",out_prefix,".RData",sep=""))
317
#tb<-as.data.frame(tb_diagnostic1)
318

  
319
#write.table(tb_1, file= paste(path,"/","results2_fusion_Assessment_measure1",out_prefix,".txt",sep=""), sep=",")
320 367

  
321
#write.table(tb_diagnostic2, file= paste(path,"/","results_fusion_Assessment_measure2",out_prefix,".txt",sep=""), sep=",")
368
## Summary of number of files and results
369
#l_f<-list.files(pattern="GAM_bias_tmax_predicted_mod1_20100103_30_1_365d_GAM_fusion_lstd_10062012.rst$")
370
#r6<-raster(l_f)
371
#plot(r6)
372
#results_list_metrics_objects_*_20101231_30_1_365d_GAM_fusion_lstd_10062012.RData
373
#l_f<-list.files(pattern=paste(".*","tmax_predicted_mod1","*outprefix,".rst",sep="""))
322 374

  
323 375
#### END OF SCRIPT

Also available in: Unified diff