/climate/research/oregon/interpolation/GAM_fusion_analysis_raster_prediction_multisampling.R - Annotate - Environment and organisms - NCEAS Projects

root/climate/research/oregon/interpolation/GAM_fusion_analysis_raster_prediction_multisampling.R @ 48e49857

1	e7bf2d1b	Benoit Parmentier	################## MULTI SAMPLING GAM FUSION METHOD ASSESSMENT ####################################
2			############################ Merging LST and station data ##########################################
3	fb039a6b	Benoit Parmentier	#This script interpolates tmax values using MODIS LST and GHCND station data
4			#interpolation area. It requires the text file of stations and a shape file of the study area.
5			#Note that the projection for both GHCND and study area is lonlat WGS84.
6			#Options to run this program are:
7			#1) Multisampling: vary the porportions of hold out and use random samples for each run
8			#2)Constant sampling: use the same sample over the runs
9			#3)over dates: run over for example 365 dates without mulitsampling
10			#4)use seed number: use seed if random samples must be repeatable
11			#5)GAM fusion: possibilty of running GAM+FUSION or GAM separately
12			#AUTHOR: Benoit Parmentier
13	760957d7	Benoit Parmentier	#DATE: 02/08/2013
14	fb039a6b	Benoit Parmentier	#PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363--
15	e7bf2d1b	Benoit Parmentier	###################################################################################################
16
17			###Loading R library and packages
18			library(gtools) # loading some useful tools
19			library(mgcv) # GAM package by Simon Wood
20			library(sp) # Spatial pacakge with class definition by Bivand et al.
21			library(spdep) # Spatial pacakge with methods and spatial stat. by Bivand et al.
22			library(rgdal) # GDAL wrapper for R, spatial utilities
23			library(gstat) # Kriging and co-kriging by Pebesma et al.
24	0163d0e2	Benoit Parmentier	library(fields) # NCAR Spatial Interpolation methods such as kriging, splines
25	e7bf2d1b	Benoit Parmentier	library(raster) # Hijmans et al. package for raster processing
26			library(rasterVis)
27			library(parallel) # Urbanek S. and Ripley B., package for multi cores & parralel processing
28	fb039a6b	Benoit Parmentier	library(reshape)
29			library(plotrix)
30	e7bf2d1b	Benoit Parmentier	### Parameters and argument
31
32	fb039a6b	Benoit Parmentier	infile2<-"list_365_dates_04212012.txt"
33	760957d7	Benoit Parmentier	infile_monthly<-"monthly_covariates_ghcn_data_TMAXy2010_2010_VE_02082013.shp"
34			infile_daily<-"daily_covariates_ghcn_data_TMAXy2010_2010_VE_02082013.shp"
35			infile_locs<-"stations_venezuela_region_y2010_2010_VE_02082013.shp"
36	96c5053f	Benoit Parmentier	infile3<-"covariates__venezuela_region__VE_01292013.tif" #this is an output from covariate script
37	e7bf2d1b	Benoit Parmentier
38	96c5053f	Benoit Parmentier	in_path<-"/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/input_data"
39			out_path<-"/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/output_data"
40	760957d7	Benoit Parmentier	script_path<-"/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/"
41	96c5053f	Benoit Parmentier	setwd(in_path)
42	e7bf2d1b	Benoit Parmentier
43	0766370d	Benoit Parmentier	nmodels<-9 #number of models running
44	e7bf2d1b	Benoit Parmentier	y_var_name<-"dailyTmax"
45			predval<-1
46	fb039a6b	Benoit Parmentier	seed_number<- 100 #if seed zero then no seed? #Seed number for random sampling
47	760957d7	Benoit Parmentier	out_prefix<-"_10d_GAM_fus5_all_lstd_02082013" #User defined output prefix
48	e7bf2d1b	Benoit Parmentier
49			bias_val<-0 #if value 1 then training data is used in the bias surface rather than the all monthly stations
50	fb039a6b	Benoit Parmentier	bias_prediction<-1 #if value 1 then use GAM for the BIAS prediction otherwise GAM direct repdiction for y_var (daily tmax)
51			nb_sample<-1 #number of time random sampling must be repeated for every hold out proportion
52			prop_min<-0.3 #if prop_min=prop_max and step=0 then predicitons are done for the number of dates...
53			prop_max<-0.3
54			step<-0
55	0766370d	Benoit Parmentier	constant<-0 #if value 1 then use the same samples as date one for the all set of dates
56	96c5053f	Benoit Parmentier	#projection used in the interpolation of the study area: should be read directly from the outline of the study area
57	760957d7	Benoit Parmentier	#CRS_interp<-"+proj=lcc +lat_1=43 +lat_2=45.5 +lat_0=41.75 +lon_0=-120.5 +x_0=400000 +y_0=0 +ellps=GRS80 +units=m +no_defs";
58	fb039a6b	Benoit Parmentier	CRS_locs_WGS84<-CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +towgs84=0,0,0") #Station coords WGS84
59
60	760957d7	Benoit Parmentier	source(file.path(script_path,"GAM_fusion_function_multisampling_02082013.R"))
61	fb039a6b	Benoit Parmentier
62			###################### START OF THE SCRIPT ########################
63	e7bf2d1b	Benoit Parmentier
64	96c5053f	Benoit Parmentier	###Reading the daily station data and setting up for models' comparison
65			ghcn<-readOGR(dsn=in_path,layer=sub(".shp","",infile_daily))
66			CRS_interp<-proj4string(ghcn) #Storing projection information (ellipsoid, datum,etc.)
67	e7bf2d1b	Benoit Parmentier
68	96c5053f	Benoit Parmentier	stat_loc<-readOGR(dsn=in_path,layer=sub(".shp","",infile_locs))
69	e7bf2d1b	Benoit Parmentier
70	96c5053f	Benoit Parmentier	data3<-readOGR(dsn=in_path,layer=sub(".shp","",infile_monthly))
71	e7bf2d1b	Benoit Parmentier
72	760957d7	Benoit Parmentier	#Remove NA for LC and CANHEIGHT: Need to check this part after
73	e7bf2d1b	Benoit Parmentier	ghcn$LC1[is.na(ghcn$LC1)]<-0
74			ghcn$LC3[is.na(ghcn$LC3)]<-0
75			ghcn$CANHEIGHT[is.na(ghcn$CANHEIGHT)]<-0
76	0766370d	Benoit Parmentier	ghcn$LC4[is.na(ghcn$LC4)]<-0
77			ghcn$LC6[is.na(ghcn$LC6)]<-0
78	e7bf2d1b	Benoit Parmentier
79	760957d7	Benoit Parmentier	dates <-readLines(file.path(in_path,infile2)) #dates to be predicted
80	e7bf2d1b	Benoit Parmentier
81			##Extracting the variables values from the raster files
82
83	96c5053f	Benoit Parmentier	#The names of covariates can be changed...
84	760957d7	Benoit Parmentier	rnames <-c("x","y","lon","lat","N","E","N_w","E_w","elev","slope","aspect","CANHEIGHT","DISTOC")
85	96c5053f	Benoit Parmentier	lc_names<-c("LC1","LC2","LC3","LC4","LC5","LC6","LC7","LC8","LC9","LC10","LC11","LC12")
86			lst_names<-c("mm_01","mm_02","mm_03","mm_04","mm_05","mm_06","mm_07","mm_08","mm_09","mm_10","mm_11","mm_12",
87			"nobs_01","nobs_02","nobs_03","nobs_04","nobs_05","nobs_06","nobs_07","nobs_08",
88			"nobs_09","nobs_10","nobs_11","nobs_12")
89
90			covar_names<-c(rnames,lc_names,lst_names)
91
92			s_raster<-stack(infile3) #read in the data stack
93			names(s_raster)<-covar_names #Assigning names to the raster layers: making sure it is included in the extraction
94
95			#Deal with no data value and zero
96	760957d7	Benoit Parmentier	#pos<-match("LC1",layerNames(s_raster)) #Find column with name "value"
97			#LC1<-raster(s_raster,layer=pos) #Select layer from stack
98			#s_raster<-dropLayer(s_raster,pos)
99			#LC1[is.na(LC1)]<-0
100
101			#pos<-match("LC3",layerNames(s_raster)) #Find column with name "value"
102			#LC3<-raster(s_raster,layer=pos) #Select layer from stack
103			#s_raster<-dropLayer(s_raster,pos)
104			#LC3[is.na(LC3)]<-0
105
106			#pos<-match("CANHEIGHT",layerNames(s_raster)) #Find column with name "value"
107			#CANHEIGHT<-raster(s_raster,layer=pos) #Select layer from stack
108			#s_raster<-dropLayer(s_raster,pos)
109			#CANHEIGHT[is.na(CANHEIGHT)]<-0
110			#pos<-match("ELEV_SRTM",layerNames(s_raster)) #Find column with name "ELEV_SRTM"
111			#ELEV_SRTM<-raster(s_raster,layer=pos) #Select layer from stack on 10/30
112			#s_raster<-dropLayer(s_raster,pos)
113			#ELEV_SRTM[ELEV_SRTM <0]<-NA
114	e7bf2d1b	Benoit Parmentier
115			#s_sgdf<-as(s_raster,"SpatialGridDataFrame") #Conversion to spatial grid data frame
116
117			###### Preparing tables for model assessment: specific diagnostic/metrics
118
119			#Model assessment: specific diagnostics/metrics
120			results_AIC<- matrix(1,1,nmodels+3)
121			results_GCV<- matrix(1,1,nmodels+3)
122			results_DEV<- matrix(1,1,nmodels+3)
123			#results_RMSE_f<- matrix(1,length(models)+3)
124
125			#Model assessment: general diagnostic/metrics
126			results_RMSE <- matrix(1,1,nmodels+4)
127			results_MAE <- matrix(1,1,nmodels+4)
128			results_ME <- matrix(1,1,nmodels+4) #There are 8+1 models
129			results_R2 <- matrix(1,1,nmodels+4) #Coef. of determination for the validation dataset
130
131			results_RMSE_f<- matrix(1,1,nmodels+4) #RMSE fit, RMSE for the training dataset
132			results_MAE_f <- matrix(1,1,nmodels+4)
133
134	760957d7	Benoit Parmentier	######### Preparing daily and monthly values for training and testing
135	96c5053f	Benoit Parmentier
136	760957d7	Benoit Parmentier	#Screening for daily bad values: value is tmax in this case
137	e7bf2d1b	Benoit Parmentier	#ghcn$value<-as.numeric(ghcn$value)
138	96c5053f	Benoit Parmentier	#ghcn_all<-ghcn
139			#ghcn_test<-subset(ghcn,ghcn$value>-150 & ghcn$value<400)
140			#ghcn_test<-ghcn
141			#ghcn_test2<-subset(ghcn_test,ghcn_test$elev_1>0)
142			#ghcn<-ghcn_test2
143	e7bf2d1b	Benoit Parmentier	#coords<- ghcn[,c('x_OR83M','y_OR83M')]
144
145	760957d7	Benoit Parmentier	#Now clean and screen monthly values
146			#dst_all<-dst
147			dst_all<-data3
148			dst<-data3
149			#dst<-subset(dst,dst$TMax>-15 & dst$TMax<45) #may choose different threshold??
150			#dst<-subset(dst,dst$ELEV_SRTM>0) #This will drop two stations...or 24 rows
151
152	e7bf2d1b	Benoit Parmentier	##Sampling: training and testing sites.
153
154	96c5053f	Benoit Parmentier	#Make this a a function
155
156	fb039a6b	Benoit Parmentier	if (seed_number>0) {
157			set.seed(seed_number) #Using a seed number allow results based on random number to be compared...
158			}
159	e7bf2d1b	Benoit Parmentier	nel<-length(dates)
160			dates_list<-vector("list",nel) #list of one row data.frame
161
162	fb039a6b	Benoit Parmentier	prop_range<-(seq(from=prop_min,to=prop_max,by=step))*100 #range of proportion to run
163			sn<-length(dates)nb_samplelength(prop_range) #Number of samples to run
164	e7bf2d1b	Benoit Parmentier
165			for(i in 1:length(dates)){
166			d_tmp<-rep(dates[i],nb_sample*length(prop_range)) #repeating same date
167			s_nb<-rep(1:nb_sample,length(prop_range)) #number of random sample per proportion
168			prop_tmp<-sort(rep(prop_range, nb_sample))
169			tab_run_tmp<-cbind(d_tmp,s_nb,prop_tmp)
170			dates_list[[i]]<-tab_run_tmp
171			}
172
173			sampling_dat<-as.data.frame(do.call(rbind,dates_list))
174			names(sampling_dat)<-c("date","run_samp","prop")
175
176			for(i in 2:3){ # start of the for loop #1
177			sampling_dat[,i]<-as.numeric(as.character(sampling_dat[,i]))
178			}
179
180			sampling_dat$date<- as.character(sampling_dat[,1])
181			#ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
182			ghcn.subsets <-lapply(as.character(sampling_dat$date), function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
183
184	fb039a6b	Benoit Parmentier	## adding choice of constant sample
185			if (seed_number>0) {
186			set.seed(seed_number) #Using a seed number allow results based on random number to be compared...
187			}
188
189			sampling<-vector("list",length(ghcn.subsets))
190			sampling_station_id<-vector("list",length(ghcn.subsets))
191	e7bf2d1b	Benoit Parmentier	for(i in 1:length(ghcn.subsets)){
192			n<-nrow(ghcn.subsets[[i]])
193			prop<-(sampling_dat$prop[i])/100
194			ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows
195			nv<-n-ns #create a sample for validation with prop of the rows
196			ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
197			ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
198	fb039a6b	Benoit Parmentier	#Find the corresponding
199			data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
200			station_id.training<-data_sampled$station #selected id for the randomly sampled stations (115)
201			#Save the information
202	e7bf2d1b	Benoit Parmentier	sampling[[i]]<-ind.training
203	fb039a6b	Benoit Parmentier	sampling_station_id[[i]]<- station_id.training
204			}
205			## Use same samples across the year...
206			if (constant==1){
207			sampled<-sampling[[1]]
208			data_sampled<-ghcn.subsets[[1]][sampled,] #selected the randomly sampled stations
209			station_sampled<-data_sampled$station #selected id for the randomly sampled stations (115)
210			list_const_sampling<-vector("list",sn)
211			list_const_sampling_station_id<-vector("list",sn)
212			for(i in 1:sn){
213			station_id.training<-intersect(station_sampled,ghcn.subsets[[i]]$station)
214			ind.training<-match(station_id.training,ghcn.subsets[[i]]$station)
215			list_const_sampling[[i]]<-ind.training
216			list_const_sampling_station_id[[i]]<-station_id.training
217			}
218			sampling<-list_const_sampling
219			sampling_station_id<-list_const_sampling_station_id
220	e7bf2d1b	Benoit Parmentier	}
221
222			######## Prediction for the range of dates and sampling data
223
224			#gam_fus_mod<-mclapply(1:length(dates), runGAMFusion,mc.preschedule=FALSE,mc.cores = 8) #This is the end bracket from mclapply(...) statement
225	fb039a6b	Benoit Parmentier	#gam_fus_mod_s<-mclapply(1:1, runGAMFusion,mc.preschedule=FALSE,mc.cores = 1) #This is the end bracket from mclapply(...) statement
226	0766370d	Benoit Parmentier	gam_fus_mod_s<-mclapply(1:length(ghcn.subsets), runGAMFusion,mc.preschedule=FALSE,mc.cores = 9) #This is the end bracket from mclapply(...) statement
227	fb039a6b	Benoit Parmentier	#gam_fus_mod2<-mclapply(4:4, runGAMFusion,mc.preschedule=FALSE,mc.cores = 1) #This is the end bracket from mclapply(...) statement
228	e7bf2d1b	Benoit Parmentier
229	fb039a6b	Benoit Parmentier	save(gam_fus_mod_s,file= paste(path,"/","results2_fusion_Assessment_measure_all",out_prefix,".RData",sep=""))
230	e7bf2d1b	Benoit Parmentier
231			## Plotting and saving diagnostic measures
232
233			tb<-gam_fus_mod_s[[1]][[3]][0,] #empty data frame with metric table structure that can be used in rbinding...
234			tb_tmp<-gam_fus_mod_s #copy
235
236			for (i in 1:length(tb_tmp)){
237			tmp<-tb_tmp[[i]][[3]]
238			tb<-rbind(tb,tmp)
239			}
240			rm(tb_tmp)
241
242			for(i in 4:ncol(tb)){ # start of the for loop #1
243			tb[,i]<-as.numeric(as.character(tb[,i]))
244			}
245
246	fb039a6b	Benoit Parmentier	metrics<-as.character(unique(tb$metric)) #Name of accuracy metrics (RMSE,MAE etc.)
247	e7bf2d1b	Benoit Parmentier	tb_metric_list<-vector("list",length(metrics))
248
249	fb039a6b	Benoit Parmentier	for(i in 1:length(metrics)){ # Reorganizing information in terms of metrics
250	e7bf2d1b	Benoit Parmentier	metric_name<-paste("tb_",metrics[i],sep="")
251			tb_metric<-subset(tb, metric==metrics[i])
252			tb_metric<-cbind(tb_metric,sampling_dat[,2:3])
253			assign(metric_name,tb_metric)
254			tb_metric_list[[i]]<-tb_metric
255			}
256
257			tb_diagnostic<-do.call(rbind,tb_metric_list)
258	0766370d	Benoit Parmentier	tb_diagnostic[["prop"]]<-as.factor(tb_diagnostic[["prop"]])
259
260			mod_pat<-glob2rx("mod*")
261			mod_var<-grep(mod_pat,names(tb_diagnostic),value=TRUE) # using grep with "value" extracts the matching names
262
263			t<-melt(tb_diagnostic,
264			measure=mod_var,
265			id=c("dates","metric","prop"),
266			na.rm=F)
267			avg_tb<-cast(t,metric+prop~variable,mean)
268			median_tb<-cast(t,metric+prop~variable,median)
269			avg_tb[["prop"]]<-as.numeric(as.character(avg_tb[["prop"]]))
270			avg_RMSE<-subset(avg_tb,metric=="RMSE")
271
272	fb039a6b	Benoit Parmentier	sampling_obj<-list(sampling_dat=sampling_dat,training=sampling, training_id=sampling_station_id, tb=tb_diagnostic)
273	e7bf2d1b	Benoit Parmentier
274	0766370d	Benoit Parmentier	write.table(avg_tb, file= paste(path,"/","results2_fusion_Assessment_measure_avg_",out_prefix,".txt",sep=""), sep=",")
275			write.table(median_tb, file= paste(path,"/","results2_fusion_Assessment_measure_median_",out_prefix,".txt",sep=""), sep=",")
276	e7bf2d1b	Benoit Parmentier	write.table(tb_diagnostic, file= paste(path,"/","results2_fusion_Assessment_measure",out_prefix,".txt",sep=""), sep=",")
277			write.table(tb, file= paste(path,"/","results2_fusion_Assessment_measure_all",out_prefix,".txt",sep=""), sep=",")
278
279	0766370d	Benoit Parmentier	save(sampling_obj, file= paste(path,"/","results2_fusion_sampling_obj",out_prefix,".RData",sep=""))
280			#save(gam_fus_mod_s,file= paste(path,"/","results2_fusion_Assessment_measure_all",out_prefix,".RData",sep=""))
281			gam_fus_mod_obj<-list(gam_fus_mod=gam_fus_mod_s,sampling_obj=sampling_obj)
282			save(gam_fus_mod_obj,file= paste(path,"/","results_mod_obj_",out_prefix,".RData",sep=""))
283	e7bf2d1b	Benoit Parmentier
284			#### END OF SCRIPT

Project

General

Profile