Project

General

Profile

Download (16.3 KB) Statistics
| Branch: | Revision:
1
######################################## METHOD COMPARISON #######################################
2
############################ Multisampling for GAM CAI method #####################################
3
#This script interpolates tmax values using MODIS LST and GHCND station data                     #
4
#interpolation area. It requires the text file of stations and a shape file of the study area.   #       
5
#Note that the projection for both GHCND and study area is lonlat WGS84.                         #
6
#Method is assedsed using multisampling with variation  of validation sample with different      #
7
#hold out proportions.                                                                           #
8
#AUTHOR: Benoit Parmentier                                                                       #
9
#DATE: 09/14/2012                                                                                #
10
#PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#491--                                  #
11
###################################################################################################
12

    
13
###Loading R library and packages                                                      
14
library(gtools)                                         # loading some useful tools 
15
library(mgcv)                                           # GAM package by Simon Wood
16
library(sp)                                             # Spatial pacakge with class definition by Bivand et al.
17
library(spdep)                               # Spatial pacakge with methods and spatial stat. by Bivand et al.
18
library(rgdal)                               # GDAL wrapper for R, spatial utilities
19
library(gstat)                               # Kriging and co-kriging by Pebesma et al.
20
library(fields)                              # NCAR Spatial Interpolation methods such as kriging, splines
21
library(raster)                              # Hijmans et al. package for raster processing
22
library(rasterVis)
23
library(parallel)                            # Urbanek S. and Ripley B., package for multi cores & parralel processing
24
library(reshape)
25

    
26
### Parameters and argument
27

    
28
infile1<- "ghcn_or_tmax_covariates_06262012_OR83M.shp"             #GHCN shapefile containing variables for modeling 2010                 
29
infile2<-"list_10_dates_04212012.txt"                     #List of 10 dates for the regression
30
#infile2<-"list_2_dates_04212012.txt"
31
#infile2<-"list_365_dates_04212012.txt"
32
infile3<-"LST_dates_var_names.txt"                        #LST dates name
33
infile4<-"models_interpolation_05142012.txt"              #Interpolation model names
34
infile5<-"mean_day244_rescaled.rst"                       #Raster or grid for the locations of predictions
35
#infile6<-"lst_climatology.txt"
36
infile6<-"LST_files_monthly_climatology.txt"
37
inlistf<-"list_files_05032012.txt"                        #Stack of images containing the Covariates
38

    
39

    
40
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations"
41
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_07192012_GAM"
42
path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_07192012_CAI"
43
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_GAM"
44
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_07152012"     #Jupiter LOCATION on Atlas for kriging"
45
#path<-"M:/Data/IPLANT_project/data_Oregon_stations"   #Locations on Atlas
46

    
47
#Station location of the study area
48
stat_loc<-read.table(paste(path,"/","location_study_area_OR_0602012.txt",sep=""),sep=",", header=TRUE)
49
#GHCN Database for 1980-2010 for study area (OR) 
50
data3<-read.table(paste(path,"/","ghcn_data_TMAXy1980_2010_OR_0602012.txt",sep=""),sep=",", header=TRUE)
51

    
52
nmodels<-8   #number of models running
53
y_var_name<-"dailyTmax"
54
climgam=1
55
predval<-1
56
prop<-0.3                                                                           #Proportion of testing retained for validation   
57
#prop<-0.25
58
seed_number<- 100                                                                   #Seed number for random sampling
59
out_prefix<-"_09132012_365d_GAM_CAI2_multisampling2"                                #User defined output prefix
60
setwd(path)
61
bias_val<-0            #if value 1 then training data is used in the bias surface rather than the all monthly stations
62

    
63
nb_sample<-1
64
prop_min<-0.3
65
prop_max<-0.3
66
step<-0
67

    
68
#source("fusion_function_07192012.R")
69
source("GAM_CAI_function_multisampling_09132012.R")
70
############ START OF THE SCRIPT ##################
71

    
72
###Reading the station data and setting up for models' comparison
73
filename<-sub(".shp","",infile1)             #Removing the extension from file.
74
ghcn<-readOGR(".", filename)                 #reading shapefile 
75

    
76
CRS<-proj4string(ghcn)                       #Storing projection information (ellipsoid, datum,etc.)
77

    
78
mean_LST<- readGDAL(infile5)                 #Reading the whole raster in memory. This provides a grid for kriging
79
proj4string(mean_LST)<-CRS                   #Assigning coordinate information to prediction grid.
80

    
81
ghcn <- transform(ghcn,Northness = cos(ASPECT*pi/180)) #Adding a variable to the dataframe
82
ghcn <- transform(ghcn,Eastness = sin(ASPECT*pi/180))  #adding variable to the dataframe.
83
ghcn <- transform(ghcn,Northness_w = sin(slope*pi/180)*cos(ASPECT*pi/180)) #Adding a variable to the dataframe
84
ghcn <- transform(ghcn,Eastness_w = sin(slope*pi/180)*sin(ASPECT*pi/180))  #adding variable to the dataframe.
85

    
86
#Remove NA for LC and CANHEIGHT
87
ghcn$LC1[is.na(ghcn$LC1)]<-0
88
ghcn$LC3[is.na(ghcn$LC3)]<-0
89
ghcn$CANHEIGHT[is.na(ghcn$CANHEIGHT)]<-0
90

    
91
dates <-readLines(paste(path,"/",infile2, sep=""))
92
LST_dates <-readLines(paste(path,"/",infile3, sep=""))
93
models <-readLines(paste(path,"/",infile4, sep=""))
94

    
95
##Extracting the variables values from the raster files                                             
96

    
97
lines<-read.table(paste(path,"/",inlistf,sep=""), sep=" ")                  #Column 1 contains the names of raster files
98
inlistvar<-lines[,1]
99
inlistvar<-paste(path,"/",as.character(inlistvar),sep="")
100
covar_names<-as.character(lines[,2])                                         #Column two contains short names for covaraites
101

    
102
s_raster<- stack(inlistvar)                                                  #Creating a stack of raster images from the list of variables.
103
layerNames(s_raster)<-covar_names                                            #Assigning names to the raster layers
104
projection(s_raster)<-CRS
105

    
106
#stat_val<- extract(s_raster, ghcn3)                                          #Extracting values from the raster stack for every point location in coords data frame.
107
pos<-match("ASPECT",layerNames(s_raster)) #Find column with name "value"
108
r1<-raster(s_raster,layer=pos)             #Select layer from stack
109
pos<-match("slope",layerNames(s_raster)) #Find column with name "value"
110
r2<-raster(s_raster,layer=pos)             #Select layer from stack
111
N<-cos(r1*pi/180)
112
E<-sin(r1*pi/180)
113
Nw<-sin(r2*pi/180)*cos(r1*pi/180)   #Adding a variable to the dataframe
114
Ew<-sin(r2*pi/180)*sin(r1*pi/180)   #Adding variable to the dataframe.
115

    
116
pos<-match("LC1",layerNames(s_raster)) #Find column with name "value"
117
LC1<-raster(s_raster,layer=pos)             #Select layer from stack
118
s_raster<-dropLayer(s_raster,pos)
119
LC1[is.na(LC1)]<-0                      #NA must be set to zero.
120
pos<-match("LC3",layerNames(s_raster)) #Find column with name "value"
121
LC3<-raster(s_raster,layer=pos)             #Select layer from stack
122
s_raster<-dropLayer(s_raster,pos)
123
LC3[is.na(LC3)]<-0
124
pos<-match("CANHEIGHT",layerNames(s_raster)) #Find column with name "value"
125
CANHEIGHT<-raster(s_raster,layer=pos)             #Select layer from stack
126
s_raster<-dropLayer(s_raster,pos)
127
CANHEIGHT[is.na(CANHEIGHT)]<-0
128

    
129
xy<-coordinates(r1)  #get x and y projected coordinates...
130
xy_latlon<-project(xy, CRS, inv=TRUE) # find lat long for projected coordinats (or pixels...)
131
lon<-raster(xy_latlon) #Transform a matrix into a raster object ncol=ncol(r1), nrow=nrow(r1))
132
ncol(lon)<-ncol(r1)
133
nrow(lon)<-nrow(r1)
134
extent(lon)<-extent(r1)
135
projection(lon)<-CRS  #At this stage this is still an empty raster with 536 nrow and 745 ncell 
136
lat<-lon
137
values(lon)<-xy_latlon[,1]
138
values(lat)<-xy_latlon[,2]
139

    
140
r<-stack(N,E,Nw,Ew,lon,lat,LC1,LC3,CANHEIGHT)
141
rnames<-c("Northness","Eastness","Northness_w","Eastness_w", "lon","lat","LC1","LC3","CANHEIGHT")
142
layerNames(r)<-rnames
143
s_raster<-addLayer(s_raster, r)
144

    
145
#s_sgdf<-as(s_raster,"SpatialGridDataFrame") #Conversion to spatial grid data frame
146

    
147
####### Preparing LST stack of climatology...
148

    
149
#l=list.files(pattern="mean_month.*rescaled.rst")
150
l <-readLines(paste(path,"/",infile6, sep=""))
151
molst<-stack(l)  #Creating a raster stack...
152
#setwd(old)
153
molst<-molst-273.16  #K->C          #LST stack of monthly average...
154
idx <- seq(as.Date('2010-01-15'), as.Date('2010-12-15'), 'month')
155
molst <- setZ(molst, idx)
156
layerNames(molst) <- month.abb
157

    
158

    
159
######  Preparing tables for model assessment: specific diagnostic/metrics
160

    
161
#Model assessment: specific diagnostics/metrics
162
results_m1<- matrix(1,1,nmodels+3)  
163
results_m2<- matrix(1,1,nmodels+3)
164
results_m3<- matrix(1,1,nmodels+3)
165
#results_RMSE_f<- matrix(1,length(models)+3)
166

    
167
#Model assessment: general diagnostic/metrics 
168
results_RMSE <- matrix(1,1,nmodels+4)
169
results_MAE <- matrix(1,1,nmodels+4)
170
results_ME <- matrix(1,1,nmodels+4)       #There are 8+1 models
171
results_R2 <- matrix(1,1,nmodels+4)       #Coef. of determination for the validation dataset
172

    
173
results_RMSE_f<- matrix(1,1,nmodels+4)    #RMSE fit, RMSE for the training dataset
174
results_MAE_f <- matrix(1,1,nmodels+4)
175
results_R2_f<-matrix(1,1,nmodels+4)
176
######## Preparing monthly averages from the ProstGres database
177

    
178
# do this work outside of (before) this function
179
# to avoid making a copy of the data frame inside the function call
180
date1<-ISOdate(data3$year,data3$month,data3$day) #Creating a date object from 3 separate column
181
date2<-as.POSIXlt(as.Date(date1))
182
data3$date<-date2
183
d<-subset(data3,year>=2000 & mflag=="0" ) #Selecting dataset 2000-2010 with good quality: 193 stations
184
#May need some screeing??? i.e. range of temp and elevation...
185
d1<-aggregate(value~station+month, data=d, mean)  #Calculate monthly mean for every station in OR
186
id<-as.data.frame(unique(d1$station))     #Unique station in OR for year 2000-2010: 193 but 7 loss of monthly avg    
187

    
188
dst<-merge(d1, stat_loc, by.x="station", by.y="STAT_ID")   #Inner join all columns are retained
189

    
190
#This allows to change only one name of the data.frame
191
pos<-match("value",names(dst)) #Find column with name "value"
192
names(dst)[pos]<-c("TMax")
193
dst$TMax<-dst$TMax/10                #TMax is the average max temp for monthy data
194
#dstjan=dst[dst$month==9,]  #dst contains the monthly averages for tmax for every station over 2000-2010
195

    
196
######### Preparing daily values for training and testing
197

    
198
#Screening for bad values: value is tmax in this case
199
#ghcn$value<-as.numeric(ghcn$value)
200
ghcn_all<-ghcn
201
ghcn_test<-subset(ghcn,ghcn$value>-150 & ghcn$value<400)
202
ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0)
203
ghcn<-ghcn_test2
204
#coords<- ghcn[,c('x_OR83M','y_OR83M')]
205

    
206
##Sampling: training and testing sites...
207

    
208
#set.seed(seed_number)                        #Using a seed number allow results based on random number to be compared...
209
nel<-length(dates)
210
dates_list<-vector("list",nel) #list of one row data.frame
211

    
212
prop_range<-(seq(from=prop_min,to=prop_max,by=step))*100
213
sn<-length(dates)*nb_sample*length(prop_range)
214

    
215
for(i in 1:length(dates)){
216
  d_tmp<-rep(dates[i],nb_sample*length(prop_range)) #repeating same date
217
  s_nb<-rep(1:nb_sample,length(prop_range))         #number of random sample per proportion
218
  prop_tmp<-sort(rep(prop_range, nb_sample))
219
  tab_run_tmp<-cbind(d_tmp,s_nb,prop_tmp)
220
  dates_list[[i]]<-tab_run_tmp
221
}
222

    
223
sampling_dat<-as.data.frame(do.call(rbind,dates_list))
224
names(sampling_dat)<-c("date","run_samp","prop")
225

    
226
for(i in 2:3){            # start of the for loop #1
227
  sampling_dat[,i]<-as.numeric(as.character(sampling_dat[,i]))  
228
}
229

    
230
sampling_dat$date<- as.character(sampling_dat[,1])
231
#ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
232
ghcn.subsets <-lapply(as.character(sampling_dat$date), function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
233

    
234
sampling<-vector("list",length(ghcn.subsets))
235

    
236
for(i in 1:length(ghcn.subsets)){
237
  n<-nrow(ghcn.subsets[[i]])
238
  prop<-(sampling_dat$prop[i])/100
239
  ns<-n-round(n*prop)   #Create a sample from the data frame with 70% of the rows
240
  nv<-n-ns              #create a sample for validation with prop of the rows
241
  ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
242
  ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
243
  sampling[[i]]<-ind.training
244
}
245

    
246
######## Prediction for the range of dates
247

    
248
#Start loop here...
249

    
250
## looping through the dates...this is the main part of the code
251
#i=1 #for debugging
252
#j=1 #for debugging
253
#for(i in 1:length(dates)){     [[       # start of the for loop #1
254
#i=1
255

    
256
#mclapply(1:length(dates), runFusion, mc.cores = 8)#This is the end bracket from mclapply(...) statement
257
#source("GAM_fusion_function_07192012d.R")
258

    
259
#gam_CAI_mod<-mclapply(1:length(dates), runGAMCAI,mc.preschedule=FALSE,mc.cores = 8) #This is the end bracket from mclapply(...) statement
260
gam_CAI_mod<-mclapply(1:length(ghcn.subsets), runGAMCAI,mc.preschedule=FALSE,mc.cores = 8) #This is the end bracket from mclapply(...) statement
261
#gam_CAI_mod<-mclapply(1:1, runGAMCAI,mc.preschedule=FALSE,mc.cores = 1) #This is the end bracket from mclapply(...) statement
262

    
263
tb<-gam_CAI_mod[[1]][[3]][0,]  #empty data frame with metric table structure that can be used in rbinding...
264
tb_tmp<-gam_CAI_mod #copy
265

    
266
for (i in 1:length(tb_tmp)){
267
  tmp<-tb_tmp[[i]][[3]]
268
  tb<-rbind(tb,tmp)
269
}
270
rm(tb_tmp)
271

    
272
for(i in 4:(nmodels+4)){            # start of the for loop #1
273
  tb[,i]<-as.numeric(as.character(tb[,i]))  
274
}
275

    
276
metrics<-as.character(unique(tb$metric))            #Name of accuracy metrics (RMSE,MAE etc.)
277
tb_metric_list<-vector("list",length(metrics))
278

    
279
for(i in 1:length(metrics)){            # Reorganizing information in terms of metrics 
280
  metric_name<-paste("tb_",metrics[i],sep="")
281
  tb_metric<-subset(tb, metric==metrics[i])
282
  tb_metric<-cbind(tb_metric,sampling_dat[,2:3])
283
  assign(metric_name,tb_metric)
284
  tb_metric_list[[i]]<-tb_metric
285
}
286

    
287
tb_diagnostic<-do.call(rbind,tb_metric_list)
288
tb_diagnostic[["prop"]]<-as.factor(tb_diagnostic[["prop"]])
289

    
290
t<-melt(tb_diagnostic,
291
        measure=c("mod1","mod2","mod3","mod4", "mod5", "mod6", "mod7", "mod8","mod9"), 
292
        id=c("dates","metric","prop"),
293
        na.rm=F)
294
avg_tb<-cast(t,metric+prop~variable,mean)
295
median_tb<-cast(t,metric+prop~variable,mean)
296
avg_tb[["prop"]]<-as.numeric(as.character(avg_tb[["prop"]]))
297
avg_RMSE<-subset(avg_tb,metric=="RMSE")
298

    
299
# Save before plotting
300
#sampling_obj<-list(sampling_dat=sampling_dat,training=sampling)
301
sampling_obj<-list(sampling_dat=sampling_dat,training=sampling, tb=tb_diagnostic)
302

    
303
write.table(avg_tb, file= paste(path,"/","results2_fusion_Assessment_measure_avg_",out_prefix,".txt",sep=""), sep=",")
304
write.table(median_tb, file= paste(path,"/","results2_fusion_Assessment_measure_median_",out_prefix,".txt",sep=""), sep=",")
305
write.table(tb_diagnostic, file= paste(path,"/","results2_fusion_Assessment_measure",out_prefix,".txt",sep=""), sep=",")
306
write.table(tb, file= paste(path,"/","results2_fusion_Assessment_measure_all",out_prefix,".txt",sep=""), sep=",")
307

    
308

    
309
# tb_RMSE<-subset(tb, metric=="RMSE")
310
# tb_MAE<-subset(tb,metric=="MAE")
311
# tb_ME<-subset(tb,metric=="ME")
312
# tb_R2<-subset(tb,metric=="R2")
313
# tb_RMSE_f<-subset(tb, metric=="RMSE_f")
314
# tb_MAE_f<-subset(tb,metric=="MAE_f")
315
# tb_R2_f<-subset(tb,metric=="R2_f")
316
# 
317
# tb_diagnostic1<-rbind(tb_RMSE,tb_MAE,tb_ME,tb_R2)
318
# #tb_diagnostic2<-rbind(tb_,tb_MAE,tb_ME,tb_R2)
319
# 
320
# mean_RMSE<-sapply(tb_RMSE[,4:(nmodels+4)],mean)
321
# mean_MAE<-sapply(tb_MAE[,4:(nmodels+4)],mean)
322
# mean_R2<-sapply(tb_R2[,4:(nmodels+4)],mean)
323
# mean_ME<-sapply(tb_ME[,4:(nmodels+4)],mean)
324
# mean_MAE_f<-sapply(tb_MAE[,4:(nmodels+4)],mean)
325
# mean_RMSE_f<-sapply(tb_RMSE_f[,4:(nmodels+4)],mean)
326
# 
327
# #Wrting metric results in textfile and model objects in .RData file
328
# write.table(tb_diagnostic1, file= paste(path,"/","results2_CAI_Assessment_measure1",out_prefix,".txt",sep=""), sep=",")
329
# write.table(tb, file= paste(path,"/","results2_CAI_Assessment_measure_all",out_prefix,".txt",sep=""), sep=",")
330
save(sampling_obj, file= paste(path,"/","results2_CAI_sampling_obj",out_prefix,".RData",sep=""))
331
save(gam_CAI_mod,file= paste(path,"/","results2_CAI_Assessment_measure_all",out_prefix,".RData",sep=""))
332

    
333
#### END OF SCRIPT
(2-2/31)