Project

General

Profile

Download (18 KB) Statistics
| Branch: | Revision:
1
######################################## METHOD COMPARISON #######################################
2
############################ Constant sampling for GAM CAI method #####################################
3
#This script interpolates tmax values using MODIS LST and GHCND station data                     #
4
#interpolation area. It requires the text file of stations and a shape file of the study area.   #       
5
#Note that the projection for both GHCND and study area is lonlat WGS84.                         #
6
#Method is assedsed using constant sampling with variation  of validation sample with different  #
7
#hold out proportions.                                                                           #
8
#AUTHOR: Benoit Parmentier                                                                       #
9
#DATE: 10/25/2012                                                                                #
10
#PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#491--                                  #
11
###################################################################################################
12

    
13
###Loading R library and packages                                                      
14
library(gtools)                                         # loading some useful tools 
15
library(mgcv)                                           # GAM package by Simon Wood
16
library(sp)                                             # Spatial pacakge with class definition by Bivand et al.
17
library(spdep)                               # Spatial pacakge with methods and spatial stat. by Bivand et al.
18
library(rgdal)                               # GDAL wrapper for R, spatial utilities
19
library(gstat)                               # Kriging and co-kriging by Pebesma et al.
20
library(fields)                              # NCAR Spatial Interpolation methods such as kriging, splines
21
library(raster)                              # Hijmans et al. package for raster processing
22
library(rasterVis)
23
library(parallel)                            # Urbanek S. and Ripley B., package for multi cores & parralel processing
24
library(reshape)
25

    
26
### Parameters and argument
27

    
28
infile1<- "ghcn_or_tmax_covariates_06262012_OR83M.shp"             #GHCN shapefile containing variables for modeling 2010                 
29
#infile2<-"list_10_dates_04212012.txt"                     #List of 10 dates for the regression
30
infile2<-"list_365_dates_04212012.txt"
31
infile3<-"LST_dates_var_names.txt"                        #LST dates name
32
infile4<-"models_interpolation_05142012.txt"              #Interpolation model names
33
infile5<-"mean_day244_rescaled.rst"                       #Raster or grid for the locations of predictions
34
#infile6<-"lst_climatology.txt"
35
infile6<-"LST_files_monthly_climatology.txt"
36
inlistf<-"list_files_05032012.txt"                        #Stack of images containing the Covariates
37

    
38
path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_10242012_CAI" #Atlas location
39
setwd(path)
40

    
41
#Station location for the study area
42
stat_loc<-read.table(paste(path,"/","location_study_area_OR_0602012.txt",sep=""),sep=",", header=TRUE)
43
#GHCN Database for 1980-2010 for study area (OR) 
44
data3<-read.table(paste(path,"/","ghcn_data_TMAXy1980_2010_OR_0602012.txt",sep=""),sep=",", header=TRUE)
45

    
46
nmodels<-8   #number of models running
47
y_var_name<-"dailyTmax"
48
climgam=1                                                     #if 1, then GAM is run on the climatology rather than the daily deviation surface...
49
predval<-1
50
prop<-0.3                                                     #Proportion of testing retained for validation   
51

    
52
seed_number<- 100                                             #Seed number for random sampling, if seed_number<0, no seed number is used..
53
#out_prefix<-"_365d_GAM_CAI2_const_10222012_"                  #User defined output prefix
54
out_prefix<-"_365d_GAM_CAI2_all_lstd_10262012"                #User defined output prefix
55

    
56
bias_val<-0            #if value 1 then daily training data is used in the bias surface rather than the all monthly stations (added on 07/11/2012)
57
bias_prediction<-1     #if value 1 then use GAM for the BIAS prediction otherwise GAM direct reprediction for y_var (daily tmax)
58
nb_sample<-1           #number of time random sampling must be repeated for every hold out proportion
59
prop_min<-0.3          #if prop_min=prop_max and step=0 then predicitons are done for the number of dates...
60
prop_max<-0.3
61
step<-0         
62
constant<-0            #if value 1 then use the same samples as date one for the all set of dates
63
#projection used in the interpolation of the study area
64
CRS_interp<-"+proj=lcc +lat_1=43 +lat_2=45.5 +lat_0=41.75 +lon_0=-120.5 +x_0=400000 +y_0=0 +ellps=GRS80 +units=m +no_defs";
65
CRS_locs_WGS84<-CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +towgs84=0,0,0") #Station coords WGS84
66

    
67
source("GAM_CAI_function_multisampling_10252012.R")
68

    
69
############ START OF THE SCRIPT ##################
70

    
71
###Reading the station data and setting up for models' comparison
72
filename<-sub(".shp","",infile1)             #Removing the extension from file.
73
ghcn<-readOGR(".", filename)                 #reading shapefile 
74

    
75
CRS<-proj4string(ghcn)                       #Storing projection information (ellipsoid, datum,etc.)
76

    
77
mean_LST<- readGDAL(infile5)                 #Reading the whole raster in memory. This provides a grid for kriging
78
proj4string(mean_LST)<-CRS                   #Assigning coordinate information to prediction grid.
79

    
80
ghcn <- transform(ghcn,Northness = cos(ASPECT*pi/180)) #Adding a variable to the dataframe
81
ghcn <- transform(ghcn,Eastness = sin(ASPECT*pi/180))  #adding variable to the dataframe.
82
ghcn <- transform(ghcn,Northness_w = sin(slope*pi/180)*cos(ASPECT*pi/180)) #Adding a variable to the dataframe
83
ghcn <- transform(ghcn,Eastness_w = sin(slope*pi/180)*sin(ASPECT*pi/180))  #adding variable to the dataframe.
84

    
85
#Remove NA for LC and CANHEIGHT
86
ghcn$LC1[is.na(ghcn$LC1)]<-0
87
ghcn$LC3[is.na(ghcn$LC3)]<-0
88
ghcn$CANHEIGHT[is.na(ghcn$CANHEIGHT)]<-0
89
ghcn$LC4[is.na(ghcn$LC4)]<-0
90
ghcn$LC6[is.na(ghcn$LC6)]<-0
91

    
92
#Use file.path for to construct pathfor independent os platform? !!!
93
dates<-readLines(file.path(path,infile2))
94
#dates <-readLines(paste(path,"/",infile2, sep=""))
95
LST_dates <-readLines(paste(path,"/",infile3, sep=""))
96
models <-readLines(paste(path,"/",infile4, sep=""))
97

    
98
##Extracting the variables values from the raster files                                             
99

    
100
lines<-read.table(paste(path,"/",inlistf,sep=""), sep=" ")                  #Column 1 contains the names of raster files
101
inlistvar<-lines[,1]
102
inlistvar<-paste(path,"/",as.character(inlistvar),sep="")
103
covar_names<-as.character(lines[,2])                                         #Column two contains short names for covaraites
104

    
105
s_raster<- stack(inlistvar)                                                  #Creating a stack of raster images from the list of variables.
106
layerNames(s_raster)<-covar_names                                            #Assigning names to the raster layers
107
projection(s_raster)<-CRS
108

    
109
#stat_val<- extract(s_raster, ghcn3)                                          #Extracting values from the raster stack for every point location in coords data frame.
110
pos<-match("ASPECT",layerNames(s_raster)) #Find column with name "value"
111
r1<-raster(s_raster,layer=pos)             #Select layer from stack
112
pos<-match("slope",layerNames(s_raster)) #Find column with name "value"
113
r2<-raster(s_raster,layer=pos)             #Select layer from stack
114
N<-cos(r1*pi/180)
115
E<-sin(r1*pi/180)
116
Nw<-sin(r2*pi/180)*cos(r1*pi/180)   #Adding a variable to the dataframe
117
Ew<-sin(r2*pi/180)*sin(r1*pi/180)   #Adding variable to the dataframe.
118

    
119
pos<-match("LC1",layerNames(s_raster)) #Find column with name "value"
120
LC1<-raster(s_raster,layer=pos)             #Select layer from stack
121
s_raster<-dropLayer(s_raster,pos)
122
LC1[is.na(LC1)]<-0                      #NA must be set to zero.
123
pos<-match("LC3",layerNames(s_raster)) #Find column with name "value"
124
LC3<-raster(s_raster,layer=pos)             #Select layer from stack
125
s_raster<-dropLayer(s_raster,pos)
126
LC3[is.na(LC3)]<-0
127

    
128
#Modification added to account for other land cover
129

    
130
pos<-match("LC4",layerNames(s_raster)) #Find column with name "value"
131
LC4<-raster(s_raster,layer=pos)             #Select layer from stack
132
s_raster<-dropLayer(s_raster,pos)
133
LC4[is.na(LC4)]<-0
134

    
135
pos<-match("LC6",layerNames(s_raster)) #Find column with name "value"
136
LC6<-raster(s_raster,layer=pos)             #Select layer from stack
137
s_raster<-dropLayer(s_raster,pos)
138
LC6[is.na(LC6)]<-0
139

    
140
LC_s<-stack(LC1,LC3,LC4,LC6)
141
layerNames(LC_s)<-c("LC1_forest","LC3_grass","LC4_crop","LC6_urban")
142
plot(LC_s)
143

    
144
pos<-match("CANHEIGHT",layerNames(s_raster)) #Find column with name "value"
145
CANHEIGHT<-raster(s_raster,layer=pos)             #Select layer from stack
146
s_raster<-dropLayer(s_raster,pos)
147
CANHEIGHT[is.na(CANHEIGHT)]<-0
148

    
149
xy<-coordinates(r1)  #get x and y projected coordinates...
150
xy_latlon<-project(xy, CRS, inv=TRUE) # find lat long for projected coordinats (or pixels...)
151
lon<-raster(xy_latlon) #Transform a matrix into a raster object ncol=ncol(r1), nrow=nrow(r1))
152
ncol(lon)<-ncol(r1)
153
nrow(lon)<-nrow(r1)
154
extent(lon)<-extent(r1)
155
projection(lon)<-CRS  #At this stage this is still an empty raster with 536 nrow and 745 ncell 
156
lat<-lon
157
values(lon)<-xy_latlon[,1]
158
values(lat)<-xy_latlon[,2]
159

    
160
r<-stack(N,E,Nw,Ew,lon,lat,LC1,LC3,LC4,LC6, CANHEIGHT)
161
rnames<-c("Northness","Eastness","Northness_w","Eastness_w", "lon","lat","LC1","LC3","LC4","LC6","CANHEIGHT")
162
layerNames(r)<-rnames
163
s_raster<-addLayer(s_raster, r)
164

    
165
#s_sgdf<-as(s_raster,"SpatialGridDataFrame") #Conversion to spatial grid data frame
166

    
167
####### Preparing LST stack of climatology...
168

    
169
#l=list.files(pattern="mean_month.*rescaled.rst")
170
l <-readLines(paste(path,"/",infile6, sep=""))
171
molst<-stack(l)  #Creating a raster stack...
172
#setwd(old)
173
molst<-molst-273.16  #K->C          #LST stack of monthly average...
174
idx <- seq(as.Date('2010-01-15'), as.Date('2010-12-15'), 'month')
175
molst <- setZ(molst, idx)
176
layerNames(molst) <- month.abb
177

    
178
######  Preparing tables for model assessment: specific diagnostic/metrics
179

    
180
#Model assessment: specific diagnostics/metrics
181
results_m1<- matrix(1,1,nmodels+3)  
182
results_m2<- matrix(1,1,nmodels+3)
183
results_m3<- matrix(1,1,nmodels+3)
184
#results_RMSE_f<- matrix(1,length(models)+3)
185

    
186
#Model assessment: general diagnostic/metrics 
187
results_RMSE <- matrix(1,1,nmodels+4)
188
results_MAE <- matrix(1,1,nmodels+4)
189
results_ME <- matrix(1,1,nmodels+4)       #There are 8+1 models
190
results_R2 <- matrix(1,1,nmodels+4)       #Coef. of determination for the validation dataset
191

    
192
results_RMSE_f<- matrix(1,1,nmodels+4)    #RMSE fit, RMSE for the training dataset
193
results_MAE_f <- matrix(1,1,nmodels+4)
194
results_R2_f<-matrix(1,1,nmodels+4)
195
######## Preparing monthly averages from the ProstGres database
196

    
197
# do this work outside of (before) this function
198
# to avoid making a copy of the data frame inside the function call
199
date1<-ISOdate(data3$year,data3$month,data3$day) #Creating a date object from 3 separate column
200
date2<-as.POSIXlt(as.Date(date1))
201
data3$date<-date2
202
d<-subset(data3,year>=2000 & mflag=="0" ) #Selecting dataset 2000-2010 with good quality: 193 stations
203
#May need some screeing??? i.e. range of temp and elevation...
204
d1<-aggregate(value~station+month, data=d, mean)  #Calculate monthly mean for every station in OR
205
id<-as.data.frame(unique(d1$station))     #Unique station in OR for year 2000-2010: 193 but 7 loss of monthly avg    
206

    
207
dst<-merge(d1, stat_loc, by.x="station", by.y="STAT_ID")   #Inner join all columns are retained
208

    
209
#This allows to change only one name of the data.frame
210
pos<-match("value",names(dst)) #Find column with name "value"
211
names(dst)[pos]<-c("TMax")
212
dst$TMax<-dst$TMax/10                #TMax is the average max temp for monthy data
213
#dstjan=dst[dst$month==9,]  #dst contains the monthly averages for tmax for every station over 2000-2010
214

    
215
#Extracting covariates from stack for the monthly dataset...
216
coords<- dst[c('lon','lat')]              #Define coordinates in a data frame
217
coordinates(dst)<-coords                      #Assign coordinates to the data frame
218
proj4string(dst)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
219
dst_month<-spTransform(dst,CRS(CRS_interp))     #Project from WGS84 to new coord. system
220

    
221
stations_val<-extract(s_raster,dst_month)  #extraction of the infomration at station location
222
stations_val<-as.data.frame(stations_val)
223
dst_extract<-cbind(dst_month,stations_val)
224
dst<-dst_extract
225

    
226
######### Preparing daily values for training and testing
227

    
228
#Screening for bad values: value is tmax in this case
229
#ghcn$value<-as.numeric(ghcn$value)
230
ghcn_all<-ghcn
231
ghcn_test<-subset(ghcn,ghcn$value>-150 & ghcn$value<400)
232
ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0)
233
ghcn<-ghcn_test2
234
#coords<- ghcn[,c('x_OR83M','y_OR83M')]
235

    
236
##Sampling: training and testing sites...
237

    
238
if (seed_number>0) {
239
  set.seed(seed_number)                        #Using a seed number allow results based on random number to be compared...
240
}
241
nel<-length(dates)
242
dates_list<-vector("list",nel) #list of one row data.frame
243

    
244
prop_range<-(seq(from=prop_min,to=prop_max,by=step))*100
245
sn<-length(dates)*nb_sample*length(prop_range)
246

    
247
for(i in 1:length(dates)){
248
  d_tmp<-rep(dates[i],nb_sample*length(prop_range)) #repeating same date
249
  s_nb<-rep(1:nb_sample,length(prop_range))         #number of random sample per proportion
250
  prop_tmp<-sort(rep(prop_range, nb_sample))
251
  tab_run_tmp<-cbind(d_tmp,s_nb,prop_tmp)
252
  dates_list[[i]]<-tab_run_tmp
253
}
254

    
255
sampling_dat<-as.data.frame(do.call(rbind,dates_list))
256
names(sampling_dat)<-c("date","run_samp","prop")
257

    
258
for(i in 2:3){            # start of the for loop #1
259
  sampling_dat[,i]<-as.numeric(as.character(sampling_dat[,i]))  
260
}
261

    
262
sampling_dat$date<- as.character(sampling_dat[,1])
263
#ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
264
ghcn.subsets <-lapply(as.character(sampling_dat$date), function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
265

    
266
sampling<-vector("list",length(ghcn.subsets))
267

    
268
for(i in 1:length(ghcn.subsets)){
269
  n<-nrow(ghcn.subsets[[i]])
270
  prop<-(sampling_dat$prop[i])/100
271
  ns<-n-round(n*prop)   #Create a sample from the data frame with 70% of the rows
272
  nv<-n-ns              #create a sample for validation with prop of the rows
273
  ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
274
  ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
275
  sampling[[i]]<-ind.training
276
}
277

    
278
if (constant==1){
279
  sampled<-sampling[[1]]
280
  list_const_sampling<-vector("list",sn)
281
  for(i in 1:sn){
282
    list_const_sampling[[i]]<-sampled
283
  }
284
  sampling<-list_const_sampling  
285
}
286

    
287
######## Prediction for the range of dates
288

    
289
#Start loop here...
290

    
291
#gam_CAI_mod<-mclapply(1:length(dates), runGAMCAI,mc.preschedule=FALSE,mc.cores = 8) #This is the end bracket from mclapply(...) statement
292
gam_CAI_mod<-mclapply(1:length(ghcn.subsets), runGAMCAI,mc.preschedule=FALSE,mc.cores = 8) #This is the end bracket from mclapply(...) statement
293
#gam_CAI_mod<-mclapply(1:1, runGAMCAI,mc.preschedule=FALSE,mc.cores = 1) #This is the end bracket from mclapply(...) statement
294

    
295
tb<-gam_CAI_mod[[1]][[3]][0,]  #empty data frame with metric table structure that can be used in rbinding...
296
tb_tmp<-gam_CAI_mod #copy
297

    
298
for (i in 1:length(tb_tmp)){
299
  tmp<-tb_tmp[[i]][[3]]
300
  tb<-rbind(tb,tmp)
301
}
302
rm(tb_tmp)
303

    
304
for(i in 4:(nmodels+4)){            # start of the for loop #1
305
  tb[,i]<-as.numeric(as.character(tb[,i]))  
306
}
307

    
308
metrics<-as.character(unique(tb$metric))            #Name of accuracy metrics (RMSE,MAE etc.)
309
tb_metric_list<-vector("list",length(metrics))
310

    
311
for(i in 1:length(metrics)){            # Reorganizing information in terms of metrics 
312
  metric_name<-paste("tb_",metrics[i],sep="")
313
  tb_metric<-subset(tb, metric==metrics[i])
314
  tb_metric<-cbind(tb_metric,sampling_dat[,2:3])
315
  assign(metric_name,tb_metric)
316
  tb_metric_list[[i]]<-tb_metric
317
}
318

    
319
tb_diagnostic<-do.call(rbind,tb_metric_list)  #produce a data.frame from the list ...
320
tb_diagnostic[["prop"]]<-as.factor(tb_diagnostic[["prop"]])
321

    
322
t<-melt(tb_diagnostic,
323
        measure=c("mod1","mod2","mod3","mod4", "mod5", "mod6", "mod7", "mod8","mod9"), 
324
        id=c("dates","metric","prop"),
325
        na.rm=F)
326
avg_tb<-cast(t,metric+prop~variable,mean)
327
median_tb<-cast(t,metric+prop~variable,mean)
328
avg_tb[["prop"]]<-as.numeric(as.character(avg_tb[["prop"]]))
329
avg_RMSE<-subset(avg_tb,metric=="RMSE")
330

    
331
# Save before plotting
332
#sampling_obj<-list(sampling_dat=sampling_dat,training=sampling)
333
sampling_obj<-list(sampling_dat=sampling_dat,training=sampling, tb=tb_diagnostic)
334

    
335
write.table(avg_tb, file= paste(path,"/","results2_fusion_Assessment_measure_avg_",out_prefix,".txt",sep=""), sep=",")
336
write.table(median_tb, file= paste(path,"/","results2_fusion_Assessment_measure_median_",out_prefix,".txt",sep=""), sep=",")
337
write.table(tb_diagnostic, file= paste(path,"/","results2_fusion_Assessment_measure",out_prefix,".txt",sep=""), sep=",")
338
write.table(tb, file= paste(path,"/","results2_fusion_Assessment_measure_all",out_prefix,".txt",sep=""), sep=",")
339

    
340

    
341
# tb_RMSE<-subset(tb, metric=="RMSE")
342
# tb_MAE<-subset(tb,metric=="MAE")
343
# tb_ME<-subset(tb,metric=="ME")
344
# tb_R2<-subset(tb,metric=="R2")
345
# tb_RMSE_f<-subset(tb, metric=="RMSE_f")
346
# tb_MAE_f<-subset(tb,metric=="MAE_f")
347
# tb_R2_f<-subset(tb,metric=="R2_f")
348
# 
349
# tb_diagnostic1<-rbind(tb_RMSE,tb_MAE,tb_ME,tb_R2)
350
# #tb_diagnostic2<-rbind(tb_,tb_MAE,tb_ME,tb_R2)
351
# 
352
# mean_RMSE<-sapply(tb_RMSE[,4:(nmodels+4)],mean)
353
# mean_MAE<-sapply(tb_MAE[,4:(nmodels+4)],mean)
354
# mean_R2<-sapply(tb_R2[,4:(nmodels+4)],mean)
355
# mean_ME<-sapply(tb_ME[,4:(nmodels+4)],mean)
356
# mean_MAE_f<-sapply(tb_MAE[,4:(nmodels+4)],mean)
357
# mean_RMSE_f<-sapply(tb_RMSE_f[,4:(nmodels+4)],mean)
358
# 
359
# #Wrting metric results in textfile and model objects in .RData file
360
# write.table(tb_diagnostic1, file= paste(path,"/","results2_CAI_Assessment_measure1",out_prefix,".txt",sep=""), sep=",")
361
# write.table(tb, file= paste(path,"/","results2_CAI_Assessment_measure_all",out_prefix,".txt",sep=""), sep=",")
362
save(sampling_obj, file= paste(path,"/","results2_CAI_sampling_obj",out_prefix,".RData",sep=""))
363
save(gam_CAI_mod,file= paste(path,"/","results2_CAI_Assessment_measure_all",out_prefix,".RData",sep=""))
364

    
365
#### END OF SCRIPT
(2-2/32)