Revision 7c37315b
Added by Benoit Parmentier about 12 years ago
climate/research/oregon/interpolation/GAM_CAI_analysis_raster_prediction_multisampling.R | ||
---|---|---|
1 | 1 |
######################################## METHOD COMPARISON ####################################### |
2 |
############################ Multisampling for GAM CAI method #####################################
|
|
2 |
############################ Constant sampling for GAM CAI method #####################################
|
|
3 | 3 |
#This script interpolates tmax values using MODIS LST and GHCND station data # |
4 | 4 |
#interpolation area. It requires the text file of stations and a shape file of the study area. # |
5 | 5 |
#Note that the projection for both GHCND and study area is lonlat WGS84. # |
6 |
#Method is assedsed using multisampling with variation of validation sample with different #
|
|
6 |
#Method is assedsed using constant sampling with variation of validation sample with different #
|
|
7 | 7 |
#hold out proportions. # |
8 | 8 |
#AUTHOR: Benoit Parmentier # |
9 |
#DATE: 09/14/2012 #
|
|
9 |
#DATE: 10/25/2012 #
|
|
10 | 10 |
#PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#491-- # |
11 | 11 |
################################################################################################### |
12 | 12 |
|
... | ... | |
26 | 26 |
### Parameters and argument |
27 | 27 |
|
28 | 28 |
infile1<- "ghcn_or_tmax_covariates_06262012_OR83M.shp" #GHCN shapefile containing variables for modeling 2010 |
29 |
infile2<-"list_10_dates_04212012.txt" #List of 10 dates for the regression |
|
30 |
#infile2<-"list_2_dates_04212012.txt" |
|
31 |
#infile2<-"list_365_dates_04212012.txt" |
|
29 |
#infile2<-"list_10_dates_04212012.txt" #List of 10 dates for the regression |
|
30 |
infile2<-"list_365_dates_04212012.txt" |
|
32 | 31 |
infile3<-"LST_dates_var_names.txt" #LST dates name |
33 | 32 |
infile4<-"models_interpolation_05142012.txt" #Interpolation model names |
34 | 33 |
infile5<-"mean_day244_rescaled.rst" #Raster or grid for the locations of predictions |
... | ... | |
36 | 35 |
infile6<-"LST_files_monthly_climatology.txt" |
37 | 36 |
inlistf<-"list_files_05032012.txt" #Stack of images containing the Covariates |
38 | 37 |
|
38 |
path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_10242012_CAI" #Atlas location |
|
39 |
setwd(path) |
|
39 | 40 |
|
40 |
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations" |
|
41 |
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_07192012_GAM" |
|
42 |
path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_07192012_CAI" |
|
43 |
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_GAM" |
|
44 |
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_07152012" #Jupiter LOCATION on Atlas for kriging" |
|
45 |
#path<-"M:/Data/IPLANT_project/data_Oregon_stations" #Locations on Atlas |
|
46 |
|
|
47 |
#Station location of the study area |
|
41 |
#Station location for the study area |
|
48 | 42 |
stat_loc<-read.table(paste(path,"/","location_study_area_OR_0602012.txt",sep=""),sep=",", header=TRUE) |
49 | 43 |
#GHCN Database for 1980-2010 for study area (OR) |
50 | 44 |
data3<-read.table(paste(path,"/","ghcn_data_TMAXy1980_2010_OR_0602012.txt",sep=""),sep=",", header=TRUE) |
51 | 45 |
|
52 | 46 |
nmodels<-8 #number of models running |
53 | 47 |
y_var_name<-"dailyTmax" |
54 |
climgam=1 |
|
48 |
climgam=1 #if 1, then GAM is run on the climatology rather than the daily deviation surface...
|
|
55 | 49 |
predval<-1 |
56 |
prop<-0.3 #Proportion of testing retained for validation |
|
57 |
#prop<-0.25 |
|
58 |
seed_number<- 100 #Seed number for random sampling |
|
59 |
out_prefix<-"_09132012_365d_GAM_CAI2_multisampling2" #User defined output prefix |
|
60 |
setwd(path) |
|
61 |
bias_val<-0 #if value 1 then training data is used in the bias surface rather than the all monthly stations |
|
50 |
prop<-0.3 #Proportion of testing retained for validation |
|
62 | 51 |
|
63 |
nb_sample<-1 |
|
64 |
prop_min<-0.3 |
|
52 |
seed_number<- 100 #Seed number for random sampling, if seed_number<0, no seed number is used.. |
|
53 |
#out_prefix<-"_365d_GAM_CAI2_const_10222012_" #User defined output prefix |
|
54 |
out_prefix<-"_365d_GAM_CAI2_all_lstd_10262012" #User defined output prefix |
|
55 |
|
|
56 |
bias_val<-0 #if value 1 then daily training data is used in the bias surface rather than the all monthly stations (added on 07/11/2012) |
|
57 |
bias_prediction<-1 #if value 1 then use GAM for the BIAS prediction otherwise GAM direct reprediction for y_var (daily tmax) |
|
58 |
nb_sample<-1 #number of time random sampling must be repeated for every hold out proportion |
|
59 |
prop_min<-0.3 #if prop_min=prop_max and step=0 then predicitons are done for the number of dates... |
|
65 | 60 |
prop_max<-0.3 |
66 |
step<-0 |
|
61 |
step<-0 |
|
62 |
constant<-0 #if value 1 then use the same samples as date one for the all set of dates |
|
63 |
#projection used in the interpolation of the study area |
|
64 |
CRS_interp<-"+proj=lcc +lat_1=43 +lat_2=45.5 +lat_0=41.75 +lon_0=-120.5 +x_0=400000 +y_0=0 +ellps=GRS80 +units=m +no_defs"; |
|
65 |
CRS_locs_WGS84<-CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +towgs84=0,0,0") #Station coords WGS84 |
|
66 |
|
|
67 |
source("GAM_CAI_function_multisampling_10252012.R") |
|
67 | 68 |
|
68 |
#source("fusion_function_07192012.R") |
|
69 |
source("GAM_CAI_function_multisampling_09132012.R") |
|
70 | 69 |
############ START OF THE SCRIPT ################## |
71 | 70 |
|
72 | 71 |
###Reading the station data and setting up for models' comparison |
... | ... | |
87 | 86 |
ghcn$LC1[is.na(ghcn$LC1)]<-0 |
88 | 87 |
ghcn$LC3[is.na(ghcn$LC3)]<-0 |
89 | 88 |
ghcn$CANHEIGHT[is.na(ghcn$CANHEIGHT)]<-0 |
89 |
ghcn$LC4[is.na(ghcn$LC4)]<-0 |
|
90 |
ghcn$LC6[is.na(ghcn$LC6)]<-0 |
|
90 | 91 |
|
91 |
dates <-readLines(paste(path,"/",infile2, sep="")) |
|
92 |
#Use file.path for to construct pathfor independent os platform? !!! |
|
93 |
dates<-readLines(file.path(path,infile2)) |
|
94 |
#dates <-readLines(paste(path,"/",infile2, sep="")) |
|
92 | 95 |
LST_dates <-readLines(paste(path,"/",infile3, sep="")) |
93 | 96 |
models <-readLines(paste(path,"/",infile4, sep="")) |
94 | 97 |
|
... | ... | |
121 | 124 |
LC3<-raster(s_raster,layer=pos) #Select layer from stack |
122 | 125 |
s_raster<-dropLayer(s_raster,pos) |
123 | 126 |
LC3[is.na(LC3)]<-0 |
127 |
|
|
128 |
#Modification added to account for other land cover |
|
129 |
|
|
130 |
pos<-match("LC4",layerNames(s_raster)) #Find column with name "value" |
|
131 |
LC4<-raster(s_raster,layer=pos) #Select layer from stack |
|
132 |
s_raster<-dropLayer(s_raster,pos) |
|
133 |
LC4[is.na(LC4)]<-0 |
|
134 |
|
|
135 |
pos<-match("LC6",layerNames(s_raster)) #Find column with name "value" |
|
136 |
LC6<-raster(s_raster,layer=pos) #Select layer from stack |
|
137 |
s_raster<-dropLayer(s_raster,pos) |
|
138 |
LC6[is.na(LC6)]<-0 |
|
139 |
|
|
140 |
LC_s<-stack(LC1,LC3,LC4,LC6) |
|
141 |
layerNames(LC_s)<-c("LC1_forest","LC3_grass","LC4_crop","LC6_urban") |
|
142 |
plot(LC_s) |
|
143 |
|
|
124 | 144 |
pos<-match("CANHEIGHT",layerNames(s_raster)) #Find column with name "value" |
125 | 145 |
CANHEIGHT<-raster(s_raster,layer=pos) #Select layer from stack |
126 | 146 |
s_raster<-dropLayer(s_raster,pos) |
... | ... | |
137 | 157 |
values(lon)<-xy_latlon[,1] |
138 | 158 |
values(lat)<-xy_latlon[,2] |
139 | 159 |
|
140 |
r<-stack(N,E,Nw,Ew,lon,lat,LC1,LC3,CANHEIGHT) |
|
141 |
rnames<-c("Northness","Eastness","Northness_w","Eastness_w", "lon","lat","LC1","LC3","CANHEIGHT") |
|
160 |
r<-stack(N,E,Nw,Ew,lon,lat,LC1,LC3,LC4,LC6, CANHEIGHT)
|
|
161 |
rnames<-c("Northness","Eastness","Northness_w","Eastness_w", "lon","lat","LC1","LC3","LC4","LC6","CANHEIGHT")
|
|
142 | 162 |
layerNames(r)<-rnames |
143 | 163 |
s_raster<-addLayer(s_raster, r) |
144 | 164 |
|
... | ... | |
155 | 175 |
molst <- setZ(molst, idx) |
156 | 176 |
layerNames(molst) <- month.abb |
157 | 177 |
|
158 |
|
|
159 | 178 |
###### Preparing tables for model assessment: specific diagnostic/metrics |
160 | 179 |
|
161 | 180 |
#Model assessment: specific diagnostics/metrics |
... | ... | |
193 | 212 |
dst$TMax<-dst$TMax/10 #TMax is the average max temp for monthy data |
194 | 213 |
#dstjan=dst[dst$month==9,] #dst contains the monthly averages for tmax for every station over 2000-2010 |
195 | 214 |
|
215 |
#Extracting covariates from stack for the monthly dataset... |
|
216 |
coords<- dst[c('lon','lat')] #Define coordinates in a data frame |
|
217 |
coordinates(dst)<-coords #Assign coordinates to the data frame |
|
218 |
proj4string(dst)<-CRS_locs_WGS84 #Assign coordinates reference system in PROJ4 format |
|
219 |
dst_month<-spTransform(dst,CRS(CRS_interp)) #Project from WGS84 to new coord. system |
|
220 |
|
|
221 |
stations_val<-extract(s_raster,dst_month) #extraction of the infomration at station location |
|
222 |
stations_val<-as.data.frame(stations_val) |
|
223 |
dst_extract<-cbind(dst_month,stations_val) |
|
224 |
dst<-dst_extract |
|
225 |
|
|
196 | 226 |
######### Preparing daily values for training and testing |
197 | 227 |
|
198 | 228 |
#Screening for bad values: value is tmax in this case |
... | ... | |
205 | 235 |
|
206 | 236 |
##Sampling: training and testing sites... |
207 | 237 |
|
208 |
#set.seed(seed_number) #Using a seed number allow results based on random number to be compared... |
|
238 |
if (seed_number>0) { |
|
239 |
set.seed(seed_number) #Using a seed number allow results based on random number to be compared... |
|
240 |
} |
|
209 | 241 |
nel<-length(dates) |
210 | 242 |
dates_list<-vector("list",nel) #list of one row data.frame |
211 | 243 |
|
... | ... | |
243 | 275 |
sampling[[i]]<-ind.training |
244 | 276 |
} |
245 | 277 |
|
278 |
if (constant==1){ |
|
279 |
sampled<-sampling[[1]] |
|
280 |
list_const_sampling<-vector("list",sn) |
|
281 |
for(i in 1:sn){ |
|
282 |
list_const_sampling[[i]]<-sampled |
|
283 |
} |
|
284 |
sampling<-list_const_sampling |
|
285 |
} |
|
286 |
|
|
246 | 287 |
######## Prediction for the range of dates |
247 | 288 |
|
248 | 289 |
#Start loop here... |
249 | 290 |
|
250 |
## looping through the dates...this is the main part of the code |
|
251 |
#i=1 #for debugging |
|
252 |
#j=1 #for debugging |
|
253 |
#for(i in 1:length(dates)){ [[ # start of the for loop #1 |
|
254 |
#i=1 |
|
255 |
|
|
256 |
#mclapply(1:length(dates), runFusion, mc.cores = 8)#This is the end bracket from mclapply(...) statement |
|
257 |
#source("GAM_fusion_function_07192012d.R") |
|
258 |
|
|
259 | 291 |
#gam_CAI_mod<-mclapply(1:length(dates), runGAMCAI,mc.preschedule=FALSE,mc.cores = 8) #This is the end bracket from mclapply(...) statement |
260 | 292 |
gam_CAI_mod<-mclapply(1:length(ghcn.subsets), runGAMCAI,mc.preschedule=FALSE,mc.cores = 8) #This is the end bracket from mclapply(...) statement |
261 | 293 |
#gam_CAI_mod<-mclapply(1:1, runGAMCAI,mc.preschedule=FALSE,mc.cores = 1) #This is the end bracket from mclapply(...) statement |
... | ... | |
284 | 316 |
tb_metric_list[[i]]<-tb_metric |
285 | 317 |
} |
286 | 318 |
|
287 |
tb_diagnostic<-do.call(rbind,tb_metric_list) |
|
319 |
tb_diagnostic<-do.call(rbind,tb_metric_list) #produce a data.frame from the list ...
|
|
288 | 320 |
tb_diagnostic[["prop"]]<-as.factor(tb_diagnostic[["prop"]]) |
289 | 321 |
|
290 | 322 |
t<-melt(tb_diagnostic, |
Also available in: Unified diff
GAM CAI, added constant sampling over year and monthly extraction for climatology fitting