Revision 3ee4fbde
Added by Benoit Parmentier over 12 years ago
climate/research/oregon/interpolation/fusion_reg.R | ||
---|---|---|
35 | 35 |
data3<-read.table(paste(path,"/","ghcn_data_TMAXy1980_2010_OR_0602012.txt",sep=""),sep=",", header=TRUE) |
36 | 36 |
|
37 | 37 |
prop<-0.3 #Proportion of testing retained for validation |
38 |
#prop<-0.25 |
|
38 | 39 |
seed_number<- 100 #Seed number for random sampling |
39 |
out_prefix<-"_07022012_10d_fusion8" #User defined output prefix
|
|
40 |
out_prefix<-"_07022012_10d_fusion12" #User defined output prefix
|
|
40 | 41 |
setwd(path) |
41 | 42 |
############ START OF THE SCRIPT ################## |
42 | 43 |
|
... | ... | |
54 | 55 |
mean_LST<- readGDAL(infile5) #Reading the whole raster in memory. This provides a grid for kriging |
55 | 56 |
proj4string(mean_LST)<-CRS #Assigning coordinate information to prediction grid. |
56 | 57 |
|
57 |
ghcn = transform(ghcn,Northness = cos(ASPECT)) #Adding a variable to the dataframe |
|
58 |
ghcn = transform(ghcn,Eastness = sin(ASPECT)) #adding variable to the dataframe. |
|
59 |
ghcn = transform(ghcn,Northness_w = sin(slope)*cos(ASPECT)) #Adding a variable to the dataframe |
|
60 |
ghcn = transform(ghcn,Eastness_w = sin(slope)*sin(ASPECT)) #adding variable to the dataframe. |
|
58 |
ghcn = transform(ghcn,Northness = cos(ASPECT*pi/180)) #Adding a variable to the dataframe |
|
59 |
#ghcn = transform(ghcn,Northness = cos(ASPECT)) #Adding a variable to the dataframe |
|
60 |
#ghcn = transform(ghcn,Eastness = sin(ASPECT)) #adding variable to the dataframe. |
|
61 |
ghcn = transform(ghcn,Eastness = sin(ASPECT*pi/180)) #adding variable to the dataframe. |
|
62 |
#ghcn = transform(ghcn,Northness_w = sin(slope)*cos(ASPECT)) #Adding a variable to the dataframe |
|
63 |
#ghcn = transform(ghcn,Eastness_w = sin(slope)*sin(ASPECT)) #adding variable to the dataframe. |
|
64 |
ghcn = transform(ghcn,Northness_w = sin(slope*pi/180)*cos(ASPECT*pi/180)) #Adding a variable to the dataframe |
|
65 |
ghcn = transform(ghcn,Eastness_w = sin(slope*pi/180)*sin(ASPECT*pi/180)) #adding variable to the dataframe. |
|
66 |
|
|
67 |
#Remove NA for LC and CANHEIGHT |
|
68 |
ghcn$LC1[is.na(ghcn$LC1)]<-0 |
|
69 |
ghcn$LC3[is.na(ghcn$LC3)]<-0 |
|
70 |
ghcn$CANHEIGHT[is.na(ghcn$CANHEIGHT)]<-0 |
|
71 |
|
|
61 | 72 |
|
62 | 73 |
set.seed(seed_number) #Using a seed number allow results based on random number to be compared... |
63 | 74 |
|
... | ... | |
84 | 95 |
# cor_LST_LC3<-matrix(1,10,1) #correlation LST-LC3 |
85 | 96 |
# cor_LST_tmax<-matrix(1,10,1) #correlation LST-tmax |
86 | 97 |
|
87 |
#Screening for bad values: element is tmax in this case
|
|
88 |
ghcn$element<-as.numeric(ghcn$element)
|
|
98 |
#Screening for bad values: value is tmax in this case
|
|
99 |
#ghcn$value<-as.numeric(ghcn$value)
|
|
89 | 100 |
ghcn_all<-ghcn |
90 |
ghcn_test<-subset(ghcn,ghcn$element>-150 & ghcn$element<400)
|
|
101 |
ghcn_test<-subset(ghcn,ghcn$value>-150 & ghcn$value<400)
|
|
91 | 102 |
ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0) |
92 | 103 |
ghcn<-ghcn_test2 |
93 | 104 |
#coords<- ghcn[,c('x_OR83M','y_OR83M')] |
... | ... | |
165 | 176 |
date2<-as.POSIXlt(as.Date(date1)) |
166 | 177 |
data3$date<-date2 |
167 | 178 |
d<-subset(data3,year>=2000 & mflag=="0" ) #Selecting dataset 2000-2010 with good quality |
179 |
#May need some screeing??? i.e. range of temp and elevation... |
|
168 | 180 |
d1<-aggregate(value~station+month, data=d, mean) #Calculate monthly mean for every station in OR |
169 | 181 |
id<-as.data.frame(unique(d1$station)) #Unique station in OR for year 2000-2010 |
170 | 182 |
|
171 |
dst<-merge(d1, stat_loc, by.x="station", by.y="STAT_ID") |
|
183 |
dst<-merge(d1, stat_loc, by.x="station", by.y="STAT_ID") #Inner join all columns are retained |
|
184 |
|
|
172 | 185 |
#This allows to change only one name of the data.frame |
173 |
names(dst)[3]<-c("TMax") |
|
186 |
pos<-match("value",names(dst)) #Find column with name "value" |
|
187 |
names(dst)[pos]<-c("TMax") |
|
174 | 188 |
dst$TMax<-dst$TMax/10 #TMax is the average max temp for months |
175 | 189 |
#dstjan=dst[dst$month==9,] #dst contains the monthly averages for tmax for every station over 2000-2010 |
176 | 190 |
############## |
... | ... | |
196 | 210 |
######### |
197 | 211 |
|
198 | 212 |
sta_bias=sta_tmax_from_lst-modst$TMax; #That is the difference between the monthly LST mean and monthly station mean |
213 |
#Added by Benoit |
|
214 |
modst$LSTD_bias<-sta_bias #Adding bias to data frame modst containning the monthly average for 10 years |
|
199 | 215 |
|
200 | 216 |
bias_xy=project(as.matrix(sta_lola),proj_str) |
201 | 217 |
# windows() |
... | ... | |
247 | 263 |
##########################Commented out by Benoit |
248 | 264 |
|
249 | 265 |
#added by Benoit |
250 |
#d<-ghcn.subsets[[i]] |
|
266 |
#x<-ghcn.subsets[[i]] #Holds both training and testing for instance 161 rows for Jan 1 |
|
267 |
x<-data_v |
|
251 | 268 |
d<-data_s |
252 |
names(d)[5]<-c("dailyTmax") |
|
269 |
|
|
270 |
pos<-match("value",names(d)) #Find column with name "value" |
|
271 |
names(d)[pos]<-c("dailyTmax") |
|
272 |
names(x)[pos]<-c("dailyTmax") |
|
253 | 273 |
d$dailyTmax=(as.numeric(d$dailyTmax))/10 #stored as 1/10 degree C to allow integer storage |
254 |
names(d)[1]<-c("id") |
|
274 |
x$dailyTmax=(as.numeric(x$dailyTmax))/10 #stored as 1/10 degree C to allow integer storage |
|
275 |
pos<-match("station",names(d)) #Find column with name "value" |
|
276 |
names(d)[pos]<-c("id") |
|
277 |
names(x)[pos]<-c("id") |
|
255 | 278 |
names(modst)[1]<-c("id") #modst contains the average tmax per month for every stations... |
256 |
dmoday=merge(modst,d,by="id") #LOOSING DATA HERE!!! from 162 t0 146 |
|
279 |
dmoday=merge(modst,d,by="id") #LOOSING DATA HERE!!! from 113 t0 103 |
|
280 |
xmoday=merge(modst,x,by="id") #LOOSING DATA HERE!!! from 48 t0 43 |
|
257 | 281 |
names(dmoday)[4]<-c("lat") |
258 |
names(dmoday)[5]<-c("lon") |
|
282 |
names(dmoday)[5]<-c("lon") #dmoday contains all the the information: BIAS, monn |
|
283 |
names(xmoday)[4]<-c("lat") |
|
284 |
names(xmoday)[5]<-c("lon") #dmoday contains all the the information: BIAS, monn |
|
285 |
|
|
286 |
data_v<-xmoday |
|
259 | 287 |
### |
260 | 288 |
|
289 |
#dmoday contains the daily tmax values with TMax being the monthly station tmax mean |
|
261 | 290 |
#dmoday contains the daily tmax values with TMax being the monthly station tmax mean |
262 | 291 |
|
263 | 292 |
# windows() |
... | ... | |
290 | 319 |
#### Added by Benoit on 06/19 |
291 | 320 |
data_s<-dmoday #put the |
292 | 321 |
data_s$daily_delta<-daily_delta |
322 |
|
|
323 |
|
|
293 | 324 |
#data_s$y_var<-daily_delta #y_var is the variable currently being modeled, may be better with BIAS!! |
294 |
data_s$y_var<-data_s$dailyTmax |
|
325 |
data_s$y_var<-data_s$LSTD_bias |
|
326 |
#data_s$y_var<-(data_s$dailyTmax)*10 |
|
295 | 327 |
#Model and response variable can be changed without affecting the script |
296 | 328 |
|
297 | 329 |
mod1<- gam(y_var~ s(lat) + s (lon) + s (ELEV_SRTM), data=data_s) |
... | ... | |
302 | 334 |
mod6<- gam(y_var~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST)+s(LC1), data=data_s) |
303 | 335 |
mod7<- gam(y_var~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST)+s(LC3), data=data_s) |
304 | 336 |
mod8<- gam(y_var~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST) + s(LC1), data=data_s) |
337 |
|
|
338 |
#Added |
|
339 |
#tmax_predicted=themolst+daily_delta_rast-bias_rast #Final surface?? but daily_rst |
|
340 |
|
|
305 | 341 |
#### Added by Benoit ends |
306 | 342 |
|
307 | 343 |
######### |
... | ... | |
316 | 352 |
|
317 | 353 |
plot(daily_delta_rast,main="Raster Daily Delta") |
318 | 354 |
|
319 |
tmax_predicted=themolst+daily_delta_rast-bias_rast #Final surface?? but daily_rst
|
|
355 |
tmax_predicted=themolst+daily_delta_rast-bias_rast #Final surface as a raster layer...
|
|
320 | 356 |
#tmax_predicted=themolst+daily_delta_rast+bias_rast #Added by Benoit, why is it -bias_rast |
321 | 357 |
plot(tmax_predicted,main="Predicted daily") |
322 | 358 |
|
... | ... | |
332 | 368 |
sta_pred=lookup(tmax_predicted,data_v$lat,data_v$lon) |
333 | 369 |
#sta_pred=lookup(tmax_predicted,daily_sta_lola$lat,daily_sta_lola$lon) |
334 | 370 |
#rmse=RMSE(sta_pred,dmoday$dailyTmax) |
335 |
tmax<-data_v$tmax/10 |
|
371 |
#pos<-match("value",names(data_v)) #Find column with name "value" |
|
372 |
#names(data_v)[pos]<-c("dailyTmax") |
|
373 |
tmax<-data_v$dailyTmax |
|
374 |
#data_v$dailyTmax<-tmax |
|
336 | 375 |
rmse=RMSE(sta_pred,tmax) |
337 | 376 |
#plot(sta_pred~dmoday$dailyTmax,xlab=paste("Actual daily for",datelabel),ylab="Pred daily",main=paste("RMSE=",rmse)) |
338 | 377 |
X11() |
... | ... | |
359 | 398 |
results_RMSE_f[i,3]<- "RMSE" |
360 | 399 |
results_RMSE_f[i,j+3]<- rmse_fit #Storing RMSE for the model j |
361 | 400 |
|
362 |
ns<-nrow(data_s) |
|
363 |
|
|
401 |
ns<-nrow(data_s) #This is added to because some loss of data might have happened because of the averaging...
|
|
402 |
nv<-nrow(data_v) |
|
364 | 403 |
|
365 | 404 |
for (j in 1:length(models)){ |
366 | 405 |
|
... | ... | |
383 | 422 |
results_DEV[i,3]<- "DEV" |
384 | 423 |
results_DEV[i,j+3]<- mod$deviance |
385 | 424 |
|
425 |
sta_LST_s=lookup(themolst,data_s$lat,data_s$lon) |
|
426 |
sta_delta_s=lookup(daily_delta_rast,data_s$lat,data_s$lon) #delta surface has been calculated before!! |
|
427 |
sta_bias_s= mod$fit |
|
428 |
#Need to extract values from the kriged delta surface... |
|
429 |
#sta_delta= lookup(delta_surface,data_v$lat,data_v$lon) |
|
430 |
#tmax_predicted=sta_LST+sta_bias-y_mod$fit |
|
431 |
tmax_predicted_s= sta_LST_s-sta_bias_s+sta_delta_s |
|
432 |
|
|
386 | 433 |
results_RMSE_f[i,1]<- dates[i] #storing the interpolation dates in the first column |
387 | 434 |
results_RMSE_f[i,2]<- ns #number of stations used in the training stage |
388 | 435 |
results_RMSE_f[i,3]<- "RSME" |
389 |
results_RMSE_f[i,j+3]<- sqrt(sum((mod$residuals)^2)/nv)
|
|
436 |
results_RMSE_f[i,j+3]<- sqrt(sum((tmax_predicted_s-data_s$dailyTmax)^2)/ns)
|
|
390 | 437 |
|
391 | 438 |
##Model assessment: general diagnostic/metrics |
392 | 439 |
##validation: using the testing data |
393 | 440 |
|
394 | 441 |
#This was modified on 06192012 |
395 | 442 |
|
396 |
#data_v$y_var<-data_v$tmax/10
|
|
397 |
data_v$y_var<-tmax |
|
443 |
#data_v$y_var<-data_v$LSTD_bias
|
|
444 |
#data_v$y_var<-tmax
|
|
398 | 445 |
y_mod<- predict(mod, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values. |
399 | 446 |
|
400 |
#sta_LST=lookup(themolst,data_v$lat,data_v$lon) |
|
401 |
#sta_bias=lookup(bias_rast,data_v$lat,data_v$lon) |
|
447 |
####ADDED ON JULY 5th |
|
448 |
sta_LST_v=lookup(themolst,data_v$lat,data_v$lon) |
|
449 |
sta_delta_v=lookup(daily_delta_rast,data_v$lat,data_v$lon) #delta surface has been calculated before!! |
|
450 |
sta_bias_v= y_mod$fit |
|
451 |
#Need to extract values from the kriged delta surface... |
|
452 |
#sta_delta= lookup(delta_surface,data_v$lat,data_v$lon) |
|
402 | 453 |
#tmax_predicted=sta_LST+sta_bias-y_mod$fit |
454 |
tmax_predicted_v= sta_LST_v-sta_bias_v+sta_delta_v |
|
403 | 455 |
|
404 | 456 |
#data_v$tmax<-(data_v$tmax)/10 |
405 |
#res_mod<- data_v$tmax - tmax_predicted #Residuals for the model for fusion
|
|
406 |
res_mod<- data_v$y_var - y_mod$fit #Residuals for the model |
|
457 |
res_mod<- data_v$dailyTmax - tmax_predicted_v #Residuals for the model for fusion
|
|
458 |
#res_mod<- data_v$y_var - y_mod$fit #Residuals for the model
|
|
407 | 459 |
|
408 | 460 |
RMSE_mod <- sqrt(sum(res_mod^2)/nv) #RMSE FOR REGRESSION STEP 1: GAM |
409 | 461 |
MAE_mod<- sum(abs(res_mod))/nv #MAE, Mean abs. Error FOR REGRESSION STEP 1: GAM |
410 | 462 |
ME_mod<- sum(res_mod)/nv #ME, Mean Error or bias FOR REGRESSION STEP 1: GAM |
411 |
R2_mod<- cor(data_v$tmax,y_mod$fit)^2 #R2, coef. of var FOR REGRESSION STEP 1: GAM
|
|
463 |
R2_mod<- cor(data_v$dailyTmax,tmax_predicted_v)^2 #R2, coef. of var FOR REGRESSION STEP 1: GAM
|
|
412 | 464 |
|
413 | 465 |
results_RMSE[i,1]<- dates[i] #storing the interpolation dates in the first column |
414 | 466 |
results_RMSE[i,2]<- ns #number of stations used in the training stage |
... | ... | |
429 | 481 |
|
430 | 482 |
#Saving residuals and prediction in the dataframes: tmax predicted from GAM |
431 | 483 |
pred<-paste("pred_mod",j,sep="") |
432 |
data_v[[pred]]<-as.numeric(y_mod$fit) |
|
433 |
data_s[[pred]]<-as.numeric(mod$fit) #Storing model fit values (predicted on training sample) |
|
484 |
#data_v[[pred]]<-as.numeric(y_mod$fit) |
|
485 |
data_v[[pred]]<-as.numeric(tmax_predicted_v) |
|
486 |
data_s[[pred]]<-as.numeric(tmax_predicted_s) #Storing model fit values (predicted on training sample) |
|
487 |
#data_s[[pred]]<-as.numeric(mod$fit) #Storing model fit values (predicted on training sample) |
|
434 | 488 |
|
435 | 489 |
name2<-paste("res_mod",j,sep="") |
436 | 490 |
data_v[[name2]]<-as.numeric(res_mod) |
437 |
data_s[[name2]]<-as.numeric(mod$residuals) |
|
491 |
temp<-tmax_predicted_s-data_s$dailyTmax |
|
492 |
data_s[[name2]]<-as.numeric(temp) |
|
438 | 493 |
#end of loop calculating RMSE |
439 | 494 |
|
440 | 495 |
} |
Also available in: Unified diff
FUSION, using GAM for bias surface, correction aspects