1
|
################## CLIMATE INTERPOLATION FUSION METHOD #######################################
|
2
|
############################ Merging LST and station data ##########################################
|
3
|
#This script interpolates tmax values using MODIS LST and GHCND station data #
|
4
|
#interpolation area. It requires the text file of stations and a shape file of the study area. #
|
5
|
#Note that the projection for both GHCND and study area is lonlat WGS84. #
|
6
|
#AUTHOR: Brian McGill #
|
7
|
#DATE: 06/09/212 #
|
8
|
#PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363-- #
|
9
|
###################################################################################################
|
10
|
|
11
|
###Loading R library and packages
|
12
|
library(gtools) # loading some useful tools
|
13
|
library(mgcv) # GAM package by Simon Wood
|
14
|
library(sp) # Spatial pacakge with class definition by Bivand et al.
|
15
|
library(spdep) # Spatial pacakge with methods and spatial stat. by Bivand et al.
|
16
|
library(rgdal) # GDAL wrapper for R, spatial utilities
|
17
|
library(gstat) # Kriging and co-kriging by Pebesma et al.
|
18
|
library(fields) # Spatial Interpolation methods such as kriging, splines
|
19
|
library(raster)
|
20
|
### Parameters and argument
|
21
|
|
22
|
infile1<- "ghcn_or_tmax_b_04142012_OR83M.shp" #GHCN shapefile containing variables for modeling 2010
|
23
|
infile2<-"list_10_dates_04212012.txt" #List of 10 dates for the regression
|
24
|
#infile2<-"list_365_dates_04212012.txt"
|
25
|
infile3<-"LST_dates_var_names.txt" #LST dates name
|
26
|
infile4<-"models_interpolation_05142012.txt" #Interpolation model names
|
27
|
infile5<-"mean_day244_rescaled.rst" #Raster or grid for the locations of predictions
|
28
|
|
29
|
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations"
|
30
|
path<-"M:/Data/IPLANT_project/data_Oregon_stations" #Locations on Atlas
|
31
|
|
32
|
#Station location of the study area
|
33
|
stat_loc<-read.table(paste(path,"/","location_study_area_OR_0602012.txt",sep=""),sep=",", header=TRUE)
|
34
|
#GHCN Database for 1980-2010 for study area (OR)
|
35
|
data3<-read.table(paste(path,"/","ghcn_data_TMAXy1980_2010_OR_0602012.txt",sep=""),sep=",", header=TRUE)
|
36
|
|
37
|
prop<-0.3 #Proportion of testing retained for validation
|
38
|
seed_number<- 100 #Seed number for random sampling
|
39
|
out_prefix<-"_06142012_10d_fusion2" #User defined output prefix
|
40
|
setwd(path)
|
41
|
############ START OF THE SCRIPT ##################
|
42
|
|
43
|
#
|
44
|
### Step 0/Step 6 in Brian's code...preparing year 2010 data for modeling
|
45
|
#
|
46
|
|
47
|
|
48
|
###Reading the station data and setting up for models' comparison
|
49
|
filename<-sub(".shp","",infile1) #Removing the extension from file.
|
50
|
ghcn<-readOGR(".", filename) #reading shapefile
|
51
|
|
52
|
CRS<-proj4string(ghcn) #Storing projection information (ellipsoid, datum,etc.)
|
53
|
|
54
|
mean_LST<- readGDAL(infile5) #Reading the whole raster in memory. This provides a grid for kriging
|
55
|
proj4string(mean_LST)<-CRS #Assigning coordinate information to prediction grid.
|
56
|
|
57
|
ghcn = transform(ghcn,Northness = cos(ASPECT)) #Adding a variable to the dataframe
|
58
|
ghcn = transform(ghcn,Eastness = sin(ASPECT)) #adding variable to the dataframe.
|
59
|
ghcn = transform(ghcn,Northness_w = sin(slope)*cos(ASPECT)) #Adding a variable to the dataframe
|
60
|
ghcn = transform(ghcn,Eastness_w = sin(slope)*sin(ASPECT)) #adding variable to the dataframe.
|
61
|
|
62
|
set.seed(seed_number) #Using a seed number allow results based on random number to be compared...
|
63
|
|
64
|
dates <-readLines(paste(path,"/",infile2, sep=""))
|
65
|
LST_dates <-readLines(paste(path,"/",infile3, sep=""))
|
66
|
models <-readLines(paste(path,"/",infile4, sep=""))
|
67
|
|
68
|
#Model assessment: specific diagnostic/metrics for GAM
|
69
|
# results_AIC<- matrix(1,length(dates),length(models)+3)
|
70
|
# results_GCV<- matrix(1,length(dates),length(models)+3)
|
71
|
# results_DEV<- matrix(1,length(dates),length(models)+3)
|
72
|
# results_RMSE_f<- matrix(1,length(dates),length(models)+3)
|
73
|
|
74
|
#Model assessment: general diagnostic/metrics
|
75
|
results_RMSE <- matrix(1,length(dates),length(models)+3)
|
76
|
results_MAE <- matrix(1,length(dates),length(models)+3)
|
77
|
results_ME <- matrix(1,length(dates),length(models)+3)
|
78
|
results_R2 <- matrix(1,length(dates),length(models)+3) #Coef. of determination for the validation dataset
|
79
|
results_RMSE_f<- matrix(1,length(dates),length(models)+3) #RMSE fit, RMSE for the training dataset
|
80
|
results_RMSE_f_kr<- matrix(1,length(dates),length(models)+3)
|
81
|
|
82
|
# #Tracking relationship between LST AND LC
|
83
|
# cor_LST_LC1<-matrix(1,10,1) #correlation LST-LC1
|
84
|
# cor_LST_LC3<-matrix(1,10,1) #correlation LST-LC3
|
85
|
# cor_LST_tmax<-matrix(1,10,1) #correlation LST-tmax
|
86
|
|
87
|
#Screening for bad values
|
88
|
ghcn_all<-ghcn
|
89
|
ghcn_test<-subset(ghcn,ghcn$tmax>-150 & ghcn$tmax<400)
|
90
|
ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0)
|
91
|
ghcn<-ghcn_test2
|
92
|
#coords<- ghcn[,c('x_OR83M','y_OR83M')]
|
93
|
|
94
|
month_var<-c("mm_01","mm_02","mm_03","mm_04","mm_05","mm_06","mm_07","mm_08","mm_09", "mm_10", "mm_11", "mm_12")
|
95
|
ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
|
96
|
|
97
|
|
98
|
#Start loop here...
|
99
|
|
100
|
## looping through the dates...this is the main part of the code
|
101
|
#i=1 #for debugging
|
102
|
#j=1 #for debugging
|
103
|
for(i in 1:length(dates)){ # start of the for loop #1
|
104
|
|
105
|
date<-strptime(dates[i], "%Y%m%d") # interpolation date being processed
|
106
|
month<-strftime(date, "%m") # current month of the date being processed
|
107
|
LST_month<-paste("mm_",month,sep="") # name of LST month to be matched
|
108
|
|
109
|
###Regression part 1: Creating a validation dataset by creating training and testing datasets
|
110
|
|
111
|
mod_LST <-ghcn.subsets[[i]][,match(LST_month, names(ghcn.subsets[[i]]))] #Match interpolation date and monthly LST average
|
112
|
ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod_LST) #Add the variable LST to the subset dataset
|
113
|
n<-nrow(ghcn.subsets[[i]])
|
114
|
ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows
|
115
|
nv<-n-ns #create a sample for validation with prop of the rows
|
116
|
ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
|
117
|
ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
|
118
|
data_s <- ghcn.subsets[[i]][ind.training, ] #Training dataset currently used in the modeling
|
119
|
data_v <- ghcn.subsets[[i]][ind.testing, ] #Testing/validation dataset
|
120
|
|
121
|
#i=1
|
122
|
date_proc<-dates[i]
|
123
|
date_proc<-strptime(dates[i], "%Y%m%d") # interpolation date being processed
|
124
|
mo<-as.integer(strftime(date_proc, "%m")) # current month of the date being processed
|
125
|
day<-as.integer(strftime(date_proc, "%d"))
|
126
|
year<-as.integer(strftime(date_proc, "%Y"))
|
127
|
|
128
|
#setup
|
129
|
|
130
|
#mo=9 #Commented out by Benoit on June 14
|
131
|
#day=2
|
132
|
#year=2010
|
133
|
datelabel=format(ISOdate(year,mo,day),"%b %d, %Y")
|
134
|
|
135
|
#
|
136
|
# Step 1 - 10 year monthly averages
|
137
|
#
|
138
|
|
139
|
library(raster)
|
140
|
#old<-getwd()
|
141
|
#setwd("c:/data/benoit/data_Oregon_stations_Brian_04242012")
|
142
|
#l=list.files(pattern="mean_month.*rescaled.tif")
|
143
|
l=list.files(pattern="mean_month.*rescaled.rst")
|
144
|
molst<-stack(l)
|
145
|
#setwd(old)
|
146
|
molst=molst-273.16 #K->C
|
147
|
idx <- seq(as.Date('2010-01-15'), as.Date('2010-12-15'), 'month')
|
148
|
molst <- setZ(molst, idx)
|
149
|
layerNames(molst) <- month.abb
|
150
|
themolst<-raster(molst,mo)
|
151
|
plot(themolst)
|
152
|
#
|
153
|
# Step 2 - Weather station means across same days
|
154
|
#
|
155
|
# ??? which years & what quality flags???
|
156
|
#select ghcn.id, lat,lon, elev, month, avg(value/10) as "TMax", count(*) as "NumDays" from ghcn, stations where ghcn.id in (select id from stations where state=='OR') and ghcn.id==stations.id and value<>-9999 and year>=2000 and element=='TMAX' group by stations.id, month;select ghcn.id, lat,lon, elev, month, avg(value/10) as "TMax", count(*) as "NumDays" from ghcn, stations where ghcn.id in (select id from stations where state=='OR') and ghcn.id==stations.id and value<>-9999 and year>=2000 and element=='TMAX' group by stations.id, month;
|
157
|
#below table from above SQL query
|
158
|
#dst<-read.csv('/data/benoit/data_oregon_stations_brian_04242012/station_means.csv',h=T)
|
159
|
|
160
|
|
161
|
##Added by Benoit ######
|
162
|
date1<-ISOdate(data3$year,data3$month,data3$day) #Creating a date object from 3 separate column
|
163
|
date2<-as.POSIXlt(as.Date(date1))
|
164
|
data3$date<-date2
|
165
|
d<-subset(data3,year>=2000 & mflag=="0" )
|
166
|
d1<-aggregate(value~station+month, data=d, mean) #Calculate monthly mean for every station in OR
|
167
|
id<-as.data.frame(unique(d1$station))
|
168
|
|
169
|
dst<-merge(d1, stat_loc, by.x="station", by.y="STAT_ID")
|
170
|
#This allows to change only one name of the data.frame
|
171
|
names(dst)[3]<-c("TMax")
|
172
|
dst$TMax<-dst$TMax/10
|
173
|
#dstjan=dst[dst$month==9,] #dst contains the monthly averages for tmax for every station over 2000-2010
|
174
|
##############
|
175
|
|
176
|
modst=dst[dst$month==mo,]
|
177
|
#
|
178
|
# Step 3 - get LST at stations
|
179
|
#
|
180
|
sta_lola=modst[,c("lon","lat")]
|
181
|
library(rgdal)
|
182
|
proj_str="+proj=lcc +lat_1=43 +lat_2=45.5 +lat_0=41.75 +lon_0=-120.5 +x_0=400000 +y_0=0 +ellps=GRS80 +units=m +no_defs";
|
183
|
lookup<-function(r,lat,lon) {
|
184
|
xy<-project(cbind(lon,lat),proj_str);
|
185
|
cidx<-cellFromXY(r,xy);
|
186
|
return(r[cidx])
|
187
|
}
|
188
|
sta_tmax_from_lst=lookup(themolst,sta_lola$lat,sta_lola$lon)
|
189
|
#
|
190
|
# Step 4 - bias at stations
|
191
|
#
|
192
|
sta_bias=sta_tmax_from_lst-modst$TMax;
|
193
|
bias_xy=project(as.matrix(sta_lola),proj_str)
|
194
|
# windows()
|
195
|
plot(modst$TMax,sta_tmax_from_lst,xlab="Station mo Tmax",ylab="LST mo Tmax")
|
196
|
abline(0,1)
|
197
|
#
|
198
|
# Step 5 - interpolate bias
|
199
|
#
|
200
|
# ?? include covariates like elev, distance to coast, cloud frequency, tree height
|
201
|
library(fields)
|
202
|
#windows()
|
203
|
quilt.plot(sta_lola,sta_bias,main="Bias at stations",asp=1)
|
204
|
US(add=T,col="magenta",lwd=2)
|
205
|
#fitbias<-Tps(bias_xy,sta_bias) #use TPS or krige
|
206
|
fitbias<-Krig(bias_xy,sta_bias,theta=1e5) #use TPS or krige
|
207
|
# windows()
|
208
|
surface(fitbias,col=rev(terrain.colors(100)),asp=1,main="Interpolated bias")
|
209
|
#US(add=T,col="magenta",lwd=2)
|
210
|
#
|
211
|
# Step 6 - return to daily station data & calcualate delta=daily T-monthly T from stations
|
212
|
#Commmented out by Benoit 06/14
|
213
|
# library(RSQLite)
|
214
|
# m<-dbDriver("SQLite")
|
215
|
# con<-dbConnect(m,dbname='c:/data/ghcn_tmintmax.db')
|
216
|
# querystr=paste("select ghcn.id, value as 'dailyTmax' from ghcn where ghcn.id in (select id from stations where state=='OR') and value<>-9999",
|
217
|
# "and year==",year,"and month==",mo,"and day==",day,
|
218
|
# "and element=='TMAX' ")
|
219
|
# rs<-dbSendQuery(con,querystr)
|
220
|
# d<-fetch(rs,n=-1)
|
221
|
# dbClearResult(rs)
|
222
|
# dbDisconnect(con)
|
223
|
#
|
224
|
# d$dailyTmax=d$dailyTmax/10 #stored as 1/10 degree C to allow integer storage
|
225
|
# dmoday=merge(modst,d,by="id")
|
226
|
##########################Commented out by Benoit
|
227
|
|
228
|
#added by Benoit
|
229
|
#d<-ghcn.subsets[[i]]
|
230
|
d<-data_s
|
231
|
names(d)[8]<-c("dailyTmax")
|
232
|
d$dailyTmax=d$dailyTmax/10 #stored as 1/10 degree C to allow integer storage
|
233
|
names(d)[1]<-c("id")
|
234
|
names(modst)[1]<-c("id")
|
235
|
dmoday=merge(modst,d,by="id") #LOOSING DATA HERE!!! from 162 t0 146
|
236
|
names(dmoday)[4]<-c("lat")
|
237
|
names(dmoday)[5]<-c("lon")
|
238
|
###
|
239
|
|
240
|
# windows()
|
241
|
plot(dailyTmax~TMax,data=dmoday,xlab="Mo Tmax",ylab=paste("Daily for",datelabel),main="across stations in OR")
|
242
|
#
|
243
|
# Step 7 - interpolate delta across space
|
244
|
#
|
245
|
daily_sta_lola=dmoday[,c("lon","lat")] #could be same as before but why assume merge does this - assume not
|
246
|
daily_sta_xy=project(as.matrix(daily_sta_lola),proj_str)
|
247
|
daily_delta=dmoday$dailyTmax-dmoday$TMax
|
248
|
#windows()
|
249
|
quilt.plot(daily_sta_lola,daily_delta,asp=1,main="Station delta for Jan 15")
|
250
|
US(add=T,col="magenta",lwd=2)
|
251
|
#fitdelta<-Tps(daily_sta_xy,daily_delta) #use TPS or krige
|
252
|
fitdelta<-Krig(daily_sta_xy,daily_delta,theta=1e5) #use TPS or krige
|
253
|
# windows()
|
254
|
surface(fitdelta,col=rev(terrain.colors(100)),asp=1,main="Interpolated delta")
|
255
|
#US(add=T,col="magenta",lwd=2)
|
256
|
#
|
257
|
# Step 8 - assemble final answer - T=LST+Bias(interpolated)+delta(interpolated)
|
258
|
#
|
259
|
bias_rast=interpolate(themolst,fitbias)
|
260
|
plot(bias_rast,main="Raster bias")
|
261
|
daily_delta_rast=interpolate(themolst,fitdelta)
|
262
|
plot(daily_delta_rast,main="Raster Daily Delta")
|
263
|
tmax_predicted=themolst+daily_delta_rast-bias_rast #Final surface??
|
264
|
plot(tmax_predicted,main="Predicted daily")
|
265
|
#
|
266
|
# check
|
267
|
#
|
268
|
sta_pred=lookup(tmax_predicted,data_v$lat,data_v$lon)
|
269
|
#sta_pred=lookup(tmax_predicted,daily_sta_lola$lat,daily_sta_lola$lon)
|
270
|
RMSE<-function(x,y) {return(mean((x-y)^2)^0.5)}
|
271
|
rmse=RMSE(sta_pred,dmoday$dailyTmax)
|
272
|
#plot(sta_pred~dmoday$dailyTmax,xlab=paste("Actual daily for",datelabel),ylab="Pred daily",main=paste("RMSE=",rmse))
|
273
|
tmax<-data_v$tmax/10
|
274
|
plot(sta_pred~tmax,xlab=paste("Actual daily for",datelabel),ylab="Pred daily",main=paste("RMSE=",rmse))
|
275
|
abline(0,1)
|
276
|
resid=sta_pred-dmoday$dailyTmax
|
277
|
quilt.plot(daily_sta_lola,resid)
|
278
|
|
279
|
### END OF BRIAN's code
|
280
|
|
281
|
### Added by benoit
|
282
|
j=1
|
283
|
results_RMSE[i,1]<- dates[i] #storing the interpolation dates in the first column
|
284
|
results_RMSE[i,2]<- ns #number of stations used in the training stage
|
285
|
results_RMSE[i,3]<- "RMSE"
|
286
|
results_RMSE[i,j+3]<- rmse #Storing RMSE for the model j
|
287
|
|
288
|
# end of the for loop1
|
289
|
|
290
|
}
|
291
|
|
292
|
|
293
|
## Plotting and saving diagnostic measures
|
294
|
|
295
|
#Specific diagnostic measures related to the testing datasets
|
296
|
|
297
|
results_table_RMSE<-as.data.frame(results_RMSE)
|
298
|
results_table_MAE<-as.data.frame(results_MAE)
|
299
|
results_table_ME<-as.data.frame(results_ME)
|
300
|
results_table_R2<-as.data.frame(results_R2)
|
301
|
|
302
|
cname<-c("dates","ns","metric","mod1", "mod2","mod3", "mod4", "mod5", "mod6", "mod7","mod8")
|
303
|
colnames(results_table_RMSE)<-cname
|
304
|
|
305
|
#tb_diagnostic1<-rbind(results_table_RMSE,results_table_MAE, results_table_ME, results_table_R2) #
|
306
|
tb_diagnostic1<-results_table_RMSE
|
307
|
|
308
|
write.table(tb_diagnostic1, file= paste(path,"/","results_fusion_Assessment_measure1",out_prefix,".txt",sep=""), sep=",")
|
309
|
|
310
|
#### END OF SCRIPT
|