1
|
################## Data preparation for interpolation #######################################
|
2
|
############################ Extraction of station data ##########################################
|
3
|
|
4
|
|
5
|
database_covariates_preparation<-function(list_param_prep){
|
6
|
#This function performs queries on the Postgres ghcnd database for stations matching the
|
7
|
#interpolation area. It requires 11 inputs:
|
8
|
# 1) db.name : Postgres database name containing the meteorological stations
|
9
|
# 2) var: the variable of interest - "TMAX","TMIN" or "PRCP"
|
10
|
# 3) range_years: range of records used in the daily interpolation, note that upper bound year is not included
|
11
|
# 4) range_years_clim: range of records used in the monthly climatology interpolation, note that upper bound is not included
|
12
|
# 5) infile_reg_outline: region outline as a shape file - used in the interpolation stage too
|
13
|
# 6) infile_ghncd_data: ghcnd stations locations as a textfile name with lat-long fields
|
14
|
# 7) infile_covarariates: tif file of raser covariates for the interpolation area: it should have a local projection
|
15
|
# 8) CRS_locs_WGS84: longlat EPSG 4326 used as coordinates reference system (proj4)for stations locations
|
16
|
# 9) in_path: input path for covariates data and other files, this is also the output?
|
17
|
# 10) out_path: output path created in master script
|
18
|
# 11) covar_names: names of covariates used for the interpolation --may be removed later? (should be stored in the brick)
|
19
|
# 12) qc_flags_stations: flag used to screen out at this stage only two values...
|
20
|
# 13) out_prefix: output suffix added to output names--it is the same in the interpolation script
|
21
|
#
|
22
|
#The output is a list of four shapefile names produced by the function:
|
23
|
#1) loc_stations: locations of stations as shapefile in EPSG 4326
|
24
|
#2) loc_stations_ghcn: ghcn daily data for the year range of interpolation (locally projected)
|
25
|
#3) daily_query_ghcn_data: ghcn daily data from daily query before application of quality flag
|
26
|
#4) daily_covar_ghcn_data: ghcn daily data with covariates for the year range of interpolation (locally projected)
|
27
|
#5) monthly_query_ghcn_data: ghcn daily data from monthly query before application of quality flag
|
28
|
#6) monthly_covar_ghcn_data: ghcn monthly averaged data with covariates for the year range of interpolation (locally projected)
|
29
|
|
30
|
#AUTHOR: Benoit Parmentier
|
31
|
#DATE: 05/21/2013
|
32
|
#PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363--
|
33
|
#Comments and TODO
|
34
|
#-Add buffer option...
|
35
|
#-Add screening for value predicted: var
|
36
|
##################################################################################################
|
37
|
|
38
|
###Loading R library and packages: should it be read in before???
|
39
|
|
40
|
library(RPostgreSQL)
|
41
|
library(sp) # Spatial pacakge with class definition by Bivand et al.
|
42
|
library(spdep) # Spatial pacakge with methods and spatial stat. by Bivand et al.
|
43
|
library(rgdal) # GDAL wrapper for R, spatial utilities
|
44
|
library(rgeos)
|
45
|
library(rgdal)
|
46
|
library(raster)
|
47
|
library(rasterVis)
|
48
|
library(maps)
|
49
|
library(maptools)
|
50
|
|
51
|
### Functions used in the script
|
52
|
|
53
|
format_s <-function(s_ID){
|
54
|
#Format station ID in a vector format/tuple that is used in a psql query.
|
55
|
# Argument 1: vector of station ID
|
56
|
# Return: character of station ID
|
57
|
tx2<-s_ID
|
58
|
tx2<-as.character(tx2)
|
59
|
stat_list<-tx2
|
60
|
temp<-shQuote(stat_list)
|
61
|
t<-paste(temp, collapse= " ")
|
62
|
t1<-gsub(" ", ",",t)
|
63
|
sf_ID<-paste("(",t1,")",sep="") #vector containing the station ID to query
|
64
|
return(sf_ID)
|
65
|
}
|
66
|
|
67
|
#parsing input arguments
|
68
|
|
69
|
db.name <- list_param_prep$db.name #name of the Postgres database
|
70
|
var <- list_param_prep$var #name of the variables to keep: TMIN, TMAX or PRCP
|
71
|
year_start <-list_param_prep$range_years[1] #"2010" #starting year for the query (included)
|
72
|
year_end <-list_param_prep$range_years[2] #"2011" #end year for the query (excluded)
|
73
|
year_start_clim <-list_param_prep$range_years_clim[1] #right bound not included in the range!! starting year for monthly query to calculate clime
|
74
|
year_end_clim <-list_param_prep$range_years_clim[2] #right bound not included in the range!! starting year for monthly query to calculate clime
|
75
|
infile_reg_outline<- list_param_prep$infile_reg_outline #This is the shape file of outline of the study area #It is an input/output of the covariate script
|
76
|
infile_ghncd_data<-list_param_prep$infile_ghncd_data #"/home/layers/data/climate/ghcn/v2.92-upd-2012052822/ghcnd-stations.txt" #This is the textfile of station locations from GHCND
|
77
|
infile_covariates<-list_param_prep$infile_covariates #"covariates__venezuela_region__VE_01292013.tif" #this is an output from covariate script
|
78
|
CRS_locs_WGS84<-list_param_prep$CRS_locs_WGS84 #Station coords WGS84: same as earlier
|
79
|
in_path <- list_param_prep$in_path #CRS_locs_WGS84"/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/input_data/"
|
80
|
out_path <- list_param_prep$out_path #CRS_locs_WGS84"/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/input_data/"
|
81
|
covar_names<-list_param_prep$covar_names # names should be written in the tif file!!!
|
82
|
qc_flags_stations <- list_param_prep$qc_flags_stations #flags allowed for the query from the GHCND??
|
83
|
out_prefix<-list_param_prep$out_prefix #"_365d_GAM_fus5_all_lstd_03012013" #User defined output prefix
|
84
|
|
85
|
## working directory is the same for input and output for this function
|
86
|
#setwd(in_path)
|
87
|
setwd(out_path)
|
88
|
##### STEP 1: Select station in the study area
|
89
|
|
90
|
filename<-sub(".shp","",infile_reg_outline) #Removing the extension from file.
|
91
|
interp_area <- readOGR(dsn=dirname(filename),basename(filename))
|
92
|
CRS_interp<-proj4string(interp_area) #Storing the coordinate information: geographic coordinates longlat WGS84
|
93
|
|
94
|
#Read in GHCND database station locations
|
95
|
dat_stat <- read.fwf(infile_ghncd_data,
|
96
|
widths = c(11,9,10,7,3,31,4,4,6),fill=TRUE)
|
97
|
colnames(dat_stat)<-c("STAT_ID","latitude","longitude","elev","state","name","GSNF","HCNF","WMOID")
|
98
|
coords<- dat_stat[,c('longitude','latitude')]
|
99
|
coordinates(dat_stat)<-coords
|
100
|
proj4string(dat_stat)<-CRS_locs_WGS84 #this is the WGS84 projection
|
101
|
#proj4string(dat_stat)<-CRS_interp
|
102
|
interp_area_WGS84 <-spTransform(interp_area,CRS_locs_WGS84) # Project from WGS84 to new coord. system
|
103
|
|
104
|
# Spatial query to find relevant stations
|
105
|
|
106
|
inside <- !is.na(over(dat_stat, as(interp_area_WGS84, "SpatialPolygons"))) #Finding stations contained in the current interpolation area
|
107
|
stat_reg<-dat_stat[inside,] #Selecting stations contained in the current interpolation area
|
108
|
|
109
|
####
|
110
|
##TODO: Add buffer option?
|
111
|
####
|
112
|
|
113
|
#### STEP 2: Connecting to the database and query for relevant data
|
114
|
|
115
|
drv <- dbDriver("PostgreSQL")
|
116
|
db <- dbConnect(drv, dbname=db.name)
|
117
|
|
118
|
time1<-proc.time() #Start stop watch
|
119
|
list_s<-format_s(stat_reg$STAT_ID)
|
120
|
data2<-dbGetQuery(db, paste("SELECT *
|
121
|
FROM ghcn
|
122
|
WHERE element=",shQuote(var),
|
123
|
"AND year>=",year_start,
|
124
|
"AND year<",year_end,
|
125
|
"AND station IN ",list_s,";",sep="")) #Selecting station using a SQL query
|
126
|
time_duration<-proc.time()-time1 #Time for the query may be long given the size of the database
|
127
|
time_minutes<-time_duration[3]/60
|
128
|
dbDisconnect(db)
|
129
|
|
130
|
data_table<-merge(data2,as.data.frame(stat_reg), by.x = "station", by.y = "STAT_ID")
|
131
|
|
132
|
#Transform the subset data frame in a spatial data frame and reproject
|
133
|
data_reg<-data_table #Make a copy of the data frame
|
134
|
coords<- data_reg[c('longitude','latitude')] #Define coordinates in a data frame: clean up here!!
|
135
|
coordinates(data_reg)<-coords #Assign coordinates to the data frame
|
136
|
proj4string(data_reg)<-CRS_locs_WGS84 #Assign coordinates reference system in PROJ4 format
|
137
|
data_reg<-spTransform(data_reg,CRS(CRS_interp)) #Project from WGS84 to new coord. system
|
138
|
|
139
|
data_d <-data_reg #data_d: daily data containing the query without screening
|
140
|
#data_reg <-subset(data_d,mflag=="0" | mflag=="S") #should be input arguments!!
|
141
|
#Transform the query to be depending on the number of flags
|
142
|
|
143
|
data_reg <-subset(data_d, mflag %in% qc_flags_stations) #screening using flags
|
144
|
#data_reg2 <-subset(data_d,mflag==qc_flags_stations[1] | mflag==qc_flags_stations[2]) #screening using flags
|
145
|
|
146
|
##################################################################
|
147
|
### STEP 3: Save results and outuput in textfile and a shape file
|
148
|
#browser()
|
149
|
#Save shape files of the locations of meteorological stations in the study area
|
150
|
#outfile1<-file.path(in_path,paste("stations","_",out_prefix,".shp",sep=""))
|
151
|
outfile1<-file.path(out_path,paste("stations","_",out_prefix,".shp",sep=""))
|
152
|
writeOGR(stat_reg,dsn= dirname(outfile1),layer= sub(".shp","",basename(outfile1)), driver="ESRI Shapefile",overwrite_layer=TRUE)
|
153
|
#writeOGR(dst,dsn= ".",layer= sub(".shp","",outfile4), driver="ESRI Shapefile",overwrite_layer=TRUE)
|
154
|
|
155
|
outfile2<-file.path(out_path,paste("ghcn_data_query_",var,"_",year_start,"_",year_end,out_prefix,".shp",sep="")) #Name of the file
|
156
|
#writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
|
157
|
writeOGR(data_d,dsn= dirname(outfile2),layer= sub(".shp","",basename(outfile2)), driver="ESRI Shapefile",overwrite_layer=TRUE)
|
158
|
|
159
|
outfile3<-file.path(out_path,paste("ghcn_data_",var,"_",year_start,"_",year_end,out_prefix,".shp",sep="")) #Name of the file
|
160
|
#writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
|
161
|
writeOGR(data_reg,dsn= dirname(outfile3),layer= sub(".shp","",basename(outfile3)), driver="ESRI Shapefile",overwrite_layer=TRUE)
|
162
|
|
163
|
###################################################################
|
164
|
### STEP 4: Extract values at stations from covariates stack of raster images
|
165
|
#Eventually this step may be skipped if the covariates information is stored in the database...
|
166
|
|
167
|
#s_raster<-stack(file.path(in_path,infile_covariates)) #read in the data stack
|
168
|
s_raster<-brick(infile_covariates) #read in the data stack
|
169
|
names(s_raster)<-covar_names #Assigning names to the raster layers: making sure it is included in the extraction
|
170
|
stat_val<- extract(s_raster, data_reg) #Extracting values from the raster stack for every point location in coords data frame.
|
171
|
#stat_val_test<- extract(s_raster, data_reg,def=TRUE)
|
172
|
|
173
|
#create a shape file and data_frame with names
|
174
|
|
175
|
data_RST<-as.data.frame(stat_val) #This creates a data frame with the values extracted
|
176
|
data_RST_SDF<-cbind(data_reg,data_RST)
|
177
|
coordinates(data_RST_SDF)<-coordinates(data_reg) #Transforming data_RST_SDF into a spatial point dataframe
|
178
|
CRS_reg<-proj4string(data_reg)
|
179
|
proj4string(data_RST_SDF)<-CRS_reg #Need to assign coordinates...
|
180
|
|
181
|
#Creating a date column
|
182
|
date1<-ISOdate(data_RST_SDF$year,data_RST_SDF$month,data_RST_SDF$day) #Creating a date object from 3 separate column
|
183
|
date2<-gsub("-","",as.character(as.Date(date1)))
|
184
|
data_RST_SDF$date<-date2 #Date format (year,month,day) is the following: "20100627"
|
185
|
|
186
|
#This allows to change only one name of the data.frame
|
187
|
pos<-match("value",names(data_RST_SDF)) #Find column with name "value"
|
188
|
if (var=="TMAX"){
|
189
|
#names(data_RST_SDF)[pos]<-c("TMax")
|
190
|
data_RST_SDF$value<-data_RST_SDF$value/10 #TMax is the average max temp for monthy data
|
191
|
}
|
192
|
|
193
|
if (var=="TMIN"){
|
194
|
#names(data_RST_SDF)[pos]<-c("TMin")
|
195
|
data_RST_SDF$value<-data_RST_SDF$value/10 #TMax is the average max temp for monthy data
|
196
|
}
|
197
|
|
198
|
#write out a new shapefile (including .prj component)
|
199
|
outfile4<-file.path(out_path,paste("daily_covariates_ghcn_data_",var,"_",range_years[1],"_",range_years[2],out_prefix,".shp",sep="")) #Name of the file
|
200
|
writeOGR(data_RST_SDF,dsn= dirname(outfile4),layer= sub(".shp","",basename(outfile4)), driver="ESRI Shapefile",overwrite_layer=TRUE)
|
201
|
|
202
|
###############################################################
|
203
|
######## STEP 5: Preparing monthly averages from the ProstGres database
|
204
|
|
205
|
drv <- dbDriver("PostgreSQL")
|
206
|
db <- dbConnect(drv, dbname=db.name)
|
207
|
|
208
|
#year_start_clim: set at the start of the script
|
209
|
time1<-proc.time() #Start stop watch
|
210
|
list_s<-format_s(stat_reg$STAT_ID)
|
211
|
data_m<-dbGetQuery(db, paste("SELECT *
|
212
|
FROM ghcn
|
213
|
WHERE element=",shQuote(var),
|
214
|
"AND year>=",year_start_clim,
|
215
|
"AND year<",year_end_clim,
|
216
|
"AND station IN ",list_s,";",sep="")) #Selecting station using a SQL query
|
217
|
time_duration<-proc.time()-time1 #Time for the query may be long given the size of the database
|
218
|
time_minutes<-time_duration[3]/60
|
219
|
dbDisconnect(db)
|
220
|
#Clean out this section!!
|
221
|
date1<-ISOdate(data_m$year,data_m$month,data_m$day) #Creating a date object from 3 separate column
|
222
|
date2<-as.POSIXlt(as.Date(date1))
|
223
|
data_m$date<-date2
|
224
|
#Save the query data here...
|
225
|
data_m<-merge(data_m, stat_reg, by.x="station", by.y="STAT_ID") #Inner join all columns are retained
|
226
|
#Extracting covariates from stack for the monthly dataset...
|
227
|
coords<- data_m[c('longitude','latitude')] #Define coordinates in a data frame
|
228
|
coordinates(data_m)<-coords #Assign coordinates to the data frame
|
229
|
proj4string(data_m)<-CRS_locs_WGS84 #Assign coordinates reference system in PROJ4 format
|
230
|
data_m<-spTransform(data_m,CRS(CRS_interp)) #Project from WGS84 to new coord. system
|
231
|
outfile5<-file.path(out_path,paste("monthly_query_ghcn_data_",var,"_",year_start_clim,"_",year_end_clim,out_prefix,".shp",sep="")) #Name of the file
|
232
|
writeOGR(data_m,dsn= dirname(outfile5),layer= sub(".shp","",basename(outfile5)), driver="ESRI Shapefile",overwrite_layer=TRUE)
|
233
|
|
234
|
#In Venezuela and other regions where there are not many stations...mflag==S should be added..see Durenne etal.2010.
|
235
|
|
236
|
#d<-subset(data_m,mflag==qc_flags_stations[1] | mflag==qc_flags_stations[2])
|
237
|
d<-subset(data_m,mflag %in% qc_flags_stations)
|
238
|
|
239
|
#Add screening here ...May need some screeing??? i.e. range of temp and elevation...
|
240
|
|
241
|
d1<-aggregate(value~station+month, data=d, mean) #Calculate monthly mean for every station in OR
|
242
|
#d2<-aggregate(value~station+month, data=d, length) #Calculate monthly mean for every station in OR
|
243
|
is_not_na_fun<-function(x) sum(!is.na(x)) #count the number of available observation
|
244
|
d2<-aggregate(value~station+month, data=d, is_not_na_fun) #Calculate monthly mean for every station in OR
|
245
|
#names(d2)[names(d2)=="value"] <-"nobs_station"
|
246
|
d1$nobs_station<-d2$value
|
247
|
dst<-merge(d1, stat_reg, by.x="station", by.y="STAT_ID") #Inner join all columns are retained
|
248
|
|
249
|
#This allows to change only one name of the data.frame
|
250
|
pos<-match("value",names(dst)) #Find column with name "value"
|
251
|
if (var=="TMAX"){
|
252
|
names(dst)[pos]<-c("TMax") #renaming the column "value" extracted from the Postgres database
|
253
|
dst$TMax<-dst$TMax/10 #TMax is the average max temp for monthy data
|
254
|
}
|
255
|
|
256
|
if (var=="TMIN"){
|
257
|
names(dst)[pos]<-c("TMin")
|
258
|
dst$TMin<-dst$TMin/10 #TMin is the average min temp for monthy data
|
259
|
}
|
260
|
|
261
|
#Extracting covariates from stack for the monthly dataset...
|
262
|
#names(dst)[5:6] <-c('latitude','longitude')
|
263
|
coords<- dst[c('longitude','latitude')] #Define coordinates in a data frame
|
264
|
|
265
|
coordinates(dst)<-coords #Assign coordinates to the data frame
|
266
|
proj4string(dst)<-CRS_locs_WGS84 #Assign coordinates reference system in PROJ4 format
|
267
|
dst_month<-spTransform(dst,CRS(CRS_interp)) #Project from WGS84 to new coord. system
|
268
|
|
269
|
stations_val<-extract(s_raster,dst_month,df=TRUE) #extraction of the information at station location in a data frame
|
270
|
#dst_extract<-spCbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
|
271
|
dst_extract<-cbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
|
272
|
dst<-dst_extract #problem!!! two column named elev!!! use elev_s??
|
273
|
|
274
|
#browser()
|
275
|
coords<- dst[c('x','y')] #Define coordinates in a data frame, this is the local x,y
|
276
|
index<-!is.na(coords$x) #remove if NA, may need to revisit at a later stage
|
277
|
dst<-dst[index,]
|
278
|
coords<- dst[c('x','y')] #Define coordinates in a data frame, this is the local x,y
|
279
|
coordinates(dst)<-coords #Assign coordinates to the data frame
|
280
|
proj4string(dst)<-CRS_interp #Assign coordinates reference system in PROJ4 format
|
281
|
|
282
|
### ADD SCREENING HERE BEFORE WRITING OUT DATA
|
283
|
#Covariates ok since screening done in covariate script
|
284
|
#screening on var i.e. value, TMIN, TMAX...
|
285
|
|
286
|
####
|
287
|
|
288
|
####
|
289
|
#write out a new shapefile (including .prj component)
|
290
|
dst$OID<-1:nrow(dst) #need a unique ID?
|
291
|
outfile6<-file.path(out_path,paste("monthly_covariates_ghcn_data_",var,"_",year_start_clim,"_",year_end_clim,out_prefix,".shp",sep="")) #Name of the file
|
292
|
writeOGR(dst,dsn= dirname(outfile6),layer= sub(".shp","",basename(outfile6)), driver="ESRI Shapefile",overwrite_layer=TRUE)
|
293
|
|
294
|
### list of outputs return
|
295
|
|
296
|
outfiles_obj<-list(outfile1,outfile2,outfile3,outfile4,outfile5,outfile6)
|
297
|
names(outfiles_obj)<- c("loc_stations","loc_stations_ghcn","daily_query_ghcn_data","daily_covar_ghcn_data","monthly_query_ghcn_data","monthly_covar_ghcn_data")
|
298
|
save(outfiles_obj,file= file.path(out_path,paste("met_stations_outfiles_obj_",interpolation_method,"_", out_prefix,".RData",sep="")))
|
299
|
|
300
|
return(outfiles_obj)
|
301
|
|
302
|
#END OF FUNCTION #
|
303
|
}
|
304
|
|
305
|
########## END OF SCRIPT ##########
|