Project

General

Profile

Download (18.4 KB) Statistics
| Branch: | Revision:
1
##################    Data preparation for interpolation   #######################################
2
############################ Extraction of station data ##########################################
3

    
4

    
5
database_covariates_preparation<-function(list_param_prep){
6
  #This function performs queries on the Postgres ghcnd database for stations matching the             
7
  #interpolation area. It requires 11 inputs:                                           
8
  # 1)  db.name :  Postgres database name containing the meteorological stations
9
  # 2)  var: the variable of interest - "TMAX","TMIN" or "PRCP" 
10
  # 3)  range_years: range of records used in the daily interpolation, note that upper bound year is not included               
11
  # 4)  range_years_clim: range of records used in the monthly climatology interpolation, note that upper bound is not included
12
  # 5)  infile_reg_outline: region outline as a shape file - used in the interpolation  stage too                              
13
  # 6)  infile_ghncd_data: ghcnd stations locations as a textfile name with lat-long fields                                                                                   
14
  # 7)  infile_covarariates: tif file of raser covariates for the interpolation area: it should have a local projection                                                                                           
15
  # 8)  CRS_locs_WGS84: longlat EPSG 4326 used as coordinates reference system (proj4)for stations locations
16
  # 9)  in_path: input path for covariates data and other files, this is also the output?
17
  # 10) out_path: output path created in master script
18
  # 11) covar_names: names of covariates used for the interpolation --may be removed later? (should be stored in the brick)
19
  # 12) qc_flags_stations: flag used to screen out at this stage only two values...
20
  # 13) out_prefix: output suffix added to output names--it is the same in the interpolation script
21
  #
22
  #The output is a list of four shapefile names produced by the function:
23
  #1) loc_stations: locations of stations as shapefile in EPSG 4326
24
  #2) loc_stations_ghcn: ghcn daily data for the year range of interpolation (locally projected)
25
  #3) daily_query_ghcn_data: ghcn daily data from daily query before application of quality flag
26
  #4) daily_covar_ghcn_data: ghcn daily data with covariates for the year range of interpolation (locally projected)
27
  #5) monthly_query_ghcn_data: ghcn daily data from monthly query before application of quality flag
28
  #6) monthly_covar_ghcn_data: ghcn monthly averaged data with covariates for the year range of interpolation (locally projected)
29
  
30
  #AUTHOR: Benoit Parmentier                                                                       
31
  #DATE: 05/21/2013                                                                                 
32
  #PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363--     
33
  #Comments and TODO
34
  #-Add buffer option...
35
  #-Add screening for value predicted: var
36
  ##################################################################################################
37
  
38
  ###Loading R library and packages: should it be read in before???   
39
  
40
  library(RPostgreSQL)
41
  library(sp)                                           # Spatial pacakge with class definition by Bivand et al.
42
  library(spdep)                                          # Spatial pacakge with methods and spatial stat. by Bivand et al.
43
  library(rgdal)                                          # GDAL wrapper for R, spatial utilities
44
  library(rgeos)
45
  library(rgdal)
46
  library(raster)
47
  library(rasterVis)
48
  library(maps)
49
  library(maptools)
50
  
51
  ### Functions used in the script
52
  
53
  format_s <-function(s_ID){
54
    #Format station ID in a vector format/tuple that is used in a psql query.
55
    # Argument 1: vector of station ID
56
    # Return: character of station ID
57
    tx2<-s_ID
58
    tx2<-as.character(tx2)
59
    stat_list<-tx2
60
    temp<-shQuote(stat_list)
61
    t<-paste(temp, collapse= " ")
62
    t1<-gsub(" ", ",",t)
63
    sf_ID<-paste("(",t1,")",sep="") #vector containing the station ID to query
64
    return(sf_ID)
65
  }
66
  
67
  #parsing input arguments
68
  
69
  db.name <- list_param_prep$db.name             #name of the Postgres database
70
  var <- list_param_prep$var                     #name of the variables to keep: TMIN, TMAX or PRCP
71
  year_start <-list_param_prep$range_years[1] #"2010"               #starting year for the query (included)
72
  year_end <-list_param_prep$range_years[2] #"2011"                 #end year for the query (excluded)
73
  year_start_clim <-list_param_prep$range_years_clim[1] #right bound not included in the range!! starting year for monthly query to calculate clime
74
  year_end_clim <-list_param_prep$range_years_clim[2] #right bound not included in the range!! starting year for monthly query to calculate clime
75
  infile_reg_outline<- list_param_prep$infile_reg_outline  #This is the shape file of outline of the study area                                                      #It is an input/output of the covariate script
76
  infile_ghncd_data<-list_param_prep$infile_ghncd_data      #"/home/layers/data/climate/ghcn/v2.92-upd-2012052822/ghcnd-stations.txt"                              #This is the textfile of station locations from GHCND
77
  infile_covariates<-list_param_prep$infile_covariates #"covariates__venezuela_region__VE_01292013.tif" #this is an output from covariate script
78
  CRS_locs_WGS84<-list_param_prep$CRS_locs_WGS84 #Station coords WGS84: same as earlier
79
  in_path <- list_param_prep$in_path #CRS_locs_WGS84"/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/input_data/"
80
  out_path <- list_param_prep$out_path #CRS_locs_WGS84"/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/input_data/"
81
  covar_names<-list_param_prep$covar_names # names should be written in the tif file!!!
82
  qc_flags_stations <- list_param_prep$qc_flags_stations #flags allowed for the query from the GHCND??
83
  out_prefix<-list_param_prep$out_prefix #"_365d_GAM_fus5_all_lstd_03012013"                #User defined output prefix
84
  
85
  ## working directory is the same for input and output for this function  
86
  #setwd(in_path) 
87
  setwd(out_path)
88
  ##### STEP 1: Select station in the study area
89
  
90
  filename<-sub(".shp","",infile_reg_outline)             #Removing the extension from file.
91
  interp_area <- readOGR(dsn=dirname(filename),basename(filename))
92
  CRS_interp<-proj4string(interp_area)         #Storing the coordinate information: geographic coordinates longlat WGS84
93
  
94
  #Read in GHCND database station locations
95
  dat_stat <- read.fwf(infile_ghncd_data, 
96
                       widths = c(11,9,10,7,3,31,4,4,6),fill=TRUE)
97
  colnames(dat_stat)<-c("STAT_ID","latitude","longitude","elev","state","name","GSNF","HCNF","WMOID")
98
  coords<- dat_stat[,c('longitude','latitude')]
99
  coordinates(dat_stat)<-coords
100
  proj4string(dat_stat)<-CRS_locs_WGS84 #this is the WGS84 projection
101
  #proj4string(dat_stat)<-CRS_interp
102
  interp_area_WGS84 <-spTransform(interp_area,CRS_locs_WGS84)         # Project from WGS84 to new coord. system
103
  
104
  # Spatial query to find relevant stations
105
  
106
  inside <- !is.na(over(dat_stat, as(interp_area_WGS84, "SpatialPolygons")))  #Finding stations contained in the current interpolation area
107
  stat_reg<-dat_stat[inside,]              #Selecting stations contained in the current interpolation area
108
  
109
  ####
110
  ##TODO: Add buffer option? 
111
  ####
112
  
113
  #### STEP 2: Connecting to the database and query for relevant data 
114
  
115
  drv <- dbDriver("PostgreSQL")
116
  db <- dbConnect(drv, dbname=db.name)
117
  
118
  time1<-proc.time()    #Start stop watch
119
  list_s<-format_s(stat_reg$STAT_ID)
120
  data2<-dbGetQuery(db, paste("SELECT *
121
                              FROM ghcn
122
                              WHERE element=",shQuote(var),
123
                              "AND year>=",year_start,
124
                              "AND year<",year_end,
125
                              "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
126
  time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
127
  time_minutes<-time_duration[3]/60
128
  dbDisconnect(db)
129

    
130
  data_table<-merge(data2,as.data.frame(stat_reg), by.x = "station", by.y = "STAT_ID")
131
  
132
  #Transform the subset data frame in a spatial data frame and reproject
133
  data_reg<-data_table                               #Make a copy of the data frame
134
  coords<- data_reg[c('longitude','latitude')]              #Define coordinates in a data frame: clean up here!!
135
  coordinates(data_reg)<-coords                      #Assign coordinates to the data frame
136
  proj4string(data_reg)<-CRS_locs_WGS84                #Assign coordinates reference system in PROJ4 format
137
  data_reg<-spTransform(data_reg,CRS(CRS_interp))     #Project from WGS84 to new coord. system
138
  
139
  data_d <-data_reg  #data_d: daily data containing the query without screening
140
  #data_reg <-subset(data_d,mflag=="0" | mflag=="S") #should be input arguments!!
141
  #Transform the query to be depending on the number of flags
142
  
143
  data_reg <-subset(data_d, mflag %in% qc_flags_stations) #screening using flags
144
  #data_reg2 <-subset(data_d,mflag==qc_flags_stations[1] | mflag==qc_flags_stations[2]) #screening using flags
145
  
146
  ##################################################################
147
  ### STEP 3: Save results and outuput in textfile and a shape file
148
  #browser()
149
  #Save shape files of the locations of meteorological stations in the study area
150
  #outfile1<-file.path(in_path,paste("stations","_",out_prefix,".shp",sep=""))
151
  outfile1<-file.path(out_path,paste("stations","_",out_prefix,".shp",sep=""))
152
  writeOGR(stat_reg,dsn= dirname(outfile1),layer= sub(".shp","",basename(outfile1)), driver="ESRI Shapefile",overwrite_layer=TRUE)
153
  #writeOGR(dst,dsn= ".",layer= sub(".shp","",outfile4), driver="ESRI Shapefile",overwrite_layer=TRUE)
154
  
155
  outfile2<-file.path(out_path,paste("ghcn_data_query_",var,"_",year_start,"_",year_end,out_prefix,".shp",sep=""))         #Name of the file
156
  #writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
157
  writeOGR(data_d,dsn= dirname(outfile2),layer= sub(".shp","",basename(outfile2)), driver="ESRI Shapefile",overwrite_layer=TRUE)
158
  
159
  outfile3<-file.path(out_path,paste("ghcn_data_",var,"_",year_start,"_",year_end,out_prefix,".shp",sep=""))         #Name of the file
160
  #writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
161
  writeOGR(data_reg,dsn= dirname(outfile3),layer= sub(".shp","",basename(outfile3)), driver="ESRI Shapefile",overwrite_layer=TRUE)
162
  
163
  ###################################################################
164
  ### STEP 4: Extract values at stations from covariates stack of raster images
165
  #Eventually this step may be skipped if the covariates information is stored in the database...
166
  
167
  #s_raster<-stack(file.path(in_path,infile_covariates))                   #read in the data stack
168
  s_raster<-brick(infile_covariates)                   #read in the data stack
169
  names(s_raster)<-covar_names               #Assigning names to the raster layers: making sure it is included in the extraction
170
  stat_val<- extract(s_raster, data_reg)        #Extracting values from the raster stack for every point location in coords data frame.
171
  #stat_val_test<- extract(s_raster, data_reg,def=TRUE)
172
  
173
  #create a shape file and data_frame with names
174
  
175
  data_RST<-as.data.frame(stat_val)                                            #This creates a data frame with the values extracted
176
  data_RST_SDF<-cbind(data_reg,data_RST)
177
  coordinates(data_RST_SDF)<-coordinates(data_reg) #Transforming data_RST_SDF into a spatial point dataframe
178
  CRS_reg<-proj4string(data_reg)
179
  proj4string(data_RST_SDF)<-CRS_reg  #Need to assign coordinates...
180
  
181
  #Creating a date column
182
  date1<-ISOdate(data_RST_SDF$year,data_RST_SDF$month,data_RST_SDF$day) #Creating a date object from 3 separate column
183
  date2<-gsub("-","",as.character(as.Date(date1)))
184
  data_RST_SDF$date<-date2                                              #Date format (year,month,day) is the following: "20100627"
185
  
186
  #This allows to change only one name of the data.frame
187
  pos<-match("value",names(data_RST_SDF)) #Find column with name "value"
188
  if (var=="TMAX"){
189
    #names(data_RST_SDF)[pos]<-c("TMax")
190
    data_RST_SDF$value<-data_RST_SDF$value/10                #TMax is the average max temp for monthy data
191
  }
192
  
193
  if (var=="TMIN"){
194
    #names(data_RST_SDF)[pos]<-c("TMin")
195
    data_RST_SDF$value<-data_RST_SDF$value/10                #TMax is the average max temp for monthy data
196
  }
197
  
198
  #write out a new shapefile (including .prj component)
199
  outfile4<-file.path(out_path,paste("daily_covariates_ghcn_data_",var,"_",range_years[1],"_",range_years[2],out_prefix,".shp",sep=""))         #Name of the file
200
  writeOGR(data_RST_SDF,dsn= dirname(outfile4),layer= sub(".shp","",basename(outfile4)), driver="ESRI Shapefile",overwrite_layer=TRUE)
201
  
202
  ###############################################################
203
  ######## STEP 5: Preparing monthly averages from the ProstGres database
204
  
205
  drv <- dbDriver("PostgreSQL")
206
  db <- dbConnect(drv, dbname=db.name)
207
  
208
  #year_start_clim: set at the start of the script
209
  time1<-proc.time()    #Start stop watch
210
  list_s<-format_s(stat_reg$STAT_ID)
211
  data_m<-dbGetQuery(db, paste("SELECT *
212
                               FROM ghcn
213
                               WHERE element=",shQuote(var),
214
                               "AND year>=",year_start_clim,
215
                               "AND year<",year_end_clim,
216
                               "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
217
  time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
218
  time_minutes<-time_duration[3]/60
219
  dbDisconnect(db)
220
  #Clean out this section!!
221
  date1<-ISOdate(data_m$year,data_m$month,data_m$day) #Creating a date object from 3 separate column
222
  date2<-as.POSIXlt(as.Date(date1))
223
  data_m$date<-date2
224
  #Save the query data here...
225
  data_m<-merge(data_m, stat_reg, by.x="station", by.y="STAT_ID")   #Inner join all columns are retained
226
  #Extracting covariates from stack for the monthly dataset...
227
  coords<- data_m[c('longitude','latitude')]              #Define coordinates in a data frame
228
  coordinates(data_m)<-coords                      #Assign coordinates to the data frame
229
  proj4string(data_m)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
230
  data_m<-spTransform(data_m,CRS(CRS_interp))     #Project from WGS84 to new coord. system
231
  outfile5<-file.path(out_path,paste("monthly_query_ghcn_data_",var,"_",year_start_clim,"_",year_end_clim,out_prefix,".shp",sep=""))  #Name of the file
232
  writeOGR(data_m,dsn= dirname(outfile5),layer= sub(".shp","",basename(outfile5)), driver="ESRI Shapefile",overwrite_layer=TRUE)
233
  
234
  #In Venezuela and other regions where there are not many stations...mflag==S should be added..see Durenne etal.2010.
235

    
236
  #d<-subset(data_m,mflag==qc_flags_stations[1] | mflag==qc_flags_stations[2])
237
  d<-subset(data_m,mflag %in% qc_flags_stations)
238
  
239
  #Add screening here ...May need some screeing??? i.e. range of temp and elevation...
240
  
241
  d1<-aggregate(value~station+month, data=d, mean)  #Calculate monthly mean for every station in OR
242
  #d2<-aggregate(value~station+month, data=d, length)  #Calculate monthly mean for every station in OR
243
  is_not_na_fun<-function(x) sum(!is.na(x)) #count the number of available observation
244
  d2<-aggregate(value~station+month, data=d, is_not_na_fun)  #Calculate monthly mean for every station in OR
245
  #names(d2)[names(d2)=="value"] <-"nobs_station"
246
  d1$nobs_station<-d2$value
247
  dst<-merge(d1, stat_reg, by.x="station", by.y="STAT_ID")   #Inner join all columns are retained
248
  
249
  #This allows to change only one name of the data.frame
250
  pos<-match("value",names(dst)) #Find column with name "value"
251
  if (var=="TMAX"){
252
    names(dst)[pos]<-c("TMax")           #renaming the column "value" extracted from the Postgres database
253
    dst$TMax<-dst$TMax/10                #TMax is the average max temp for monthy data
254
  }
255
  
256
  if (var=="TMIN"){
257
    names(dst)[pos]<-c("TMin")
258
    dst$TMin<-dst$TMin/10                #TMin is the average min temp for monthy data
259
  }
260
  
261
  #Extracting covariates from stack for the monthly dataset...
262
  #names(dst)[5:6] <-c('latitude','longitude')
263
  coords<- dst[c('longitude','latitude')]              #Define coordinates in a data frame
264
  
265
  coordinates(dst)<-coords                      #Assign coordinates to the data frame
266
  proj4string(dst)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
267
  dst_month<-spTransform(dst,CRS(CRS_interp))     #Project from WGS84 to new coord. system
268
  
269
  stations_val<-extract(s_raster,dst_month,df=TRUE)  #extraction of the information at station location in a data frame
270
  #dst_extract<-spCbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
271
  dst_extract<-cbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
272
  dst<-dst_extract #problem!!! two column named elev!!! use elev_s??
273
  
274
  #browser()
275
  coords<- dst[c('x','y')]              #Define coordinates in a data frame, this is the local x,y
276
  index<-!is.na(coords$x)               #remove if NA, may need to revisit at a later stage
277
  dst<-dst[index,]
278
  coords<- dst[c('x','y')]              #Define coordinates in a data frame, this is the local x,y
279
  coordinates(dst)<-coords                    #Assign coordinates to the data frame
280
  proj4string(dst)<-CRS_interp        #Assign coordinates reference system in PROJ4 format
281
  
282
  ### ADD SCREENING HERE BEFORE WRITING OUT DATA
283
  #Covariates ok since screening done in covariate script
284
  #screening on var i.e. value, TMIN, TMAX...
285
  
286
  ####
287
  
288
  ####
289
  #write out a new shapefile (including .prj component)
290
  dst$OID<-1:nrow(dst) #need a unique ID?
291
  outfile6<-file.path(out_path,paste("monthly_covariates_ghcn_data_",var,"_",year_start_clim,"_",year_end_clim,out_prefix,".shp",sep=""))  #Name of the file
292
  writeOGR(dst,dsn= dirname(outfile6),layer= sub(".shp","",basename(outfile6)), driver="ESRI Shapefile",overwrite_layer=TRUE)
293
  
294
  ### list of outputs return
295
  
296
  outfiles_obj<-list(outfile1,outfile2,outfile3,outfile4,outfile5,outfile6)
297
  names(outfiles_obj)<- c("loc_stations","loc_stations_ghcn","daily_query_ghcn_data","daily_covar_ghcn_data","monthly_query_ghcn_data","monthly_covar_ghcn_data")
298
  save(outfiles_obj,file= file.path(out_path,paste("met_stations_outfiles_obj_",interpolation_method,"_", out_prefix,".RData",sep="")))
299
  
300
  return(outfiles_obj)
301
  
302
  #END OF FUNCTION # 
303
}
304

    
305
########## END OF SCRIPT ##########
(2-2/52)