Project

General

Profile

Download (17.6 KB) Statistics
| Branch: | Revision:
1
##################    Data preparation for interpolation   #######################################
2
############################ Extraction of station data ##########################################
3

    
4

    
5
database_covariates_preparation<-function(list_param_prep){
6
  #This function performs queries on the Postgres ghcnd database for stations matching the             
7
  #interpolation area. It requires 11 inputs:                                           
8
  # 1)  db.name :  Postgres database name containing the meteorological stations
9
  # 2)  var: the variable of interest - "TMAX","TMIN" or "PRCP" 
10
  # 3)  range_years: range of records used in the daily interpolation, note that upper bound year is not included               
11
  # 4)  range_years_clim: range of records used in the monthly climatology interpolation, note that upper bound is not included
12
  # 5)  infile1: region outline as a shape file - used in the interpolation  stage too                              
13
  # 6)  infile2: ghcnd stations locations as a textfile name with lat-long fields                                                                                   
14
  # 7)  infile_covarariates: tif file of raser covariates for the interpolation area: it should have a local projection                                                                                           
15
  # 8)  CRS_locs_WGS84: longlat EPSG 4326 used as coordinates reference system (proj4)for stations locations
16
  # 9)  in_path: input path for covariates data and other files, this is also the output?
17
  # 10) covar_names: names of covariates used for the interpolation --may be removed later? (should be stored in the brick)
18
  # 11) qc_flags_stations: flag used to screen out at this stage only two values...
19
  # 12) out_prefix: output suffix added to output names--it is the same in the interpolation script
20
  #
21
  #The output is a list of four shapefile names produced by the function:
22
  #1) loc_stations: locations of stations as shapefile in EPSG 4326
23
  #2) loc_stations_ghcn: ghcn daily data for the year range of interpolation (locally projected)
24
  #3) daily_query_ghcn_data: ghcn daily data from daily query before application of quality flag
25
  #4) daily_covar_ghcn_data: ghcn daily data with covariates for the year range of interpolation (locally projected)
26
  #5) monthly_query_ghcn_data: ghcn daily data from monthly query before application of quality flag
27
  #6) monthly_covar_ghcn_data: ghcn monthly averaged data with covariates for the year range of interpolation (locally projected)
28
  
29
  #AUTHOR: Benoit Parmentier                                                                       
30
  #DATE: 04/05/2013                                                                                 
31
  #PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363--     
32
  #Comments and TODO
33
  #-Add buffer option...
34
  #-Add output path argument option
35
  #-Add screening for value predicted: var
36
  ##################################################################################################
37
  
38
  ###Loading R library and packages: should it be read in before???   
39
  
40
  library(RPostgreSQL)
41
  library(sp)                                           # Spatial pacakge with class definition by Bivand et al.
42
  library(spdep)                                          # Spatial pacakge with methods and spatial stat. by Bivand et al.
43
  library(rgdal)                                          # GDAL wrapper for R, spatial utilities
44
  library(rgeos)
45
  library(rgdal)
46
  library(raster)
47
  library(rasterVis)
48
  library(maps)
49
  library(maptools)
50
  
51
  ### Functions used in the script
52
  
53
  format_s <-function(s_ID){
54
    #Format station ID in a vector format/tuple that is used in a psql query.
55
    # Argument 1: vector of station ID
56
    # Return: character of station ID
57
    tx2<-s_ID
58
    tx2<-as.character(tx2)
59
    stat_list<-tx2
60
    temp<-shQuote(stat_list)
61
    t<-paste(temp, collapse= " ")
62
    t1<-gsub(" ", ",",t)
63
    sf_ID<-paste("(",t1,")",sep="") #vector containing the station ID to query
64
    return(sf_ID)
65
  }
66
  
67
  #parsing input arguments
68
  
69
  db.name <- list_param_prep$db.name             #name of the Postgres database
70
  var <- list_param_prep$var                     #name of the variables to keep: TMIN, TMAX or PRCP
71
  year_start <-list_param_prep$range_years[1] #"2010"               #starting year for the query (included)
72
  year_end <-list_param_prep$range_years[2] #"2011"                 #end year for the query (excluded)
73
  year_start_clim <-list_param_prep$range_years_clim[1] #right bound not included in the range!! starting year for monthly query to calculate clime
74
  year_end_clim <-list_param_prep$range_years_clim[2] #right bound not included in the range!! starting year for monthly query to calculate clime
75
  infile1<- list_param_prep$infile1  #This is the shape file of outline of the study area                                                      #It is an input/output of the covariate script
76
  infile2<-list_param_prep$infile2      #"/home/layers/data/climate/ghcn/v2.92-upd-2012052822/ghcnd-stations.txt"                              #This is the textfile of station locations from GHCND
77
  infile3<-list_param_prep$infile_covariates #"covariates__venezuela_region__VE_01292013.tif" #this is an output from covariate script
78
  CRS_locs_WGS84<-list_param_prep$CRS_locs_WGS84 #Station coords WGS84: same as earlier
79
  in_path <- list_param_prep$in_path #CRS_locs_WGS84"/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/input_data/"
80
  covar_names<-list_param_prep$covar_names # names should be written in the tif file!!!
81
  qc_flags_stations <- list_param_prep$qc_flags_stations #flags allowed for the query from the GHCND??
82
  out_prefix<-list_param_prep$out_prefix #"_365d_GAM_fus5_all_lstd_03012013"                #User defined output prefix
83
  
84
  ## working directory is the same for input and output for this function  
85
  setwd(in_path) 
86
  
87
  ##### STEP 1: Select station in the study area
88
  
89
  filename<-sub(".shp","",infile1)             #Removing the extension from file.
90
  interp_area <- readOGR(".",filename)
91
  CRS_interp<-proj4string(interp_area)         #Storing the coordinate information: geographic coordinates longlat WGS84
92
  
93
  #Read in GHCND database station locations
94
  dat_stat <- read.fwf(infile2, 
95
                       widths = c(11,9,10,7,3,31,4,4,6),fill=TRUE)
96
  colnames(dat_stat)<-c("STAT_ID","lat","lon","elev","state","name","GSNF","HCNF","WMOID")
97
  coords<- dat_stat[,c('lon','lat')]
98
  coordinates(dat_stat)<-coords
99
  proj4string(dat_stat)<-CRS_locs_WGS84 #this is the WGS84 projection
100
  #proj4string(dat_stat)<-CRS_interp
101
  interp_area_WGS84 <-spTransform(interp_area,CRS_locs_WGS84)         # Project from WGS84 to new coord. system
102
  
103
  # Spatial query to find relevant stations
104
  
105
  inside <- !is.na(over(dat_stat, as(interp_area_WGS84, "SpatialPolygons")))  #Finding stations contained in the current interpolation area
106
  stat_reg<-dat_stat[inside,]              #Selecting stations contained in the current interpolation area
107
  
108
  ####
109
  ##TODO: Add buffer option? 
110
  ####
111
  
112
  #### STEP 2: Connecting to the database and query for relevant data 
113
  
114
  drv <- dbDriver("PostgreSQL")
115
  db <- dbConnect(drv, dbname=db.name)
116
  
117
  time1<-proc.time()    #Start stop watch
118
  list_s<-format_s(stat_reg$STAT_ID)
119
  data2<-dbGetQuery(db, paste("SELECT *
120
                              FROM ghcn
121
                              WHERE element=",shQuote(var),
122
                              "AND year>=",year_start,
123
                              "AND year<",year_end,
124
                              "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
125
  time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
126
  time_minutes<-time_duration[3]/60
127
  dbDisconnect(db)
128

    
129
  data_table<-merge(data2,as.data.frame(stat_reg), by.x = "station", by.y = "STAT_ID")
130
  
131
  #Transform the subset data frame in a spatial data frame and reproject
132
  data_reg<-data_table                               #Make a copy of the data frame
133
  coords<- data_reg[c('lon','lat')]              #Define coordinates in a data frame: clean up here!!
134
  coordinates(data_reg)<-coords                      #Assign coordinates to the data frame
135
  proj4string(data_reg)<-CRS_locs_WGS84                #Assign coordinates reference system in PROJ4 format
136
  data_reg<-spTransform(data_reg,CRS(CRS_interp))     #Project from WGS84 to new coord. system
137
  
138
  data_d <-data_reg  #data_d: daily data containing the query without screening
139
  #data_reg <-subset(data_d,mflag=="0" | mflag=="S") #should be input arguments!!
140
  #Transform the query to be depending on the number of flags
141
  
142
  data_reg <-subset(data_d, mflag %in% qc_flags_stations) #screening using flags
143
  #data_reg2 <-subset(data_d,mflag==qc_flags_stations[1] | mflag==qc_flags_stations[2]) #screening using flags
144
  
145
  ##################################################################
146
  ### STEP 3: Save results and outuput in textfile and a shape file
147
  #browser()
148
  #Save shape files of the locations of meteorological stations in the study area
149
  outfile1<-file.path(in_path,paste("stations","_",out_prefix,".shp",sep=""))
150
  writeOGR(stat_reg,dsn= dirname(outfile1),layer= sub(".shp","",basename(outfile1)), driver="ESRI Shapefile",overwrite_layer=TRUE)
151
  #writeOGR(dst,dsn= ".",layer= sub(".shp","",outfile4), driver="ESRI Shapefile",overwrite_layer=TRUE)
152
  
153
  outfile2<-file.path(in_path,paste("ghcn_data_query_",var,"_",year_start,"_",year_end,out_prefix,".shp",sep=""))         #Name of the file
154
  #writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
155
  writeOGR(data_d,dsn= dirname(outfile2),layer= sub(".shp","",basename(outfile2)), driver="ESRI Shapefile",overwrite_layer=TRUE)
156
  
157
  outfile3<-file.path(in_path,paste("ghcn_data_",var,"_",year_start,"_",year_end,out_prefix,".shp",sep=""))         #Name of the file
158
  #writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
159
  writeOGR(data_reg,dsn= dirname(outfile3),layer= sub(".shp","",basename(outfile3)), driver="ESRI Shapefile",overwrite_layer=TRUE)
160
  
161
  ###################################################################
162
  ### STEP 4: Extract values at stations from covariates stack of raster images
163
  #Eventually this step may be skipped if the covariates information is stored in the database...
164
  
165
  s_raster<-stack(infile3)                   #read in the data stack
166
  names(s_raster)<-covar_names               #Assigning names to the raster layers: making sure it is included in the extraction
167
  stat_val<- extract(s_raster, data_reg)        #Extracting values from the raster stack for every point location in coords data frame.
168
  
169
  #create a shape file and data_frame with names
170
  
171
  data_RST<-as.data.frame(stat_val)                                            #This creates a data frame with the values extracted
172
  data_RST_SDF<-cbind(data_reg,data_RST)
173
  coordinates(data_RST_SDF)<-coordinates(data_reg) #Transforming data_RST_SDF into a spatial point dataframe
174
  CRS_reg<-proj4string(data_reg)
175
  proj4string(data_RST_SDF)<-CRS_reg  #Need to assign coordinates...
176
  
177
  #Creating a date column
178
  date1<-ISOdate(data_RST_SDF$year,data_RST_SDF$month,data_RST_SDF$day) #Creating a date object from 3 separate column
179
  date2<-gsub("-","",as.character(as.Date(date1)))
180
  data_RST_SDF$date<-date2                                              #Date format (year,month,day) is the following: "20100627"
181
  
182
  #This allows to change only one name of the data.frame
183
  pos<-match("value",names(data_RST_SDF)) #Find column with name "value"
184
  if (var=="TMAX"){
185
    #names(data_RST_SDF)[pos]<-c("TMax")
186
    data_RST_SDF$value<-data_RST_SDF$value/10                #TMax is the average max temp for monthy data
187
  }
188
  
189
  if (var=="TMIN"){
190
    #names(data_RST_SDF)[pos]<-c("TMin")
191
    data_RST_SDF$value<-data_RST_SDF$value/10                #TMax is the average max temp for monthy data
192
  }
193
  
194
  #write out a new shapefile (including .prj component)
195
  outfile4<-file.path(in_path,paste("daily_covariates_ghcn_data_",var,"_",range_years[1],"_",range_years[2],out_prefix,".shp",sep=""))         #Name of the file
196
  writeOGR(data_RST_SDF,dsn= dirname(outfile4),layer= sub(".shp","",basename(outfile4)), driver="ESRI Shapefile",overwrite_layer=TRUE)
197
  
198
  ###############################################################
199
  ######## STEP 5: Preparing monthly averages from the ProstGres database
200
  
201
  drv <- dbDriver("PostgreSQL")
202
  db <- dbConnect(drv, dbname=db.name)
203
  
204
  #year_start_clim: set at the start of the script
205
  time1<-proc.time()    #Start stop watch
206
  list_s<-format_s(stat_reg$STAT_ID)
207
  data_m<-dbGetQuery(db, paste("SELECT *
208
                               FROM ghcn
209
                               WHERE element=",shQuote(var),
210
                               "AND year>=",year_start_clim,
211
                               "AND year<",year_end_clim,
212
                               "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
213
  time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
214
  time_minutes<-time_duration[3]/60
215
  dbDisconnect(db)
216
  #Clean out this section!!
217
  date1<-ISOdate(data_m$year,data_m$month,data_m$day) #Creating a date object from 3 separate column
218
  date2<-as.POSIXlt(as.Date(date1))
219
  data_m$date<-date2
220
  #Save the query data here...
221
  data_m<-merge(data_m, stat_reg, by.x="station", by.y="STAT_ID")   #Inner join all columns are retained
222
  #Extracting covariates from stack for the monthly dataset...
223
  coords<- data_m[c('lon','lat')]              #Define coordinates in a data frame
224
  coordinates(data_m)<-coords                      #Assign coordinates to the data frame
225
  proj4string(data_m)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
226
  data_m<-spTransform(data_m,CRS(CRS_interp))     #Project from WGS84 to new coord. system
227
  outfile5<-file.path(in_path,paste("monthly_query_ghcn_data_",var,"_",year_start_clim,"_",year_end_clim,out_prefix,".shp",sep=""))  #Name of the file
228
  writeOGR(data_m,dsn= dirname(outfile5),layer= sub(".shp","",basename(outfile5)), driver="ESRI Shapefile",overwrite_layer=TRUE)
229
  
230
  #In Venezuela and other regions where there are not many stations...mflag==S should be added..see Durenne etal.2010.
231

    
232
  #d<-subset(data_m,mflag==qc_flags_stations[1] | mflag==qc_flags_stations[2])
233
  d<-subset(data_m,mflag %in% qc_flags_stations)
234
  
235
  #Add screening here ...May need some screeing??? i.e. range of temp and elevation...
236
  
237
  d1<-aggregate(value~station+month, data=d, mean)  #Calculate monthly mean for every station in OR
238
  #d2<-aggregate(value~station+month, data=d, length)  #Calculate monthly mean for every station in OR
239
  is_not_na_fun<-function(x) sum(!is.na(x)) #count the number of available observation
240
  d2<-aggregate(value~station+month, data=d, is_not_na_fun)  #Calculate monthly mean for every station in OR
241
  #names(d2)[names(d2)=="value"] <-"nobs_station"
242
  d1$nobs_station<-d2$value
243
  dst<-merge(d1, stat_reg, by.x="station", by.y="STAT_ID")   #Inner join all columns are retained
244
  
245
  #This allows to change only one name of the data.frame
246
  pos<-match("value",names(dst)) #Find column with name "value"
247
  if (var=="TMAX"){
248
    names(dst)[pos]<-c("TMax")           #renaming the column "value" extracted from the Postgres database
249
    dst$TMax<-dst$TMax/10                #TMax is the average max temp for monthy data
250
  }
251
  
252
  if (var=="TMIN"){
253
    names(dst)[pos]<-c("TMin")
254
    dst$TMin<-dst$TMin/10                #TMin is the average min temp for monthy data
255
  }
256
  
257
  #Extracting covariates from stack for the monthly dataset...
258
  coords<- dst[c('lon','lat')]              #Define coordinates in a data frame
259
  coordinates(dst)<-coords                      #Assign coordinates to the data frame
260
  proj4string(dst)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
261
  dst_month<-spTransform(dst,CRS(CRS_interp))     #Project from WGS84 to new coord. system
262
  
263
  stations_val<-extract(s_raster,dst_month,df=TRUE)  #extraction of the information at station location in a data frame
264
  #dst_extract<-spCbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
265
  dst_extract<-cbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
266
  dst<-dst_extract #problem!!! two column named elev!!! use elev_s??
267
  
268
  #browser()
269
  coords<- dst[c('x','y')]              #Define coordinates in a data frame, this is the local x,y
270
  index<-!is.na(coords$x)               #remove if NA, may need to revisit at a later stage
271
  dst<-dst[index,]
272
  coords<- dst[c('x','y')]              #Define coordinates in a data frame, this is the local x,y
273
  coordinates(dst)<-coords                    #Assign coordinates to the data frame
274
  proj4string(dst)<-CRS_interp        #Assign coordinates reference system in PROJ4 format
275
  
276
  ### ADD SCREENING HERE BEFORE WRITING OUT DATA
277
  #Covariates ok since screening done in covariate script
278
  #screening on var i.e. value, TMIN, TMAX...
279
  
280
  ####
281
  
282
  ####
283
  #write out a new shapefile (including .prj component)
284
  dst$OID<-1:nrow(dst) #need a unique ID?
285
  outfile6<-file.path(in_path,paste("monthly_covariates_ghcn_data_",var,"_",year_start_clim,"_",year_end_clim,out_prefix,".shp",sep=""))  #Name of the file
286
  writeOGR(dst,dsn= dirname(outfile6),layer= sub(".shp","",basename(outfile6)), driver="ESRI Shapefile",overwrite_layer=TRUE)
287
  
288
  ### list of outputs return
289
  
290
  outfiles_obj<-list(outfile1,outfile2,outfile3,outfile4,outfile5,outfile6)
291
  names(outfiles_obj)<- c("loc_stations","loc_stations_ghcn","daily_query_ghcn_data","daily_covar_ghcn_data","monthly_query_ghcn_data","monthly_covar_ghcn_data")
292
  return(outfiles_obj)
293
  
294
  #END OF FUNCTION # 
295
}
296

    
297
########## END OF SCRIPT ##########
(2-2/42)