Project

General

Profile

Download (15.2 KB) Statistics
| Branch: | Revision:
1
##################    Data preparation for interpolation   #######################################
2
############################ Extraction of station data ##########################################
3

    
4

    
5
database_covariates_preparation<-function(list_param_prep){
6
  #This function performs queries on the Postgres ghcnd database for stations matching the             
7
  #interpolation area. It requires 11 inputs:                                           
8
  # 1) db.name :  Postgres database name containing the meteorological stations
9
  # 2) var: the variable of interest - "TMAX","TMIN" or "PRCP" 
10
  # 3) range_years: range of records used in the daily interpolation, note that upper bound year is not included               
11
  # 4) range_years_clim: range of records used in the monthly climatology interpolation, note that upper bound is not included
12
  # 5) infile1: region outline as a shape file - used in the interpolation  stage too                              
13
  # 6) infile2: ghcnd stations locations as a textfile name with lat-long fields                                                                                   
14
  # 7) infile_covarariates: tif file of raser covariates for the interpolation area: it should have a local projection                                                                                           
15
  # 8) CRS_locs_WGS84: longlat EPSG 4326 used as coordinates reference system (proj4)for stations locations
16
  # 9) in_path: input path for covariates data and other files, this is also the output?
17
  # 10) covar_names: names of covariates used for the interpolation --may be removed later? (should be stored in the brick)
18
  # 11) out_prefix: output suffix added to output names--it is the same in the interpolation script
19
  #
20
  #The output is a list of four shapefile names produced by the function:
21
  #1) loc_stations: locations of stations as shapefile in EPSG 4326
22
  #2) loc_stations_ghcn: ghcn daily data for the year range of interpolation (locally projected)
23
  #3) daily_covar_ghcn_data: ghcn daily data with covariates for the year range of interpolation (locally projected)
24
  #4) monthly_covar_ghcn_data: ghcn daily data with covariates for the year range of interpolation (locally projected)
25
  
26
  #AUTHOR: Benoit Parmentier                                                                       
27
  #DATE: 03/13/2013                                                                                 
28
  #PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363--     
29
  #Comments and TODO
30
  #-Add buffer option...
31
  #-Add output path argument option
32
  #-Add qc flag options
33
  ##################################################################################################
34
  
35
  ###Loading R library and packages: should it be read in before???   
36
  
37
  library(RPostgreSQL)
38
  library(sp)                                           # Spatial pacakge with class definition by Bivand et al.
39
  library(spdep)                                          # Spatial pacakge with methods and spatial stat. by Bivand et al.
40
  library(rgdal)                                          # GDAL wrapper for R, spatial utilities
41
  library(rgeos)
42
  library(rgdal)
43
  library(raster)
44
  library(rasterVis)
45
  library(maps)
46
  library(maptools)
47
  
48
  ### Functions used in the script
49
  
50
  format_s <-function(s_ID){
51
    #Format station ID in a vector format/tuple that is used in a psql query.
52
    # Argument 1: vector of station ID
53
    # Return: character of station ID
54
    tx2<-s_ID
55
    tx2<-as.character(tx2)
56
    stat_list<-tx2
57
    temp<-shQuote(stat_list)
58
    t<-paste(temp, collapse= " ")
59
    t1<-gsub(" ", ",",t)
60
    sf_ID<-paste("(",t1,")",sep="") #vector containing the station ID to query
61
    return(sf_ID)
62
  }
63
  
64
  #parsing input arguments
65
  
66
  db.name <- list_param_prep$db.name             #name of the Postgres database
67
  var <- list_param_prep$var                     #name of the variables to keep: TMIN, TMAX or PRCP
68
  year_start <-list_param_prep$range_years[1] #"2010"               #starting year for the query (included)
69
  year_end <-list_param_prep$range_years[2] #"2011"                 #end year for the query (excluded)
70
  year_start_clim <-list_param_prep$range_years_clim[1] #right bound not included in the range!! starting year for monthly query to calculate clime
71
  infile1<- list_param_prep$infile1  #This is the shape file of outline of the study area                                                      #It is an input/output of the covariate script
72
  infile2<-list_param_prep$infile2      #"/home/layers/data/climate/ghcn/v2.92-upd-2012052822/ghcnd-stations.txt"                              #This is the textfile of station locations from GHCND
73
  infile3<-list_param_prep$infile_covariates #"covariates__venezuela_region__VE_01292013.tif" #this is an output from covariate script
74
  CRS_locs_WGS84<-list_param_prep$CRS_locs_WGS84 #Station coords WGS84: same as earlier
75
  in_path <- list_param_prep$in_path #CRS_locs_WGS84"/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/input_data/"
76
  out_prefix<-list_param_prep$out_prefix #"_365d_GAM_fus5_all_lstd_03012013"                #User defined output prefix
77
  #qc_flags<-list_param_prep$qc_flags    flags allowed for the query from the GHCND??
78
  covar_names<-list_param_prep$covar_names # names should be written in the tif file!!!
79
  
80
  ## working directory is the same for input and output for this function  
81
  setwd(in_path) 
82
  
83
  ##### STEP 1: Select station in the study area
84
  
85
  filename<-sub(".shp","",infile1)             #Removing the extension from file.
86
  interp_area <- readOGR(".",filename)
87
  CRS_interp<-proj4string(interp_area)         #Storing the coordinate information: geographic coordinates longlat WGS84
88
  
89
  #Read in GHCND database station locations
90
  dat_stat <- read.fwf(infile2, 
91
                       widths = c(11,9,10,7,3,31,4,4,6),fill=TRUE)
92
  colnames(dat_stat)<-c("STAT_ID","lat","lon","elev","state","name","GSNF","HCNF","WMOID")
93
  coords<- dat_stat[,c('lon','lat')]
94
  coordinates(dat_stat)<-coords
95
  proj4string(dat_stat)<-CRS_locs_WGS84 #this is the WGS84 projection
96
  #proj4string(dat_stat)<-CRS_interp
97
  dat_stat2<-spTransform(dat_stat,CRS(CRS_interp))         # Project from WGS84 to new coord. system
98
  
99
  # Spatial query to find relevant stations
100
  inside <- !is.na(over(dat_stat2, as(interp_area, "SpatialPolygons")))  #Finding stations contained in the current interpolation area
101
  stat_reg<-dat_stat2[inside,]              #Selecting stations contained in the current interpolation area
102
  
103
  ####
104
  ##TODO: Add buffer option? 
105
  ####
106
  
107
  #### STEP 2: Connecting to the database and query for relevant data 
108
  
109
  drv <- dbDriver("PostgreSQL")
110
  db <- dbConnect(drv, dbname=db.name)
111
  
112
  time1<-proc.time()    #Start stop watch
113
  list_s<-format_s(stat_reg$STAT_ID)
114
  data2<-dbGetQuery(db, paste("SELECT *
115
                              FROM ghcn
116
                              WHERE element=",shQuote(var),
117
                              "AND year>=",year_start,
118
                              "AND year<",year_end,
119
                              "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
120
  time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
121
  time_minutes<-time_duration[3]/60
122
  dbDisconnect(db)
123
  ###
124
  #Add month query and averages here...
125
  ###
126
  
127
  #data2 contains only 46 stations for Venezueal area??
128
  data_table<-merge(data2,as.data.frame(stat_reg), by.x = "station", by.y = "STAT_ID")
129
  
130
  #Transform the subset data frame in a spatial data frame and reproject
131
  data_reg<-data_table                               #Make a copy of the data frame
132
  coords<- data_reg[c('lon','lat')]              #Define coordinates in a data frame: clean up here!!
133
  #Wrong label...it is in fact projected...
134
  coordinates(data_reg)<-coords                      #Assign coordinates to the data frame
135
  #proj4string(data3)<-locs_coord                  #Assign coordinates reference system in PROJ4 format
136
  proj4string(data_reg)<-CRS_locs_WGS84                #Assign coordinates reference system in PROJ4 format
137
  data_reg<-spTransform(data_reg,CRS(CRS_interp))     #Project from WGS84 to new coord. system
138
  
139
  ##################################################################
140
  ### STEP 3: Save results and outuput in textfile and a shape file
141
  #browser()
142
  #Save shape files of the locations of meteorological stations in the study area
143
  outfile1<-file.path(in_path,paste("stations","_",out_prefix,".shp",sep=""))
144
  writeOGR(stat_reg,dsn= dirname(outfile1),layer= sub(".shp","",basename(outfile1)), driver="ESRI Shapefile",overwrite_layer=TRUE)
145
  #writeOGR(dst,dsn= ".",layer= sub(".shp","",outfile4), driver="ESRI Shapefile",overwrite_layer=TRUE)
146
  
147
  outfile2<-file.path(in_path,paste("ghcn_data_",var,"_",year_start_clim,"_",year_end,out_prefix,".shp",sep=""))         #Name of the file
148
  #writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
149
  writeOGR(data_reg,dsn= dirname(outfile2),layer= sub(".shp","",basename(outfile2)), driver="ESRI Shapefile",overwrite_layer=TRUE)
150
  
151
  ###################################################################
152
  ### STEP 4: Extract values at stations from covariates stack of raster images
153
  #Eventually this step may be skipped if the covariates information is stored in the database...
154
  
155
  s_raster<-stack(infile3)                   #read in the data stack
156
  names(s_raster)<-covar_names               #Assigning names to the raster layers: making sure it is included in the extraction
157
  stat_val<- extract(s_raster, data_reg)        #Extracting values from the raster stack for every point location in coords data frame.
158
  
159
  #create a shape file and data_frame with names
160
  
161
  data_RST<-as.data.frame(stat_val)                                            #This creates a data frame with the values extracted
162
  data_RST_SDF<-cbind(data_reg,data_RST)
163
  coordinates(data_RST_SDF)<-coordinates(data_reg) #Transforming data_RST_SDF into a spatial point dataframe
164
  CRS_reg<-proj4string(data_reg)
165
  proj4string(data_RST_SDF)<-CRS_reg  #Need to assign coordinates...
166
  
167
  #Creating a date column
168
  date1<-ISOdate(data_RST_SDF$year,data_RST_SDF$month,data_RST_SDF$day) #Creating a date object from 3 separate column
169
  date2<-gsub("-","",as.character(as.Date(date1)))
170
  data_RST_SDF$date<-date2                                              #Date format (year,month,day) is the following: "20100627"
171
  
172
  #This allows to change only one name of the data.frame
173
  pos<-match("value",names(data_RST_SDF)) #Find column with name "value"
174
  if (var=="TMAX"){
175
    #names(data_RST_SDF)[pos]<-c("TMax")
176
    data_RST_SDF$value<-data_RST_SDF$value/10                #TMax is the average max temp for monthy data
177
  }
178
  
179
  if (var=="TMIN"){
180
    #names(data_RST_SDF)[pos]<-c("TMin")
181
    data_RST_SDF$value<-data_RST_SDF$value/10                #TMax is the average max temp for monthy data
182
  }
183
  
184
  #write out a new shapefile (including .prj component)
185
  outfile3<-file.path(in_path,paste("daily_covariates_ghcn_data_",var,"_",range_years[1],"_",range_years[2],out_prefix,".shp",sep=""))         #Name of the file
186
  writeOGR(data_RST_SDF,dsn= dirname(outfile3),layer= sub(".shp","",basename(outfile3)), driver="ESRI Shapefile",overwrite_layer=TRUE)
187
  
188
  ###############################################################
189
  ######## STEP 5: Preparing monthly averages from the ProstGres database
190
  
191
  drv <- dbDriver("PostgreSQL")
192
  db <- dbConnect(drv, dbname=db.name)
193
  
194
  #year_start_clim: set at the start of the script
195
  #year_end<-2011
196
  time1<-proc.time()    #Start stop watch
197
  list_s<-format_s(stat_reg$STAT_ID)
198
  data_m<-dbGetQuery(db, paste("SELECT *
199
                               FROM ghcn
200
                               WHERE element=",shQuote(var),
201
                               "AND year>=",year_start_clim,
202
                               "AND year<",year_end,
203
                               "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
204
  time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
205
  time_minutes<-time_duration[3]/60
206
  dbDisconnect(db)
207
  #Clean out this section!!
208
  date1<-ISOdate(data_m$year,data_m$month,data_m$day) #Creating a date object from 3 separate column
209
  date2<-as.POSIXlt(as.Date(date1))
210
  data_m$date<-date2
211
  #In Venezuela and other regions where there are not many stations...mflag==S should be added..see Durenne etal.2010.
212
  #d<-subset(data_m,year>=2000 & mflag=="0" ) #Selecting dataset 2000-2010 with good quality: 193 stations
213
  d<-subset(data_m,mflag=="0" | mflag=="S") #should be input arguments!!
214
  #May need some screeing??? i.e. range of temp and elevation...
215
  d1<-aggregate(value~station+month, data=d, mean)  #Calculate monthly mean for every station in OR
216
  id<-as.data.frame(unique(d1$station))     #Unique station in OR for year 2000-2010: 193 but 7 loss of monthly avg    
217
  
218
  dst<-merge(d1, stat_reg, by.x="station", by.y="STAT_ID")   #Inner join all columns are retained
219
  
220
  #This allows to change only one name of the data.frame
221
  pos<-match("value",names(dst)) #Find column with name "value"
222
  if (var=="TMAX"){
223
    names(dst)[pos]<-c("TMax")           #renaming the column "value" extracted from the Postgres database
224
    dst$TMax<-dst$TMax/10                #TMax is the average max temp for monthy data
225
  }
226
  
227
  if (var=="TMIN"){
228
    names(dst)[pos]<-c("TMin")
229
    dst$TMin<-dst$TMin/10                #TMin is the average min temp for monthy data
230
  }
231
  
232
  #Extracting covariates from stack for the monthly dataset...
233
  coords<- dst[c('lon','lat')]              #Define coordinates in a data frame
234
  coordinates(dst)<-coords                      #Assign coordinates to the data frame
235
  proj4string(dst)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
236
  dst_month<-spTransform(dst,CRS(CRS_interp))     #Project from WGS84 to new coord. system
237
  
238
  stations_val<-extract(s_raster,dst_month,df=TRUE)  #extraction of the information at station location in a data frame
239
  #dst_extract<-spCbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
240
  dst_extract<-cbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
241
  dst<-dst_extract
242
  
243
  #browser()
244
  coords<- dst[c('x','y')]              #Define coordinates in a data frame, this is the local x,y
245
  index<-!is.na(coords$x)               #remove if NA, may need to revisit at a later stage
246
  dst<-dst[index,]
247
  coords<- dst[c('x','y')]              #Define coordinates in a data frame, this is the local x,y
248
  coordinates(dst)<-coords                    #Assign coordinates to the data frame
249
  proj4string(dst)<-projection(s_raster)        #Assign coordinates reference system in PROJ4 format
250
  
251
  ####
252
  #write out a new shapefile (including .prj component)
253
  dst$OID<-1:nrow(dst) #need a unique ID?
254
  outfile4<-file.path(in_path,paste("monthly_covariates_ghcn_data_",var,"_",year_start_clim,"_",year_end,out_prefix,".shp",sep=""))  #Name of the file
255
  writeOGR(dst,dsn= dirname(outfile4),layer= sub(".shp","",basename(outfile4)), driver="ESRI Shapefile",overwrite_layer=TRUE)
256
  
257
  ### list of outputs return
258
  
259
  outfiles_obj<-list(outfile1,outfile2,outfile3,outfile4)
260
  names(outfiles_obj)<- c("loc_stations","loc_stations_ghcn","daily_covar_ghcn_data","monthly_covar_ghcn_data")
261
  return(outfiles_obj)
262
  
263
  #END OF FUNCTION # 
264
}
265

    
266
##### END OF SCRIPT ##########
(1-1/40)