Project

General

Profile

Download (14.5 KB) Statistics
| Branch: | Revision:
1
##################    Data preparation for interpolation   #######################################
2
############################ Extraction of station data ##########################################
3

    
4

    
5
database_covaratiates_preparation<-function(list_param_prep){
6
  #This function performs queries on the Postgres ghcnd database for stations matching the             
7
  #interpolation area. It requires 11 inputs:                                           
8
  # 1) db.name :  Postgres database name containing the meteorological stations
9
  # 2) var: the variable of interest - "TMAX","TMIN" or "PRCP" 
10
  # 3) range_years: range of records used in the daily interpolation, note that upper bound year is not included               
11
  # 4) range_years_clim: range of records used in the monthly climatology interpolation, note that upper bound is not included
12
  # 5) infile1: region outline as a shape file - used in the interpolation  stage too                              
13
  # 6) infile2: ghcnd stations locations as a textfile name with lat-long fields                                                                                   
14
  # 7) infile_covarariates: tif file of raser covariates for the interpolation area: it should have a local projection                                                                                           
15
  # 8) CRS_locs_WGS84: longlat EPSG 4326 used as coordinates reference system (proj4)for stations locations
16
  # 9) in_path: input path for covariates data and other files, this is also the output?
17
  # 10) covar_names: names of covariates used for the interpolation --may be removed later? (should be stored in the brick)
18
  # 11) out_prefix: output suffix added to output names--it is the same in the interpolation script
19
  #
20
  #The output is a list of four shapefile names produced by the function:
21
  #1) loc_stations: locations of stations as shapefile in EPSG 4326
22
  #2) loc_stations_ghcn: ghcn daily data for the year range of interpolation (locally projected)
23
  #3) daily_covar_ghcn_data: ghcn daily data with covariates for the year range of interpolation (locally projected)
24
  #4) monthly_covar_ghcn_data: ghcn daily data with covariates for the year range of interpolation (locally projected)
25
  
26
  #AUTHOR: Benoit Parmentier                                                                       
27
  #DATE: 03/01/2013                                                                                 
28
  #PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363--     
29
  #Comments and TODO
30
  #-Add buffer option...
31
  #-Add output path argument option
32
  #-Add qc flag options
33
  ##################################################################################################
34
  
35
  ###Loading R library and packages: should it be read in before???   
36
  
37
  library(RPostgreSQL)
38
  library(sp)                                           # Spatial pacakge with class definition by Bivand et al.
39
  library(spdep)                                          # Spatial pacakge with methods and spatial stat. by Bivand et al.
40
  library(rgdal)                                          # GDAL wrapper for R, spatial utilities
41
  library(rgeos)
42
  library(rgdal)
43
  library(raster)
44
  library(rasterVis)
45
  
46
  ### Functions used in the script
47
  
48
  format_s <-function(s_ID){
49
    #Format station ID in a vector format/tuple that is used in a psql query.
50
    # Argument 1: vector of station ID
51
    # Return: character of station ID
52
    tx2<-s_ID
53
    tx2<-as.character(tx2)
54
    stat_list<-tx2
55
    temp<-shQuote(stat_list)
56
    t<-paste(temp, collapse= " ")
57
    t1<-gsub(" ", ",",t)
58
    sf_ID<-paste("(",t1,")",sep="") #vector containing the station ID to query
59
    return(sf_ID)
60
  }
61
  
62
  #parsing input arguments
63
  
64
  db.name <- list_param_prep$db.name             #name of the Postgres database
65
  var <- list_param_prep$var                     #name of the variables to keep: TMIN, TMAX or PRCP
66
  year_start <-list_param_prep$range_years[1] #"2010"               #starting year for the query (included)
67
  year_end <-list_param_prep$range_years[2] #"2011"                 #end year for the query (excluded)
68
  year_start_clim <-list_param_prep$range_years_clim[1] #right bound not included in the range!! starting year for monthly query to calculate clime
69
  infile1<- list_param_prep$infile1  #This is the shape file of outline of the study area                                                      #It is an input/output of the covariate script
70
  infile2<-list_param_prep$infile2      #"/home/layers/data/climate/ghcn/v2.92-upd-2012052822/ghcnd-stations.txt"                              #This is the textfile of station locations from GHCND
71
  infile3<-list_param_prep$infile_covariates #"covariates__venezuela_region__VE_01292013.tif" #this is an output from covariate script
72
  CRS_locs_WGS84<-list_param_prep$CRS_locs_WGS84 #Station coords WGS84: same as earlier
73
  in_path <- list_param_prep$in_path #CRS_locs_WGS84"/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/input_data/"
74
  out_prefix<-list_param_prep$out_prefix #"_365d_GAM_fus5_all_lstd_03012013"                #User defined output prefix
75
  #qc_flags<-list_param_prep$qc_flags    flags allowed for the query from the GHCND??
76
  covar_names<-list_param_prep$covar_names # names should be written in the tif file!!!
77
  
78
  ## working directory is the same for input and output for this function  
79
  setwd(in_path) 
80
  
81
  ##### STEP 1: Select station in the study area
82
  
83
  filename<-sub(".shp","",infile1)             #Removing the extension from file.
84
  interp_area <- readOGR(".",filename)
85
  CRS_interp<-proj4string(interp_area)         #Storing the coordinate information: geographic coordinates longlat WGS84
86
  
87
  #Read in GHCND database station locations
88
  dat_stat <- read.fwf(infile2, 
89
                       widths = c(11,9,10,7,3,31,4,4,6),fill=TRUE)
90
  colnames(dat_stat)<-c("STAT_ID","lat","lon","elev","state","name","GSNF","HCNF","WMOID")
91
  coords<- dat_stat[,c('lon','lat')]
92
  coordinates(dat_stat)<-coords
93
  proj4string(dat_stat)<-CRS_locs_WGS84 #this is the WGS84 projection
94
  #proj4string(dat_stat)<-CRS_interp
95
  dat_stat2<-spTransform(dat_stat,CRS(CRS_interp))         # Project from WGS84 to new coord. system
96
  
97
  # Spatial query to find relevant stations
98
  inside <- !is.na(over(dat_stat2, as(interp_area, "SpatialPolygons")))  #Finding stations contained in the current interpolation area
99
  stat_reg<-dat_stat2[inside,]              #Selecting stations contained in the current interpolation area
100
  
101
  ####
102
  ##TODO: Add buffer option? 
103
  ####
104
  
105
  #### STEP 2: Connecting to the database and query for relevant data 
106
  
107
  drv <- dbDriver("PostgreSQL")
108
  db <- dbConnect(drv, dbname=db.name)
109
  
110
  time1<-proc.time()    #Start stop watch
111
  list_s<-format_s(stat_reg$STAT_ID)
112
  data2<-dbGetQuery(db, paste("SELECT *
113
                              FROM ghcn
114
                              WHERE element=",shQuote(var),
115
                              "AND year>=",year_start,
116
                              "AND year<",year_end,
117
                              "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
118
  time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
119
  time_minutes<-time_duration[3]/60
120
  dbDisconnect(db)
121
  ###
122
  #Add month query and averages here...
123
  ###
124
  
125
  #data2 contains only 46 stations for Venezueal area??
126
  data_table<-merge(data2,as.data.frame(stat_reg), by.x = "station", by.y = "STAT_ID")
127
  
128
  #Transform the subset data frame in a spatial data frame and reproject
129
  data_reg<-data_table                               #Make a copy of the data frame
130
  coords<- data_reg[c('lon','lat')]              #Define coordinates in a data frame: clean up here!!
131
  #Wrong label...it is in fact projected...
132
  coordinates(data_reg)<-coords                      #Assign coordinates to the data frame
133
  #proj4string(data3)<-locs_coord                  #Assign coordinates reference system in PROJ4 format
134
  proj4string(data_reg)<-CRS_locs_WGS84                #Assign coordinates reference system in PROJ4 format
135
  data_reg<-spTransform(data_reg,CRS(CRS_interp))     #Project from WGS84 to new coord. system
136
  
137
  ##################################################################
138
  ### STEP 3: Save results and outuput in textfile and a shape file
139
  #browser()
140
  #Save shape files of the locations of meteorological stations in the study area
141
  outfile1<-file.path(in_path,paste("stations","_",out_prefix,".shp",sep=""))
142
  writeOGR(stat_reg,dsn= dirname(outfile1),layer= sub(".shp","",basename(outfile1)), driver="ESRI Shapefile",overwrite_layer=TRUE)
143
  #writeOGR(dst,dsn= ".",layer= sub(".shp","",outfile4), driver="ESRI Shapefile",overwrite_layer=TRUE)
144
  
145
  outfile2<-file.path(in_path,paste("ghcn_data_",var,"_",year_start_clim,"_",year_end,out_prefix,".shp",sep=""))         #Name of the file
146
  #writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
147
  writeOGR(data_reg,dsn= dirname(outfile2),layer= sub(".shp","",basename(outfile2)), driver="ESRI Shapefile",overwrite_layer=TRUE)
148
  
149
  ###################################################################
150
  ### STEP 4: Extract values at stations from covariates stack of raster images
151
  #Eventually this step may be skipped if the covariates information is stored in the database...
152
  
153
  s_raster<-stack(infile3)                   #read in the data stack
154
  names(s_raster)<-covar_names               #Assigning names to the raster layers: making sure it is included in the extraction
155
  stat_val<- extract(s_raster, data_reg)        #Extracting values from the raster stack for every point location in coords data frame.
156
  
157
  #create a shape file and data_frame with names
158
  
159
  data_RST<-as.data.frame(stat_val)                                            #This creates a data frame with the values extracted
160
  data_RST_SDF<-cbind(data_reg,data_RST)
161
  coordinates(data_RST_SDF)<-coordinates(data_reg) #Transforming data_RST_SDF into a spatial point dataframe
162
  CRS_reg<-proj4string(data_reg)
163
  proj4string(data_RST_SDF)<-CRS_reg  #Need to assign coordinates...
164
  
165
  #Creating a date column
166
  date1<-ISOdate(data_RST_SDF$year,data_RST_SDF$month,data_RST_SDF$day) #Creating a date object from 3 separate column
167
  date2<-gsub("-","",as.character(as.Date(date1)))
168
  data_RST_SDF$date<-date2                                              #Date format (year,month,day) is the following: "20100627"
169
  
170
  #This allows to change only one name of the data.frame
171
  pos<-match("value",names(data_RST_SDF)) #Find column with name "value"
172
  if (var=="TMAX"){
173
    #names(data_RST_SDF)[pos]<-c("TMax")
174
    data_RST_SDF$value<-data_RST_SDF$value/10                #TMax is the average max temp for monthy data
175
  }
176
  
177
  #write out a new shapefile (including .prj component)
178
  outfile3<-file.path(in_path,paste("daily_covariates_ghcn_data_",var,"_",range_years[1],"_",range_years[2],out_prefix,".shp",sep=""))         #Name of the file
179
  writeOGR(data_RST_SDF,dsn= dirname(outfile3),layer= sub(".shp","",basename(outfile3)), driver="ESRI Shapefile",overwrite_layer=TRUE)
180
  
181
  ###############################################################
182
  ######## STEP 5: Preparing monthly averages from the ProstGres database
183
  
184
  drv <- dbDriver("PostgreSQL")
185
  db <- dbConnect(drv, dbname=db.name)
186
  
187
  #year_start_clim: set at the start of the script
188
  year_end<-2011
189
  time1<-proc.time()    #Start stop watch
190
  list_s<-format_s(stat_reg$STAT_ID)
191
  data_m<-dbGetQuery(db, paste("SELECT *
192
                               FROM ghcn
193
                               WHERE element=",shQuote(var),
194
                               "AND year>=",year_start_clim,
195
                               "AND year<",year_end,
196
                               "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
197
  time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
198
  time_minutes<-time_duration[3]/60
199
  dbDisconnect(db)
200
  #Clean out this section!!
201
  date1<-ISOdate(data_m$year,data_m$month,data_m$day) #Creating a date object from 3 separate column
202
  date2<-as.POSIXlt(as.Date(date1))
203
  data_m$date<-date2
204
  #In Venezuela and other regions where there are not many stations...mflag==S should be added..see Durenne etal.2010.
205
  #d<-subset(data_m,year>=2000 & mflag=="0" ) #Selecting dataset 2000-2010 with good quality: 193 stations
206
  d<-subset(data_m,mflag=="0" | mflag=="S") #should be input arguments!!
207
  #May need some screeing??? i.e. range of temp and elevation...
208
  d1<-aggregate(value~station+month, data=d, mean)  #Calculate monthly mean for every station in OR
209
  id<-as.data.frame(unique(d1$station))     #Unique station in OR for year 2000-2010: 193 but 7 loss of monthly avg    
210
  
211
  dst<-merge(d1, stat_reg, by.x="station", by.y="STAT_ID")   #Inner join all columns are retained
212
  
213
  #This allows to change only one name of the data.frame
214
  pos<-match("value",names(dst)) #Find column with name "value"
215
  if (var=="TMAX"){
216
    names(dst)[pos]<-c("TMax")
217
    dst$TMax<-dst$TMax/10                #TMax is the average max temp for monthy data
218
  }
219
  
220
  #Extracting covariates from stack for the monthly dataset...
221
  coords<- dst[c('lon','lat')]              #Define coordinates in a data frame
222
  coordinates(dst)<-coords                      #Assign coordinates to the data frame
223
  proj4string(dst)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
224
  dst_month<-spTransform(dst,CRS(CRS_interp))     #Project from WGS84 to new coord. system
225
  
226
  stations_val<-extract(s_raster,dst_month)  #extraction of the infomration at station location
227
  stations_val<-as.data.frame(stations_val)
228
  dst_extract<-cbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
229
  dst<-dst_extract
230
  
231
  coords<- dst[c('x','y')]              #Define coordinates in a data frame, this is the local x,y
232
  coordinates(dst)<-coords                    #Assign coordinates to the data frame
233
  proj4string(dst)<-projection(s_raster)        #Assign coordinates reference system in PROJ4 format
234
  
235
  ####
236
  #write out a new shapefile (including .prj component)
237
  dst$OID<-1:nrow(dst) #need a unique ID?
238
  outfile4<-file.path(in_path,paste("monthly_covariates_ghcn_data_",var,"_",year_start_clim,"_",year_end,out_prefix,".shp",sep=""))  #Name of the file
239
  writeOGR(dst,dsn= dirname(outfile4),layer= sub(".shp","",basename(outfile4)), driver="ESRI Shapefile",overwrite_layer=TRUE)
240
  
241
  ### list of outputs return
242
  
243
  outfiles_obj<-list(outfile1,outfile2,outfile3,outfile4)
244
  names(outfiles_obj)<- c("loc_stations","loc_stations_ghcn","daily_covar_ghcn_data","monthly_covar_ghcn_data")
245
  return(outfiles_obj)
246
  
247
  #END OF FUNCTION # 
248
}
249

    
250
##### END OF SCRIPT ##########
(1-1/37)