1 |
6a5b56da
|
Benoit Parmentier
|
################## Data preparation for interpolation #######################################
|
2 |
|
|
############################ Extraction of station data ##########################################
|
3 |
|
|
#This script perform queries on the Postgres database ghcn for stations matching the
|
4 |
|
|
#interpolation area. It requires the following inputs:
|
5 |
|
|
# 1)the text file ofGHCND stations from NCDC matching the database version release
|
6 |
|
|
# 2)a shape file of the study area with geographic coordinates: lonlat WGS84 #
|
7 |
|
|
# 3)a new coordinate system can be provided as an argument
|
8 |
|
|
# 4)the variable of interest: "TMAX","TMIN" or "PRCP"
|
9 |
|
|
# 5)the location of raser covariate stack.
|
10 |
|
|
#The outputs are text files and a shape file of a time subset of the database
|
11 |
|
|
#AUTHOR: Benoit Parmentier
|
12 |
|
|
#DATE: 02/08/2013
|
13 |
|
|
#PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363--
|
14 |
|
|
#Comments and TODO
|
15 |
|
|
#-Add buffer option...
|
16 |
|
|
#-Add calculation of monthly mean...
|
17 |
|
|
##################################################################################################
|
18 |
|
|
|
19 |
|
|
###Loading R library and packages
|
20 |
|
|
|
21 |
|
|
library(RPostgreSQL)
|
22 |
|
|
library(sp) # Spatial pacakge with class definition by Bivand et al.
|
23 |
|
|
library(spdep) # Spatial pacakge with methods and spatial stat. by Bivand et al.
|
24 |
|
|
library(rgdal) # GDAL wrapper for R, spatial utilities
|
25 |
|
|
library(rgeos)
|
26 |
|
|
library(rgdal)
|
27 |
|
|
library(raster)
|
28 |
|
|
library(rasterVis)
|
29 |
|
|
|
30 |
|
|
### Parameters and arguments
|
31 |
|
|
|
32 |
|
|
db.name <- "ghcn" #name of the Postgres database
|
33 |
|
|
var <- "TMAX" #name of the variables to keep: TMIN, TMAX or PRCP
|
34 |
|
|
year_start<-"2010" #starting year for the query (included)
|
35 |
|
|
year_end<-"2011" #end year for the query (excluded)
|
36 |
|
|
infile1<- "outline_venezuela_region__VE_01292013.shp" #This is the shape file of outline of the study area. #It is projected alreaday
|
37 |
|
|
infile2<-"ghcnd-stations.txt" #This is the textfile of station locations from GHCND
|
38 |
|
|
infile3<-"covariates__venezuela_region__VE_01292013.tif" #this is an output from covariate script
|
39 |
|
|
|
40 |
|
|
new_proj<-"+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs"
|
41 |
|
|
locs_coord<-CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +towgs84=0,0,0")
|
42 |
|
|
CRS_locs_WGS84<-"+proj=longlat +ellps=WGS84 +datum=WGS84 +towgs84=0,0,0"
|
43 |
|
|
##Paths to inputs and output
|
44 |
|
|
in_path <- "/home/parmentier/Data/benoit_test"
|
45 |
|
|
in_path <- "/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/input_data/"
|
46 |
|
|
out_path<- "/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/output_data/"
|
47 |
|
|
ghcnd_path<- "/home/layers/data/climate/ghcn/v2.92-upd-2012052822"
|
48 |
|
|
setwd(in_path)
|
49 |
|
|
out_suffix<-"y2010_2010_VE_02082013" #User defined output prefix
|
50 |
|
|
out_region_name<-"_venezuela_region"
|
51 |
|
|
#out_suffix<-"_VE_01292013"
|
52 |
|
|
|
53 |
|
|
### Functions used in the script
|
54 |
|
|
|
55 |
|
|
format_s <-function(s_ID){
|
56 |
|
|
#Format station ID in a vector format/tuple that is used in a psql query.
|
57 |
|
|
# Argument 1: vector of station ID
|
58 |
|
|
# Return: character of station ID
|
59 |
|
|
tx2<-s_ID
|
60 |
|
|
tx2<-as.character(tx2)
|
61 |
|
|
stat_list<-tx2
|
62 |
|
|
temp<-shQuote(stat_list)
|
63 |
|
|
t<-paste(temp, collapse= " ")
|
64 |
|
|
t1<-gsub(" ", ",",t)
|
65 |
|
|
sf_ID<-paste("(",t1,")",sep="") #vector containing the station ID to query
|
66 |
|
|
return(sf_ID)
|
67 |
|
|
}
|
68 |
|
|
|
69 |
|
|
############ BEGIN: START OF THE SCRIPT #################
|
70 |
|
|
|
71 |
|
|
##### STEP 1: Select station in the study area
|
72 |
|
|
|
73 |
|
|
filename<-sub(".shp","",infile1) #Removing the extension from file.
|
74 |
|
|
interp_area <- readOGR(".",filename)
|
75 |
|
|
CRS_interp<-proj4string(interp_area) #Storing the coordinate information: geographic coordinates longlat WGS84
|
76 |
|
|
|
77 |
|
|
dat_stat <- read.fwf(file.path(ghcnd_path,"ghcnd-stations.txt"), widths = c(11,9,10,7,3,31,4,4,6),fill=TRUE)
|
78 |
|
|
colnames(dat_stat)<-c("STAT_ID","lat","lon","elev","state","name","GSNF","HCNF","WMOID")
|
79 |
|
|
coords<- dat_stat[,c('lon','lat')]
|
80 |
|
|
coordinates(dat_stat)<-coords
|
81 |
|
|
proj4string(dat_stat)<-locs_coord #this is the WGS84 projection
|
82 |
|
|
#proj4string(dat_stat)<-CRS_interp
|
83 |
|
|
dat_stat2<-spTransform(dat_stat,CRS(new_proj)) # Project from WGS84 to new coord. system
|
84 |
|
|
|
85 |
|
|
# Spatial query to find relevant stations
|
86 |
|
|
inside <- !is.na(over(dat_stat2, as(interp_area, "SpatialPolygons"))) #Finding stations contained in the current interpolation area
|
87 |
|
|
stat_reg<-dat_stat2[inside,] #Selecting stations contained in the current interpolation area
|
88 |
|
|
|
89 |
|
|
#Quick visualization of station locations
|
90 |
|
|
plot(interp_area, axes =TRUE)
|
91 |
|
|
plot(stat_reg, pch=1, col="red", cex= 0.7, add=TRUE)
|
92 |
|
|
#plot(data3,pch=1,col="blue",cex=3,add=TRUE)
|
93 |
|
|
#legend("topleft", pch=1,col="red",bty="n",title= "Stations",cex=1.6)
|
94 |
|
|
#only 357 station for Venezuela??
|
95 |
|
|
|
96 |
|
|
####
|
97 |
|
|
##Add buffer option?
|
98 |
|
|
####
|
99 |
|
|
|
100 |
|
|
#### STEP 2: Connecting to the database and query for relevant data
|
101 |
|
|
|
102 |
|
|
drv <- dbDriver("PostgreSQL")
|
103 |
|
|
db <- dbConnect(drv, dbname=db.name)
|
104 |
|
|
|
105 |
|
|
time1<-proc.time() #Start stop watch
|
106 |
|
|
list_s<-format_s(stat_reg$STAT_ID)
|
107 |
|
|
data2<-dbGetQuery(db, paste("SELECT *
|
108 |
|
|
FROM ghcn
|
109 |
|
|
WHERE element=",shQuote(var),
|
110 |
|
|
"AND year>=",year_start,
|
111 |
|
|
"AND year<",year_end,
|
112 |
|
|
"AND station IN ",list_s,";",sep="")) #Selecting station using a SQL query
|
113 |
|
|
time_duration<-proc.time()-time1 #Time for the query may be long given the size of the database
|
114 |
|
|
time_minutes<-time_duration[3]/60
|
115 |
|
|
|
116 |
|
|
###
|
117 |
|
|
#Add month query and averages here...
|
118 |
|
|
###
|
119 |
|
|
|
120 |
|
|
#data2 contains only 46 stations for Venezueal area??
|
121 |
|
|
data_table<-merge(data2,as.data.frame(stat_reg), by.x = "station", by.y = "STAT_ID")
|
122 |
|
|
|
123 |
|
|
#Transform the subset data frame in a spatial data frame and reproject
|
124 |
|
|
data_reg<-data_table #Make a copy of the data frame
|
125 |
|
|
coords<- data_reg[c('lon','lat')] #Define coordinates in a data frame: clean up here!!
|
126 |
|
|
#Wrong label...it is in fact projected...
|
127 |
|
|
coordinates(data_reg)<-coords #Assign coordinates to the data frame
|
128 |
|
|
#proj4string(data3)<-locs_coord #Assign coordinates reference system in PROJ4 format
|
129 |
|
|
proj4string(data_reg)<-locs_coord #Assign coordinates reference system in PROJ4 format
|
130 |
|
|
data_reg<-spTransform(data_reg,CRS(new_proj)) #Project from WGS84 to new coord. system
|
131 |
|
|
|
132 |
|
|
plot(interp_area, axes =TRUE)
|
133 |
|
|
plot(stat_reg, pch=1, col="red", cex= 0.7, add=TRUE)
|
134 |
|
|
plot(data_reg,pch=2,col="blue",cex=2,add=TRUE)
|
135 |
|
|
|
136 |
|
|
##################################################################
|
137 |
|
|
### STEP 3: Save results and outuput in textfile and a shape file
|
138 |
|
|
|
139 |
|
|
#Save a textfile of the locations of meteorological stations in the study area
|
140 |
|
|
write.table(as.data.frame(stat_reg), file=file.path(in_path,paste("stations",out_region_name,"_",
|
141 |
|
|
out_suffix,".txt",sep="")),sep=",")
|
142 |
|
|
outfile<-paste("stations",out_region_name,"_",
|
143 |
|
|
out_suffix,sep="")
|
144 |
|
|
writeOGR(stat_reg,dsn= ".",layer= outfile, driver="ESRI Shapefile",overwrite_layer=TRUE)
|
145 |
|
|
|
146 |
|
|
outfile<-paste("ghcn_data_",var,out_suffix,sep="") #Name of the file
|
147 |
|
|
#writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
|
148 |
|
|
writeOGR(data_reg,dsn= ".",layer= outfile, driver="ESRI Shapefile",overwrite_layer=TRUE)
|
149 |
|
|
|
150 |
|
|
###################################################################
|
151 |
|
|
### STEP 4: Extract values at stations from covariates stack of raster images
|
152 |
|
|
#Eventually this step may be skipped if the covariates information is stored in the database...
|
153 |
|
|
|
154 |
|
|
#The names of covariates can be changed...
|
155 |
|
|
rnames<-c("x","y","lon","lat","N","E","N_w","E_w","elev","slope","aspect","CANHEIGHT","DISTOC")
|
156 |
|
|
lc_names<-c("LC1","LC2","LC3","LC4","LC5","LC6","LC7","LC8","LC9","LC10","LC11","LC12")
|
157 |
|
|
lst_names<-c("mm_01","mm_02","mm_03","mm_04","mm_05","mm_06","mm_07","mm_08","mm_09","mm_10","mm_11","mm_12",
|
158 |
|
|
"nobs_01","nobs_02","nobs_03","nobs_04","nobs_05","nobs_06","nobs_07","nobs_08",
|
159 |
|
|
"nobs_09","nobs_10","nobs_11","nobs_12")
|
160 |
|
|
|
161 |
|
|
covar_names<-c(rnames,lc_names,lst_names)
|
162 |
|
|
|
163 |
|
|
s_raster<-stack(infile3) #read in the data stack
|
164 |
|
|
names(s_raster)<-covar_names #Assigning names to the raster layers: making sure it is included in the extraction
|
165 |
|
|
stat_val<- extract(s_raster, data_reg) #Extracting values from the raster stack for every point location in coords data frame.
|
166 |
|
|
|
167 |
|
|
#create a shape file and data_frame with names
|
168 |
|
|
|
169 |
|
|
data_RST<-as.data.frame(stat_val) #This creates a data frame with the values extracted
|
170 |
|
|
data_RST_SDF<-cbind(data_reg,data_RST)
|
171 |
|
|
coordinates(data_RST_SDF)<-coordinates(data_reg) #Transforming data_RST_SDF into a spatial point dataframe
|
172 |
|
|
CRS_reg<-proj4string(data_reg)
|
173 |
|
|
proj4string(data_RST_SDF)<-CRS_reg #Need to assign coordinates...
|
174 |
|
|
|
175 |
|
|
#Creating a date column
|
176 |
|
|
date1<-ISOdate(data_RST_SDF$year,data_RST_SDF$month,data_RST_SDF$day) #Creating a date object from 3 separate column
|
177 |
|
|
date2<-gsub("-","",as.character(as.Date(date1)))
|
178 |
|
|
data_RST_SDF$date<-date2 #Date format (year,month,day) is the following: "20100627"
|
179 |
|
|
|
180 |
|
|
#This allows to change only one name of the data.frame
|
181 |
|
|
pos<-match("value",names(data_RST_SDF)) #Find column with name "value"
|
182 |
|
|
if (var=="TMAX"){
|
183 |
|
|
#names(data_RST_SDF)[pos]<-c("TMax")
|
184 |
|
|
data_RST_SDF$value<-data_RST_SDF$value/10 #TMax is the average max temp for monthy data
|
185 |
|
|
}
|
186 |
|
|
|
187 |
|
|
#write out a new shapefile (including .prj component)
|
188 |
|
|
outfile<-paste("daily_covariates_ghcn_data_",var,out_suffix,sep="") #Name of the file
|
189 |
|
|
writeOGR(data_RST_SDF,dsn= ".",layer= outfile, driver="ESRI Shapefile",overwrite_layer=TRUE)
|
190 |
|
|
|
191 |
|
|
###############################################################
|
192 |
|
|
######## STEP 5: Preparing monthly averages from the ProstGres database
|
193 |
|
|
|
194 |
|
|
drv <- dbDriver("PostgreSQL")
|
195 |
|
|
db <- dbConnect(drv, dbname=db.name)
|
196 |
|
|
|
197 |
|
|
year_start<-2000
|
198 |
|
|
year_end<-2011
|
199 |
|
|
time1<-proc.time() #Start stop watch
|
200 |
|
|
list_s<-format_s(stat_reg$STAT_ID)
|
201 |
|
|
data_m<-dbGetQuery(db, paste("SELECT *
|
202 |
|
|
FROM ghcn
|
203 |
|
|
WHERE element=",shQuote(var),
|
204 |
|
|
"AND year>=",year_start,
|
205 |
|
|
"AND year<",year_end,
|
206 |
|
|
"AND station IN ",list_s,";",sep="")) #Selecting station using a SQL query
|
207 |
|
|
time_duration<-proc.time()-time1 #Time for the query may be long given the size of the database
|
208 |
|
|
time_minutes<-time_duration[3]/60
|
209 |
|
|
|
210 |
|
|
# do this work outside of (before) this function
|
211 |
|
|
# to avoid making a copy of the data frame inside the function call
|
212 |
|
|
date1<-ISOdate(data_m$year,data_m$month,data_m$day) #Creating a date object from 3 separate column
|
213 |
|
|
date2<-as.POSIXlt(as.Date(date1))
|
214 |
|
|
data_m$date<-date2
|
215 |
|
|
#In Venezuela and other regions where there are not many stations...mflag==S should be added..see Durenne etal.2010.
|
216 |
|
|
#d<-subset(data_m,year>=2000 & mflag=="0" ) #Selecting dataset 2000-2010 with good quality: 193 stations
|
217 |
|
|
d<-subset(data_m,mflag=="0" | mflag=="S")
|
218 |
|
|
#May need some screeing??? i.e. range of temp and elevation...
|
219 |
|
|
d1<-aggregate(value~station+month, data=d, mean) #Calculate monthly mean for every station in OR
|
220 |
|
|
id<-as.data.frame(unique(d1$station)) #Unique station in OR for year 2000-2010: 193 but 7 loss of monthly avg
|
221 |
|
|
|
222 |
|
|
dst<-merge(d1, stat_reg, by.x="station", by.y="STAT_ID") #Inner join all columns are retained
|
223 |
|
|
|
224 |
|
|
#This allows to change only one name of the data.frame
|
225 |
|
|
pos<-match("value",names(dst)) #Find column with name "value"
|
226 |
|
|
if (var=="TMAX"){
|
227 |
|
|
names(dst)[pos]<-c("TMax")
|
228 |
|
|
dst$TMax<-dst$TMax/10 #TMax is the average max temp for monthy data
|
229 |
|
|
}
|
230 |
|
|
#dstjan=dst[dst$month==9,] #dst contains the monthly averages for tmax for every station over 2000-2010
|
231 |
|
|
|
232 |
|
|
#Extracting covariates from stack for the monthly dataset...
|
233 |
|
|
coords<- dst[c('lon','lat')] #Define coordinates in a data frame
|
234 |
|
|
coordinates(dst)<-coords #Assign coordinates to the data frame
|
235 |
|
|
proj4string(dst)<-CRS_locs_WGS84 #Assign coordinates reference system in PROJ4 format
|
236 |
|
|
dst_month<-spTransform(dst,CRS(CRS_interp)) #Project from WGS84 to new coord. system
|
237 |
|
|
|
238 |
|
|
stations_val<-extract(s_raster,dst_month) #extraction of the infomration at station location
|
239 |
|
|
stations_val<-as.data.frame(stations_val)
|
240 |
|
|
dst_extract<-cbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
|
241 |
|
|
dst<-dst_extract
|
242 |
|
|
|
243 |
|
|
coords<- dst[c('x','y')] #Define coordinates in a data frame, this is the local x,y
|
244 |
|
|
coordinates(dst)<-coords #Assign coordinates to the data frame
|
245 |
|
|
proj4string(dst)<-projection(s_raster) #Assign coordinates reference system in PROJ4 format
|
246 |
|
|
|
247 |
|
|
####
|
248 |
|
|
#write out a new shapefile (including .prj component)
|
249 |
|
|
outfile<-paste("monthly_covariates_ghcn_data_",var,out_suffix,sep="") #Name of the file
|
250 |
|
|
dst$OID<-1:nrow(dst) #need a unique ID?
|
251 |
|
|
writeOGR(dst,dsn= ".",layer= outfile, driver="ESRI Shapefile",overwrite_layer=TRUE)
|
252 |
|
|
|
253 |
|
|
##### END OF SCRIPT ##########
|