Revision 4f0db866
Added by Benoit Parmentier about 10 years ago
climate/research/oregon/interpolation/Database_stations_covariates_processing_function.R | ||
---|---|---|
19 | 19 |
# 12) qc_flags_stations: flag used to screen out at this stage only two values... |
20 | 20 |
# 13) out_prefix: output suffix added to output names--it is the same in the interpolation script |
21 | 21 |
# |
22 |
# |
|
23 |
# 14) |
|
24 |
# 15).. |
|
22 | 25 |
#The output is a list of four shapefile names produced by the function: |
23 | 26 |
#1) loc_stations: locations of stations as shapefile in EPSG 4326 |
24 | 27 |
#2) loc_stations_ghcn: ghcn daily data for the year range of interpolation (locally projected) |
... | ... | |
82 | 85 |
qc_flags_stations <- list_param_prep$qc_flags_stations #flags allowed for the query from the GHCND?? |
83 | 86 |
out_prefix<-list_param_prep$out_prefix #"_365d_GAM_fus5_all_lstd_03012013" #User defined output prefix |
84 | 87 |
|
88 |
## New parameters added for sub samplineg in areas with important density of meteo stations |
|
89 |
sub_sampling <- list_param$sub_sampling #if TRUE then monthly stations data are resampled |
|
90 |
sub_sample_rnd <- list_param$sub_sample_rnd #if TRUE use random sampling in addition to spatial sub-sampling |
|
91 |
target_range_nb <- list_param$target_range_nb # number of stations desired as min and max, convergence to min for now |
|
92 |
dist_range <- list_param$dist_range #distance range for pruning, usually (0,5) in km or 0,0.009*5 for degreee |
|
93 |
step_dist <- list_param$step_dist #stepping distance used in pruning spatially, use 1km or 0.009 for degree data |
|
94 |
|
|
85 | 95 |
## working directory is the same for input and output for this function |
86 | 96 |
#setwd(in_path) |
87 | 97 |
setwd(out_path) |
88 | 98 |
##### STEP 1: Select station in the study area |
89 |
|
|
90 |
filename<-sub(".shp","",infile_reg_outline) #Removing the extension from file. |
|
99 |
|
|
100 |
filename<-sub(".shp","",fixed=TRUE,infile_reg_outline) #Removing the extension from file.
|
|
91 | 101 |
interp_area <- readOGR(dsn=dirname(filename),basename(filename)) |
92 | 102 |
CRS_interp<-proj4string(interp_area) #Storing the coordinate information: geographic coordinates longlat WGS84 |
93 | 103 |
|
94 | 104 |
#Read in GHCND database station locations |
95 | 105 |
dat_stat <- read.fwf(infile_ghncd_data, |
96 | 106 |
widths = c(11,9,10,7,3,31,4,4,6),fill=TRUE) |
107 |
|
|
97 | 108 |
colnames(dat_stat)<-c("STAT_ID","latitude","longitude","elev","state","name","GSNF","HCNF","WMOID") |
98 | 109 |
coords<- dat_stat[,c('longitude','latitude')] |
99 | 110 |
coordinates(dat_stat)<-coords |
100 | 111 |
proj4string(dat_stat)<-CRS_locs_WGS84 #this is the WGS84 projection |
101 | 112 |
#proj4string(dat_stat)<-CRS_interp |
102 | 113 |
interp_area_WGS84 <-spTransform(interp_area,CRS_locs_WGS84) # Project from WGS84 to new coord. system |
103 |
|
|
104 | 114 |
# Spatial query to find relevant stations |
105 |
|
|
115 |
|
|
106 | 116 |
inside <- !is.na(over(dat_stat, as(interp_area_WGS84, "SpatialPolygons"))) #Finding stations contained in the current interpolation area |
107 | 117 |
stat_reg<-dat_stat[inside,] #Selecting stations contained in the current interpolation area |
108 | 118 |
|
... | ... | |
111 | 121 |
#### |
112 | 122 |
|
113 | 123 |
#### STEP 2: Connecting to the database and query for relevant data |
114 |
|
|
124 |
|
|
115 | 125 |
drv <- dbDriver("PostgreSQL") |
116 | 126 |
db <- dbConnect(drv, dbname=db.name) |
117 |
|
|
127 |
|
|
118 | 128 |
time1<-proc.time() #Start stop watch |
119 | 129 |
list_s<-format_s(stat_reg$STAT_ID) |
120 | 130 |
data2<-dbGetQuery(db, paste("SELECT * |
... | ... | |
125 | 135 |
"AND station IN ",list_s,";",sep="")) #Selecting station using a SQL query |
126 | 136 |
time_duration<-proc.time()-time1 #Time for the query may be long given the size of the database |
127 | 137 |
time_minutes<-time_duration[3]/60 |
138 |
print(time_minutes) |
|
128 | 139 |
dbDisconnect(db) |
129 |
|
|
140 |
|
|
130 | 141 |
data_table<-merge(data2,as.data.frame(stat_reg), by.x = "station", by.y = "STAT_ID") |
131 |
|
|
142 |
|
|
132 | 143 |
#Transform the subset data frame in a spatial data frame and reproject |
133 | 144 |
data_reg<-data_table #Make a copy of the data frame |
134 | 145 |
coords<- data_reg[c('longitude','latitude')] #Define coordinates in a data frame: clean up here!! |
135 | 146 |
coordinates(data_reg)<-coords #Assign coordinates to the data frame |
136 | 147 |
proj4string(data_reg)<-CRS_locs_WGS84 #Assign coordinates reference system in PROJ4 format |
137 |
data_reg<-spTransform(data_reg,CRS(CRS_interp)) #Project from WGS84 to new coord. system |
|
148 |
#data_reg<-spTransform(data_reg,CRS(CRS_interp)) #Project from WGS84 to new coord. system
|
|
138 | 149 |
|
139 | 150 |
data_d <-data_reg #data_d: daily data containing the query without screening |
140 | 151 |
#data_reg <-subset(data_d,mflag=="0" | mflag=="S") #should be input arguments!! |
... | ... | |
142 | 153 |
|
143 | 154 |
data_reg <-subset(data_d, mflag %in% qc_flags_stations) #screening using flags |
144 | 155 |
#data_reg2 <-subset(data_d,mflag==qc_flags_stations[1] | mflag==qc_flags_stations[2]) #screening using flags |
145 |
|
|
156 |
|
|
146 | 157 |
################################################################## |
147 | 158 |
### STEP 3: Save results and outuput in textfile and a shape file |
148 | 159 |
#browser() |
... | ... | |
151 | 162 |
outfile1<-file.path(out_path,paste("stations","_",out_prefix,".shp",sep="")) |
152 | 163 |
writeOGR(stat_reg,dsn= dirname(outfile1),layer= sub(".shp","",basename(outfile1)), driver="ESRI Shapefile",overwrite_layer=TRUE) |
153 | 164 |
#writeOGR(dst,dsn= ".",layer= sub(".shp","",outfile4), driver="ESRI Shapefile",overwrite_layer=TRUE) |
154 |
|
|
165 |
#Also save as rds |
|
166 |
outfile1_rds<-sub(".shp",".rds",outfile1) |
|
167 |
saveRDS(stat_reg,outfile1) |
|
168 |
|
|
155 | 169 |
outfile2<-file.path(out_path,paste("ghcn_data_query_",var,"_",year_start,"_",year_end,out_prefix,".shp",sep="")) #Name of the file |
156 | 170 |
#writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension |
157 | 171 |
writeOGR(data_d,dsn= dirname(outfile2),layer= sub(".shp","",basename(outfile2)), driver="ESRI Shapefile",overwrite_layer=TRUE) |
158 |
|
|
172 |
outfile2_rds<-sub(".shp",".rds",outfile2) |
|
173 |
saveRDS(data_d,outfile2_rds) |
|
174 |
|
|
159 | 175 |
outfile3<-file.path(out_path,paste("ghcn_data_",var,"_",year_start,"_",year_end,out_prefix,".shp",sep="")) #Name of the file |
160 | 176 |
#writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension |
161 | 177 |
writeOGR(data_reg,dsn= dirname(outfile3),layer= sub(".shp","",basename(outfile3)), driver="ESRI Shapefile",overwrite_layer=TRUE) |
162 |
|
|
178 |
outfile3_rds<-sub(".shp",".rds",outfile3) |
|
179 |
saveRDS(data_reg,outfile3_rds) |
|
180 |
|
|
163 | 181 |
################################################################### |
164 | 182 |
### STEP 4: Extract values at stations from covariates stack of raster images |
165 | 183 |
#Eventually this step may be skipped if the covariates information is stored in the database... |
166 |
|
|
167 | 184 |
#s_raster<-stack(file.path(in_path,infile_covariates)) #read in the data stack |
168 | 185 |
s_raster<-brick(infile_covariates) #read in the data stack |
169 | 186 |
names(s_raster)<-covar_names #Assigning names to the raster layers: making sure it is included in the extraction |
... | ... | |
171 | 188 |
#stat_val_test<- extract(s_raster, data_reg,def=TRUE) |
172 | 189 |
|
173 | 190 |
#create a shape file and data_frame with names |
174 |
|
|
175 | 191 |
data_RST<-as.data.frame(stat_val) #This creates a data frame with the values extracted |
176 | 192 |
data_RST_SDF<-cbind(data_reg,data_RST) |
193 |
|
|
177 | 194 |
coordinates(data_RST_SDF)<-coordinates(data_reg) #Transforming data_RST_SDF into a spatial point dataframe |
178 | 195 |
CRS_reg<-proj4string(data_reg) |
179 | 196 |
proj4string(data_RST_SDF)<-CRS_reg #Need to assign coordinates... |
180 |
|
|
197 |
|
|
181 | 198 |
#Creating a date column |
182 | 199 |
date1<-ISOdate(data_RST_SDF$year,data_RST_SDF$month,data_RST_SDF$day) #Creating a date object from 3 separate column |
183 | 200 |
date2<-gsub("-","",as.character(as.Date(date1))) |
184 | 201 |
data_RST_SDF$date<-date2 #Date format (year,month,day) is the following: "20100627" |
185 |
|
|
202 |
|
|
186 | 203 |
#This allows to change only one name of the data.frame |
187 | 204 |
pos<-match("value",names(data_RST_SDF)) #Find column with name "value" |
188 | 205 |
if (var=="TMAX"){ |
... | ... | |
198 | 215 |
#write out a new shapefile (including .prj component) |
199 | 216 |
outfile4<-file.path(out_path,paste("daily_covariates_ghcn_data_",var,"_",range_years[1],"_",range_years[2],out_prefix,".shp",sep="")) #Name of the file |
200 | 217 |
writeOGR(data_RST_SDF,dsn= dirname(outfile4),layer= sub(".shp","",basename(outfile4)), driver="ESRI Shapefile",overwrite_layer=TRUE) |
201 |
|
|
218 |
outfile4_rds<-sub(".shp",".rds",outfile4) |
|
219 |
saveRDS(data_RST_SDF,outfile4_rds) |
|
220 |
|
|
202 | 221 |
############################################################### |
203 | 222 |
######## STEP 5: Preparing monthly averages from the ProstGres database |
204 |
|
|
205 | 223 |
drv <- dbDriver("PostgreSQL") |
206 | 224 |
db <- dbConnect(drv, dbname=db.name) |
207 | 225 |
|
208 | 226 |
#year_start_clim: set at the start of the script |
209 | 227 |
time1<-proc.time() #Start stop watch |
210 | 228 |
list_s<-format_s(stat_reg$STAT_ID) |
229 |
|
|
211 | 230 |
data_m<-dbGetQuery(db, paste("SELECT * |
212 | 231 |
FROM ghcn |
213 | 232 |
WHERE element=",shQuote(var), |
214 | 233 |
"AND year>=",year_start_clim, |
215 |
"AND year<",year_end_clim, |
|
216 | 234 |
"AND station IN ",list_s,";",sep="")) #Selecting station using a SQL query |
217 | 235 |
time_duration<-proc.time()-time1 #Time for the query may be long given the size of the database |
218 | 236 |
time_minutes<-time_duration[3]/60 |
237 |
print(time_minutes) |
|
219 | 238 |
dbDisconnect(db) |
239 |
|
|
220 | 240 |
#Clean out this section!! |
221 | 241 |
date1<-ISOdate(data_m$year,data_m$month,data_m$day) #Creating a date object from 3 separate column |
222 | 242 |
date2<-as.POSIXlt(as.Date(date1)) |
... | ... | |
231 | 251 |
outfile5<-file.path(out_path,paste("monthly_query_ghcn_data_",var,"_",year_start_clim,"_",year_end_clim,out_prefix,".shp",sep="")) #Name of the file |
232 | 252 |
writeOGR(data_m,dsn= dirname(outfile5),layer= sub(".shp","",basename(outfile5)), driver="ESRI Shapefile",overwrite_layer=TRUE) |
233 | 253 |
|
254 |
outfile5_rds<-sub(".shp",".rds",outfile5) |
|
255 |
saveRDS(data_m,outfile5_rds) |
|
256 |
|
|
234 | 257 |
#In Venezuela and other regions where there are not many stations...mflag==S should be added..see Durenne etal.2010. |
235 | 258 |
|
236 | 259 |
#d<-subset(data_m,mflag==qc_flags_stations[1] | mflag==qc_flags_stations[2]) |
237 |
d<-subset(data_m,mflag %in% qc_flags_stations) |
|
260 |
d<-subset(data_m,mflag %in% qc_flags_stations)
|
|
238 | 261 |
|
239 | 262 |
#Add screening here ...May need some screeing??? i.e. range of temp and elevation... |
240 | 263 |
|
... | ... | |
261 | 284 |
#Extracting covariates from stack for the monthly dataset... |
262 | 285 |
#names(dst)[5:6] <-c('latitude','longitude') |
263 | 286 |
coords<- dst[c('longitude','latitude')] #Define coordinates in a data frame |
264 |
|
|
287 |
|
|
265 | 288 |
coordinates(dst)<-coords #Assign coordinates to the data frame |
266 | 289 |
proj4string(dst)<-CRS_locs_WGS84 #Assign coordinates reference system in PROJ4 format |
267 | 290 |
dst_month<-spTransform(dst,CRS(CRS_interp)) #Project from WGS84 to new coord. system |
... | ... | |
283 | 306 |
#Covariates ok since screening done in covariate script |
284 | 307 |
#screening on var i.e. value, TMIN, TMAX... |
285 | 308 |
|
286 |
#### |
|
309 |
#### Adding subsampling for regions that have too many stations...
|
|
287 | 310 |
|
311 |
#This must be set up in master script |
|
312 |
#target_max_nb <- 100,000 #this is not actually used yet in the current implementation,can be set to very high value... |
|
313 |
#target_min_nb <- 8,000 #this is the target number of stations we would like: to be set by Alberto... |
|
314 |
##max_dist <- 1000 # the maximum distance used for pruning ie removes stations that are closer than 1000m, this in degree...? |
|
315 |
#max_dist <- 0.009*5 #5km in degree |
|
316 |
#min_dist <- 0 #minimum distance to start with |
|
317 |
#step_dist <- 0.009 #iteration step to remove the stations |
|
318 |
|
|
319 |
#test5 <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_nb,dist_range=dist_range,step_dist=step_dist,data_in=data_month,sampling=T,combined=F) |
|
320 |
if(sub_sampling==TRUE){ |
|
321 |
sub_sampling_obj <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_nb,dist_range=dist_range,step_dist=step_dist,data_in=data_month,sampling=T,combined=F) |
|
322 |
dst <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations |
|
323 |
#save the information for later use (validation at monthly step!!) |
|
324 |
save(sub_sampling_obj,file= file.path(out_path,paste("sub_sampling_obj_",interpolation_method,"_", out_prefix,".RData",sep=""))) |
|
325 |
} |
|
326 |
|
|
288 | 327 |
#### |
289 | 328 |
#write out a new shapefile (including .prj component) |
290 | 329 |
dst$OID<-1:nrow(dst) #need a unique ID? |
291 | 330 |
outfile6<-file.path(out_path,paste("monthly_covariates_ghcn_data_",var,"_",year_start_clim,"_",year_end_clim,out_prefix,".shp",sep="")) #Name of the file |
292 | 331 |
writeOGR(dst,dsn= dirname(outfile6),layer= sub(".shp","",basename(outfile6)), driver="ESRI Shapefile",overwrite_layer=TRUE) |
293 | 332 |
|
294 |
### list of outputs return |
|
295 |
|
|
333 |
outfile6_rds<-sub(".shp",".rds",outfile6) |
|
334 |
saveRDS(dst,outfile6_rds) |
|
335 |
|
|
296 | 336 |
outfiles_obj<-list(outfile1,outfile2,outfile3,outfile4,outfile5,outfile6) |
297 | 337 |
names(outfiles_obj)<- c("loc_stations","loc_stations_ghcn","daily_query_ghcn_data","daily_covar_ghcn_data","monthly_query_ghcn_data","monthly_covar_ghcn_data") |
298 | 338 |
save(outfiles_obj,file= file.path(out_path,paste("met_stations_outfiles_obj_",interpolation_method,"_", out_prefix,".RData",sep=""))) |
Also available in: Unified diff
subampling added to Database preparation script for NEX run