Project

General

Profile

« Previous | Next » 

Revision 4f0db866

Added by Benoit Parmentier about 10 years ago

subampling added to Database preparation script for NEX run

View differences:

climate/research/oregon/interpolation/Database_stations_covariates_processing_function.R
19 19
  # 12) qc_flags_stations: flag used to screen out at this stage only two values...
20 20
  # 13) out_prefix: output suffix added to output names--it is the same in the interpolation script
21 21
  #
22
  # 
23
  # 14)
24
  # 15)..
22 25
  #The output is a list of four shapefile names produced by the function:
23 26
  #1) loc_stations: locations of stations as shapefile in EPSG 4326
24 27
  #2) loc_stations_ghcn: ghcn daily data for the year range of interpolation (locally projected)
......
82 85
  qc_flags_stations <- list_param_prep$qc_flags_stations #flags allowed for the query from the GHCND??
83 86
  out_prefix<-list_param_prep$out_prefix #"_365d_GAM_fus5_all_lstd_03012013"                #User defined output prefix
84 87
  
88
  ## New parameters added for  sub samplineg in areas with important density of meteo stations
89
  sub_sampling <- list_param$sub_sampling  #if TRUE then monthly stations data are resampled
90
  sub_sample_rnd <- list_param$sub_sample_rnd #if  TRUE use random sampling  in addition to spatial  sub-sampling
91
  target_range_nb <- list_param$target_range_nb # number of stations desired as min and max, convergence to  min  for  now
92
  dist_range <- list_param$dist_range #distance range  for pruning,  usually (0,5) in km or 0,0.009*5 for  degreee
93
  step_dist <- list_param$step_dist #stepping distance used in pruning  spatially, use 1km or 0.009 for degree data
94

  
85 95
  ## working directory is the same for input and output for this function  
86 96
  #setwd(in_path) 
87 97
  setwd(out_path)
88 98
  ##### STEP 1: Select station in the study area
89
  
90
  filename<-sub(".shp","",infile_reg_outline)             #Removing the extension from file.
99
    
100
  filename<-sub(".shp","",fixed=TRUE,infile_reg_outline)             #Removing the extension from file.
91 101
  interp_area <- readOGR(dsn=dirname(filename),basename(filename))
92 102
  CRS_interp<-proj4string(interp_area)         #Storing the coordinate information: geographic coordinates longlat WGS84
93 103
  
94 104
  #Read in GHCND database station locations
95 105
  dat_stat <- read.fwf(infile_ghncd_data, 
96 106
                       widths = c(11,9,10,7,3,31,4,4,6),fill=TRUE)
107

  
97 108
  colnames(dat_stat)<-c("STAT_ID","latitude","longitude","elev","state","name","GSNF","HCNF","WMOID")
98 109
  coords<- dat_stat[,c('longitude','latitude')]
99 110
  coordinates(dat_stat)<-coords
100 111
  proj4string(dat_stat)<-CRS_locs_WGS84 #this is the WGS84 projection
101 112
  #proj4string(dat_stat)<-CRS_interp
102 113
  interp_area_WGS84 <-spTransform(interp_area,CRS_locs_WGS84)         # Project from WGS84 to new coord. system
103
  
104 114
  # Spatial query to find relevant stations
105
  
115

  
106 116
  inside <- !is.na(over(dat_stat, as(interp_area_WGS84, "SpatialPolygons")))  #Finding stations contained in the current interpolation area
107 117
  stat_reg<-dat_stat[inside,]              #Selecting stations contained in the current interpolation area
108 118
  
......
111 121
  ####
112 122
  
113 123
  #### STEP 2: Connecting to the database and query for relevant data 
114
  
124

  
115 125
  drv <- dbDriver("PostgreSQL")
116 126
  db <- dbConnect(drv, dbname=db.name)
117
  
127

  
118 128
  time1<-proc.time()    #Start stop watch
119 129
  list_s<-format_s(stat_reg$STAT_ID)
120 130
  data2<-dbGetQuery(db, paste("SELECT *
......
125 135
                              "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
126 136
  time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
127 137
  time_minutes<-time_duration[3]/60
138
  print(time_minutes)
128 139
  dbDisconnect(db)
129

  
140
 
130 141
  data_table<-merge(data2,as.data.frame(stat_reg), by.x = "station", by.y = "STAT_ID")
131
  
142

  
132 143
  #Transform the subset data frame in a spatial data frame and reproject
133 144
  data_reg<-data_table                               #Make a copy of the data frame
134 145
  coords<- data_reg[c('longitude','latitude')]              #Define coordinates in a data frame: clean up here!!
135 146
  coordinates(data_reg)<-coords                      #Assign coordinates to the data frame
136 147
  proj4string(data_reg)<-CRS_locs_WGS84                #Assign coordinates reference system in PROJ4 format
137
  data_reg<-spTransform(data_reg,CRS(CRS_interp))     #Project from WGS84 to new coord. system
148
  #data_reg<-spTransform(data_reg,CRS(CRS_interp))     #Project from WGS84 to new coord. system
138 149
  
139 150
  data_d <-data_reg  #data_d: daily data containing the query without screening
140 151
  #data_reg <-subset(data_d,mflag=="0" | mflag=="S") #should be input arguments!!
......
142 153
  
143 154
  data_reg <-subset(data_d, mflag %in% qc_flags_stations) #screening using flags
144 155
  #data_reg2 <-subset(data_d,mflag==qc_flags_stations[1] | mflag==qc_flags_stations[2]) #screening using flags
145
  
156

  
146 157
  ##################################################################
147 158
  ### STEP 3: Save results and outuput in textfile and a shape file
148 159
  #browser()
......
151 162
  outfile1<-file.path(out_path,paste("stations","_",out_prefix,".shp",sep=""))
152 163
  writeOGR(stat_reg,dsn= dirname(outfile1),layer= sub(".shp","",basename(outfile1)), driver="ESRI Shapefile",overwrite_layer=TRUE)
153 164
  #writeOGR(dst,dsn= ".",layer= sub(".shp","",outfile4), driver="ESRI Shapefile",overwrite_layer=TRUE)
154
  
165
  #Also save as rds
166
  outfile1_rds<-sub(".shp",".rds",outfile1)
167
  saveRDS(stat_reg,outfile1)
168

  
155 169
  outfile2<-file.path(out_path,paste("ghcn_data_query_",var,"_",year_start,"_",year_end,out_prefix,".shp",sep=""))         #Name of the file
156 170
  #writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
157 171
  writeOGR(data_d,dsn= dirname(outfile2),layer= sub(".shp","",basename(outfile2)), driver="ESRI Shapefile",overwrite_layer=TRUE)
158
  
172
  outfile2_rds<-sub(".shp",".rds",outfile2)
173
  saveRDS(data_d,outfile2_rds) 
174

  
159 175
  outfile3<-file.path(out_path,paste("ghcn_data_",var,"_",year_start,"_",year_end,out_prefix,".shp",sep=""))         #Name of the file
160 176
  #writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
161 177
  writeOGR(data_reg,dsn= dirname(outfile3),layer= sub(".shp","",basename(outfile3)), driver="ESRI Shapefile",overwrite_layer=TRUE)
162
  
178
  outfile3_rds<-sub(".shp",".rds",outfile3)
179
  saveRDS(data_reg,outfile3_rds)
180

  
163 181
  ###################################################################
164 182
  ### STEP 4: Extract values at stations from covariates stack of raster images
165 183
  #Eventually this step may be skipped if the covariates information is stored in the database...
166
  
167 184
  #s_raster<-stack(file.path(in_path,infile_covariates))                   #read in the data stack
168 185
  s_raster<-brick(infile_covariates)                   #read in the data stack
169 186
  names(s_raster)<-covar_names               #Assigning names to the raster layers: making sure it is included in the extraction
......
171 188
  #stat_val_test<- extract(s_raster, data_reg,def=TRUE)
172 189
  
173 190
  #create a shape file and data_frame with names
174
  
175 191
  data_RST<-as.data.frame(stat_val)                                            #This creates a data frame with the values extracted
176 192
  data_RST_SDF<-cbind(data_reg,data_RST)
193

  
177 194
  coordinates(data_RST_SDF)<-coordinates(data_reg) #Transforming data_RST_SDF into a spatial point dataframe
178 195
  CRS_reg<-proj4string(data_reg)
179 196
  proj4string(data_RST_SDF)<-CRS_reg  #Need to assign coordinates...
180
  
197

  
181 198
  #Creating a date column
182 199
  date1<-ISOdate(data_RST_SDF$year,data_RST_SDF$month,data_RST_SDF$day) #Creating a date object from 3 separate column
183 200
  date2<-gsub("-","",as.character(as.Date(date1)))
184 201
  data_RST_SDF$date<-date2                                              #Date format (year,month,day) is the following: "20100627"
185
  
202
 
186 203
  #This allows to change only one name of the data.frame
187 204
  pos<-match("value",names(data_RST_SDF)) #Find column with name "value"
188 205
  if (var=="TMAX"){
......
198 215
  #write out a new shapefile (including .prj component)
199 216
  outfile4<-file.path(out_path,paste("daily_covariates_ghcn_data_",var,"_",range_years[1],"_",range_years[2],out_prefix,".shp",sep=""))         #Name of the file
200 217
  writeOGR(data_RST_SDF,dsn= dirname(outfile4),layer= sub(".shp","",basename(outfile4)), driver="ESRI Shapefile",overwrite_layer=TRUE)
201
  
218
  outfile4_rds<-sub(".shp",".rds",outfile4)
219
  saveRDS(data_RST_SDF,outfile4_rds)  
220

  
202 221
  ###############################################################
203 222
  ######## STEP 5: Preparing monthly averages from the ProstGres database
204
  
205 223
  drv <- dbDriver("PostgreSQL")
206 224
  db <- dbConnect(drv, dbname=db.name)
207 225
  
208 226
  #year_start_clim: set at the start of the script
209 227
  time1<-proc.time()    #Start stop watch
210 228
  list_s<-format_s(stat_reg$STAT_ID)
229
 
211 230
  data_m<-dbGetQuery(db, paste("SELECT *
212 231
                               FROM ghcn
213 232
                               WHERE element=",shQuote(var),
214 233
                               "AND year>=",year_start_clim,
215
                               "AND year<",year_end_clim,
216 234
                               "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
217 235
  time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
218 236
  time_minutes<-time_duration[3]/60
237
  print(time_minutes)
219 238
  dbDisconnect(db)
239
  
220 240
  #Clean out this section!!
221 241
  date1<-ISOdate(data_m$year,data_m$month,data_m$day) #Creating a date object from 3 separate column
222 242
  date2<-as.POSIXlt(as.Date(date1))
......
231 251
  outfile5<-file.path(out_path,paste("monthly_query_ghcn_data_",var,"_",year_start_clim,"_",year_end_clim,out_prefix,".shp",sep=""))  #Name of the file
232 252
  writeOGR(data_m,dsn= dirname(outfile5),layer= sub(".shp","",basename(outfile5)), driver="ESRI Shapefile",overwrite_layer=TRUE)
233 253
  
254
  outfile5_rds<-sub(".shp",".rds",outfile5)
255
  saveRDS(data_m,outfile5_rds) 
256

  
234 257
  #In Venezuela and other regions where there are not many stations...mflag==S should be added..see Durenne etal.2010.
235 258

  
236 259
  #d<-subset(data_m,mflag==qc_flags_stations[1] | mflag==qc_flags_stations[2])
237
  d<-subset(data_m,mflag %in% qc_flags_stations)
260
  d<-subset(data_m,mflag %in% qc_flags_stations) 
238 261
  
239 262
  #Add screening here ...May need some screeing??? i.e. range of temp and elevation...
240 263
  
......
261 284
  #Extracting covariates from stack for the monthly dataset...
262 285
  #names(dst)[5:6] <-c('latitude','longitude')
263 286
  coords<- dst[c('longitude','latitude')]              #Define coordinates in a data frame
264
  
287

  
265 288
  coordinates(dst)<-coords                      #Assign coordinates to the data frame
266 289
  proj4string(dst)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
267 290
  dst_month<-spTransform(dst,CRS(CRS_interp))     #Project from WGS84 to new coord. system
......
283 306
  #Covariates ok since screening done in covariate script
284 307
  #screening on var i.e. value, TMIN, TMAX...
285 308
  
286
  ####
309
  #### Adding  subsampling for regions  that  have  too  many stations...
287 310
  
311
  #This must be set up in master script
312
  #target_max_nb <- 100,000 #this is not actually used yet in the current implementation,can be set to very high value...
313
  #target_min_nb <- 8,000 #this is the target number of stations we would like: to be set by Alberto...
314
  ##max_dist <- 1000 # the maximum distance used for pruning ie removes stations that are closer than 1000m, this in degree...? 
315
  #max_dist <- 0.009*5 #5km in degree
316
  #min_dist <- 0    #minimum distance to start with
317
  #step_dist <- 0.009 #iteration step to remove the stations
318

  
319
  #test5 <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_nb,dist_range=dist_range,step_dist=step_dist,data_in=data_month,sampling=T,combined=F)
320
  if(sub_sampling==TRUE){
321
    sub_sampling_obj <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_nb,dist_range=dist_range,step_dist=step_dist,data_in=data_month,sampling=T,combined=F)
322
    dst <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations
323
    #save the information for later use (validation at monthly step!!)
324
    save(sub_sampling_obj,file= file.path(out_path,paste("sub_sampling_obj_",interpolation_method,"_", out_prefix,".RData",sep="")))
325
  }
326
 
288 327
  ####
289 328
  #write out a new shapefile (including .prj component)
290 329
  dst$OID<-1:nrow(dst) #need a unique ID?
291 330
  outfile6<-file.path(out_path,paste("monthly_covariates_ghcn_data_",var,"_",year_start_clim,"_",year_end_clim,out_prefix,".shp",sep=""))  #Name of the file
292 331
  writeOGR(dst,dsn= dirname(outfile6),layer= sub(".shp","",basename(outfile6)), driver="ESRI Shapefile",overwrite_layer=TRUE)
293 332
  
294
  ### list of outputs return
295
  
333
  outfile6_rds<-sub(".shp",".rds",outfile6)
334
  saveRDS(dst,outfile6_rds)
335

  
296 336
  outfiles_obj<-list(outfile1,outfile2,outfile3,outfile4,outfile5,outfile6)
297 337
  names(outfiles_obj)<- c("loc_stations","loc_stations_ghcn","daily_query_ghcn_data","daily_covar_ghcn_data","monthly_query_ghcn_data","monthly_covar_ghcn_data")
298 338
  save(outfiles_obj,file= file.path(out_path,paste("met_stations_outfiles_obj_",interpolation_method,"_", out_prefix,".RData",sep="")))

Also available in: Unified diff