Project

General

Profile

« Previous | Next » 

Revision 907d8e86

Added by Benoit Parmentier about 10 years ago

subsampling script, major modifications to solve issues related to North America runs

View differences:

climate/research/oregon/interpolation/subsampling_data.R
5 5
#
6 6
#AUTHOR: Benoit Parmentier                                                                      
7 7
#CREATED ON: 10/16/2014            
8
#MODIFIED ON: 10/27/2014            
8
#MODIFIED ON: 01/06/2015            
9 9
#Version: 1
10 10
#
11 11
#PROJECT: Environmental Layers project  NCEAS-NASA
......
47 47
function_analyses_paper1 <- "contribution_of_covariates_paper_interpolation_functions_07182014.R" #first interp paper
48 48
function_analyses_paper2 <- "multi_timescales_paper_interpolation_functions_10062014.R"
49 49

  
50
sub_sampling_by_dist <- function(target_range_nb=c(10000,10000),dist_val=0.0,max_dist=NULL,step,data_in){
50
sub_sampling_by_dist <- function(target_range_nb=c(10000,10000),dist_val=0.0,max_dist=NULL,step_val,data_in){
51 51
  #Function to select stations data that are outside a specific spatial range from each other
52 52
  #Parameters:
53 53
  #max_dist: maximum spatial distance at which to stop the pruning
54 54
  #min_dist: minimum distance to start pruning the data
55
  #step: spatial distance increment
56
  #Note that we are assuming that the first columns contains ID with name col of "id"
57

  
55
  #step_val: spatial distance increment
56
  #Note that we are assuming that the first columns contains ID with name col of "id".
57
  #Note that the selection is based on unique id of original SPDF so that replicates screened.
58
  
59
  data_in$id <- as.character(data_in$id)
58 60
  data <- data_in
61
  
62
  #Now only take unique id in the shapefile!!!
63
  #This step is necessary to avoid the large calculation of matrix distance with replicates
64
  #unique(data$id)
65
  data <- aggregate(id ~ x + y , data=data,min)
66
  coordinates(data) <- cbind(data$x,data$y)
67
  proj4string(data) <- proj4string(data_in)
68
  
59 69
  target_min_nb <- target_range_nb[1]
60
  station_nb <- nrow(data_in)
70
  #target_min_nb <- target_range_day_nb[1]
71
  
72
  #station_nb <- nrow(data_in)
73
  station_nb <- nrow(data)
61 74
  if(is.null(max_dist)){
62 75
    while(station_nb > target_min_nb){
63 76
      data <- remove.duplicates(data, zero = dist_val) #spatially sub sample...
64
      dist_val <- dist_val + step
77
      dist_val <- dist_val + step_val
65 78
      station_nb <- nrow(data)
66 79
    }
67 80
    #setdiff(as.character(data$id),as.character(data_in$id))
68
    ind.selected <-match(as.character(data$id),as.character(data_in$id)) #index of stations row selected
69
    ind.removed  <- setdiff(1:nrow(data_in), ind.selected) #index of stations rows removed 
81
    #ind.selected <-match(as.character(data$id),as.character(data_in$id)) #index of stations row selected
82
    #ind.removed  <- setdiff(1:nrow(data_in), ind.selected) #index of stations rows removed 
83
    id_selected <- as.character(data$id)
84
    id_removed <- setdiff(unique(as.character(data_in$id)),as.character(data$id))
85

  
70 86
  }
71 87
  if(!is.null(max_dist)){
72 88
    
......
74 90
      data <- remove.duplicates(data, zero = dist_val) #spatially sub sample...
75 91
      #id_rm <- zerodist(data, zero = dist_val, unique.ID = FALSE)
76 92
      #data_rm <- data[id_rm,]
77
      dist_val <- dist_val + step
93
      dist_val <- dist_val + step_val
78 94
      station_nb <- nrow(data)
79 95
    }
80
    ind.selected <-match(as.character(data$id),as.character(data_in$id))
81
    ind.removed  <- setdiff(1:nrow(data_in), ind.selected)
96
    #ind.selected <- match(as.character(data$id),as.character(data_in$id))
97
    id_selected <- as.character(data$id)
98
    id_removed <- setdiff(unique(as.character(data_in$id)),as.character(data$id))
99
  #  ind.removed  <- setdiff(1:nrow(data_in), ind.selected)
82 100
  }
101
    
102
  #data_rm <- data_in[ind.removed,]
103
  data_rm <- subset(data_in, id %in% id_removed)
104
  data_tmp <- data #store the reduced dataset with only id, for debugging purpose
105
  
106
  #data <- subset(data_in, id %in% data$id) #select based on id
107
  data <-subset(data_in, id %in% id_selected) #select based on id
83 108
  
84
  data_rm <- data_in[ind.removed,]
109
  #data <- data_in[ind.selected,]
85 110
  obj_sub_sampling <- list(data,dist_val,data_rm) #data.frame selected, minimum distance, data.frame stations removed
86 111
  names(obj_sub_sampling) <- c("data","dist","data_rm")
87 112
  return(obj_sub_sampling)
......
96 121
  #target_range_nb : number of stations desired as min and max, convergence to  min  for  now
97 122
  #dist_range : spatial distance range  for pruning,  usually (0,5) in km or 0,0.009*5 for  degreee
98 123
  #step_dist : stepping distance used in pruning  spatially, use 1km or 0.009 for degree data
99
  #data_in : input data to be resampled (data.frame or spatial point df.)
124
  #data_in : input data to be resampled (spatial point df. which contains)
100 125
  #combined: if FALSE, combined, add variable to  show wich  data rows  were removed (not currently in use)
101 126
  #
102 127
  #Output parameters:
103
  #data: subsampled data
128
  #data_out: subsampled data
104 129
  #dist: distance at which spatial sub-sampling  ended
105 130
  #data_removed: data that was removed from the input data frame
106 131
  #data_dist: data item/stations after using spatial pruning, only appears if sampling = T
107 132

  
108 133
  #### START PROGRAM BODY #####
109 134
  
110
  data <- data_in
135
  data_all <- data_in
111 136
  min_dist <- dist_range[1]
112 137
  max_dist <- dist_range[2]
113 138
  
114 139
  #if sampling is chosen...first run spatial selection then sampling...
115 140
  if(sampling==T){
116 141
    #debug(sub_sampling_by_dist)
117
    dat <- sub_sampling_by_dist(target_range_nb,dist_val=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month)
118
    station_nb <- nrow(dat$data)
119
    if (station_nb > target_min_nb){
120
      ind_s1  <- sample(nrow(dat$data), size=target_range_nb[1], replace = FALSE, prob = NULL) #furhter sample
121
      #ind_s2 <- setdiff(1:nrow(dat$data), ind_s1)
122
      data_out <- dat$data[ind_s1,] #selected the randomly sampled stations
142
    dat <- sub_sampling_by_dist(target_range_nb,dist_val=min_dist,max_dist=max_dist,step_val=step_dist,data_in=data_in)
143
    data_out1 <- dat$data #after subsampling using spatial proximity
144
    
145
    data <- aggregate(id ~ x + y , data=data_out1,min)
146
    coordinates(data) <- cbind(data$x,data$y)
147
    proj4string(data) <- proj4string(data_in)
148

  
149
    #once more we need to use only stations with id not replicates!!
123 150
    
124
      ind.selected <-match(as.character(data_out$id),as.character(data_in$id))
125
      ind.removed  <- setdiff(1:nrow(data_in), ind.selected)
126
      data_removed <- data_in[ind.removed,]
151
    station_nb <- nrow(data)
152
    
153
    if (station_nb > target_min_nb){
154
      
155
      ind_s1  <- sample(nrow(data), size=target_range_nb[1], replace = FALSE, prob = NULL) #furhter sample
156
      ind_s2 <- setdiff(1:nrow(data), ind_s1)
157

  
158
      data_out_tmp <- data[ind_s1,] #selected the randomly sampled stations, only station location used here!!
159

  
160
      id_selected <- as.character(data_out_tmp$id)
161
      id_removed <- setdiff(unique(as.character(data$id)),as.character(data_out_tmp$id))
162
      
163
      data_removed <- subset(data_out1, id %in% id_removed)
164
      #data_tmp <- data #store the reduced dataset with only id, for debugging purpose
165
  
166
      #data <- subset(data_in, id %in% data$id) #select based on id
167
      data_out <-subset(data_out1, id %in% id_selected) #select based on id
168

  
169
      #data_out_tmp <- data[ind_s1,] #selected the randomly sampled stations, only station location used here!!
170
      #ind.selected <- match(as.character(data_out$id),as.character(data_in$id))
171
      #ind.removed  <- setdiff(1:nrow(data_in), ind.selected)
172
      #data_removed <- data[ind.removed,]
127 173
    
128 174
      #Find the corresponding 
129 175
      #data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
......
137 183
    names(data_obj) <- c("data","dist","data_removed","data_dist")
138 184
  }
139 185
  if(sampling!=T){
140
    dat <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step=step_dist,data_in=data_month)
186
    dat <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step_val=step_dist,data_in=data_in)
141 187
    #
142 188
    data_obj <- list(dat$data,dat$dist,dat$data_rm)
143 189
    names(data_obj) <- c("data","dist","data_removed")
......
231 277

  
232 278
test5 <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_nb,dist_range=dist_range,step_dist=step_dist,data_in=data_month,sampling=T,combined=F)
233 279

  
280
#### Now testing on NEX data for North America
281
#daily sampling...
282

  
283
path_tmp <- "/nobackupp6/aguzman4/climateLayers/output1000x3000_km/reg1/33.8_-93.3"
284
setwd(path_tmp)
285
#daily_covariates_ghcn_data_TMAX_2010_201133.8_-93.3.shp
286

  
287
data_RST_SDF <- readOGR(".","daily_covariates_ghcn_data_TMAX_2010_201133.8_-93.3")
288
dim(data_RST_SDF)
289

  
290

  
291
target_max_nb <- 100000 #this is not actually used yet in the current implementation,can be set to very high value...
292
target_min_nb <- 600 #this is the target number of stations we would like: to be set by Alberto...
293
#max_dist <- 1000 # the maximum distance used for pruning ie removes stations that are closer than 1000m, this in degree...? 
294
max_idst <- 0.009*5 #5km in degree
295
min_dist <- 0    #minimum distance to start with
296
step_dist <- 0.009 #iteration step to remove the stations
297
target_range_day_nb <- c(target_min_nb,target_max_nb) #set in master script and read in database_preparation script..
298

  
299
if(sub_sampling_day==TRUE){
300
    
301
  sub_sampling_obj <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_day_nb,
302
                                                   dist_range=dist_range,step_dist=step_dist,data_in=data_RST_SDF,
303
                                                   sampling=T,combined=F)
304
  #data_RST_SDF <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations
305
  data_test <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations
234 306
  
307
  #save the information for later use (validation at monthly step!!)
308
  save(sub_sampling_obj,file= file.path(out_path,paste("sub_sampling_obj_","daily_",interpolation_method,"_", out_prefix,".RData",sep="")))
309
}
310

  
311
dim(data_test)
312
#> dim(data_test) #some replications
313
#[1] 199755     72
314
unique(data_test$id) #this is 600 stations as requested!
315

  
316
#### now deal with monthly data
317

  
318
#monthly_covariates_ghcn_data_TMAX_2000_201133.8_-93.3.shp
319
dst <- readOGR(".","monthly_covariates_ghcn_data_TMAX_2000_201133.8_-93.3")
320
dst$id <- dst$station #must have an id column, this was added in database prepration script
321

  
322
#This must be set up in master script
323
target_max_nb <- 100000 #this is not actually used yet in the current implementation,can be set to very high value...
324
target_min_nb <- 2500 #this is the target number of stations we would like: to be set by Alberto...#THIS IS DIFFERENT THAN DAILY
325
#max_dist <- 1000 # the maximum distance used for pruning ie removes stations that are closer than 1000m, this in degree...? 
326
max_dist <- 0.009*5 #5km in degree
327
min_dist <- 0    #minimum distance to start with
328
step_dist <- 0.009 #iteration step to remove the stations
329
target_range_nb <- c(target_min_nb,target_max_nb)
330
#note that  this is for monthly stations.
331
  
332
if(sub_sampling==TRUE){ #sub_sampling is an option for the monthly station
333
  sub_sampling_obj <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_nb,dist_range=dist_range,step_dist=step_dist,data_in=dst,sampling=T,combined=F)
334
  data_test_month <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations
335
  #save the information for later use (validation at monthly step!!)
336
  save(sub_sampling_obj,file= file.path(out_path,paste("sub_sampling_obj_",interpolation_method,"_", out_prefix,".RData",sep="")))
337
}
338
 
339
#> dim(data_test_month)
340
#[1] 29418    68
341
#> length(unique(data_test_month$id))
342
#[1] 2500
343

  
344
#Ok working for monthly as well...
345

  
346
############################
347
### Additional tile to test: Tile 2 on NEX with about 1.1 millon rows
348

  
349
#37.6_-89.5
350

  
351
path_tmp <- "/nobackupp6/aguzman4/climateLayers/output1000x3000_km/reg1/37.6_-89.5"
352
setwd(path_tmp)
353
#daily_covariates_ghcn_data_TMAX_2010_201133.8_-93.3.shp
354

  
355
data_RST_SDF <- readOGR(".","daily_covariates_ghcn_data_TMAX_2010_201137.6_-89.5")
356
dim(data_RST_SDF)
357
#> length(unique(data_RST_SDF$id))
358
#[1] 3298
359
#> dim(data_RST_SDF)
360
#[1] 1117029      72
361

  
362
target_max_nb <- 100000 #this is not actually used yet in the current implementation,can be set to very high value...
363
target_min_nb <- 600 #this is the target number of stations we would like: to be set by Alberto...
364
#max_dist <- 1000 # the maximum distance used for pruning ie removes stations that are closer than 1000m, this in degree...? 
365
max_idst <- 0.009*5 #5km in degree
366
min_dist <- 0    #minimum distance to start with
367
step_dist <- 0.009 #iteration step to remove the stations
368
target_range_day_nb <- c(target_min_nb,target_max_nb) #set in master script and read in database_preparation script..
369

  
370
if(sub_sampling_day==TRUE){
371
    
372
  sub_sampling_obj <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_day_nb,
373
                                                   dist_range=dist_range,step_dist=step_dist,data_in=data_RST_SDF,
374
                                                   sampling=T,combined=F)
375
  #data_RST_SDF <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations
376
  data_test <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations
377
  
378
  #save the information for later use (validation at monthly step!!)
379
  save(sub_sampling_obj,file= file.path(out_path,paste("sub_sampling_obj_","daily_",interpolation_method,"_", out_prefix,".RData",sep="")))
380
}
381

  
382
#Checking if it worked...
383
dim(data_test)
384
#> length(unique(data_test$id))
385
#[1] 600
386
#> dim(data_test)
387
#[1] 204444     72
388
#unique(data_test$id) #this is 600 stations as requested!
389

  
390
####
391
#### now deal with monthly data
392

  
393
#monthly_covariates_ghcn_data_TMAX_2000_201137.6_-89.5.shp
394
dst <- readOGR(".","monthly_covariates_ghcn_data_TMAX_2000_201137.6_-89.5")
395
dst$id <- dst$station #must have an id column, this was added in database prepration script
396

  
397
#This must be set up in master script
398
target_max_nb <- 100000 #this is not actually used yet in the current implementation,can be set to very high value...
399
target_min_nb <- 2500 #this is the target number of stations we would like: to be set by Alberto...#THIS IS DIFFERENT THAN DAILY
400
#max_dist <- 1000 # the maximum distance used for pruning ie removes stations that are closer than 1000m, this in degree...? 
401
max_dist <- 0.009*5 #5km in degree
402
min_dist <- 0    #minimum distance to start with
403
step_dist <- 0.009 #iteration step to remove the stations
404
target_range_nb <- c(target_min_nb,target_max_nb)
405
#note that  this is for monthly stations.
406
  
407
if(sub_sampling==TRUE){ #sub_sampling is an option for the monthly station
408
  sub_sampling_obj <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_nb,dist_range=dist_range,step_dist=step_dist,data_in=dst,sampling=T,combined=F)
409
  data_test_month <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations
410
  #save the information for later use (validation at monthly step!!)
411
  save(sub_sampling_obj,file= file.path(out_path,paste("sub_sampling_obj_",interpolation_method,"_", out_prefix,".RData",sep="")))
412
}
413

  
414
#Ok worked too and very fast
415
#> dim(data_test_month)
416
#[1] 29450    68
417
#> length(unique(data_test_month$id))
418
#[1] 2500
419

  
235 420
############ END OF SCRIPT #########

Also available in: Unified diff