Project

General

Profile

« Previous | Next » 

Revision 6b4a1789

Added by Benoit Parmentier about 10 years ago

keeping tracked of stations removed and debugging

View differences:

climate/research/oregon/interpolation/subsampling_data.R
5 5
#
6 6
#AUTHOR: Benoit Parmentier                                                                      
7 7
#CREATED ON: 10/16/2014            
8
#MODIFIED ON: 10/23/2014            
8
#MODIFIED ON: 10/27/2014            
9 9
#Version: 1
10 10
#
11 11
#PROJECT: Environmental Layers project  NCEAS-NASA
......
47 47
function_analyses_paper1 <- "contribution_of_covariates_paper_interpolation_functions_07182014.R" #first interp paper
48 48
function_analyses_paper2 <- "multi_timescales_paper_interpolation_functions_10062014.R"
49 49

  
50
sub_sampling_by_dist <- function(target_range_nb=c(10000,10000),dist=0.0,max_dist=NULL,step,data_in){
50
sub_sampling_by_dist <- function(target_range_nb=c(10000,10000),dist_val=0.0,max_dist=NULL,step,data_in){
51 51
  #Function to select stations data that are outside a specific spatial range from each other
52 52
  #Parameters:
53 53
  #max_dist: maximum spatial distance at which to stop the pruning
54 54
  #min_dist: minimum distance to start pruning the data
55 55
  #step: spatial distance increment
56
  #Note that we are assuming that the first columns contains ID with name col of "id"
56 57

  
57 58
  data <- data_in
58 59
  target_min_nb <- target_range_nb[1]
59 60
  station_nb <- nrow(data_in)
60 61
  if(is.null(max_dist)){
61 62
    while(station_nb > target_min_nb){
62
      data <- remove.duplicates(data, zero = dist) #spatially sub sample...
63
      dist <- dist + step
63
      data <- remove.duplicates(data, zero = dist_val) #spatially sub sample...
64
      dist_val <- dist_val + step
64 65
      station_nb <- nrow(data)
65 66
    }
67
    #setdiff(as.character(data$id),as.character(data_in$id))
68
    ind.selected <-match(as.character(data$id),as.character(data_in$id)) #index of stations row selected
69
    ind.removed  <- setdiff(1:nrow(data_in), ind.selected) #index of stations rows removed 
66 70
  }
67 71
  if(!is.null(max_dist)){
68 72
    
69
    while(station_nb > target_min_nb & dist < max_dist){ 
70
      data <- remove.duplicates(data, zero = dist) #spatially sub sample...
71
      id_rm <- zerodist(data, zero = dist, unique.ID = FALSE)
72
      data_rm <- data[id_rm,]
73
      dist <- dist + step
73
    while(station_nb > target_min_nb & dist_val < max_dist){ 
74
      data <- remove.duplicates(data, zero = dist_val) #spatially sub sample...
75
      #id_rm <- zerodist(data, zero = dist_val, unique.ID = FALSE)
76
      #data_rm <- data[id_rm,]
77
      dist_val <- dist_val + step
74 78
      station_nb <- nrow(data)
75 79
    }
80
    ind.selected <-match(as.character(data$id),as.character(data_in$id))
81
    ind.removed  <- setdiff(1:nrow(data_in), ind.selected)
76 82
  }
77 83
  
78
  obj_sub_sampling <- list(data,dist)
79
  names(obj_sub_sampling) <- c("data","dist")
84
  data_rm <- data_in[ind.removed,]
85
  obj_sub_sampling <- list(data,dist_val,data_rm) #data.frame selected, minimum distance, data.frame stations removed
86
  names(obj_sub_sampling) <- c("data","dist","data_rm")
80 87
  return(obj_sub_sampling)
81 88
}
82 89

  
......
86 93
  min_dist <- dist_range[1]
87 94
  max_dist <- dist_range[2]
88 95
  
96
  #if sampling is chosen...first run spatial selection then sampling...
89 97
  if(sampling==T){
90
    dat <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month)
91
    ind_s1  <- sample(nrow(dat$data), size=target_range_nb[1], replace = FALSE, prob = NULL)
92
    ind_s2 <- setdiff(1:nrow(dat$data), ind_s1)
93
    data_out <- dat$data[ind_s1,] #selected the randomly sampled stations
94
    data_removed <- dat[ind_s2,]
98
    #debug(sub_sampling_by_dist)
99
    dat <- sub_sampling_by_dist(target_range_nb,dist_val=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month)
100
    station_nb <- nrow(dat$data)
101
    if (station_nb > target_min_nb){
102
      ind_s1  <- sample(nrow(dat$data), size=target_range_nb[1], replace = FALSE, prob = NULL) #furhter sample
103
      #ind_s2 <- setdiff(1:nrow(dat$data), ind_s1)
104
      data_out <- dat$data[ind_s1,] #selected the randomly sampled stations
95 105
    
96
    #Find the corresponding 
97
    #data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
106
      ind.selected <-match(as.character(data_out$id),as.character(data_in$id))
107
      ind.removed  <- setdiff(1:nrow(data_in), ind.selected)
108
      data_removed <- data_in[ind.removed,]
109
    
110
      #Find the corresponding 
111
      #data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
112
    }
113
    if (station_nb <= target_min_nb){
114
      data_out <- dat$data
115
      data_removed <- dat$data_rm
116
    }
98 117

  
99
    data_out <- list(data_out,dat$dist,data_removed,dat$data)
100
    data_out <- c("data","dist","data_removed","data_dist")
118
    data_obj <- list(data_out,dat$dist,data_removed,dat$data)
119
    names(data_obj) <- c("data","dist","data_removed","data_dist")
101 120
  }
102 121
  if(sampling!=T){
103 122
    dat <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step=step_dist,data_in=data_month)
104 123
    #
105
    data_out <- list(dat$data,dat$dist,data_removed)
106
    data_out <- c("data","dist","data_removed")
124
    data_obj <- list(dat$data,dat$dist,dat$data_rm)
125
    names(data_obj) <- c("data","dist","data_removed")
107 126
    
108 127
  }
109
  return(data_out)
128
  return(data_obj)
110 129
}
111 130

  
112
debug(sub_sampling_by_dist_nb_stat)
113
test3 <- sub_sampling_by_dist_nb_stat(target_range_nb=c(100,200),dist_range=c(0,1000),data_in=data_month,sampling=T,combined=F)
114

  
115

  
116
#n<-nrow(ghcn.subsets[[i]])
117
#prop<-(sampling_dat$prop[i])/100
118
#ns<-n-round(n*prop)   #Create a sample from the data frame with 70% of the rows
119
#nv<-n-ns              #create a sample for validation with prop of the rows
120
#ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
121
#ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
122
#Find the corresponding 
123
#data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
124
#station_id.training<-data_sampled$station     #selected id for the randomly sampled stations (115)
125
#Save the information
126
#sampling[[i]]<-ind.training #index of training sample from data.frame
127
#sampling_station_id[[i]]<- station_id.training #station ID for traning samples
128

  
129 131
##############################
130 132
#### Parameters and constants  
131 133

  
......
151 153

  
152 154
plot(data_month)
153 155

  
156

  
157
### Part 1, use selection based on spatial distance only!!
158

  
154 159
#set up input parameters
155 160

  
156 161
target_max_nb <- 200 #this is not actually used yet in the current implementation
......
173 178
dim(test2$data) #97 stations selected 
174 179
test2$dist # for distance of 31,000 m (no max_dist is set)
175 180

  
176
dist_range <- c(0,5000) 
181
dist_range <- c(0,10000) 
182
max_dist <- 10000# the maximum distance used for pruning ie removes stations that are closer than 1000m 
183

  
184
test3 <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month)
185
dim(test3$data) #178 stations selected 
177 186

  
187
################
188
### Part 2 use selection based on both spatial and sampling distance
189

  
190
#if for a given max distance there is still too many stations then use sampling (use sampling==T)
178 191
#Now use the other function to sample the station data points:
179 192

  
180
sub_sampling_by_dist_nb_stat(target_range_nb=c(10000,10000),dist_range,data_in,sampling=T)
181

  
182

  
183
#n<-nrow(ghcn.subsets[[i]])
184
#prop<-(sampling_dat$prop[i])/100
185
#ns<-n-round(n*prop)   #Create a sample from the data frame with 70% of the rows
186
#nv<-n-ns              #create a sample for validation with prop of the rows
187
#ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
188
#ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
189
#Find the corresponding 
190
#data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
191
#station_id.training<-data_sampled$station     #selected id for the randomly sampled stations (115)
192
#Save the information
193
#sampling[[i]]<-ind.training #index of training sample from data.frame
194
#sampling_station_id[[i]]<- station_id.training #station ID for traning samples
193
#### 
194
dist_range <- c(0,10000) 
195
max_dist <- 10000# the maximum distance used for pruning ie removes stations that are closer than 1000m 
196

  
197
#debug(sub_sampling_by_dist_nb_stat)
198
test4 <- sub_sampling_by_dist_nb_stat(target_range_nb=c(100,200),dist_range=c(0,10000),data_in=data_month,sampling=T,combined=F)
199
dim(test4$data) #we get exactly 100 stations as asked...first the 178 stations were selected using the spatial criteria
200
                #then 100 stations were selected using the sampling function
195 201
  
196 202
############ END OF SCRIPT #########

Also available in: Unified diff