Revision 6b4a1789
Added by Benoit Parmentier about 10 years ago
climate/research/oregon/interpolation/subsampling_data.R | ||
---|---|---|
5 | 5 |
# |
6 | 6 |
#AUTHOR: Benoit Parmentier |
7 | 7 |
#CREATED ON: 10/16/2014 |
8 |
#MODIFIED ON: 10/23/2014
|
|
8 |
#MODIFIED ON: 10/27/2014
|
|
9 | 9 |
#Version: 1 |
10 | 10 |
# |
11 | 11 |
#PROJECT: Environmental Layers project NCEAS-NASA |
... | ... | |
47 | 47 |
function_analyses_paper1 <- "contribution_of_covariates_paper_interpolation_functions_07182014.R" #first interp paper |
48 | 48 |
function_analyses_paper2 <- "multi_timescales_paper_interpolation_functions_10062014.R" |
49 | 49 |
|
50 |
sub_sampling_by_dist <- function(target_range_nb=c(10000,10000),dist=0.0,max_dist=NULL,step,data_in){ |
|
50 |
sub_sampling_by_dist <- function(target_range_nb=c(10000,10000),dist_val=0.0,max_dist=NULL,step,data_in){
|
|
51 | 51 |
#Function to select stations data that are outside a specific spatial range from each other |
52 | 52 |
#Parameters: |
53 | 53 |
#max_dist: maximum spatial distance at which to stop the pruning |
54 | 54 |
#min_dist: minimum distance to start pruning the data |
55 | 55 |
#step: spatial distance increment |
56 |
#Note that we are assuming that the first columns contains ID with name col of "id" |
|
56 | 57 |
|
57 | 58 |
data <- data_in |
58 | 59 |
target_min_nb <- target_range_nb[1] |
59 | 60 |
station_nb <- nrow(data_in) |
60 | 61 |
if(is.null(max_dist)){ |
61 | 62 |
while(station_nb > target_min_nb){ |
62 |
data <- remove.duplicates(data, zero = dist) #spatially sub sample... |
|
63 |
dist <- dist + step
|
|
63 |
data <- remove.duplicates(data, zero = dist_val) #spatially sub sample...
|
|
64 |
dist_val <- dist_val + step
|
|
64 | 65 |
station_nb <- nrow(data) |
65 | 66 |
} |
67 |
#setdiff(as.character(data$id),as.character(data_in$id)) |
|
68 |
ind.selected <-match(as.character(data$id),as.character(data_in$id)) #index of stations row selected |
|
69 |
ind.removed <- setdiff(1:nrow(data_in), ind.selected) #index of stations rows removed |
|
66 | 70 |
} |
67 | 71 |
if(!is.null(max_dist)){ |
68 | 72 |
|
69 |
while(station_nb > target_min_nb & dist < max_dist){ |
|
70 |
data <- remove.duplicates(data, zero = dist) #spatially sub sample... |
|
71 |
id_rm <- zerodist(data, zero = dist, unique.ID = FALSE)
|
|
72 |
data_rm <- data[id_rm,] |
|
73 |
dist <- dist + step
|
|
73 |
while(station_nb > target_min_nb & dist_val < max_dist){
|
|
74 |
data <- remove.duplicates(data, zero = dist_val) #spatially sub sample...
|
|
75 |
#id_rm <- zerodist(data, zero = dist_val, unique.ID = FALSE)
|
|
76 |
#data_rm <- data[id_rm,]
|
|
77 |
dist_val <- dist_val + step
|
|
74 | 78 |
station_nb <- nrow(data) |
75 | 79 |
} |
80 |
ind.selected <-match(as.character(data$id),as.character(data_in$id)) |
|
81 |
ind.removed <- setdiff(1:nrow(data_in), ind.selected) |
|
76 | 82 |
} |
77 | 83 |
|
78 |
obj_sub_sampling <- list(data,dist) |
|
79 |
names(obj_sub_sampling) <- c("data","dist") |
|
84 |
data_rm <- data_in[ind.removed,] |
|
85 |
obj_sub_sampling <- list(data,dist_val,data_rm) #data.frame selected, minimum distance, data.frame stations removed |
|
86 |
names(obj_sub_sampling) <- c("data","dist","data_rm") |
|
80 | 87 |
return(obj_sub_sampling) |
81 | 88 |
} |
82 | 89 |
|
... | ... | |
86 | 93 |
min_dist <- dist_range[1] |
87 | 94 |
max_dist <- dist_range[2] |
88 | 95 |
|
96 |
#if sampling is chosen...first run spatial selection then sampling... |
|
89 | 97 |
if(sampling==T){ |
90 |
dat <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month) |
|
91 |
ind_s1 <- sample(nrow(dat$data), size=target_range_nb[1], replace = FALSE, prob = NULL) |
|
92 |
ind_s2 <- setdiff(1:nrow(dat$data), ind_s1) |
|
93 |
data_out <- dat$data[ind_s1,] #selected the randomly sampled stations |
|
94 |
data_removed <- dat[ind_s2,] |
|
98 |
#debug(sub_sampling_by_dist) |
|
99 |
dat <- sub_sampling_by_dist(target_range_nb,dist_val=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month) |
|
100 |
station_nb <- nrow(dat$data) |
|
101 |
if (station_nb > target_min_nb){ |
|
102 |
ind_s1 <- sample(nrow(dat$data), size=target_range_nb[1], replace = FALSE, prob = NULL) #furhter sample |
|
103 |
#ind_s2 <- setdiff(1:nrow(dat$data), ind_s1) |
|
104 |
data_out <- dat$data[ind_s1,] #selected the randomly sampled stations |
|
95 | 105 |
|
96 |
#Find the corresponding |
|
97 |
#data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations |
|
106 |
ind.selected <-match(as.character(data_out$id),as.character(data_in$id)) |
|
107 |
ind.removed <- setdiff(1:nrow(data_in), ind.selected) |
|
108 |
data_removed <- data_in[ind.removed,] |
|
109 |
|
|
110 |
#Find the corresponding |
|
111 |
#data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations |
|
112 |
} |
|
113 |
if (station_nb <= target_min_nb){ |
|
114 |
data_out <- dat$data |
|
115 |
data_removed <- dat$data_rm |
|
116 |
} |
|
98 | 117 |
|
99 |
data_out <- list(data_out,dat$dist,data_removed,dat$data)
|
|
100 |
data_out <- c("data","dist","data_removed","data_dist")
|
|
118 |
data_obj <- list(data_out,dat$dist,data_removed,dat$data)
|
|
119 |
names(data_obj) <- c("data","dist","data_removed","data_dist")
|
|
101 | 120 |
} |
102 | 121 |
if(sampling!=T){ |
103 | 122 |
dat <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step=step_dist,data_in=data_month) |
104 | 123 |
# |
105 |
data_out <- list(dat$data,dat$dist,data_removed)
|
|
106 |
data_out <- c("data","dist","data_removed")
|
|
124 |
data_obj <- list(dat$data,dat$dist,dat$data_rm)
|
|
125 |
names(data_obj) <- c("data","dist","data_removed")
|
|
107 | 126 |
|
108 | 127 |
} |
109 |
return(data_out)
|
|
128 |
return(data_obj)
|
|
110 | 129 |
} |
111 | 130 |
|
112 |
debug(sub_sampling_by_dist_nb_stat) |
|
113 |
test3 <- sub_sampling_by_dist_nb_stat(target_range_nb=c(100,200),dist_range=c(0,1000),data_in=data_month,sampling=T,combined=F) |
|
114 |
|
|
115 |
|
|
116 |
#n<-nrow(ghcn.subsets[[i]]) |
|
117 |
#prop<-(sampling_dat$prop[i])/100 |
|
118 |
#ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
|
119 |
#nv<-n-ns #create a sample for validation with prop of the rows |
|
120 |
#ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly |
|
121 |
#ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training) |
|
122 |
#Find the corresponding |
|
123 |
#data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations |
|
124 |
#station_id.training<-data_sampled$station #selected id for the randomly sampled stations (115) |
|
125 |
#Save the information |
|
126 |
#sampling[[i]]<-ind.training #index of training sample from data.frame |
|
127 |
#sampling_station_id[[i]]<- station_id.training #station ID for traning samples |
|
128 |
|
|
129 | 131 |
############################## |
130 | 132 |
#### Parameters and constants |
131 | 133 |
|
... | ... | |
151 | 153 |
|
152 | 154 |
plot(data_month) |
153 | 155 |
|
156 |
|
|
157 |
### Part 1, use selection based on spatial distance only!! |
|
158 |
|
|
154 | 159 |
#set up input parameters |
155 | 160 |
|
156 | 161 |
target_max_nb <- 200 #this is not actually used yet in the current implementation |
... | ... | |
173 | 178 |
dim(test2$data) #97 stations selected |
174 | 179 |
test2$dist # for distance of 31,000 m (no max_dist is set) |
175 | 180 |
|
176 |
dist_range <- c(0,5000) |
|
181 |
dist_range <- c(0,10000) |
|
182 |
max_dist <- 10000# the maximum distance used for pruning ie removes stations that are closer than 1000m |
|
183 |
|
|
184 |
test3 <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month) |
|
185 |
dim(test3$data) #178 stations selected |
|
177 | 186 |
|
187 |
################ |
|
188 |
### Part 2 use selection based on both spatial and sampling distance |
|
189 |
|
|
190 |
#if for a given max distance there is still too many stations then use sampling (use sampling==T) |
|
178 | 191 |
#Now use the other function to sample the station data points: |
179 | 192 |
|
180 |
sub_sampling_by_dist_nb_stat(target_range_nb=c(10000,10000),dist_range,data_in,sampling=T) |
|
181 |
|
|
182 |
|
|
183 |
#n<-nrow(ghcn.subsets[[i]]) |
|
184 |
#prop<-(sampling_dat$prop[i])/100 |
|
185 |
#ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
|
186 |
#nv<-n-ns #create a sample for validation with prop of the rows |
|
187 |
#ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly |
|
188 |
#ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training) |
|
189 |
#Find the corresponding |
|
190 |
#data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations |
|
191 |
#station_id.training<-data_sampled$station #selected id for the randomly sampled stations (115) |
|
192 |
#Save the information |
|
193 |
#sampling[[i]]<-ind.training #index of training sample from data.frame |
|
194 |
#sampling_station_id[[i]]<- station_id.training #station ID for traning samples |
|
193 |
#### |
|
194 |
dist_range <- c(0,10000) |
|
195 |
max_dist <- 10000# the maximum distance used for pruning ie removes stations that are closer than 1000m |
|
196 |
|
|
197 |
#debug(sub_sampling_by_dist_nb_stat) |
|
198 |
test4 <- sub_sampling_by_dist_nb_stat(target_range_nb=c(100,200),dist_range=c(0,10000),data_in=data_month,sampling=T,combined=F) |
|
199 |
dim(test4$data) #we get exactly 100 stations as asked...first the 178 stations were selected using the spatial criteria |
|
200 |
#then 100 stations were selected using the sampling function |
|
195 | 201 |
|
196 | 202 |
############ END OF SCRIPT ######### |
Also available in: Unified diff
keeping tracked of stations removed and debugging