Revision 41528b05
Added by Benoit Parmentier over 8 years ago
climate/research/oregon/interpolation/global_product_assessment_part1_functions.R | ||
---|---|---|
453 | 453 |
return(data_subset) |
454 | 454 |
} |
455 | 455 |
|
456 |
sub_sampling_by_dist <- function(target_range_nb=c(10000,10000),dist_val=0.0,max_dist=NULL,step_val,data_in){ |
|
457 |
#Function to select stations data that are outside a specific spatial range from each other |
|
458 |
#Parameters: |
|
459 |
#max_dist: maximum spatial distance at which to stop the pruning |
|
460 |
#min_dist: minimum distance to start pruning the data |
|
461 |
#step_val: spatial distance increment |
|
462 |
#Note that we are assuming that the first columns contains ID with name col of "id". |
|
463 |
#Note that the selection is based on unique id of original SPDF so that replicates screened. |
|
464 |
|
|
465 |
data_in$id <- as.character(data_in$id) |
|
466 |
data <- data_in |
|
467 |
|
|
468 |
#Now only take unique id in the shapefile!!! |
|
469 |
#This step is necessary to avoid the large calculation of matrix distance with replicates |
|
470 |
#unique(data$id) |
|
471 |
data <- aggregate(id ~ x + y , data=data,min) |
|
472 |
coordinates(data) <- cbind(data$x,data$y) |
|
473 |
proj4string(data) <- proj4string(data_in) |
|
474 |
|
|
475 |
target_min_nb <- target_range_nb[1] |
|
476 |
#target_min_nb <- target_range_day_nb[1] |
|
477 |
|
|
478 |
#station_nb <- nrow(data_in) |
|
479 |
station_nb <- nrow(data) |
|
480 |
if(is.null(max_dist)){ |
|
481 |
while(station_nb > target_min_nb){ |
|
482 |
data <- remove.duplicates(data, zero = dist_val) #spatially sub sample... |
|
483 |
dist_val <- dist_val + step_val |
|
484 |
station_nb <- nrow(data) |
|
485 |
} |
|
486 |
#setdiff(as.character(data$id),as.character(data_in$id)) |
|
487 |
#ind.selected <-match(as.character(data$id),as.character(data_in$id)) #index of stations row selected |
|
488 |
#ind.removed <- setdiff(1:nrow(data_in), ind.selected) #index of stations rows removed |
|
489 |
id_selected <- as.character(data$id) |
|
490 |
id_removed <- setdiff(unique(as.character(data_in$id)),as.character(data$id)) |
|
491 |
|
|
492 |
} |
|
493 |
if(!is.null(max_dist)){ |
|
494 |
|
|
495 |
while(station_nb > target_min_nb & dist_val < max_dist){ |
|
496 |
data <- remove.duplicates(data, zero = dist_val) #spatially sub sample... |
|
497 |
#id_rm <- zerodist(data, zero = dist_val, unique.ID = FALSE) |
|
498 |
#data_rm <- data[id_rm,] |
|
499 |
dist_val <- dist_val + step_val |
|
500 |
station_nb <- nrow(data) |
|
501 |
} |
|
502 |
#ind.selected <- match(as.character(data$id),as.character(data_in$id)) |
|
503 |
id_selected <- as.character(data$id) |
|
504 |
id_removed <- setdiff(unique(as.character(data_in$id)),as.character(data$id)) |
|
505 |
# ind.removed <- setdiff(1:nrow(data_in), ind.selected) |
|
506 |
} |
|
507 |
|
|
508 |
#data_rm <- data_in[ind.removed,] |
|
509 |
data_rm <- subset(data_in, id %in% id_removed) |
|
510 |
data_tmp <- data #store the reduced dataset with only id, for debugging purpose |
|
511 |
|
|
512 |
#data <- subset(data_in, id %in% data$id) #select based on id |
|
513 |
data <-subset(data_in, id %in% id_selected) #select based on id |
|
514 |
|
|
515 |
#data <- data_in[ind.selected,] |
|
516 |
obj_sub_sampling <- list(data,dist_val,data_rm) #data.frame selected, minimum distance, data.frame stations removed |
|
517 |
names(obj_sub_sampling) <- c("data","dist","data_rm") |
|
518 |
return(obj_sub_sampling) |
|
519 |
} |
|
520 |
|
|
521 |
|
|
456 | 522 |
############################ END OF SCRIPT ################################## |
Also available in: Unified diff
testing subsampling by distance function to select specific given stations from points