Revision 907d8e86
Added by Benoit Parmentier almost 10 years ago
climate/research/oregon/interpolation/subsampling_data.R | ||
---|---|---|
5 | 5 |
# |
6 | 6 |
#AUTHOR: Benoit Parmentier |
7 | 7 |
#CREATED ON: 10/16/2014 |
8 |
#MODIFIED ON: 10/27/2014
|
|
8 |
#MODIFIED ON: 01/06/2015
|
|
9 | 9 |
#Version: 1 |
10 | 10 |
# |
11 | 11 |
#PROJECT: Environmental Layers project NCEAS-NASA |
... | ... | |
47 | 47 |
function_analyses_paper1 <- "contribution_of_covariates_paper_interpolation_functions_07182014.R" #first interp paper |
48 | 48 |
function_analyses_paper2 <- "multi_timescales_paper_interpolation_functions_10062014.R" |
49 | 49 |
|
50 |
sub_sampling_by_dist <- function(target_range_nb=c(10000,10000),dist_val=0.0,max_dist=NULL,step,data_in){ |
|
50 |
sub_sampling_by_dist <- function(target_range_nb=c(10000,10000),dist_val=0.0,max_dist=NULL,step_val,data_in){
|
|
51 | 51 |
#Function to select stations data that are outside a specific spatial range from each other |
52 | 52 |
#Parameters: |
53 | 53 |
#max_dist: maximum spatial distance at which to stop the pruning |
54 | 54 |
#min_dist: minimum distance to start pruning the data |
55 |
#step: spatial distance increment |
|
56 |
#Note that we are assuming that the first columns contains ID with name col of "id" |
|
57 |
|
|
55 |
#step_val: spatial distance increment |
|
56 |
#Note that we are assuming that the first columns contains ID with name col of "id". |
|
57 |
#Note that the selection is based on unique id of original SPDF so that replicates screened. |
|
58 |
|
|
59 |
data_in$id <- as.character(data_in$id) |
|
58 | 60 |
data <- data_in |
61 |
|
|
62 |
#Now only take unique id in the shapefile!!! |
|
63 |
#This step is necessary to avoid the large calculation of matrix distance with replicates |
|
64 |
#unique(data$id) |
|
65 |
data <- aggregate(id ~ x + y , data=data,min) |
|
66 |
coordinates(data) <- cbind(data$x,data$y) |
|
67 |
proj4string(data) <- proj4string(data_in) |
|
68 |
|
|
59 | 69 |
target_min_nb <- target_range_nb[1] |
60 |
station_nb <- nrow(data_in) |
|
70 |
#target_min_nb <- target_range_day_nb[1] |
|
71 |
|
|
72 |
#station_nb <- nrow(data_in) |
|
73 |
station_nb <- nrow(data) |
|
61 | 74 |
if(is.null(max_dist)){ |
62 | 75 |
while(station_nb > target_min_nb){ |
63 | 76 |
data <- remove.duplicates(data, zero = dist_val) #spatially sub sample... |
64 |
dist_val <- dist_val + step |
|
77 |
dist_val <- dist_val + step_val
|
|
65 | 78 |
station_nb <- nrow(data) |
66 | 79 |
} |
67 | 80 |
#setdiff(as.character(data$id),as.character(data_in$id)) |
68 |
ind.selected <-match(as.character(data$id),as.character(data_in$id)) #index of stations row selected |
|
69 |
ind.removed <- setdiff(1:nrow(data_in), ind.selected) #index of stations rows removed |
|
81 |
#ind.selected <-match(as.character(data$id),as.character(data_in$id)) #index of stations row selected |
|
82 |
#ind.removed <- setdiff(1:nrow(data_in), ind.selected) #index of stations rows removed |
|
83 |
id_selected <- as.character(data$id) |
|
84 |
id_removed <- setdiff(unique(as.character(data_in$id)),as.character(data$id)) |
|
85 |
|
|
70 | 86 |
} |
71 | 87 |
if(!is.null(max_dist)){ |
72 | 88 |
|
... | ... | |
74 | 90 |
data <- remove.duplicates(data, zero = dist_val) #spatially sub sample... |
75 | 91 |
#id_rm <- zerodist(data, zero = dist_val, unique.ID = FALSE) |
76 | 92 |
#data_rm <- data[id_rm,] |
77 |
dist_val <- dist_val + step |
|
93 |
dist_val <- dist_val + step_val
|
|
78 | 94 |
station_nb <- nrow(data) |
79 | 95 |
} |
80 |
ind.selected <-match(as.character(data$id),as.character(data_in$id)) |
|
81 |
ind.removed <- setdiff(1:nrow(data_in), ind.selected) |
|
96 |
#ind.selected <- match(as.character(data$id),as.character(data_in$id)) |
|
97 |
id_selected <- as.character(data$id) |
|
98 |
id_removed <- setdiff(unique(as.character(data_in$id)),as.character(data$id)) |
|
99 |
# ind.removed <- setdiff(1:nrow(data_in), ind.selected) |
|
82 | 100 |
} |
101 |
|
|
102 |
#data_rm <- data_in[ind.removed,] |
|
103 |
data_rm <- subset(data_in, id %in% id_removed) |
|
104 |
data_tmp <- data #store the reduced dataset with only id, for debugging purpose |
|
105 |
|
|
106 |
#data <- subset(data_in, id %in% data$id) #select based on id |
|
107 |
data <-subset(data_in, id %in% id_selected) #select based on id |
|
83 | 108 |
|
84 |
data_rm <- data_in[ind.removed,]
|
|
109 |
#data <- data_in[ind.selected,]
|
|
85 | 110 |
obj_sub_sampling <- list(data,dist_val,data_rm) #data.frame selected, minimum distance, data.frame stations removed |
86 | 111 |
names(obj_sub_sampling) <- c("data","dist","data_rm") |
87 | 112 |
return(obj_sub_sampling) |
... | ... | |
96 | 121 |
#target_range_nb : number of stations desired as min and max, convergence to min for now |
97 | 122 |
#dist_range : spatial distance range for pruning, usually (0,5) in km or 0,0.009*5 for degreee |
98 | 123 |
#step_dist : stepping distance used in pruning spatially, use 1km or 0.009 for degree data |
99 |
#data_in : input data to be resampled (data.frame or spatial point df.)
|
|
124 |
#data_in : input data to be resampled (spatial point df. which contains)
|
|
100 | 125 |
#combined: if FALSE, combined, add variable to show wich data rows were removed (not currently in use) |
101 | 126 |
# |
102 | 127 |
#Output parameters: |
103 |
#data: subsampled data |
|
128 |
#data_out: subsampled data
|
|
104 | 129 |
#dist: distance at which spatial sub-sampling ended |
105 | 130 |
#data_removed: data that was removed from the input data frame |
106 | 131 |
#data_dist: data item/stations after using spatial pruning, only appears if sampling = T |
107 | 132 |
|
108 | 133 |
#### START PROGRAM BODY ##### |
109 | 134 |
|
110 |
data <- data_in |
|
135 |
data_all <- data_in
|
|
111 | 136 |
min_dist <- dist_range[1] |
112 | 137 |
max_dist <- dist_range[2] |
113 | 138 |
|
114 | 139 |
#if sampling is chosen...first run spatial selection then sampling... |
115 | 140 |
if(sampling==T){ |
116 | 141 |
#debug(sub_sampling_by_dist) |
117 |
dat <- sub_sampling_by_dist(target_range_nb,dist_val=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month) |
|
118 |
station_nb <- nrow(dat$data) |
|
119 |
if (station_nb > target_min_nb){ |
|
120 |
ind_s1 <- sample(nrow(dat$data), size=target_range_nb[1], replace = FALSE, prob = NULL) #furhter sample |
|
121 |
#ind_s2 <- setdiff(1:nrow(dat$data), ind_s1) |
|
122 |
data_out <- dat$data[ind_s1,] #selected the randomly sampled stations |
|
142 |
dat <- sub_sampling_by_dist(target_range_nb,dist_val=min_dist,max_dist=max_dist,step_val=step_dist,data_in=data_in) |
|
143 |
data_out1 <- dat$data #after subsampling using spatial proximity |
|
144 |
|
|
145 |
data <- aggregate(id ~ x + y , data=data_out1,min) |
|
146 |
coordinates(data) <- cbind(data$x,data$y) |
|
147 |
proj4string(data) <- proj4string(data_in) |
|
148 |
|
|
149 |
#once more we need to use only stations with id not replicates!! |
|
123 | 150 |
|
124 |
ind.selected <-match(as.character(data_out$id),as.character(data_in$id)) |
|
125 |
ind.removed <- setdiff(1:nrow(data_in), ind.selected) |
|
126 |
data_removed <- data_in[ind.removed,] |
|
151 |
station_nb <- nrow(data) |
|
152 |
|
|
153 |
if (station_nb > target_min_nb){ |
|
154 |
|
|
155 |
ind_s1 <- sample(nrow(data), size=target_range_nb[1], replace = FALSE, prob = NULL) #furhter sample |
|
156 |
ind_s2 <- setdiff(1:nrow(data), ind_s1) |
|
157 |
|
|
158 |
data_out_tmp <- data[ind_s1,] #selected the randomly sampled stations, only station location used here!! |
|
159 |
|
|
160 |
id_selected <- as.character(data_out_tmp$id) |
|
161 |
id_removed <- setdiff(unique(as.character(data$id)),as.character(data_out_tmp$id)) |
|
162 |
|
|
163 |
data_removed <- subset(data_out1, id %in% id_removed) |
|
164 |
#data_tmp <- data #store the reduced dataset with only id, for debugging purpose |
|
165 |
|
|
166 |
#data <- subset(data_in, id %in% data$id) #select based on id |
|
167 |
data_out <-subset(data_out1, id %in% id_selected) #select based on id |
|
168 |
|
|
169 |
#data_out_tmp <- data[ind_s1,] #selected the randomly sampled stations, only station location used here!! |
|
170 |
#ind.selected <- match(as.character(data_out$id),as.character(data_in$id)) |
|
171 |
#ind.removed <- setdiff(1:nrow(data_in), ind.selected) |
|
172 |
#data_removed <- data[ind.removed,] |
|
127 | 173 |
|
128 | 174 |
#Find the corresponding |
129 | 175 |
#data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations |
... | ... | |
137 | 183 |
names(data_obj) <- c("data","dist","data_removed","data_dist") |
138 | 184 |
} |
139 | 185 |
if(sampling!=T){ |
140 |
dat <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step=step_dist,data_in=data_month)
|
|
186 |
dat <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step_val=step_dist,data_in=data_in)
|
|
141 | 187 |
# |
142 | 188 |
data_obj <- list(dat$data,dat$dist,dat$data_rm) |
143 | 189 |
names(data_obj) <- c("data","dist","data_removed") |
... | ... | |
231 | 277 |
|
232 | 278 |
test5 <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_nb,dist_range=dist_range,step_dist=step_dist,data_in=data_month,sampling=T,combined=F) |
233 | 279 |
|
280 |
#### Now testing on NEX data for North America |
|
281 |
#daily sampling... |
|
282 |
|
|
283 |
path_tmp <- "/nobackupp6/aguzman4/climateLayers/output1000x3000_km/reg1/33.8_-93.3" |
|
284 |
setwd(path_tmp) |
|
285 |
#daily_covariates_ghcn_data_TMAX_2010_201133.8_-93.3.shp |
|
286 |
|
|
287 |
data_RST_SDF <- readOGR(".","daily_covariates_ghcn_data_TMAX_2010_201133.8_-93.3") |
|
288 |
dim(data_RST_SDF) |
|
289 |
|
|
290 |
|
|
291 |
target_max_nb <- 100000 #this is not actually used yet in the current implementation,can be set to very high value... |
|
292 |
target_min_nb <- 600 #this is the target number of stations we would like: to be set by Alberto... |
|
293 |
#max_dist <- 1000 # the maximum distance used for pruning ie removes stations that are closer than 1000m, this in degree...? |
|
294 |
max_idst <- 0.009*5 #5km in degree |
|
295 |
min_dist <- 0 #minimum distance to start with |
|
296 |
step_dist <- 0.009 #iteration step to remove the stations |
|
297 |
target_range_day_nb <- c(target_min_nb,target_max_nb) #set in master script and read in database_preparation script.. |
|
298 |
|
|
299 |
if(sub_sampling_day==TRUE){ |
|
300 |
|
|
301 |
sub_sampling_obj <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_day_nb, |
|
302 |
dist_range=dist_range,step_dist=step_dist,data_in=data_RST_SDF, |
|
303 |
sampling=T,combined=F) |
|
304 |
#data_RST_SDF <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations |
|
305 |
data_test <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations |
|
234 | 306 |
|
307 |
#save the information for later use (validation at monthly step!!) |
|
308 |
save(sub_sampling_obj,file= file.path(out_path,paste("sub_sampling_obj_","daily_",interpolation_method,"_", out_prefix,".RData",sep=""))) |
|
309 |
} |
|
310 |
|
|
311 |
dim(data_test) |
|
312 |
#> dim(data_test) #some replications |
|
313 |
#[1] 199755 72 |
|
314 |
unique(data_test$id) #this is 600 stations as requested! |
|
315 |
|
|
316 |
#### now deal with monthly data |
|
317 |
|
|
318 |
#monthly_covariates_ghcn_data_TMAX_2000_201133.8_-93.3.shp |
|
319 |
dst <- readOGR(".","monthly_covariates_ghcn_data_TMAX_2000_201133.8_-93.3") |
|
320 |
dst$id <- dst$station #must have an id column, this was added in database prepration script |
|
321 |
|
|
322 |
#This must be set up in master script |
|
323 |
target_max_nb <- 100000 #this is not actually used yet in the current implementation,can be set to very high value... |
|
324 |
target_min_nb <- 2500 #this is the target number of stations we would like: to be set by Alberto...#THIS IS DIFFERENT THAN DAILY |
|
325 |
#max_dist <- 1000 # the maximum distance used for pruning ie removes stations that are closer than 1000m, this in degree...? |
|
326 |
max_dist <- 0.009*5 #5km in degree |
|
327 |
min_dist <- 0 #minimum distance to start with |
|
328 |
step_dist <- 0.009 #iteration step to remove the stations |
|
329 |
target_range_nb <- c(target_min_nb,target_max_nb) |
|
330 |
#note that this is for monthly stations. |
|
331 |
|
|
332 |
if(sub_sampling==TRUE){ #sub_sampling is an option for the monthly station |
|
333 |
sub_sampling_obj <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_nb,dist_range=dist_range,step_dist=step_dist,data_in=dst,sampling=T,combined=F) |
|
334 |
data_test_month <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations |
|
335 |
#save the information for later use (validation at monthly step!!) |
|
336 |
save(sub_sampling_obj,file= file.path(out_path,paste("sub_sampling_obj_",interpolation_method,"_", out_prefix,".RData",sep=""))) |
|
337 |
} |
|
338 |
|
|
339 |
#> dim(data_test_month) |
|
340 |
#[1] 29418 68 |
|
341 |
#> length(unique(data_test_month$id)) |
|
342 |
#[1] 2500 |
|
343 |
|
|
344 |
#Ok working for monthly as well... |
|
345 |
|
|
346 |
############################ |
|
347 |
### Additional tile to test: Tile 2 on NEX with about 1.1 millon rows |
|
348 |
|
|
349 |
#37.6_-89.5 |
|
350 |
|
|
351 |
path_tmp <- "/nobackupp6/aguzman4/climateLayers/output1000x3000_km/reg1/37.6_-89.5" |
|
352 |
setwd(path_tmp) |
|
353 |
#daily_covariates_ghcn_data_TMAX_2010_201133.8_-93.3.shp |
|
354 |
|
|
355 |
data_RST_SDF <- readOGR(".","daily_covariates_ghcn_data_TMAX_2010_201137.6_-89.5") |
|
356 |
dim(data_RST_SDF) |
|
357 |
#> length(unique(data_RST_SDF$id)) |
|
358 |
#[1] 3298 |
|
359 |
#> dim(data_RST_SDF) |
|
360 |
#[1] 1117029 72 |
|
361 |
|
|
362 |
target_max_nb <- 100000 #this is not actually used yet in the current implementation,can be set to very high value... |
|
363 |
target_min_nb <- 600 #this is the target number of stations we would like: to be set by Alberto... |
|
364 |
#max_dist <- 1000 # the maximum distance used for pruning ie removes stations that are closer than 1000m, this in degree...? |
|
365 |
max_idst <- 0.009*5 #5km in degree |
|
366 |
min_dist <- 0 #minimum distance to start with |
|
367 |
step_dist <- 0.009 #iteration step to remove the stations |
|
368 |
target_range_day_nb <- c(target_min_nb,target_max_nb) #set in master script and read in database_preparation script.. |
|
369 |
|
|
370 |
if(sub_sampling_day==TRUE){ |
|
371 |
|
|
372 |
sub_sampling_obj <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_day_nb, |
|
373 |
dist_range=dist_range,step_dist=step_dist,data_in=data_RST_SDF, |
|
374 |
sampling=T,combined=F) |
|
375 |
#data_RST_SDF <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations |
|
376 |
data_test <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations |
|
377 |
|
|
378 |
#save the information for later use (validation at monthly step!!) |
|
379 |
save(sub_sampling_obj,file= file.path(out_path,paste("sub_sampling_obj_","daily_",interpolation_method,"_", out_prefix,".RData",sep=""))) |
|
380 |
} |
|
381 |
|
|
382 |
#Checking if it worked... |
|
383 |
dim(data_test) |
|
384 |
#> length(unique(data_test$id)) |
|
385 |
#[1] 600 |
|
386 |
#> dim(data_test) |
|
387 |
#[1] 204444 72 |
|
388 |
#unique(data_test$id) #this is 600 stations as requested! |
|
389 |
|
|
390 |
#### |
|
391 |
#### now deal with monthly data |
|
392 |
|
|
393 |
#monthly_covariates_ghcn_data_TMAX_2000_201137.6_-89.5.shp |
|
394 |
dst <- readOGR(".","monthly_covariates_ghcn_data_TMAX_2000_201137.6_-89.5") |
|
395 |
dst$id <- dst$station #must have an id column, this was added in database prepration script |
|
396 |
|
|
397 |
#This must be set up in master script |
|
398 |
target_max_nb <- 100000 #this is not actually used yet in the current implementation,can be set to very high value... |
|
399 |
target_min_nb <- 2500 #this is the target number of stations we would like: to be set by Alberto...#THIS IS DIFFERENT THAN DAILY |
|
400 |
#max_dist <- 1000 # the maximum distance used for pruning ie removes stations that are closer than 1000m, this in degree...? |
|
401 |
max_dist <- 0.009*5 #5km in degree |
|
402 |
min_dist <- 0 #minimum distance to start with |
|
403 |
step_dist <- 0.009 #iteration step to remove the stations |
|
404 |
target_range_nb <- c(target_min_nb,target_max_nb) |
|
405 |
#note that this is for monthly stations. |
|
406 |
|
|
407 |
if(sub_sampling==TRUE){ #sub_sampling is an option for the monthly station |
|
408 |
sub_sampling_obj <- sub_sampling_by_dist_nb_stat(target_range_nb=target_range_nb,dist_range=dist_range,step_dist=step_dist,data_in=dst,sampling=T,combined=F) |
|
409 |
data_test_month <- sub_sampling_obj$data #get sub-sampled data...for monhtly stations |
|
410 |
#save the information for later use (validation at monthly step!!) |
|
411 |
save(sub_sampling_obj,file= file.path(out_path,paste("sub_sampling_obj_",interpolation_method,"_", out_prefix,".RData",sep=""))) |
|
412 |
} |
|
413 |
|
|
414 |
#Ok worked too and very fast |
|
415 |
#> dim(data_test_month) |
|
416 |
#[1] 29450 68 |
|
417 |
#> length(unique(data_test_month$id)) |
|
418 |
#[1] 2500 |
|
419 |
|
|
235 | 420 |
############ END OF SCRIPT ######### |
Also available in: Unified diff
subsampling script, major modifications to solve issues related to North America runs