Revision 764c7d94
Added by Benoit Parmentier about 10 years ago
climate/research/oregon/interpolation/subsampling_data.R | ||
---|---|---|
1 | 1 |
#################################### INTERPOLATION OF TEMPERATURES ####################################### |
2 |
############################ Script for manuscript analyses,tables and figures #######################################
|
|
3 |
#This script uses the worklfow code applied to the Oregon case study. Daily methods (GAM,GWR, Kriging) are tested with
|
|
4 |
#different covariates using two baselines. Accuracy methods are added in the the function script to evaluate results.
|
|
5 |
#Figures, tables and data for the contribution of covariate paper are also produced in the script.
|
|
2 |
############################ Script to prune and sample stations data #######################################
|
|
3 |
#This script uses the worklfow code applied to the globe. Results currently reside on NEX/PLEIADES NASA and NCEAS UCSB.
|
|
4 |
#The purpose is sub sample stations data in areas where there are a very high number of stations.
|
|
5 |
# |
|
6 | 6 |
#AUTHOR: Benoit Parmentier |
7 |
#MODIFIED ON: 09/11/2014 |
|
8 |
#Version: 5 |
|
9 |
#PROJECT: Environmental Layers project |
|
7 |
#CREATED ON: 10/16/2014 |
|
8 |
#MODIFIED ON: 10/23/2014 |
|
9 |
#Version: 1 |
|
10 |
# |
|
11 |
#PROJECT: Environmental Layers project NCEAS-NASA |
|
12 |
#TO DO: |
|
13 |
# |
|
14 |
|
|
10 | 15 |
################################################################################################# |
11 | 16 |
|
12 | 17 |
### Loading R library and packages |
... | ... | |
39 | 44 |
|
40 | 45 |
#### FUNCTION USED IN SCRIPT |
41 | 46 |
|
42 |
function_analyses_paper1 <-"contribution_of_covariates_paper_interpolation_functions_07182014.R" #first interp paper |
|
43 |
function_analyses_paper2 <-"multi_timescales_paper_interpolation_functions_10062014.R" |
|
44 |
|
|
45 |
############################## |
|
46 |
#### Parameters and constants |
|
47 |
|
|
48 |
script_path<-"/home/parmentier/Data/IPLANT_project/env_layers_scripts/" #path to script |
|
49 |
source(file.path(script_path,function_analyses_paper1)) #source all functions used in this script 1. |
|
50 |
source(file.path(script_path,function_analyses_paper2)) #source all functions used in this script 2. |
|
51 |
|
|
52 |
#Multi time scale - CAI: gam, kriging, gwr |
|
53 |
in_dir4 <-"/data/project/layers/commons/Oregon_interpolation/output_data_365d_gam_cai_lst_comb5_11032013" |
|
54 |
raster_obj_file_4 <- "raster_prediction_obj_gam_CAI_dailyTmax_365d_gam_cai_lst_comb5_11032013.RData" |
|
47 |
function_analyses_paper1 <- "contribution_of_covariates_paper_interpolation_functions_07182014.R" #first interp paper |
|
48 |
function_analyses_paper2 <- "multi_timescales_paper_interpolation_functions_10062014.R" |
|
55 | 49 |
|
56 |
raster_prediction_obj_4 <- load_obj(file.path(in_dir4,raster_obj_file_4)) |
|
57 |
|
|
58 |
names(raster_prediction_obj_4) |
|
59 |
names(raster_prediction_obj_4$method_mod_obj[[1]]) |
|
60 |
|
|
61 |
(raster_prediction_obj_4$method_mod_obj[[1]]$data_month_v) #no holdout... |
|
62 |
data_month <- raster_prediction_obj_4$method_mod_obj[[1]]$data_month_s #no holdout... |
|
63 |
dim(data_month) |
|
64 |
test <- zerodist(data_month, zero = 0.0, unique.ID = FALSE) |
|
65 |
|
|
66 |
target_max_nb <- 200 |
|
67 |
target_min_nb <- 100 |
|
68 |
max_dist <- 10 |
|
69 |
min_dist <- 0 |
|
70 |
step_dist <- 1000 |
|
71 |
target_range_nb <- c(target_min_nb,target_max_nb) |
|
72 |
#debug(sub_sampling_by_dist) |
|
73 |
#First increase distance till 5km |
|
74 |
#then use random sampling...to get the extact target? |
|
75 |
test <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month) |
|
76 |
test <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step=step_dist,data_in=data_month) |
|
77 |
|
|
78 |
dist_range <- c(0,5000) |
|
79 |
|
|
80 |
sub_sampling_by_dist_nb_stat(target_range_nb=c(10000,10000),dist_range,data_in,sampling=T){ |
|
81 |
|
|
82 |
|
|
83 | 50 |
sub_sampling_by_dist <- function(target_range_nb=c(10000,10000),dist=0.0,max_dist=NULL,step,data_in){ |
51 |
#Function to select stations data that are outside a specific spatial range from each other |
|
52 |
#Parameters: |
|
53 |
#max_dist: maximum spatial distance at which to stop the pruning |
|
54 |
#min_dist: minimum distance to start pruning the data |
|
55 |
#step: spatial distance increment |
|
56 |
|
|
84 | 57 |
data <- data_in |
85 | 58 |
target_min_nb <- target_range_nb[1] |
86 | 59 |
station_nb <- nrow(data_in) |
... | ... | |
92 | 65 |
} |
93 | 66 |
} |
94 | 67 |
if(!is.null(max_dist)){ |
95 |
while(station_nb > target_min_nb || dist < max_dist){ |
|
96 |
d#} #|| nrow > 0){ |
|
97 |
#test <- zerodist(data, zero = 0.0, unique.ID = FALSE) |
|
98 |
#test <- remove.duplicates(data_month, zero = 5000) |
|
68 |
|
|
69 |
while(station_nb > target_min_nb & dist < max_dist){ |
|
99 | 70 |
data <- remove.duplicates(data, zero = dist) #spatially sub sample... |
71 |
id_rm <- zerodist(data, zero = dist, unique.ID = FALSE) |
|
72 |
data_rm <- data[id_rm,] |
|
100 | 73 |
dist <- dist + step |
101 | 74 |
station_nb <- nrow(data) |
102 | 75 |
} |
... | ... | |
107 | 80 |
return(obj_sub_sampling) |
108 | 81 |
} |
109 | 82 |
|
110 |
sub_sampling_by_dist_nb_stat <- function(target_range_nb=c(10000,10000),dist_range,data_in,sampling=T){
|
|
83 |
sub_sampling_by_dist_nb_stat <- function(target_range_nb,dist_range,data_in,sampling=T,combined=F){
|
|
111 | 84 |
|
112 | 85 |
data <- data_in |
113 | 86 |
min_dist <- dist_range[1] |
... | ... | |
115 | 88 |
|
116 | 89 |
if(sampling==T){ |
117 | 90 |
dat <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month) |
118 |
data_out <-sample(dat$data, target_range_nb[2], replace = FALSE, prob = NULL) |
|
119 |
data_out <- list(data_out,dat$dist,dat$data) |
|
120 |
data_out <- c("data","dist","data_dist") |
|
91 |
ind_s1 <- sample(nrow(dat$data), size=target_range_nb[1], replace = FALSE, prob = NULL) |
|
92 |
ind_s2 <- setdiff(1:nrow(dat$data), ind_s1) |
|
93 |
data_out <- dat$data[ind_s1,] #selected the randomly sampled stations |
|
94 |
data_removed <- dat[ind_s2,] |
|
95 |
|
|
96 |
#Find the corresponding |
|
97 |
#data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations |
|
98 |
|
|
99 |
data_out <- list(data_out,dat$dist,data_removed,dat$data) |
|
100 |
data_out <- c("data","dist","data_removed","data_dist") |
|
121 | 101 |
} |
122 | 102 |
if(sampling!=T){ |
123 |
data_out <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step=step_dist,data_in=data_month) |
|
103 |
dat <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step=step_dist,data_in=data_month) |
|
104 |
# |
|
105 |
data_out <- list(dat$data,dat$dist,data_removed) |
|
106 |
data_out <- c("data","dist","data_removed") |
|
107 |
|
|
124 | 108 |
} |
125 | 109 |
return(data_out) |
126 | 110 |
} |
127 | 111 |
|
128 |
#sub_sampling_by_dist <- function(target_range_nb=c(10000,10000),dist=0.0,step,data_in){ |
|
129 |
# data <- data_in |
|
130 |
# target_min_nb <- target_range_nb[1] |
|
131 |
# station_nb <- nrow(data_in) |
|
132 |
# while(station_nb > target_min_nb){ #} #|| nrow > 0){ |
|
133 |
# #test <- zerodist(data, zero = 0.0, unique.ID = FALSE) |
|
134 |
# #test <- remove.duplicates(data_month, zero = 5000) |
|
135 |
# data <- remove.duplicates(data, zero = dist) #spatially sub sample... |
|
136 |
# dist <- dist + step |
|
137 |
# station_nb <- nrow(data) |
|
138 |
# } |
|
139 |
# obj_sub_sampling <- list(data,dist) |
|
140 |
# names(obj_sub_sampling) <- c("data","dist") |
|
141 |
# return(obj_sub_sampling) |
|
142 |
#} |
|
112 |
debug(sub_sampling_by_dist_nb_stat) |
|
113 |
test3 <- sub_sampling_by_dist_nb_stat(target_range_nb=c(100,200),dist_range=c(0,1000),data_in=data_month,sampling=T,combined=F) |
|
114 |
|
|
115 |
|
|
116 |
#n<-nrow(ghcn.subsets[[i]]) |
|
117 |
#prop<-(sampling_dat$prop[i])/100 |
|
118 |
#ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
|
119 |
#nv<-n-ns #create a sample for validation with prop of the rows |
|
120 |
#ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly |
|
121 |
#ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training) |
|
122 |
#Find the corresponding |
|
123 |
#data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations |
|
124 |
#station_id.training<-data_sampled$station #selected id for the randomly sampled stations (115) |
|
125 |
#Save the information |
|
126 |
#sampling[[i]]<-ind.training #index of training sample from data.frame |
|
127 |
#sampling_station_id[[i]]<- station_id.training #station ID for traning samples |
|
128 |
|
|
129 |
############################## |
|
130 |
#### Parameters and constants |
|
131 |
|
|
132 |
script_path<-"/home/parmentier/Data/IPLANT_project/env_layers_scripts/" #path to script |
|
133 |
source(file.path(script_path,function_analyses_paper1)) #source all functions used in this script 1. |
|
134 |
source(file.path(script_path,function_analyses_paper2)) #source all functions used in this script 2. |
|
135 |
|
|
136 |
#Multi time scale - CAI: gam, kriging, gwr |
|
137 |
in_dir4 <-"/data/project/layers/commons/Oregon_interpolation/output_data_365d_gam_cai_lst_comb5_11032013" #data used in multi-timescale paper |
|
138 |
raster_obj_file_4 <- "raster_prediction_obj_gam_CAI_dailyTmax_365d_gam_cai_lst_comb5_11032013.RData" |
|
139 |
|
|
140 |
################################################################### |
|
141 |
############### BEGIN SCRIPT ################################## |
|
142 |
|
|
143 |
raster_prediction_obj_4 <- load_obj(file.path(in_dir4,raster_obj_file_4)) #load raster object to get monthly data |
|
144 |
|
|
145 |
names(raster_prediction_obj_4) |
|
146 |
names(raster_prediction_obj_4$method_mod_obj[[1]]) |
|
143 | 147 |
|
148 |
raster_prediction_obj_4$method_mod_obj[[1]]$data_month_v #no holdout... |
|
149 |
data_month <- raster_prediction_obj_4$method_mod_obj[[1]]$data_month_s #no holdout... |
|
150 |
dim(data_month) #193x70, there are 70 stations in this particular case |
|
151 |
|
|
152 |
plot(data_month) |
|
153 |
|
|
154 |
#set up input parameters |
|
155 |
|
|
156 |
target_max_nb <- 200 #this is not actually used yet in the current implementation |
|
157 |
target_min_nb <- 100 #this is the target number of stations we would like |
|
158 |
max_dist <- 1000 # the maximum distance used for pruning ie removes stations that are closer than 1000m |
|
159 |
min_dist <- 0 #minimum distance to start with |
|
160 |
step_dist <- 1000 #iteration step to remove the stations |
|
161 |
target_range_nb <- c(target_min_nb,target_max_nb) #target range of number of stations |
|
162 |
#First increase distance till 5km |
|
163 |
#then use random sampling...to get the extact target? |
|
164 |
#First test with maximum distance of 100m |
|
165 |
test1 <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month) |
|
166 |
#Second test with no maximum distance: the process of spatial pruning only stops when the number of stations is below |
|
167 |
#the minimum target number |
|
168 |
test2 <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step=step_dist,data_in=data_month) |
|
169 |
#set max distance to NULL |
|
170 |
|
|
171 |
dim(test1$data) #192 stations selected |
|
172 |
test1$dist # for distance of 1000 m (max_dist) |
|
173 |
dim(test2$data) #97 stations selected |
|
174 |
test2$dist # for distance of 31,000 m (no max_dist is set) |
|
175 |
|
|
176 |
dist_range <- c(0,5000) |
|
177 |
|
|
178 |
#Now use the other function to sample the station data points: |
|
179 |
|
|
180 |
sub_sampling_by_dist_nb_stat(target_range_nb=c(10000,10000),dist_range,data_in,sampling=T) |
|
181 |
|
|
182 |
|
|
183 |
#n<-nrow(ghcn.subsets[[i]]) |
|
184 |
#prop<-(sampling_dat$prop[i])/100 |
|
185 |
#ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
|
186 |
#nv<-n-ns #create a sample for validation with prop of the rows |
|
187 |
#ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly |
|
188 |
#ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training) |
|
189 |
#Find the corresponding |
|
190 |
#data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations |
|
191 |
#station_id.training<-data_sampled$station #selected id for the randomly sampled stations (115) |
|
192 |
#Save the information |
|
193 |
#sampling[[i]]<-ind.training #index of training sample from data.frame |
|
194 |
#sampling_station_id[[i]]<- station_id.training #station ID for traning samples |
|
195 |
|
|
144 | 196 |
############ END OF SCRIPT ######### |
Also available in: Unified diff
cleaning up and testing