Project

General

Profile

« Previous | Next » 

Revision 764c7d94

Added by Benoit Parmentier about 10 years ago

cleaning up and testing

View differences:

climate/research/oregon/interpolation/subsampling_data.R
1 1
####################################  INTERPOLATION OF TEMPERATURES  #######################################
2
############################  Script for manuscript analyses,tables and figures #######################################
3
#This script uses the worklfow code applied to the Oregon case study. Daily methods (GAM,GWR, Kriging) are tested with
4
#different covariates using two baselines. Accuracy methods are added in the the function script to evaluate results.
5
#Figures, tables and data for the contribution of covariate paper are also produced in the script.
2
############################  Script to prune and sample stations data #######################################
3
#This script uses the worklfow code applied to the globe. Results currently reside on NEX/PLEIADES NASA and NCEAS UCSB.
4
#The purpose is sub sample stations data in areas where there are a very high number of stations.
5
#
6 6
#AUTHOR: Benoit Parmentier                                                                      
7
#MODIFIED ON: 09/11/2014            
8
#Version: 5
9
#PROJECT: Environmental Layers project                                     
7
#CREATED ON: 10/16/2014            
8
#MODIFIED ON: 10/23/2014            
9
#Version: 1
10
#
11
#PROJECT: Environmental Layers project  NCEAS-NASA
12
#TO DO:
13
#
14

  
10 15
#################################################################################################
11 16

  
12 17
### Loading R library and packages        
......
39 44

  
40 45
#### FUNCTION USED IN SCRIPT
41 46

  
42
function_analyses_paper1 <-"contribution_of_covariates_paper_interpolation_functions_07182014.R" #first interp paper
43
function_analyses_paper2 <-"multi_timescales_paper_interpolation_functions_10062014.R"
44

  
45
##############################
46
#### Parameters and constants  
47

  
48
script_path<-"/home/parmentier/Data/IPLANT_project/env_layers_scripts/" #path to script
49
source(file.path(script_path,function_analyses_paper1)) #source all functions used in this script 1.
50
source(file.path(script_path,function_analyses_paper2)) #source all functions used in this script 2.
51

  
52
#Multi time scale - CAI: gam, kriging, gwr
53
in_dir4 <-"/data/project/layers/commons/Oregon_interpolation/output_data_365d_gam_cai_lst_comb5_11032013"
54
raster_obj_file_4 <- "raster_prediction_obj_gam_CAI_dailyTmax_365d_gam_cai_lst_comb5_11032013.RData"
47
function_analyses_paper1 <- "contribution_of_covariates_paper_interpolation_functions_07182014.R" #first interp paper
48
function_analyses_paper2 <- "multi_timescales_paper_interpolation_functions_10062014.R"
55 49

  
56
raster_prediction_obj_4 <- load_obj(file.path(in_dir4,raster_obj_file_4))
57

  
58
names(raster_prediction_obj_4)
59
names(raster_prediction_obj_4$method_mod_obj[[1]])
60

  
61
(raster_prediction_obj_4$method_mod_obj[[1]]$data_month_v) #no holdout...
62
data_month <- raster_prediction_obj_4$method_mod_obj[[1]]$data_month_s #no holdout...
63
dim(data_month)
64
test <- zerodist(data_month, zero = 0.0, unique.ID = FALSE)
65

  
66
target_max_nb <- 200
67
target_min_nb <- 100
68
max_dist <- 10
69
min_dist <- 0
70
step_dist <- 1000
71
target_range_nb <- c(target_min_nb,target_max_nb)
72
#debug(sub_sampling_by_dist)
73
#First increase distance till 5km
74
#then use random sampling...to get the extact target?
75
test <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month)
76
test <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step=step_dist,data_in=data_month)
77

  
78
dist_range <- c(0,5000) 
79

  
80
sub_sampling_by_dist_nb_stat(target_range_nb=c(10000,10000),dist_range,data_in,sampling=T){
81

  
82
  
83 50
sub_sampling_by_dist <- function(target_range_nb=c(10000,10000),dist=0.0,max_dist=NULL,step,data_in){
51
  #Function to select stations data that are outside a specific spatial range from each other
52
  #Parameters:
53
  #max_dist: maximum spatial distance at which to stop the pruning
54
  #min_dist: minimum distance to start pruning the data
55
  #step: spatial distance increment
56

  
84 57
  data <- data_in
85 58
  target_min_nb <- target_range_nb[1]
86 59
  station_nb <- nrow(data_in)
......
92 65
    }
93 66
  }
94 67
  if(!is.null(max_dist)){
95
    while(station_nb > target_min_nb || dist < max_dist){ 
96
      d#} #|| nrow > 0){
97
      #test <- zerodist(data, zero = 0.0, unique.ID = FALSE)
98
      #test <- remove.duplicates(data_month, zero = 5000)
68
    
69
    while(station_nb > target_min_nb & dist < max_dist){ 
99 70
      data <- remove.duplicates(data, zero = dist) #spatially sub sample...
71
      id_rm <- zerodist(data, zero = dist, unique.ID = FALSE)
72
      data_rm <- data[id_rm,]
100 73
      dist <- dist + step
101 74
      station_nb <- nrow(data)
102 75
    }
......
107 80
  return(obj_sub_sampling)
108 81
}
109 82

  
110
sub_sampling_by_dist_nb_stat <- function(target_range_nb=c(10000,10000),dist_range,data_in,sampling=T){
83
sub_sampling_by_dist_nb_stat <- function(target_range_nb,dist_range,data_in,sampling=T,combined=F){
111 84
  
112 85
  data <- data_in
113 86
  min_dist <- dist_range[1]
......
115 88
  
116 89
  if(sampling==T){
117 90
    dat <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month)
118
    data_out <-sample(dat$data, target_range_nb[2], replace = FALSE, prob = NULL)
119
    data_out <- list(data_out,dat$dist,dat$data)
120
    data_out <- c("data","dist","data_dist")
91
    ind_s1  <- sample(nrow(dat$data), size=target_range_nb[1], replace = FALSE, prob = NULL)
92
    ind_s2 <- setdiff(1:nrow(dat$data), ind_s1)
93
    data_out <- dat$data[ind_s1,] #selected the randomly sampled stations
94
    data_removed <- dat[ind_s2,]
95
    
96
    #Find the corresponding 
97
    #data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
98

  
99
    data_out <- list(data_out,dat$dist,data_removed,dat$data)
100
    data_out <- c("data","dist","data_removed","data_dist")
121 101
  }
122 102
  if(sampling!=T){
123
    data_out <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step=step_dist,data_in=data_month)
103
    dat <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step=step_dist,data_in=data_month)
104
    #
105
    data_out <- list(dat$data,dat$dist,data_removed)
106
    data_out <- c("data","dist","data_removed")
107
    
124 108
  }
125 109
  return(data_out)
126 110
}
127 111

  
128
#sub_sampling_by_dist <- function(target_range_nb=c(10000,10000),dist=0.0,step,data_in){
129
#  data <- data_in
130
#  target_min_nb <- target_range_nb[1]
131
#  station_nb <- nrow(data_in)
132
#  while(station_nb > target_min_nb){  #} #|| nrow > 0){
133
#      #test <- zerodist(data, zero = 0.0, unique.ID = FALSE)
134
#      #test <- remove.duplicates(data_month, zero = 5000)
135
#      data <- remove.duplicates(data, zero = dist) #spatially sub sample...
136
#      dist <- dist + step
137
#      station_nb <- nrow(data)
138
#  }
139
#  obj_sub_sampling <- list(data,dist)
140
#  names(obj_sub_sampling) <- c("data","dist")
141
#  return(obj_sub_sampling)
142
#}
112
debug(sub_sampling_by_dist_nb_stat)
113
test3 <- sub_sampling_by_dist_nb_stat(target_range_nb=c(100,200),dist_range=c(0,1000),data_in=data_month,sampling=T,combined=F)
114

  
115

  
116
#n<-nrow(ghcn.subsets[[i]])
117
#prop<-(sampling_dat$prop[i])/100
118
#ns<-n-round(n*prop)   #Create a sample from the data frame with 70% of the rows
119
#nv<-n-ns              #create a sample for validation with prop of the rows
120
#ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
121
#ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
122
#Find the corresponding 
123
#data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
124
#station_id.training<-data_sampled$station     #selected id for the randomly sampled stations (115)
125
#Save the information
126
#sampling[[i]]<-ind.training #index of training sample from data.frame
127
#sampling_station_id[[i]]<- station_id.training #station ID for traning samples
128

  
129
##############################
130
#### Parameters and constants  
131

  
132
script_path<-"/home/parmentier/Data/IPLANT_project/env_layers_scripts/" #path to script
133
source(file.path(script_path,function_analyses_paper1)) #source all functions used in this script 1.
134
source(file.path(script_path,function_analyses_paper2)) #source all functions used in this script 2.
135

  
136
#Multi time scale - CAI: gam, kriging, gwr
137
in_dir4 <-"/data/project/layers/commons/Oregon_interpolation/output_data_365d_gam_cai_lst_comb5_11032013" #data used in multi-timescale paper
138
raster_obj_file_4 <- "raster_prediction_obj_gam_CAI_dailyTmax_365d_gam_cai_lst_comb5_11032013.RData"
139

  
140
###################################################################
141
############### BEGIN SCRIPT ##################################
142

  
143
raster_prediction_obj_4 <- load_obj(file.path(in_dir4,raster_obj_file_4)) #load raster object to get monthly data
144

  
145
names(raster_prediction_obj_4)
146
names(raster_prediction_obj_4$method_mod_obj[[1]])
143 147

  
148
raster_prediction_obj_4$method_mod_obj[[1]]$data_month_v #no holdout...
149
data_month <- raster_prediction_obj_4$method_mod_obj[[1]]$data_month_s #no holdout...
150
dim(data_month) #193x70, there are 70 stations in this particular case
151

  
152
plot(data_month)
153

  
154
#set up input parameters
155

  
156
target_max_nb <- 200 #this is not actually used yet in the current implementation
157
target_min_nb <- 100 #this is the target number of stations we would like
158
max_dist <- 1000 # the maximum distance used for pruning ie removes stations that are closer than 1000m 
159
min_dist <- 0    #minimum distance to start with
160
step_dist <- 1000 #iteration step to remove the stations
161
target_range_nb <- c(target_min_nb,target_max_nb) #target range of number of stations
162
#First increase distance till 5km
163
#then use random sampling...to get the extact target?
164
#First test with maximum distance of 100m
165
test1 <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=max_dist,step=step_dist,data_in=data_month)
166
#Second test with no maximum distance: the process of spatial pruning only stops when the number of stations is below
167
#the minimum target number
168
test2 <- sub_sampling_by_dist(target_range_nb,dist=min_dist,max_dist=NULL,step=step_dist,data_in=data_month)
169
#set max distance to NULL
170

  
171
dim(test1$data) #192 stations selected
172
test1$dist # for distance of 1000 m (max_dist)
173
dim(test2$data) #97 stations selected 
174
test2$dist # for distance of 31,000 m (no max_dist is set)
175

  
176
dist_range <- c(0,5000) 
177

  
178
#Now use the other function to sample the station data points:
179

  
180
sub_sampling_by_dist_nb_stat(target_range_nb=c(10000,10000),dist_range,data_in,sampling=T)
181

  
182

  
183
#n<-nrow(ghcn.subsets[[i]])
184
#prop<-(sampling_dat$prop[i])/100
185
#ns<-n-round(n*prop)   #Create a sample from the data frame with 70% of the rows
186
#nv<-n-ns              #create a sample for validation with prop of the rows
187
#ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
188
#ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
189
#Find the corresponding 
190
#data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
191
#station_id.training<-data_sampled$station     #selected id for the randomly sampled stations (115)
192
#Save the information
193
#sampling[[i]]<-ind.training #index of training sample from data.frame
194
#sampling_station_id[[i]]<- station_id.training #station ID for traning samples
195
  
144 196
############ END OF SCRIPT #########

Also available in: Unified diff