1 |
e38d3d87
|
Benoit Parmentier
|
sampling_training_testing<-function(list_param_sampling){
|
2 |
|
|
|
3 |
33517b7d
|
Benoit Parmentier
|
#This function creates testing and training list for input sation data based on a list of dates.
|
4 |
|
|
#This function works for montly time scale if dates are provided as mid-months or other forms of for monthly records.
|
5 |
e38d3d87
|
Benoit Parmentier
|
#It requires 6 inputs:
|
6 |
|
|
# 1) seed_number: allow comparison across runs, if seed zero then no seed number is used
|
7 |
|
|
# 2) nb_sample: number of time random sampling must be repeated for every hold out proportion
|
8 |
|
|
# 3) step : step for proportion range
|
9 |
|
|
# 4) constant: if value 1 then use the same samples as date one for the all set of dates
|
10 |
|
|
# 5) prop_minmax: if prop_min=prop_max and step=0 then predicitons are done for the number of dates...
|
11 |
|
|
# 6) dates: list of dates for prediction and subsetting
|
12 |
|
|
# 7) ghcn: station data as data.frame -daily input
|
13 |
|
|
# 11) out_prefix: output suffix added to output names--it is the same in the interpolation script
|
14 |
|
|
#
|
15 |
|
|
#The output is a list of four shapefile names produced by the function:
|
16 |
|
|
# 1) sampling_dat: sampling information for every run by date and sampling combintation
|
17 |
|
|
# 2) sampling_index: list of indexes for training and testing for every dates
|
18 |
|
|
# 3) sampling_stat_id: list of station ID for training and testing for every dates
|
19 |
33517b7d
|
Benoit Parmentier
|
# 4) ghcn_data: ghcn subsets by date, can be monthly or daily with mulitple sampling
|
20 |
e38d3d87
|
Benoit Parmentier
|
|
21 |
|
|
#AUTHOR: Benoit Parmentier
|
22 |
33517b7d
|
Benoit Parmentier
|
#DATE: 08/25/2013
|
23 |
e38d3d87
|
Benoit Parmentier
|
#PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363, TASK#558--
|
24 |
|
|
#Comments and TODO
|
25 |
|
|
#
|
26 |
|
|
##################################################################################################
|
27 |
|
|
|
28 |
|
|
#Parsing input arguments
|
29 |
|
|
|
30 |
|
|
seed_number <-list_param_sampling$seed_number
|
31 |
|
|
nb_sample <- list_param_sampling$nb_sample
|
32 |
|
|
step<-list_param_sampling$step #if seed zero then no seed?
|
33 |
|
|
constant <- list_param_sampling$constant
|
34 |
|
|
prop_minmax<-list_param_sampling$prop_minmax
|
35 |
|
|
dates<-list_param_sampling$dates
|
36 |
|
|
#ghcn_name<-list_param_sampling$ghcn_name
|
37 |
33517b7d
|
Benoit Parmentier
|
ghcn<-list_param_sampling$ghcn #can be daily or monthly!!
|
38 |
e38d3d87
|
Benoit Parmentier
|
#ghcn<-get(ghcn_name)
|
39 |
|
|
|
40 |
|
|
### BEGIN FUNCTION ####
|
41 |
|
|
|
42 |
|
|
if (seed_number>0) {
|
43 |
|
|
set.seed(seed_number) #Using a seed number allow results based on random number to be compared...
|
44 |
|
|
}
|
45 |
|
|
|
46 |
|
|
nel<-length(dates)
|
47 |
|
|
dates_list<-vector("list",nel) #list of one row data.frame
|
48 |
|
|
prop_min<-prop_minmax[1]
|
49 |
|
|
prop_max<-prop_minmax[2]
|
50 |
33517b7d
|
Benoit Parmentier
|
|
51 |
e38d3d87
|
Benoit Parmentier
|
prop_range<-(seq(from=prop_min,to=prop_max,by=step))*100 #range of proportion to run
|
52 |
|
|
sn<-length(dates)*nb_sample*length(prop_range) #Number of samples to run
|
53 |
|
|
|
54 |
|
|
for(i in 1:length(dates)){
|
55 |
|
|
d_tmp<-rep(dates[i],nb_sample*length(prop_range)) #repeating same date
|
56 |
|
|
s_nb<-rep(1:nb_sample,length(prop_range)) #number of random sample per proportion
|
57 |
|
|
prop_tmp<-sort(rep(prop_range, nb_sample))
|
58 |
|
|
tab_run_tmp<-cbind(d_tmp,s_nb,prop_tmp)
|
59 |
|
|
dates_list[[i]]<-tab_run_tmp
|
60 |
|
|
}
|
61 |
|
|
|
62 |
|
|
sampling_dat<-as.data.frame(do.call(rbind,dates_list))
|
63 |
|
|
names(sampling_dat)<-c("date","run_samp","prop")
|
64 |
|
|
|
65 |
|
|
for(i in 2:3){ # start of the for loop #1
|
66 |
|
|
sampling_dat[,i]<-as.numeric(as.character(sampling_dat[,i]))
|
67 |
|
|
}
|
68 |
5eb62e61
|
Benoit Parmentier
|
|
69 |
e38d3d87
|
Benoit Parmentier
|
sampling_dat$date<- as.character(sampling_dat[,1])
|
70 |
|
|
#ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
|
71 |
|
|
ghcn.subsets <-lapply(as.character(sampling_dat$date), function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
|
72 |
|
|
|
73 |
|
|
#Make this a function??
|
74 |
|
|
## adding choice of constant sample
|
75 |
|
|
|
76 |
|
|
if (seed_number>0) {
|
77 |
|
|
set.seed(seed_number) #Using a seed number allow results based on random number to be compared...
|
78 |
|
|
}
|
79 |
|
|
|
80 |
|
|
sampling<-vector("list",length(ghcn.subsets))
|
81 |
|
|
sampling_station_id<-vector("list",length(ghcn.subsets))
|
82 |
|
|
for(i in 1:length(ghcn.subsets)){
|
83 |
|
|
n<-nrow(ghcn.subsets[[i]])
|
84 |
|
|
prop<-(sampling_dat$prop[i])/100
|
85 |
|
|
ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows
|
86 |
|
|
nv<-n-ns #create a sample for validation with prop of the rows
|
87 |
|
|
ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
|
88 |
|
|
ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
|
89 |
|
|
#Find the corresponding
|
90 |
|
|
data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
|
91 |
|
|
station_id.training<-data_sampled$station #selected id for the randomly sampled stations (115)
|
92 |
|
|
#Save the information
|
93 |
|
|
sampling[[i]]<-ind.training #index of training sample from data.frame
|
94 |
|
|
sampling_station_id[[i]]<- station_id.training #station ID for traning samples
|
95 |
|
|
}
|
96 |
|
|
## Use same samples across the year...
|
97 |
|
|
if (constant==1){
|
98 |
|
|
sampled<-sampling[[1]]
|
99 |
|
|
data_sampled<-ghcn.subsets[[1]][sampled,] #selected the randomly sampled stations
|
100 |
|
|
station_sampled<-data_sampled$station #selected id for the randomly sampled stations (115)
|
101 |
|
|
list_const_sampling<-vector("list",sn)
|
102 |
|
|
list_const_sampling_station_id<-vector("list",sn)
|
103 |
|
|
for(i in 1:sn){
|
104 |
|
|
station_id.training<-intersect(station_sampled,ghcn.subsets[[i]]$station)
|
105 |
|
|
ind.training<-match(station_id.training,ghcn.subsets[[i]]$station)
|
106 |
|
|
list_const_sampling[[i]]<-ind.training
|
107 |
|
|
list_const_sampling_station_id[[i]]<-station_id.training
|
108 |
|
|
}
|
109 |
|
|
sampling<-list_const_sampling
|
110 |
|
|
sampling_station_id<-list_const_sampling_station_id
|
111 |
|
|
}
|
112 |
|
|
|
113 |
|
|
sampling_obj<-list(sampling_dat,sampling,sampling_station_id,ghcn.subsets)
|
114 |
33517b7d
|
Benoit Parmentier
|
names(sampling_obj)<- c("sampling_dat","sampling_index","sampling_stat_id","ghcn_data")
|
115 |
e38d3d87
|
Benoit Parmentier
|
|
116 |
|
|
return(sampling_obj)
|
117 |
|
|
|
118 |
|
|
}
|