1
|
sampling_training_testing<-function(list_param_sampling){
|
2
|
|
3
|
#This function creates testing and training list for input sation data based on a list of dates.
|
4
|
#This function works for montly time scale if dates are provided as mid-months or other forms of for monthly records.
|
5
|
#It requires 6 inputs:
|
6
|
# 1) seed_number: allow comparison across runs, if seed zero then no seed number is used
|
7
|
# 2) nb_sample: number of time random sampling must be repeated for every hold out proportion
|
8
|
# 3) step : step for proportion range
|
9
|
# 4) constant: if value 1 then use the same samples as date one for the all set of dates
|
10
|
# 5) prop_minmax: if prop_min=prop_max and step=0 then predicitons are done for the number of dates...
|
11
|
# 6) dates: list of dates for prediction and subsetting
|
12
|
# 7) ghcn: station data as data.frame -daily input
|
13
|
# 11) out_prefix: output suffix added to output names--it is the same in the interpolation script
|
14
|
#
|
15
|
#The output is a list of four shapefile names produced by the function:
|
16
|
# 1) sampling_dat: sampling information for every run by date and sampling combintation
|
17
|
# 2) sampling_index: list of indexes for training and testing for every dates
|
18
|
# 3) sampling_stat_id: list of station ID for training and testing for every dates
|
19
|
# 4) ghcn_data: ghcn subsets by date, can be monthly or daily with mulitple sampling
|
20
|
|
21
|
#AUTHOR: Benoit Parmentier
|
22
|
#DATE: 08/25/2013
|
23
|
#PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363, TASK#558--
|
24
|
#Comments and TODO
|
25
|
#
|
26
|
##################################################################################################
|
27
|
|
28
|
#Parsing input arguments
|
29
|
|
30
|
seed_number <-list_param_sampling$seed_number
|
31
|
nb_sample <- list_param_sampling$nb_sample
|
32
|
step<-list_param_sampling$step #if seed zero then no seed?
|
33
|
constant <- list_param_sampling$constant
|
34
|
prop_minmax<-list_param_sampling$prop_minmax
|
35
|
dates<-list_param_sampling$dates
|
36
|
#ghcn_name<-list_param_sampling$ghcn_name
|
37
|
ghcn<-list_param_sampling$ghcn #can be daily or monthly!!
|
38
|
#ghcn<-get(ghcn_name)
|
39
|
|
40
|
### BEGIN FUNCTION ####
|
41
|
|
42
|
if (seed_number>0) {
|
43
|
set.seed(seed_number) #Using a seed number allow results based on random number to be compared...
|
44
|
}
|
45
|
|
46
|
nel<-length(dates)
|
47
|
dates_list<-vector("list",nel) #list of one row data.frame
|
48
|
prop_min<-prop_minmax[1]
|
49
|
prop_max<-prop_minmax[2]
|
50
|
|
51
|
prop_range<-(seq(from=prop_min,to=prop_max,by=step))*100 #range of proportion to run
|
52
|
sn<-length(dates)*nb_sample*length(prop_range) #Number of samples to run
|
53
|
|
54
|
for(i in 1:length(dates)){
|
55
|
d_tmp<-rep(dates[i],nb_sample*length(prop_range)) #repeating same date
|
56
|
s_nb<-rep(1:nb_sample,length(prop_range)) #number of random sample per proportion
|
57
|
prop_tmp<-sort(rep(prop_range, nb_sample))
|
58
|
tab_run_tmp<-cbind(d_tmp,s_nb,prop_tmp)
|
59
|
dates_list[[i]]<-tab_run_tmp
|
60
|
}
|
61
|
|
62
|
sampling_dat<-as.data.frame(do.call(rbind,dates_list))
|
63
|
names(sampling_dat)<-c("date","run_samp","prop")
|
64
|
|
65
|
for(i in 2:3){ # start of the for loop #1
|
66
|
sampling_dat[,i]<-as.numeric(as.character(sampling_dat[,i]))
|
67
|
}
|
68
|
|
69
|
sampling_dat$date<- as.character(sampling_dat[,1])
|
70
|
#ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
|
71
|
ghcn.subsets <-lapply(as.character(sampling_dat$date), function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
|
72
|
|
73
|
#Make this a function??
|
74
|
## adding choice of constant sample
|
75
|
|
76
|
if (seed_number>0) {
|
77
|
set.seed(seed_number) #Using a seed number allow results based on random number to be compared...
|
78
|
}
|
79
|
|
80
|
sampling<-vector("list",length(ghcn.subsets))
|
81
|
sampling_station_id<-vector("list",length(ghcn.subsets))
|
82
|
for(i in 1:length(ghcn.subsets)){
|
83
|
n<-nrow(ghcn.subsets[[i]])
|
84
|
prop<-(sampling_dat$prop[i])/100
|
85
|
ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows
|
86
|
nv<-n-ns #create a sample for validation with prop of the rows
|
87
|
ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
|
88
|
ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
|
89
|
#Find the corresponding
|
90
|
data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
|
91
|
station_id.training<-data_sampled$station #selected id for the randomly sampled stations (115)
|
92
|
#Save the information
|
93
|
sampling[[i]]<-ind.training #index of training sample from data.frame
|
94
|
sampling_station_id[[i]]<- station_id.training #station ID for traning samples
|
95
|
}
|
96
|
## Use same samples across the year...
|
97
|
if (constant==1){
|
98
|
sampled<-sampling[[1]]
|
99
|
data_sampled<-ghcn.subsets[[1]][sampled,] #selected the randomly sampled stations
|
100
|
station_sampled<-data_sampled$station #selected id for the randomly sampled stations (115)
|
101
|
list_const_sampling<-vector("list",sn)
|
102
|
list_const_sampling_station_id<-vector("list",sn)
|
103
|
for(i in 1:sn){
|
104
|
station_id.training<-intersect(station_sampled,ghcn.subsets[[i]]$station)
|
105
|
ind.training<-match(station_id.training,ghcn.subsets[[i]]$station)
|
106
|
list_const_sampling[[i]]<-ind.training
|
107
|
list_const_sampling_station_id[[i]]<-station_id.training
|
108
|
}
|
109
|
sampling<-list_const_sampling
|
110
|
sampling_station_id<-list_const_sampling_station_id
|
111
|
}
|
112
|
|
113
|
sampling_obj<-list(sampling_dat,sampling,sampling_station_id,ghcn.subsets)
|
114
|
names(sampling_obj)<- c("sampling_dat","sampling_index","sampling_stat_id","ghcn_data")
|
115
|
|
116
|
return(sampling_obj)
|
117
|
|
118
|
}
|