Project

General

Profile

Download (5.83 KB) Statistics
| Branch: | Revision:
1 e38d3d87 Benoit Parmentier
sampling_training_testing<-function(list_param_sampling){
2
  
3 33517b7d Benoit Parmentier
  #This function creates testing and training list for input sation data based on a list of dates. 
4
  #This function works for montly time scale if dates are provided as mid-months or other forms of for monthly records.
5 e38d3d87 Benoit Parmentier
  #It requires 6 inputs:                                           
6
  # 1) seed_number: allow comparison across runs, if seed zero then no seed number is used
7
  # 2) nb_sample: number of time random sampling must be repeated for every hold out proportion 
8
  # 3) step : step for proportion range 
9
  # 4) constant: if value 1 then use the same samples as date one for the all set of dates
10
  # 5) prop_minmax: if prop_min=prop_max and step=0 then predicitons are done for the number of dates...
11
  # 6) dates: list of dates for prediction and subsetting                              
12
  # 7) ghcn: station data as data.frame -daily input                                                                                
13
  # 11) out_prefix: output suffix added to output names--it is the same in the interpolation script
14
  #
15
  #The output is a list of four shapefile names produced by the function:
16
  # 1) sampling_dat: sampling information for every run by date and sampling combintation
17
  # 2) sampling_index: list of indexes for training and testing for every dates
18
  # 3) sampling_stat_id: list of station ID for training and testing for every dates
19 33517b7d Benoit Parmentier
  # 4) ghcn_data: ghcn subsets by date, can be monthly or daily with mulitple sampling
20 e38d3d87 Benoit Parmentier
  
21
  #AUTHOR: Benoit Parmentier                                                                       
22 33517b7d Benoit Parmentier
  #DATE: 08/25/2013                                                                                 
23 e38d3d87 Benoit Parmentier
  #PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363, TASK#558--     
24
  #Comments and TODO
25
  #
26
  ##################################################################################################
27
  
28
  #Parsing input arguments
29
  
30
  seed_number <-list_param_sampling$seed_number
31
  nb_sample <- list_param_sampling$nb_sample
32
  step<-list_param_sampling$step #if seed zero then no seed?     
33
  constant <- list_param_sampling$constant
34
  prop_minmax<-list_param_sampling$prop_minmax
35
  dates<-list_param_sampling$dates
36
  #ghcn_name<-list_param_sampling$ghcn_name
37 33517b7d Benoit Parmentier
  ghcn<-list_param_sampling$ghcn #can be daily or monthly!!
38 e38d3d87 Benoit Parmentier
  #ghcn<-get(ghcn_name) 
39
  
40
  ### BEGIN FUNCTION ####
41
  
42
  if (seed_number>0) {
43
    set.seed(seed_number)                        #Using a seed number allow results based on random number to be compared...
44
  }
45
  
46
  nel<-length(dates)
47
  dates_list<-vector("list",nel) #list of one row data.frame
48
  prop_min<-prop_minmax[1]
49
  prop_max<-prop_minmax[2]
50 33517b7d Benoit Parmentier
      
51 e38d3d87 Benoit Parmentier
  prop_range<-(seq(from=prop_min,to=prop_max,by=step))*100     #range of proportion to run
52
  sn<-length(dates)*nb_sample*length(prop_range)               #Number of samples to run
53
  
54
  for(i in 1:length(dates)){
55
    d_tmp<-rep(dates[i],nb_sample*length(prop_range)) #repeating same date
56
    s_nb<-rep(1:nb_sample,length(prop_range))         #number of random sample per proportion
57
    prop_tmp<-sort(rep(prop_range, nb_sample))
58
    tab_run_tmp<-cbind(d_tmp,s_nb,prop_tmp)
59
    dates_list[[i]]<-tab_run_tmp
60
  }
61
  
62
  sampling_dat<-as.data.frame(do.call(rbind,dates_list))
63
  names(sampling_dat)<-c("date","run_samp","prop")
64
  
65
  for(i in 2:3){            # start of the for loop #1
66
    sampling_dat[,i]<-as.numeric(as.character(sampling_dat[,i]))  
67
  }
68 5eb62e61 Benoit Parmentier
  
69 e38d3d87 Benoit Parmentier
  sampling_dat$date<- as.character(sampling_dat[,1])
70
  #ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
71
  ghcn.subsets <-lapply(as.character(sampling_dat$date), function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
72
  
73
  #Make this a function??
74
  ## adding choice of constant sample 
75
  
76
  if (seed_number>0) {
77
    set.seed(seed_number)                        #Using a seed number allow results based on random number to be compared...
78
  }
79
  
80
  sampling<-vector("list",length(ghcn.subsets))
81
  sampling_station_id<-vector("list",length(ghcn.subsets))
82
  for(i in 1:length(ghcn.subsets)){
83
    n<-nrow(ghcn.subsets[[i]])
84
    prop<-(sampling_dat$prop[i])/100
85
    ns<-n-round(n*prop)   #Create a sample from the data frame with 70% of the rows
86
    nv<-n-ns              #create a sample for validation with prop of the rows
87
    ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
88
    ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
89
    #Find the corresponding 
90
    data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
91
    station_id.training<-data_sampled$station     #selected id for the randomly sampled stations (115)
92
    #Save the information
93
    sampling[[i]]<-ind.training #index of training sample from data.frame
94
    sampling_station_id[[i]]<- station_id.training #station ID for traning samples
95
  }
96
  ## Use same samples across the year...
97
  if (constant==1){
98
    sampled<-sampling[[1]]
99
    data_sampled<-ghcn.subsets[[1]][sampled,] #selected the randomly sampled stations
100
    station_sampled<-data_sampled$station     #selected id for the randomly sampled stations (115)
101
    list_const_sampling<-vector("list",sn)
102
    list_const_sampling_station_id<-vector("list",sn)
103
    for(i in 1:sn){
104
      station_id.training<-intersect(station_sampled,ghcn.subsets[[i]]$station)
105
      ind.training<-match(station_id.training,ghcn.subsets[[i]]$station)
106
      list_const_sampling[[i]]<-ind.training
107
      list_const_sampling_station_id[[i]]<-station_id.training
108
    }
109
    sampling<-list_const_sampling 
110
    sampling_station_id<-list_const_sampling_station_id
111
  }
112
  
113
  sampling_obj<-list(sampling_dat,sampling,sampling_station_id,ghcn.subsets)
114 33517b7d Benoit Parmentier
  names(sampling_obj)<- c("sampling_dat","sampling_index","sampling_stat_id","ghcn_data")
115 e38d3d87 Benoit Parmentier
  
116
  return(sampling_obj)
117
  
118
}