Project

General

Profile

Download (5.66 KB) Statistics
| Branch: | Revision:
1
sampling_training_testing<-function(list_param_sampling){
2
  
3
  #This function creates testing and training list for input sation data based on alist of dates.            
4
  #It requires 6 inputs:                                           
5
  # 1) seed_number: allow comparison across runs, if seed zero then no seed number is used
6
  # 2) nb_sample: number of time random sampling must be repeated for every hold out proportion 
7
  # 3) step : step for proportion range 
8
  # 4) constant: if value 1 then use the same samples as date one for the all set of dates
9
  # 5) prop_minmax: if prop_min=prop_max and step=0 then predicitons are done for the number of dates...
10
  # 6) dates: list of dates for prediction and subsetting                              
11
  # 7) ghcn: station data as data.frame -daily input                                                                                
12
  # 11) out_prefix: output suffix added to output names--it is the same in the interpolation script
13
  #
14
  #The output is a list of four shapefile names produced by the function:
15
  # 1) sampling_dat: sampling information for every run by date and sampling combintation
16
  # 2) sampling_index: list of indexes for training and testing for every dates
17
  # 3) sampling_stat_id: list of station ID for training and testing for every dates
18
  # 4) ghcn_data_day: ghcn subsets by date
19
  
20
  #AUTHOR: Benoit Parmentier                                                                       
21
  #DATE: 03/05/2013                                                                                 
22
  #PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363, TASK#558--     
23
  #Comments and TODO
24
  #
25
  ##################################################################################################
26
  
27
  #Parsing input arguments
28
  
29
  seed_number <-list_param_sampling$seed_number
30
  nb_sample <- list_param_sampling$nb_sample
31
  step<-list_param_sampling$step #if seed zero then no seed?     
32
  constant <- list_param_sampling$constant
33
  prop_minmax<-list_param_sampling$prop_minmax
34
  dates<-list_param_sampling$dates
35
  #ghcn_name<-list_param_sampling$ghcn_name
36
  ghcn<-list_param_sampling$ghcn
37
  #ghcn<-get(ghcn_name) 
38
  
39
  ### BEGIN FUNCTION ####
40
  
41
  if (seed_number>0) {
42
    set.seed(seed_number)                        #Using a seed number allow results based on random number to be compared...
43
  }
44
  
45
  nel<-length(dates)
46
  dates_list<-vector("list",nel) #list of one row data.frame
47
  prop_min<-prop_minmax[1]
48
  prop_max<-prop_minmax[2]
49
  
50
  prop_range<-(seq(from=prop_min,to=prop_max,by=step))*100     #range of proportion to run
51
  sn<-length(dates)*nb_sample*length(prop_range)               #Number of samples to run
52
  
53
  for(i in 1:length(dates)){
54
    d_tmp<-rep(dates[i],nb_sample*length(prop_range)) #repeating same date
55
    s_nb<-rep(1:nb_sample,length(prop_range))         #number of random sample per proportion
56
    prop_tmp<-sort(rep(prop_range, nb_sample))
57
    tab_run_tmp<-cbind(d_tmp,s_nb,prop_tmp)
58
    dates_list[[i]]<-tab_run_tmp
59
  }
60
  
61
  sampling_dat<-as.data.frame(do.call(rbind,dates_list))
62
  names(sampling_dat)<-c("date","run_samp","prop")
63
  
64
  for(i in 2:3){            # start of the for loop #1
65
    sampling_dat[,i]<-as.numeric(as.character(sampling_dat[,i]))  
66
  }
67
  browser()
68
  sampling_dat$date<- as.character(sampling_dat[,1])
69
  #ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
70
  ghcn.subsets <-lapply(as.character(sampling_dat$date), function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates
71
  
72
  #Make this a function??
73
  ## adding choice of constant sample 
74
  
75
  if (seed_number>0) {
76
    set.seed(seed_number)                        #Using a seed number allow results based on random number to be compared...
77
  }
78
  
79
  sampling<-vector("list",length(ghcn.subsets))
80
  sampling_station_id<-vector("list",length(ghcn.subsets))
81
  for(i in 1:length(ghcn.subsets)){
82
    n<-nrow(ghcn.subsets[[i]])
83
    prop<-(sampling_dat$prop[i])/100
84
    ns<-n-round(n*prop)   #Create a sample from the data frame with 70% of the rows
85
    nv<-n-ns              #create a sample for validation with prop of the rows
86
    ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
87
    ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
88
    #Find the corresponding 
89
    data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations
90
    station_id.training<-data_sampled$station     #selected id for the randomly sampled stations (115)
91
    #Save the information
92
    sampling[[i]]<-ind.training #index of training sample from data.frame
93
    sampling_station_id[[i]]<- station_id.training #station ID for traning samples
94
  }
95
  ## Use same samples across the year...
96
  if (constant==1){
97
    sampled<-sampling[[1]]
98
    data_sampled<-ghcn.subsets[[1]][sampled,] #selected the randomly sampled stations
99
    station_sampled<-data_sampled$station     #selected id for the randomly sampled stations (115)
100
    list_const_sampling<-vector("list",sn)
101
    list_const_sampling_station_id<-vector("list",sn)
102
    for(i in 1:sn){
103
      station_id.training<-intersect(station_sampled,ghcn.subsets[[i]]$station)
104
      ind.training<-match(station_id.training,ghcn.subsets[[i]]$station)
105
      list_const_sampling[[i]]<-ind.training
106
      list_const_sampling_station_id[[i]]<-station_id.training
107
    }
108
    sampling<-list_const_sampling 
109
    sampling_station_id<-list_const_sampling_station_id
110
  }
111
  
112
  sampling_obj<-list(sampling_dat,sampling,sampling_station_id,ghcn.subsets)
113
  names(sampling_obj)<- c("sampling_dat","sampling_index","sampling_stat_id","ghcn_data_day")
114
  
115
  return(sampling_obj)
116
  
117
}
(39-39/39)