Revision e38d3d87
Added by Benoit Parmentier almost 12 years ago
climate/research/oregon/interpolation/sampling_script_functions.R | ||
---|---|---|
1 |
sampling_training_testing<-function(list_param_sampling){ |
|
2 |
|
|
3 |
#This function creates testing and training list for input sation data based on alist of dates. |
|
4 |
#It requires 6 inputs: |
|
5 |
# 1) seed_number: allow comparison across runs, if seed zero then no seed number is used |
|
6 |
# 2) nb_sample: number of time random sampling must be repeated for every hold out proportion |
|
7 |
# 3) step : step for proportion range |
|
8 |
# 4) constant: if value 1 then use the same samples as date one for the all set of dates |
|
9 |
# 5) prop_minmax: if prop_min=prop_max and step=0 then predicitons are done for the number of dates... |
|
10 |
# 6) dates: list of dates for prediction and subsetting |
|
11 |
# 7) ghcn: station data as data.frame -daily input |
|
12 |
# 11) out_prefix: output suffix added to output names--it is the same in the interpolation script |
|
13 |
# |
|
14 |
#The output is a list of four shapefile names produced by the function: |
|
15 |
# 1) sampling_dat: sampling information for every run by date and sampling combintation |
|
16 |
# 2) sampling_index: list of indexes for training and testing for every dates |
|
17 |
# 3) sampling_stat_id: list of station ID for training and testing for every dates |
|
18 |
# 4) ghcn_data_day: ghcn subsets by date |
|
19 |
|
|
20 |
#AUTHOR: Benoit Parmentier |
|
21 |
#DATE: 03/05/2013 |
|
22 |
#PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363, TASK#558-- |
|
23 |
#Comments and TODO |
|
24 |
# |
|
25 |
################################################################################################## |
|
26 |
|
|
27 |
#Parsing input arguments |
|
28 |
|
|
29 |
seed_number <-list_param_sampling$seed_number |
|
30 |
nb_sample <- list_param_sampling$nb_sample |
|
31 |
step<-list_param_sampling$step #if seed zero then no seed? |
|
32 |
constant <- list_param_sampling$constant |
|
33 |
prop_minmax<-list_param_sampling$prop_minmax |
|
34 |
dates<-list_param_sampling$dates |
|
35 |
#ghcn_name<-list_param_sampling$ghcn_name |
|
36 |
ghcn<-list_param_sampling$ghcn |
|
37 |
#ghcn<-get(ghcn_name) |
|
38 |
|
|
39 |
### BEGIN FUNCTION #### |
|
40 |
|
|
41 |
if (seed_number>0) { |
|
42 |
set.seed(seed_number) #Using a seed number allow results based on random number to be compared... |
|
43 |
} |
|
44 |
|
|
45 |
nel<-length(dates) |
|
46 |
dates_list<-vector("list",nel) #list of one row data.frame |
|
47 |
prop_min<-prop_minmax[1] |
|
48 |
prop_max<-prop_minmax[2] |
|
49 |
|
|
50 |
prop_range<-(seq(from=prop_min,to=prop_max,by=step))*100 #range of proportion to run |
|
51 |
sn<-length(dates)*nb_sample*length(prop_range) #Number of samples to run |
|
52 |
|
|
53 |
for(i in 1:length(dates)){ |
|
54 |
d_tmp<-rep(dates[i],nb_sample*length(prop_range)) #repeating same date |
|
55 |
s_nb<-rep(1:nb_sample,length(prop_range)) #number of random sample per proportion |
|
56 |
prop_tmp<-sort(rep(prop_range, nb_sample)) |
|
57 |
tab_run_tmp<-cbind(d_tmp,s_nb,prop_tmp) |
|
58 |
dates_list[[i]]<-tab_run_tmp |
|
59 |
} |
|
60 |
|
|
61 |
sampling_dat<-as.data.frame(do.call(rbind,dates_list)) |
|
62 |
names(sampling_dat)<-c("date","run_samp","prop") |
|
63 |
|
|
64 |
for(i in 2:3){ # start of the for loop #1 |
|
65 |
sampling_dat[,i]<-as.numeric(as.character(sampling_dat[,i])) |
|
66 |
} |
|
67 |
browser() |
|
68 |
sampling_dat$date<- as.character(sampling_dat[,1]) |
|
69 |
#ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates |
|
70 |
ghcn.subsets <-lapply(as.character(sampling_dat$date), function(d) subset(ghcn, date==d)) #this creates a list of 10 or 365 subsets dataset based on dates |
|
71 |
|
|
72 |
#Make this a function?? |
|
73 |
## adding choice of constant sample |
|
74 |
|
|
75 |
if (seed_number>0) { |
|
76 |
set.seed(seed_number) #Using a seed number allow results based on random number to be compared... |
|
77 |
} |
|
78 |
|
|
79 |
sampling<-vector("list",length(ghcn.subsets)) |
|
80 |
sampling_station_id<-vector("list",length(ghcn.subsets)) |
|
81 |
for(i in 1:length(ghcn.subsets)){ |
|
82 |
n<-nrow(ghcn.subsets[[i]]) |
|
83 |
prop<-(sampling_dat$prop[i])/100 |
|
84 |
ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
|
85 |
nv<-n-ns #create a sample for validation with prop of the rows |
|
86 |
ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly |
|
87 |
ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training) |
|
88 |
#Find the corresponding |
|
89 |
data_sampled<-ghcn.subsets[[i]][ind.training,] #selected the randomly sampled stations |
|
90 |
station_id.training<-data_sampled$station #selected id for the randomly sampled stations (115) |
|
91 |
#Save the information |
|
92 |
sampling[[i]]<-ind.training #index of training sample from data.frame |
|
93 |
sampling_station_id[[i]]<- station_id.training #station ID for traning samples |
|
94 |
} |
|
95 |
## Use same samples across the year... |
|
96 |
if (constant==1){ |
|
97 |
sampled<-sampling[[1]] |
|
98 |
data_sampled<-ghcn.subsets[[1]][sampled,] #selected the randomly sampled stations |
|
99 |
station_sampled<-data_sampled$station #selected id for the randomly sampled stations (115) |
|
100 |
list_const_sampling<-vector("list",sn) |
|
101 |
list_const_sampling_station_id<-vector("list",sn) |
|
102 |
for(i in 1:sn){ |
|
103 |
station_id.training<-intersect(station_sampled,ghcn.subsets[[i]]$station) |
|
104 |
ind.training<-match(station_id.training,ghcn.subsets[[i]]$station) |
|
105 |
list_const_sampling[[i]]<-ind.training |
|
106 |
list_const_sampling_station_id[[i]]<-station_id.training |
|
107 |
} |
|
108 |
sampling<-list_const_sampling |
|
109 |
sampling_station_id<-list_const_sampling_station_id |
|
110 |
} |
|
111 |
|
|
112 |
sampling_obj<-list(sampling_dat,sampling,sampling_station_id,ghcn.subsets) |
|
113 |
names(sampling_obj)<- c("sampling_dat","sampling_index","sampling_stat_id","ghcn_data_day") |
|
114 |
|
|
115 |
return(sampling_obj) |
|
116 |
|
|
117 |
} |
Also available in: Unified diff
Sampling function to select training and testing stations, extracted and modified from earlier scripts