Revision aac7c36c
Added by Benoit Parmentier over 12 years ago
- ID aac7c36c047121785d502578086e556a3a353ea5
- Child 23ed3053
climate/research/oregon/interpolation/Linear_reg.R | ||
---|---|---|
19 | 19 |
infile2<-"dates_interpolation_03052012.txt" |
20 | 20 |
prop<-0.3 #Proportion of testing retained for validation |
21 | 21 |
out_prefix<-"_03042012_r1" |
22 |
infile3<-"models_interpolation_03052012.txt" |
|
23 |
|
|
22 | 24 |
|
23 | 25 |
#######START OF THE SCRIPT ############# |
24 | 26 |
|
... | ... | |
31 | 33 |
ghcn = transform(ghcn,Eastness_w = sin(slope)*sin(ASPECT)) #adding variable to the dataframe. |
32 | 34 |
set.seed(100) |
33 | 35 |
dates <-readLines(paste(path,"/",infile2, sep="")) |
36 |
models <-readLines(paste(path,"/",infile3, sep="")) |
|
34 | 37 |
|
35 | 38 |
results <- matrix(1,length(dates),14) #This is a matrix containing the diagnostic measures from the GAM models. |
36 | 39 |
|
40 |
results_AIC<- matrix(1,length(dates),length(models)+3) |
|
41 |
results_GCV<- matrix(1,length(dates),length(models)+3) |
|
42 |
results_RMSE<- matrix(1,length(dates),length(models)+3) |
|
43 |
|
|
37 | 44 |
ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 subsets data |
38 | 45 |
#note that compare to the previous version date_ column was changed to date |
39 | 46 |
|
40 | 47 |
## looping through the dates... |
48 |
#Change this into a nested loop, looping through the number of models |
|
41 | 49 |
|
42 | 50 |
for(i in 1:length(dates)){ # start of the for loop #1 |
43 | 51 |
|
... | ... | |
54 | 62 |
|
55 | 63 |
####Regression part 2: GAM models |
56 | 64 |
|
57 |
GAM_ANUSPLIN1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=data_s) |
|
58 |
GAM_PRISM1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (Northness)+ s (Eastness) + s(DISTOC), data=data_s) |
|
59 |
GAM_PRISM2<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (Northness_w)+ s (Eastness_w) + s(DISTOC), data=data_s) |
|
65 |
mod1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=data_s) |
|
66 |
mod2<- gam(tmax~ s(lat,lon,ELEV_SRTM), data=data_s) |
|
67 |
mod3<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (Northness)+ s (Eastness) + s(DISTOC), data=data_s) |
|
68 |
mod4<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (Northness_w)+ s (Eastness_w) + s(DISTOC), data=data_s) |
|
69 |
mod5<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM, Northness) + s (Eastness) + s(DISTOC), data=data_s) |
|
70 |
mod6<- gam(tmax~ s(lat,lon) + s (ELEV_SRTM, Northness) + s (Eastness) + s(DISTOC), data=data_s) |
|
60 | 71 |
|
61 | 72 |
|
62 | 73 |
####Regression part 3: Calculating and storing diagnostic measures |
63 | 74 |
|
64 |
results[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
65 |
results[i,2]<- ns #number of stations used in the training stage |
|
75 |
results_AIC[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
76 |
results_AIC[i,2]<- ns #number of stations used in the training stage |
|
77 |
results_AIC[i,3]<- AIC (mod1) |
|
78 |
results_AIC[i,4]<- AIC (mod2) |
|
79 |
results_AIC[i,5]<- AIC (mod3) |
|
80 |
results_AIC[i,6]<- AIC (mod4) |
|
81 |
results_AIC[i,7]<- AIC (mod5) |
|
82 |
results_AIC[i,8]<- AIC (mod6) |
|
66 | 83 |
|
67 |
results[i,6]<- AIC (GAM_ANUSPLIN1) |
|
68 |
results[i,7]<- AIC (GAM_PRISM1) |
|
69 |
results[i,8]<- AIC (GAM_PRISM2) |
|
84 |
results_GCV[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
85 |
results_GCV[i,2]<- ns #number of stations used in the training stage |
|
86 |
results_GCV[i,3]<- mod1$gcv.ubre |
|
87 |
results_GCV[i,4]<- mod2$gcv.ubre |
|
88 |
results_GCV[i,5]<- mod3$gcv.ubre |
|
89 |
results_GCV[i,6]<- mod4$gcv.ubre |
|
90 |
results_GCV[i,7]<- mod5$gcv.ubre |
|
91 |
results_GCV[i,8]<- mod6$gcv.ubre |
|
70 | 92 |
|
71 |
results[i,9]<- GAM_ANUSPLIN1$gcv.ubre |
|
72 |
results[i,10]<- GAM_PRISM1$gcv.ubre |
|
73 |
results[i,11]<- GAM_PRISM2$gcv.ubre |
|
74 |
|
|
75 |
results[i,12]<-GAM_ANUSPLIN1$deviance |
|
76 |
results[i,13]<-GAM_PRISM1$deviance |
|
77 |
results[i,14]<-GAM_PRISM2$deviance |
|
93 |
results_DEV[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
94 |
results_DEV[i,2]<- ns #number of stations used in the training stage |
|
95 |
results_DEV[i,3]<- mod1$deviance |
|
96 |
results_DEV[i,4]<- mod2$deviance |
|
97 |
results_DEV[i,5]<- mod3$deviance |
|
98 |
results_DEV[i,6]<- mod4$deviance |
|
99 |
results_DEV[i,7]<- mod5$deviance |
|
100 |
results_DEV[i,8]<- mod6$deviance |
|
78 | 101 |
|
79 | 102 |
#####VALIDATION: Prediction checking the results using the testing data######## |
80 | 103 |
|
81 |
y_pgANUSPLIN1<- predict(GAM_ANUSPLIN1, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values. |
|
82 |
y_pgPRISM1<- predict(GAM_PRISM1, newdata=data_v, se.fit = TRUE) |
|
83 |
y_pgPRISM2<- predict(GAM_PRISM2, newdata=data_v, se.fit = TRUE) |
|
104 |
y_mod1<- predict(mod1, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values. |
|
105 |
y_mod2<- predict(mod2, newdata=data_v, se.fit = TRUE) |
|
106 |
y_mod3<- predict(mod3, newdata=data_v, se.fit = TRUE) |
|
107 |
y_mod4<- predict(mod4, newdata=data_v, se.fit = TRUE) |
|
108 |
y_mod5<- predict(mod5, newdata=data_v, se.fit = TRUE) |
|
109 |
y_mod5<- predict(mod6, newdata=data_v, se.fit = TRUE) |
|
84 | 110 |
|
85 |
res_ypgA1<- data_v$tmax - y_pgANUSPLIN1$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation |
|
86 |
res_ypgP1<- data_v$tmax - y_pgPRISM1$fit #Residuals for GAM model that resembles the PRISM interpolation |
|
87 |
res_ypgP2<- data_v$tmax - y_pgPRISM2$fit |
|
111 |
res_mod1<- data_v$tmax - y_mod1$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation |
|
112 |
res_mod2<- data_v$tmax - y_mod2$fit #Residuals for GAM model that resembles the PRISM interpolation |
|
113 |
res_mod3<- data_v$tmax - y_mod3$fit |
|
114 |
res_mod4<- data_v$tmax - y_mod4$fit |
|
115 |
res_mod5<- data_v$tmax - y_mod5$fit |
|
116 |
res_mod6<- data_v$tmax - y_mod6$fit |
|
88 | 117 |
|
89 |
RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv) |
|
90 |
RMSE_ypgP1 <- sqrt(sum(res_ypgP1^2)/nv) |
|
91 |
RMSE_ypgP2 <- sqrt(sum(res_ypgP2^2)/nv) |
|
118 |
RMSE_mod1 <- sqrt(sum(res_mod1^2)/nv) |
|
119 |
RMSE_mod2 <- sqrt(sum(res_mod2^2)/nv) |
|
120 |
RMSE_mod3 <- sqrt(sum(res_mod3^2)/nv) |
|
121 |
RMSE_mod4 <- sqrt(sum(res_mod4^2)/nv) |
|
122 |
RMSE_mod5 <- sqrt(sum(res_mod5^2)/nv) |
|
123 |
RMSE_mod6 <- sqrt(sum(res_mod6^2)/nv) |
|
92 | 124 |
|
93 |
results[i,3]<-RMSE_ypgA1 |
|
94 |
results[i,4]<-RMSE_ypgP1 |
|
95 |
results[i,5]<-RMSE_ypgP2 |
|
125 |
|
|
126 |
results_RMSE[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
127 |
results_RMSE[i,2]<- ns #number of stations used in the training stage |
|
128 |
results_RMSE[i,3]<- RMSE_mod1 |
|
129 |
results_RMSE[i,4]<- RMSE_mod2 |
|
130 |
results_RMSE[i,5]<- RMSE_mod3 |
|
131 |
results_RMSE[i,6]<- RMSE_mod4 |
|
132 |
results_RMSE[i,7]<- RMSE_mod5 |
|
133 |
results_RMSE[i,8]<- RMSE_mod6 |
|
96 | 134 |
|
97 | 135 |
data_name<-paste("ghcn_v_",dates[[i]],sep="") |
98 | 136 |
assign(data_name,data_v) |
... | ... | |
103 | 141 |
|
104 | 142 |
## Plotting and saving diagnostic measures |
105 | 143 |
|
106 |
results_num <-results
|
|
107 |
mode(results_num)<- "numeric" |
|
144 |
results_RMSEnum <-results_RMSE
|
|
145 |
mode(results_RMSEnum)<- "numeric"
|
|
108 | 146 |
# Make it numeric first |
109 | 147 |
# Now turn it into a data.frame... |
110 | 148 |
|
111 |
results_table<-as.data.frame(results_num)
|
|
112 |
colnames(results_table)<-c("dates","ns","RMSE_A1", "RMSE_P1","RMSE_P2", "AIC_A1", "AIC_P1", "AIC_P2", "GCV_A1", "GCV_P1", "GCV_P2", "Deviance_A1", "Deviance_P1", "Deviance_P2")
|
|
149 |
results_table_RMSE<-as.data.frame(results_RMSEnum)
|
|
150 |
colnames(results_table_RMSE)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6")
|
|
113 | 151 |
|
114 | 152 |
win.graph() |
115 | 153 |
barplot(results_table$RMSE_A1/10,main="RMSE for the A1 models",names.arg=results_table$dates,ylab="Temp (deg. C)",xlab="interpolated date") |
Also available in: Unified diff
Added 3 new models and nesting to GAM, task #361