Revision 2a89fa5f
Added by Benoit Parmentier over 12 years ago
- ID 2a89fa5f5da86b42ecf7339408e46e75de9a4134
climate/research/oregon/interpolation/Linear_reg.R | ||
---|---|---|
4 | 4 |
#1)assumes that the csv file is in the current working |
5 | 5 |
#2)extract relevant variables from raster images before performing the regressions. |
6 | 6 |
#The user must provide the list of raster images in a textile. |
7 |
#Script created by Benoit Parmentier on March 1, 2012.
|
|
7 |
#Script created by Benoit Parmentier on March 3, 2012.
|
|
8 | 8 |
|
9 | 9 |
###Loading r library and packages # loading the raster package |
10 | 10 |
library(gtools) # loading ... |
... | ... | |
14 | 14 |
infile1<-"ghcn_or_b_02122012_OR83M.csv" |
15 | 15 |
path<-"C:/Data/Benoit/NCEAS/window_Oregon_data" |
16 | 16 |
setwd(path) |
17 |
infile2<-"dates_interpolation_03012012.txt" # list of 10 dates |
|
18 |
#dates<-"20101507" # list of 10 dates in a textfile #Date selected for the regression |
|
19 |
#prop<-0.3 #Proportion of testing retained for validation |
|
17 |
#infile2<-"dates_interpolation_03012012.txt" # list of 10 dates for the regression |
|
18 |
infile2<-"dates_interpolation_03032012.txt" |
|
19 |
prop<-0.3 #Proportion of testing retained for validation |
|
20 |
out_prefix<-"_03022012_r3" |
|
20 | 21 |
|
21 | 22 |
#######START OF THE SCRIPT ############# |
22 | 23 |
|
23 |
###Reading the station data |
|
24 |
###Reading the station data and setting up for models' comparison
|
|
24 | 25 |
ghcn<-read.csv(paste(path,"/",infile1, sep=""), header=TRUE) #The "paste" function concatenates the path and file name in common string. |
25 | 26 |
dates <-readLines(paste(path,"/",infile2, sep="")) |
26 |
|
|
27 |
###Creating a validation dataset by creating training and testing datasets (%30) |
|
28 | 27 |
|
28 |
results <- matrix(1,length(dates),10) #This is a matrix containing the diagnostic measures from the GAM models. |
|
29 |
ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date_==d)) #this creates a list of 10 subsets data |
|
29 | 30 |
|
30 |
ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date_==d))
|
|
31 |
## looping through the dates...
|
|
31 | 32 |
|
32 |
for(i in 1:length(dates)){ |
|
33 |
#data_name<-cat("ghcn_",dates[[i]],sep="") |
|
34 |
data_name<-paste("ghcn_",dates[[i]],sep="") |
|
35 |
data<-subset(ghcn,ghcn$date_==dates[[i]]) |
|
36 |
assign(data_name,data) |
|
37 |
} #This loops creates 2 subsets of ghcn based on dates and reassign the names using the date |
|
33 |
for(i in 1:length(dates)){ # start of the for loop #1 |
|
34 |
|
|
35 |
###Regression part 1: Creating a validation dataset by creating training and testing datasets |
|
36 |
|
|
37 |
n<-nrow(ghcn.subsets[[i]]) |
|
38 |
ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
|
39 |
nv<-n-ns #create a sample for validation with prop of the rows |
|
40 |
#ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
|
41 |
ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly |
|
42 |
ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training) |
|
43 |
data_s <- ghcn.subsets[[i]][ind.training, ] |
|
44 |
data_v <- ghcn.subsets[[i]][ind.testing, ] |
|
45 |
|
|
46 |
####Regression part 2: GAM models |
|
47 |
|
|
48 |
GAM_ANUSPLIN1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=data_s) |
|
49 |
GAM_PRISM1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (ASPECT)+ s(DISTOC), data=data_s) |
|
50 |
|
|
51 |
####Regression part 3: Calculating and storing diagnostic measures |
|
52 |
|
|
53 |
results[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
54 |
results[i,2]<- ns #number of stations used in the training stage |
|
55 |
|
|
56 |
results[i,5]<- AIC (GAM_ANUSPLIN1) |
|
57 |
results[i,6]<- AIC (GAM_PRISM1) |
|
58 |
GCVA1<-GAM_ANUSPLIN1$gcv.ubre |
|
59 |
results[i,7]<- GCVA1 |
|
60 |
GCVP1<-GAM_PRISM1$gcv.ubre |
|
61 |
results[i,8]<- GCVP1 |
|
62 |
|
|
63 |
results[i,9]<-GAM_ANUSPLIN1$deviance |
|
64 |
results[i,10]<-GAM_PRISM1$deviance |
|
65 |
|
|
66 |
#####VALIDATION: Prediction checking the results using the testing data######## |
|
67 |
|
|
68 |
y_pgANUSPLIN1<- predict(GAM_ANUSPLIN1, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values. |
|
69 |
y_pgPRISM1<- predict(GAM_PRISM1, newdata=data_v, se.fit = TRUE) |
|
70 |
|
|
71 |
res_ypgA1<- data_v$tmax - y_pgANUSPLIN1$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation |
|
72 |
res_ypgP1<- data_v$tmax - y_pgPRISM1$fit #Residuals for GAM model that resembles the PRISM interpolation |
|
73 |
|
|
74 |
RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv) |
|
75 |
RMSE_ypgP1 <- sqrt(sum(res_ypgP1^2)/nv) |
|
76 |
|
|
77 |
results[i,3]<-RMSE_ypgA1 |
|
78 |
results[i,4]<-RMSE_ypgP1 |
|
79 |
|
|
80 |
# end of the for loop #1 |
|
81 |
} |
|
82 |
|
|
83 |
## Plotting and saving diagnostic measures |
|
84 |
|
|
85 |
results_num <-results |
|
86 |
mode(results_num)<- "numeric" |
|
87 |
# Make it numeric first |
|
88 |
# Now turn it into a data.frame... |
|
89 |
|
|
90 |
results_table<-as.data.frame(results_num) |
|
91 |
colnames(results_table)<-c("dates","ns","RMSE_A1", "RMSE_P1", "AIC_A1", "AIC_P1", "GCV_A1", "GCV_P1", "Deviance_A1", "Deviance_P1") |
|
92 |
|
|
93 |
win.graph() |
|
94 |
barplot(results_table$RMSE_A1,main="RMSE for the A1 models",names.arg=results_table$dates,ylab="Temp (0.1 X deg. C)",xlab="interolated date") |
|
95 |
savePlot(paste("GAM_ANUSPLIN1_RMSE",out_prefix,".emf", sep=""), type="emf") |
|
96 |
win.graph() |
|
97 |
barplot(results_table$RMSE_P1,main="RMSE for the P1 models",names.arg=results_table$dates,ylab="Temp (0.1 X deg. C)",xlab="interolated date") |
|
98 |
savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf") |
|
99 |
win.graph() |
|
100 |
barplot(results_table$AIC_P1,main="AIC for the P1 models",names.arg=results_table$dates,ylab="Temp (0.1 X deg. C)",xlab="interolated date") |
|
101 |
savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf") |
|
102 |
win.graph() |
|
103 |
barplot(results_table$AIC_A1,main="AIC for the A1 models",names.arg=results_table$dates,ylab="Temp (0.1 X deg. C)",xlab="interolated date") |
|
104 |
savePlot(paste("GAM_PRISM1_RMSE",out_prefix,".emf", sep=""), type="emf") |
|
105 |
|
|
106 |
write.csv(results_table, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep="")) |
|
107 |
|
|
108 |
# End of script########## |
|
109 |
|
|
110 |
# ############################### |
|
38 | 111 |
|
39 |
# ############ REGRESSION ############### |
|
40 |
# |
|
41 |
# ###Regression part 1: linear models |
|
42 |
# |
|
43 |
# lm_ANUSPLIN1<-lm(tmax~lat+lon+ELEV_SRTM, data=ghcn1507_s) |
|
44 |
# lm_PRISM1<-lm(tmax~lat+lon+ELEV_SRTM+ASPECT+DISTOC, data=ghcn1507_s) #Note that a variable on inversion is missing |
|
45 |
# summary(lm_ANUSPLIN1) |
|
46 |
# summary(lm_PRISM1) |
|
47 |
# |
|
48 |
# ###Regression part 2: GAM models |
|
49 |
# GAM_ANUSPLIN1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=ghcn1507_s) |
|
50 |
# GAM_PRISM1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (ASPECT)+ s(DISTOC), data=ghcn1507_s) |
|
51 |
# #use the s() for smoothing function |
|
52 |
# |
|
53 |
# ###Compare the models |
|
54 |
# #Show AIC, Cook distance, p values and residuals plot |
|
55 |
# ###Access the R2 and significance to give a report |
|
56 |
# |
|
57 |
# AIC (lm_ANUSPLIN1,GAM_ANUSPLIN1) #list the AIC and for the results |
|
58 |
# AIC (lm_PRISM1,GAM_PRISM1) #list the AIC and for the results |
|
59 |
# |
|
60 |
# GCVA1<-GAM_ANUSPLIN1$gcv.ubre |
|
61 |
# GCVP1<-GAM_PRISM1$gcv.ubre |
|
62 |
# |
|
63 |
# DevianceA1<-GAM_ANUSPLIN1$deviance |
|
64 |
# DevianceP1<-GAM_PRISM1$deviance |
|
65 |
# |
|
66 |
# anova(lm_ANUSPLIN1, GAM_ANUSPLIN1,test="F") #compare the different models in terms of F; a reference model should be set for comparison. |
|
67 |
# anova(lm_PRISM1, GAM_PRISM1,test="F") |
|
68 |
# |
|
69 |
# summary(lm_ANUSPLIN1)$r.squared |
|
70 |
# summary(GAM_ANUSPLIN1)$r.squared |
|
71 |
# summary(lm_PRISM1)$r.squared |
|
72 |
# summary(GAM_PRISM1)$r.squared |
|
73 | 112 |
# |
74 | 113 |
# ############Diagnostic GAM plots############# |
75 | 114 |
# win.graph() |
... | ... | |
89 | 128 |
# vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM")) |
90 | 129 |
# #vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM", theta=100,phi=200)) |
91 | 130 |
# savePlot("GAM_ANUSPLIN1_prediction2.emf", type="emf") |
92 |
# |
|
93 |
# ####VALIDATION: Prediction checking the results using the testing data######## |
|
94 |
# y_plANUSPLIN1<- predict(lm_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE) |
|
95 |
# y_pgANUSPLIN1<- predict(GAM_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE) #Using the coeff to predict new values. |
|
96 |
# y_plPRISM1<- predict(lm_PRISM1, newdata=ghcn1507_v, se.fit = TRUE) |
|
97 |
# y_pgPRISM1<- predict(GAM_PRISM1, newdata=ghcn1507_v, se.fit = TRUE) |
|
98 |
# |
|
99 |
# res_yplA1<- ghcn1507_v$tmax - y_plANUSPLIN1$fit |
|
100 |
# res_ypgA1<- ghcn1507_v$tmax - y_pgANUSPLIN1$fit |
|
101 |
# res_yplP1<- ghcn1507_v$tmax - y_plPRISM1$fit #Residuals for lm model that resembles the PRISM interpolation |
|
102 |
# res_ypgP1<- ghcn1507_v$tmax - y_pgPRISM1$fit #Residuals for GAM model that resembles the PRISM interpolation |
|
103 |
# |
|
131 |
# |
|
104 | 132 |
# #results_val <-c(res_yplA1,res_ypgA1,res_yplP1,res_ypgP1) |
105 | 133 |
# results_val<-data.frame(res_yplA1=res_yplA1,res_ypgA1=res_ypgA1,res_yplP1=res_yplP1,res_ypgP1=res_ypgP1) |
106 | 134 |
# nv<- nrow(ghcn1507_v) |
107 |
# |
|
108 |
# RMSE_yplA1 <- sqrt(sum(res_yplA1^2)*1/nv) |
|
109 |
# #RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv) |
|
110 |
# RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv) |
|
111 |
# RMSE_yplP1 <- sqrt(sum(res_yplP1^2)/nv) |
|
112 |
# RMSE_ypgP1 <- sqrt(sum(res_ypgP1^2)/nv) |
|
113 |
# |
|
114 |
# #Printing the RMSE values for the different models |
|
115 |
# RMSE_yplA1 |
|
116 |
# RMSE_ypgA1 |
|
117 |
# RMSE_yplP1 |
|
118 |
# RMSE_ypgP1 |
|
119 |
# |
|
135 |
# |
|
120 | 136 |
# mod_name<-c("yplA1","ypgA1","yplP1","ypgP1") |
121 | 137 |
# mod_type<-c("lm_ANUSPLIN1","GAM_ANUSPLIN1","lm_PRISM1","GAM_PRISM1") |
122 | 138 |
# |
... | ... | |
136 | 152 |
# #dump("results", file= paste(path,"/","results_reg1507.txt",sep="")) |
137 | 153 |
# write.csv(results, file= paste(path,"/","results_reg1507_Assessment.txt",sep="")) |
138 | 154 |
# #write.csv(results_val, file= paste(path,"/","results_reg1507_val.txt",sep="")) |
139 |
# |
|
140 | 155 |
# |
141 |
# library(lattice) |
|
142 |
# g <- expand.grid(x = ghcn1507_v$lon, y = ghcn1507_v$lat) |
|
143 |
# g$z <- y_plANUSPLIN1$fit |
|
144 |
# wireframe(z ~ x * y, data = g) |
|
145 |
# |
|
146 |
# #####End of script########## |
|
147 |
# ############################### |
|
156 |
|
|
148 | 157 |
|
149 | 158 |
|
150 | 159 |
|
Also available in: Unified diff
GAM prediction added to the loop with RMSE calculation