Revision bc59c6bb
Added by Benoit Parmentier over 12 years ago
- ID bc59c6bbd35da1cae8eab57cc56f77f4f619aba8
climate/research/oregon/interpolation/Linear_reg.R | ||
---|---|---|
1 |
####################Interpolation of Tmax for on3 day.#####################
|
|
1 |
####################Interpolation of Tmax for 10 dates.#####################
|
|
2 | 2 |
#This script interpolates station values for the Oregon case study. This program loads the station data from a csv file |
3 | 3 |
#and perform two types of regression: multiple linear model and general additive model (GAM). Note that this program: |
4 | 4 |
#1)assumes that the csv file is in the current working |
... | ... | |
14 | 14 |
infile1<-"ghcn_or_b_02122012_OR83M.csv" |
15 | 15 |
path<-"C:/Data/Benoit/NCEAS/window_Oregon_data" |
16 | 16 |
setwd(path) |
17 |
#date<-"20101507" #Date selected for the regression |
|
18 |
#prop<-0.3 #Proportion of testing retained for validation |
|
17 |
infile2<-"dates_interpolation_03012012.txt" # list of 10 dates |
|
18 |
#dates<-"20101507" # list of 10 dates in a textfile #Date selected for the regression |
|
19 |
#prop<-0.3 #Proportion of testing retained for validation |
|
19 | 20 |
|
20 | 21 |
#######START OF THE SCRIPT ############# |
21 | 22 |
|
22 | 23 |
###Reading the station data |
23 | 24 |
ghcn<-read.csv(paste(path,"/",infile1, sep=""), header=TRUE) #The "paste" function concatenates the path and file name in common string. |
24 |
#ghcn<-read.csv(infile1) #Use read.table if the input file is space delimited or read.table(infile1, headername=TRUE, sep=',')
|
|
25 |
names(ghcn) #Checking that the columns are correctly labelled.
|
|
26 |
|
|
25 |
#ghcn<-read.csv(infile1) #Use read.table if the input file is space delimited or read.table(infile1, headername=TRUE, sep=',') #Checking that the columns are correctly labelled.
|
|
26 |
dates <-readLines(paste(path,"/",infile2, sep=""))
|
|
27 |
|
|
27 | 28 |
###Creating a validation dataset by creating training and testing datasets (%30) |
28 | 29 |
#ghcn1507 <-subset(ghcn,ghcn$date_== date) |
29 |
ghcn1507 <-subset(ghcn,ghcn$date_=="20100715") |
|
30 |
n<-nrow(ghcn1507) |
|
31 |
ns<-n-round(n*0.3) #Create a sample from the data frame with 70% of the rows |
|
32 |
#ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
|
33 |
ind.training <- sample(nrow(ghcn1507), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly |
|
34 |
ind.testing <- setdiff(1:nrow(ghcn1507), ind.training) |
|
35 |
ghcn1507_s <- ghcn1507[ind.training, ] |
|
36 |
ghcn1507_v <- ghcn1507[ind.testing, ] |
|
37 |
|
|
38 |
############ REGRESSION ############### |
|
39 |
|
|
40 |
###Regression part 1: linear models |
|
41 |
|
|
42 |
lm_ANUSPLIN1<-lm(tmax~lat+lon+ELEV_SRTM, data=ghcn1507_s) |
|
43 |
lm_PRISM1<-lm(tmax~lat+lon+ELEV_SRTM+ASPECT+DISTOC, data=ghcn1507_s) #Note that a variable on inversion is missing |
|
44 |
summary(lm_ANUSPLIN1) |
|
45 |
summary(lm_PRISM1) |
|
46 |
|
|
47 |
###Regression part 2: GAM models |
|
48 |
GAM_ANUSPLIN1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=ghcn1507_s) |
|
49 |
GAM_PRISM1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (ASPECT)+ s(DISTOC), data=ghcn1507_s) |
|
50 |
#use the s() for smoothing function |
|
51 |
|
|
52 |
###Compare the models |
|
53 |
#Show AIC, Cook distance, p values and residuals plot |
|
54 |
###Access the R2 and significance to give a report |
|
55 |
|
|
56 |
AIC (lm_ANUSPLIN1,GAM_ANUSPLIN1) #list the AIC and for the results |
|
57 |
AIC (lm_PRISM1,GAM_PRISM1) #list the AIC and for the results |
|
58 |
|
|
59 |
GCVA1<-GAM_ANUSPLIN1$gcv.ubre |
|
60 |
GCVP1<-GAM_PRISM1$gcv.ubre |
|
61 |
|
|
62 |
DevianceA1<-GAM_ANUSPLIN1$deviance |
|
63 |
DevianceP1<-GAM_PRISM1$deviance |
|
64 |
|
|
65 |
anova(lm_ANUSPLIN1, GAM_ANUSPLIN1,test="F") #compare the different models in terms of F; a reference model should be set for comparison. |
|
66 |
anova(lm_PRISM1, GAM_PRISM1,test="F") |
|
67 |
|
|
68 |
summary(lm_ANUSPLIN1)$r.squared |
|
69 |
summary(GAM_ANUSPLIN1)$r.squared |
|
70 |
summary(lm_PRISM1)$r.squared |
|
71 |
summary(GAM_PRISM1)$r.squared |
|
72 |
|
|
73 |
############Diagnostic GAM plots############# |
|
74 |
win.graph() |
|
75 |
gam.check(GAM_ANUSPLIN1) |
|
76 |
savePlot("GAM_ANUSPLIN1_diagnostic1.emf", type="emf") |
|
77 |
win.graph() |
|
78 |
gam.check(GAM_PRISM1) #This will produce basic plots of residuals |
|
79 |
savePlot("GAM_PRISM_diagnostic1.emf", type="emf") |
|
80 |
gam.check(GAM_ANUSPLIN1) |
|
81 |
win.graph() |
|
82 |
vis.gam(GAM_ANUSPLIN1) |
|
83 |
savePlot("GAM_ANUSPLIN1_prediction.emf", type="emf") |
|
84 |
win.graph() |
|
85 |
vis.gam(GAM_PRISM1) |
|
86 |
savePlot("GAM_PRISM1_prediction.emf", type="emf") |
|
87 |
win.graph() |
|
88 |
vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM")) |
|
89 |
#vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM", theta=100,phi=200)) |
|
90 |
savePlot("GAM_ANUSPLIN1_prediction2.emf", type="emf") |
|
91 |
|
|
92 |
####VALIDATION: Prediction checking the results using the testing data######## |
|
93 |
y_plANUSPLIN1<- predict(lm_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE) |
|
94 |
y_pgANUSPLIN1<- predict(GAM_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE) #Using the coeff to predict new values. |
|
95 |
y_plPRISM1<- predict(lm_PRISM1, newdata=ghcn1507_v, se.fit = TRUE) |
|
96 |
y_pgPRISM1<- predict(GAM_PRISM1, newdata=ghcn1507_v, se.fit = TRUE) |
|
97 |
|
|
98 |
res_yplA1<- ghcn1507_v$tmax - y_plANUSPLIN1$fit |
|
99 |
res_ypgA1<- ghcn1507_v$tmax - y_pgANUSPLIN1$fit |
|
100 |
res_yplP1<- ghcn1507_v$tmax - y_plPRISM1$fit #Residuals for lm model that resembles the PRISM interpolation |
|
101 |
res_ypgP1<- ghcn1507_v$tmax - y_pgPRISM1$fit #Residuals for GAM model that resembles the PRISM interpolation |
|
102 |
|
|
103 |
#results_val <-c(res_yplA1,res_ypgA1,res_yplP1,res_ypgP1) |
|
104 |
results_val<-data.frame(res_yplA1=res_yplA1,res_ypgA1=res_ypgA1,res_yplP1=res_yplP1,res_ypgP1=res_ypgP1) |
|
105 |
nv<- nrow(ghcn1507_v) |
|
106 |
|
|
107 |
RMSE_yplA1 <- sqrt(sum(res_yplA1^2)*1/nv) |
|
108 |
#RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv) |
|
109 |
RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv) |
|
110 |
RMSE_yplP1 <- sqrt(sum(res_yplP1^2)/nv) |
|
111 |
RMSE_ypgP1 <- sqrt(sum(res_ypgP1^2)/nv) |
|
112 |
|
|
113 |
#Printing the RMSE values for the different models |
|
114 |
RMSE_yplA1 |
|
115 |
RMSE_ypgA1 |
|
116 |
RMSE_yplP1 |
|
117 |
RMSE_ypgP1 |
|
118 |
|
|
119 |
mod_name<-c("yplA1","ypgA1","yplP1","ypgP1") |
|
120 |
mod_type<-c("lm_ANUSPLIN1","GAM_ANUSPLIN1","lm_PRISM1","GAM_PRISM1") |
|
121 |
|
|
122 |
RMSE_all<-c(RMSE_yplA1,RMSE_ypgA1,RMSE_yplP1,RMSE_ypgP1) |
|
123 |
AIC_all<-AIC(lm_ANUSPLIN1,GAM_ANUSPLIN1,lm_PRISM1,GAM_PRISM1) |
|
124 |
GCV_all<-c(0,GCVA1,0,GCVP1) #This places the GCV values for each model in a vector |
|
125 |
Deviance_all<-c(0,DevianceA1,0,DevianceP1) |
|
126 |
|
|
127 |
#results<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC) |
|
128 |
#results_val<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC) |
|
129 |
results<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC, GCV=GCV_all,Deviance=Deviance_all) |
|
130 |
|
|
131 |
#Add deviance |
|
132 |
|
|
133 |
barplot(results$RMSE,main="RMSE for the models",names.arg=c("yplA1","ypgA1","yplP1","ypgP1"),ylab="Temp (deg. C)") |
|
134 |
barplot(results$AIC,main="AIC for the models",names.arg=c("yplA1","ypgA1","yplP1","ypgP1"),ylab="AIC") |
|
135 |
#dump("results", file= paste(path,"/","results_reg1507.txt",sep="")) |
|
136 |
write.csv(results, file= paste(path,"/","results_reg1507_Assessment.txt",sep="")) |
|
137 |
#write.csv(results_val, file= paste(path,"/","results_reg1507_val.txt",sep="")) |
|
138 |
|
|
139 |
|
|
140 |
library(lattice) |
|
141 |
g <- expand.grid(x = ghcn1507_v$lon, y = ghcn1507_v$lat) |
|
142 |
g$z <- y_plANUSPLIN1$fit |
|
143 |
wireframe(z ~ x * y, data = g) |
|
144 |
|
|
145 |
#####End of script########## |
|
146 |
############################### |
|
30 |
#for loops |
|
31 |
ddat <- as.list(rep("", length(dates))) |
|
32 |
|
|
33 |
for(i in 1:length(dates)){ |
|
34 |
data<-cat("ghcn_",dates[[1]],sep="") |
|
35 |
data<-as.data.frame(data) |
|
36 |
ddat[[i]] <- data.frame(subset(ghcn,ghcn$date_==dates[[i]])) |
|
37 |
#data<-subset(ghcn,ghcn$date_==dates[[i]]) |
|
38 |
n<-nrow(data) |
|
39 |
ns<-n-round(n*0.3) #Create a sample from the data frame with 70% of the rows |
|
40 |
#ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
|
41 |
ind.training <- sample(nrow(data), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly |
|
42 |
ind.testing <- setdiff(1:nrow(data), ind.training) |
|
43 |
data_s <- data[ind.training, ] |
|
44 |
data_v <- data[ind.testing, ] |
|
45 |
} |
|
46 |
|
|
47 |
# ############ REGRESSION ############### |
|
48 |
# |
|
49 |
# ###Regression part 1: linear models |
|
50 |
# |
|
51 |
# lm_ANUSPLIN1<-lm(tmax~lat+lon+ELEV_SRTM, data=ghcn1507_s) |
|
52 |
# lm_PRISM1<-lm(tmax~lat+lon+ELEV_SRTM+ASPECT+DISTOC, data=ghcn1507_s) #Note that a variable on inversion is missing |
|
53 |
# summary(lm_ANUSPLIN1) |
|
54 |
# summary(lm_PRISM1) |
|
55 |
# |
|
56 |
# ###Regression part 2: GAM models |
|
57 |
# GAM_ANUSPLIN1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=ghcn1507_s) |
|
58 |
# GAM_PRISM1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (ASPECT)+ s(DISTOC), data=ghcn1507_s) |
|
59 |
# #use the s() for smoothing function |
|
60 |
# |
|
61 |
# ###Compare the models |
|
62 |
# #Show AIC, Cook distance, p values and residuals plot |
|
63 |
# ###Access the R2 and significance to give a report |
|
64 |
# |
|
65 |
# AIC (lm_ANUSPLIN1,GAM_ANUSPLIN1) #list the AIC and for the results |
|
66 |
# AIC (lm_PRISM1,GAM_PRISM1) #list the AIC and for the results |
|
67 |
# |
|
68 |
# GCVA1<-GAM_ANUSPLIN1$gcv.ubre |
|
69 |
# GCVP1<-GAM_PRISM1$gcv.ubre |
|
70 |
# |
|
71 |
# DevianceA1<-GAM_ANUSPLIN1$deviance |
|
72 |
# DevianceP1<-GAM_PRISM1$deviance |
|
73 |
# |
|
74 |
# anova(lm_ANUSPLIN1, GAM_ANUSPLIN1,test="F") #compare the different models in terms of F; a reference model should be set for comparison. |
|
75 |
# anova(lm_PRISM1, GAM_PRISM1,test="F") |
|
76 |
# |
|
77 |
# summary(lm_ANUSPLIN1)$r.squared |
|
78 |
# summary(GAM_ANUSPLIN1)$r.squared |
|
79 |
# summary(lm_PRISM1)$r.squared |
|
80 |
# summary(GAM_PRISM1)$r.squared |
|
81 |
# |
|
82 |
# ############Diagnostic GAM plots############# |
|
83 |
# win.graph() |
|
84 |
# gam.check(GAM_ANUSPLIN1) |
|
85 |
# savePlot("GAM_ANUSPLIN1_diagnostic1.emf", type="emf") |
|
86 |
# win.graph() |
|
87 |
# gam.check(GAM_PRISM1) #This will produce basic plots of residuals |
|
88 |
# savePlot("GAM_PRISM_diagnostic1.emf", type="emf") |
|
89 |
# gam.check(GAM_ANUSPLIN1) |
|
90 |
# win.graph() |
|
91 |
# vis.gam(GAM_ANUSPLIN1) |
|
92 |
# savePlot("GAM_ANUSPLIN1_prediction.emf", type="emf") |
|
93 |
# win.graph() |
|
94 |
# vis.gam(GAM_PRISM1) |
|
95 |
# savePlot("GAM_PRISM1_prediction.emf", type="emf") |
|
96 |
# win.graph() |
|
97 |
# vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM")) |
|
98 |
# #vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM", theta=100,phi=200)) |
|
99 |
# savePlot("GAM_ANUSPLIN1_prediction2.emf", type="emf") |
|
100 |
# |
|
101 |
# ####VALIDATION: Prediction checking the results using the testing data######## |
|
102 |
# y_plANUSPLIN1<- predict(lm_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE) |
|
103 |
# y_pgANUSPLIN1<- predict(GAM_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE) #Using the coeff to predict new values. |
|
104 |
# y_plPRISM1<- predict(lm_PRISM1, newdata=ghcn1507_v, se.fit = TRUE) |
|
105 |
# y_pgPRISM1<- predict(GAM_PRISM1, newdata=ghcn1507_v, se.fit = TRUE) |
|
106 |
# |
|
107 |
# res_yplA1<- ghcn1507_v$tmax - y_plANUSPLIN1$fit |
|
108 |
# res_ypgA1<- ghcn1507_v$tmax - y_pgANUSPLIN1$fit |
|
109 |
# res_yplP1<- ghcn1507_v$tmax - y_plPRISM1$fit #Residuals for lm model that resembles the PRISM interpolation |
|
110 |
# res_ypgP1<- ghcn1507_v$tmax - y_pgPRISM1$fit #Residuals for GAM model that resembles the PRISM interpolation |
|
111 |
# |
|
112 |
# #results_val <-c(res_yplA1,res_ypgA1,res_yplP1,res_ypgP1) |
|
113 |
# results_val<-data.frame(res_yplA1=res_yplA1,res_ypgA1=res_ypgA1,res_yplP1=res_yplP1,res_ypgP1=res_ypgP1) |
|
114 |
# nv<- nrow(ghcn1507_v) |
|
115 |
# |
|
116 |
# RMSE_yplA1 <- sqrt(sum(res_yplA1^2)*1/nv) |
|
117 |
# #RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv) |
|
118 |
# RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv) |
|
119 |
# RMSE_yplP1 <- sqrt(sum(res_yplP1^2)/nv) |
|
120 |
# RMSE_ypgP1 <- sqrt(sum(res_ypgP1^2)/nv) |
|
121 |
# |
|
122 |
# #Printing the RMSE values for the different models |
|
123 |
# RMSE_yplA1 |
|
124 |
# RMSE_ypgA1 |
|
125 |
# RMSE_yplP1 |
|
126 |
# RMSE_ypgP1 |
|
127 |
# |
|
128 |
# mod_name<-c("yplA1","ypgA1","yplP1","ypgP1") |
|
129 |
# mod_type<-c("lm_ANUSPLIN1","GAM_ANUSPLIN1","lm_PRISM1","GAM_PRISM1") |
|
130 |
# |
|
131 |
# RMSE_all<-c(RMSE_yplA1,RMSE_ypgA1,RMSE_yplP1,RMSE_ypgP1) |
|
132 |
# AIC_all<-AIC(lm_ANUSPLIN1,GAM_ANUSPLIN1,lm_PRISM1,GAM_PRISM1) |
|
133 |
# GCV_all<-c(0,GCVA1,0,GCVP1) #This places the GCV values for each model in a vector |
|
134 |
# Deviance_all<-c(0,DevianceA1,0,DevianceP1) |
|
135 |
# |
|
136 |
# #results<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC) |
|
137 |
# #results_val<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC) |
|
138 |
# results<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC, GCV=GCV_all,Deviance=Deviance_all) |
|
139 |
# |
|
140 |
# #Add deviance |
|
141 |
# |
|
142 |
# barplot(results$RMSE,main="RMSE for the models",names.arg=c("yplA1","ypgA1","yplP1","ypgP1"),ylab="Temp (deg. C)") |
|
143 |
# barplot(results$AIC,main="AIC for the models",names.arg=c("yplA1","ypgA1","yplP1","ypgP1"),ylab="AIC") |
|
144 |
# #dump("results", file= paste(path,"/","results_reg1507.txt",sep="")) |
|
145 |
# write.csv(results, file= paste(path,"/","results_reg1507_Assessment.txt",sep="")) |
|
146 |
# #write.csv(results_val, file= paste(path,"/","results_reg1507_val.txt",sep="")) |
|
147 |
# |
|
148 |
# |
|
149 |
# library(lattice) |
|
150 |
# g <- expand.grid(x = ghcn1507_v$lon, y = ghcn1507_v$lat) |
|
151 |
# g$z <- y_plANUSPLIN1$fit |
|
152 |
# wireframe(z ~ x * y, data = g) |
|
153 |
# |
|
154 |
# #####End of script########## |
|
155 |
# ############################### |
|
147 | 156 |
|
148 | 157 |
|
149 | 158 |
|
Also available in: Unified diff
Start of Task #361, modified code to subset for 10 dates