Revision e2d673e4
Added by Benoit Parmentier over 12 years ago
- ID e2d673e42b81d3fc56e248b6009ad94a9b5306b4
climate/research/oregon/interpolation/Linear_reg.R | ||
---|---|---|
1 |
#This script interpolates station value for the Oregon case study. This program loads the station data from a csv file |
|
1 |
#This script interpolates station value for the Oregon case study. This program loads the station data from a csv file
|
|
2 | 2 |
#and perform two types of regression: multiple linear model and general additive model (GAM). Note that this program: |
3 |
#1)assumes that the csv file is in the current working |
|
4 |
#2)extract relevant variables from raster images before performing the regressions. |
|
3 |
#1)assumes that the csv file is in the current working
|
|
4 |
#2)extract relevant variables from raster images before performing the regressions.
|
|
5 | 5 |
#The user must provide the list of raster images in a textile. |
6 |
#Script created by Benoit Parmentier on February 15, 2012.
|
|
6 |
#Script created by Benoit Parmentier on February 28, 2012.
|
|
7 | 7 |
|
8 |
###Loading r library and packages |
|
9 |
library(raster) # loading the raster package |
|
8 |
###Loading r library and packages # loading the raster package |
|
10 | 9 |
library(gtools) # loading ... |
11 |
library(sp) |
|
12 | 10 |
library(mgcv) |
13 | 11 |
|
14 | 12 |
###Parameters and arguments |
15 | 13 |
infile1<-"ghcn_or_b_02122012_OR83M.csv" |
16 |
inlistf<-"list_files.txt" |
|
17 | 14 |
path<-"C:/Data/Benoit/NCEAS/window_Oregon_data" |
18 |
#path<-getwd() |
|
19 |
#date<-"20101507" #Date selected for the regression |
|
20 |
#prop<-0.3 #Proportion of testing retained for validation |
|
21 |
#list_var<-list.files() # displaying the files in the current folder which should contain only the relevant raster images. |
|
22 |
#dump("list_var", file="list_var.txt") # Saving the values of list_var in a ascii text file. |
|
23 |
#inlistvar<-"ghcn_var_02122012.txt" |
|
15 |
setwd(path) |
|
16 |
#date<-"20101507" #Date selected for the regression |
|
17 |
#prop<-0.3 #Proportion of testing retained for validation |
|
24 | 18 |
|
25 | 19 |
#######START OF THE SCRIPT ############# |
26 | 20 |
|
27 | 21 |
###Reading the station data |
28 |
ghcn<-read.csv(paste(path,"/",infile1, sep=""), header=TRUE) #The "paste" function concatenates the path and file name in common string. |
|
22 |
ghcn<-read.csv(paste(path,"/",infile1, sep=""), header=TRUE) #The "paste" function concatenates the path and file name in common string.
|
|
29 | 23 |
#ghcn<-read.csv(infile1) #Use read.table if the input file is space delimited or read.table(infile1, headername=TRUE, sep=',') |
30 | 24 |
names(ghcn) #Checking that the columns are correctly labelled. |
31 | 25 |
|
32 |
###Extracting the variables values from the raster files |
|
33 |
coords<- ghcn[,c('x_OR83M','y_OR83M')] #Selecting all rows for the x and y columns and packaging the output in a data frame. |
|
34 |
lines <-readLines(paste(path,"/",inlistf, sep="") |
|
35 |
inlistvar<-paste(path,"/",lines,sep="") |
|
36 |
|
|
37 |
#s_raster<- stack(inlistvar) #Creating a stack of raster images from the list of variables. |
|
38 |
#stat_val<- extract(s_raster, coords) #Extracting values from the raster stack for every point location in coords data frame. |
|
39 |
#create a shape file and data_frame with names?? |
|
40 |
|
|
41 | 26 |
###Creating a validation dataset by creating training and testing datasets (%30) |
42 | 27 |
#ghcn1507 <-subset(ghcn,ghcn$date_== date) |
43 | 28 |
ghcn1507 <-subset(ghcn,ghcn$date_=="20100715") |
... | ... | |
69 | 54 |
|
70 | 55 |
AIC (lm_ANUSPLIN1,GAM_ANUSPLIN1) #list the AIC and for the results |
71 | 56 |
AIC (lm_PRISM1,GAM_PRISM1) #list the AIC and for the results |
72 |
anova(lm_ANUSPLIN1, GAM_ANUSPLIN1,test="F") #compare the different models in terms of F; a reference model should be set for comparison. |
|
57 |
|
|
58 |
GCVA1<-GAM_ANUSPLIN1$gcv.ubre |
|
59 |
GCVP1<-GAM_PRISM1$gcv.ubre |
|
60 |
|
|
61 |
DevianceA1<-GAM_ANUSPLIN1$deviance |
|
62 |
DevianceP1<-GAM_PRISM1$deviance |
|
63 |
|
|
64 |
anova(lm_ANUSPLIN1, GAM_ANUSPLIN1,test="F") #compare the different models in terms of F; a reference model should be set for comparison. |
|
73 | 65 |
anova(lm_PRISM1, GAM_PRISM1,test="F") |
74 | 66 |
|
75 |
#GAM plot of partial residuals |
|
67 |
summary(lm_ANUSPLIN1)$r.squared |
|
68 |
summary(GAM_ANUSPLIN1)$r.squared |
|
69 |
summary(lm_PRISM1)$r.squared |
|
70 |
summary(GAM_PRISM1)$r.squared |
|
71 |
|
|
72 |
############Diagnostic GAM plots############# |
|
73 |
win.graph() |
|
74 |
gam.check(GAM_ANUSPLIN1) |
|
75 |
savePlot("GAM_ANUSPLIN1_diagnostic1.emf", type="emf") |
|
76 |
win.graph() |
|
76 | 77 |
gam.check(GAM_PRISM1) #This will produce basic plots of residuals |
78 |
savePlot("GAM_PRISM_diagnostic1.emf", type="emf") |
|
77 | 79 |
gam.check(GAM_ANUSPLIN1) |
78 |
|
|
79 |
#Prediction checking the results using the testing data |
|
80 |
win.graph() |
|
81 |
vis.gam(GAM_ANUSPLIN1) |
|
82 |
savePlot("GAM_ANUSPLIN1_prediction.emf", type="emf") |
|
83 |
win.graph() |
|
84 |
vis.gam(GAM_PRISM1) |
|
85 |
savePlot("GAM_PRISM1_prediction.emf", type="emf") |
|
86 |
win.graph() |
|
87 |
vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM")) |
|
88 |
#vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM", theta=100,phi=200)) |
|
89 |
savePlot("GAM_ANUSPLIN1_prediction2.emf", type="emf") |
|
90 |
|
|
91 |
####VALIDATION: Prediction checking the results using the testing data######## |
|
80 | 92 |
y_plANUSPLIN1<- predict(lm_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE) |
81 |
y_pgANUSPLIN1<- predict(GAM_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE) |
|
93 |
y_pgANUSPLIN1<- predict(GAM_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE) #Using the coeff to predict new values.
|
|
82 | 94 |
y_plPRISM1<- predict(lm_PRISM1, newdata=ghcn1507_v, se.fit = TRUE) |
83 |
y_pgPRISM1<- predict(GAM_PRISM1, newdata=ghcn1507_v, se.fit = TRUE) |
|
84 |
|
|
95 |
y_pgPRISM1<- predict(GAM_PRISM1, newdata=ghcn1507_v, se.fit = TRUE)
|
|
96 |
|
|
85 | 97 |
res_yplA1<- ghcn1507_v$tmax - y_plANUSPLIN1$fit |
86 | 98 |
res_ypgA1<- ghcn1507_v$tmax - y_pgANUSPLIN1$fit |
87 | 99 |
res_yplP1<- ghcn1507_v$tmax - y_plPRISM1$fit #Residuals for lm model that resembles the PRISM interpolation |
88 |
res_ypgP1<- ghcn1507_v$tmax - y_pgPRISM1$fit #Residuals for GAM model that resembles the PRISM interpolation |
|
89 |
|
|
90 |
RMSE_yplA1 <- sqrt(sum(res_yplA1^2)) |
|
91 |
#RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)) |
|
92 |
RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)) |
|
93 |
RMSE_yplP1 <- sqrt(sum(res_yplP1^2)) |
|
94 |
RMSE_ypgP1 <- sqrt(sum(res_ypgP1^2)) |
|
95 |
|
|
96 |
#Printing the RMSE values for the different models |
|
97 |
RMSE_yplA1 |
|
98 |
RMSE_ypgA1 |
|
99 |
RMSE_yplP1 |
|
100 |
res_ypgP1<- ghcn1507_v$tmax - y_pgPRISM1$fit #Residuals for GAM model that resembles the PRISM interpolation |
|
101 |
|
|
102 |
#results_val <-c(res_yplA1,res_ypgA1,res_yplP1,res_ypgP1) |
|
103 |
results_val<-data.frame(res_yplA1=res_yplA1,res_ypgA1=res_ypgA1,res_yplP1=res_yplP1,res_ypgP1=res_ypgP1) |
|
104 |
nv<- nrow(ghcn1507_v) |
|
105 |
|
|
106 |
RMSE_yplA1 <- sqrt(sum(res_yplA1^2)*1/nv) |
|
107 |
#RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv) |
|
108 |
RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv) |
|
109 |
RMSE_yplP1 <- sqrt(sum(res_yplP1^2)/nv) |
|
110 |
RMSE_ypgP1 <- sqrt(sum(res_ypgP1^2)/nv) |
|
111 |
|
|
112 |
#Printing the RMSE values for the different models |
|
113 |
RMSE_yplA1 |
|
114 |
RMSE_ypgA1 |
|
115 |
RMSE_yplP1 |
|
100 | 116 |
RMSE_ypgP1 |
101 | 117 |
|
102 |
RMSE_all<-c(RMSE_yplA1,RMSE_ypgA1,RMSE_yplP1,RMSE_ypgP1) |
|
103 | 118 |
mod_name<-c("yplA1","ypgA1","yplP1","ypgP1") |
104 | 119 |
mod_type<-c("lm_ANUSPLIN1","GAM_ANUSPLIN1","lm_PRISM1","GAM_PRISM1") |
120 |
|
|
121 |
RMSE_all<-c(RMSE_yplA1,RMSE_ypgA1,RMSE_yplP1,RMSE_ypgP1) |
|
105 | 122 |
AIC_all<-AIC(lm_ANUSPLIN1,GAM_ANUSPLIN1,lm_PRISM1,GAM_PRISM1) |
106 |
results<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC) |
|
123 |
GCV_all<-c(0,GCVA1,0,GCVP1) #This places the GCV values for each model in a vector |
|
124 |
Deviance_all<-c(0,DevianceA1,0,DevianceP1) |
|
125 |
|
|
126 |
#results<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC) |
|
127 |
#results_val<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC) |
|
128 |
results<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC, GCV=GCV_all,Deviance=Deviance_all) |
|
129 |
|
|
130 |
#Add deviance |
|
107 | 131 |
|
132 |
barplot(results$RMSE,main="RMSE for the models",names.arg=c("yplA1","ypgA1","yplP1","ypgP1"),ylab="Temp (deg. C)") |
|
133 |
barplot(results$AIC,main="AIC for the models",names.arg=c("yplA1","ypgA1","yplP1","ypgP1"),ylab="AIC") |
|
108 | 134 |
#dump("results", file= paste(path,"/","results_reg1507.txt",sep="")) |
109 |
write.csv(results, file= paste(path,"/","results_reg1507.txt",sep="")) |
|
135 |
write.csv(results, file= paste(path,"/","results_reg1507_Assessment.txt",sep="")) |
|
136 |
#write.csv(results_val, file= paste(path,"/","results_reg1507_val.txt",sep="")) |
|
110 | 137 |
|
111 |
##End of script |
|
138 |
|
|
139 |
library(lattice) |
|
140 |
g <- expand.grid(x = ghcn1507_v$lon, y = ghcn1507_v$lat) |
|
141 |
g$z <- y_plANUSPLIN1$fit |
|
142 |
wireframe(z ~ x * y, data = g) |
|
112 | 143 |
|
144 |
#####End of script########## |
|
145 |
############################### |
|
113 | 146 |
|
114 | 147 |
|
115 | 148 |
|
116 |
|
|
149 |
|
Also available in: Unified diff
Corrected errors in RMSE calculation and added plots to view results