Project

General

Profile

« Previous | Next » 

Revision bc59c6bb

Added by Benoit Parmentier over 12 years ago

  • ID bc59c6bbd35da1cae8eab57cc56f77f4f619aba8

Start of Task #361, modified code to subset for 10 dates

View differences:

climate/research/oregon/interpolation/Linear_reg.R
1
####################Interpolation of Tmax for on3 day.#####################
1
####################Interpolation of Tmax for 10 dates.#####################
2 2
#This script interpolates station values for the Oregon case study. This program loads the station data from a csv file 
3 3
#and perform two types  of regression: multiple linear model and general additive model (GAM). Note that this program:
4 4
#1)assumes that the csv file is in the current working 
......
14 14
infile1<-"ghcn_or_b_02122012_OR83M.csv"
15 15
path<-"C:/Data/Benoit/NCEAS/window_Oregon_data"
16 16
setwd(path)
17
#date<-"20101507"                                                                       #Date selected for the regression 
18
#prop<-0.3                                                                              #Proportion of testing retained for validation   
17
infile2<-"dates_interpolation_03012012.txt"  # list of 10 dates
18
#dates<-"20101507"  # list of 10 dates in a textfile                                   #Date selected for the regression 
19
#prop<-0.3                                                                            #Proportion of testing retained for validation   
19 20

  
20 21
#######START OF THE SCRIPT #############
21 22

  
22 23
###Reading the station data
23 24
ghcn<-read.csv(paste(path,"/",infile1, sep=""), header=TRUE)                            #The "paste" function concatenates the path and file name in common string. 
24
#ghcn<-read.csv(infile1)                                                                 #Use read.table if the input file is space delimited or read.table(infile1, headername=TRUE, sep=',')
25
names(ghcn)                                                                             #Checking that the columns are correctly labelled.
26

  
25
#ghcn<-read.csv(infile1)                                                                #Use read.table if the input file is space delimited or read.table(infile1, headername=TRUE, sep=',')                                                                            #Checking that the columns are correctly labelled.
26
dates <-readLines(paste(path,"/",infile2, sep=""))
27
                  
27 28
###Creating a validation dataset by creating training and testing datasets (%30)
28 29
#ghcn1507 <-subset(ghcn,ghcn$date_== date)
29
ghcn1507 <-subset(ghcn,ghcn$date_=="20100715")
30
n<-nrow(ghcn1507)
31
ns<-n-round(n*0.3)  #Create a sample from the data frame with 70% of the rows
32
#ns<-n-round(n*prop)  #Create a sample from the data frame with 70% of the rows
33
ind.training <- sample(nrow(ghcn1507), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
34
ind.testing <- setdiff(1:nrow(ghcn1507), ind.training)
35
ghcn1507_s <- ghcn1507[ind.training, ]
36
ghcn1507_v <- ghcn1507[ind.testing, ]
37

  
38
############ REGRESSION ###############
39

  
40
###Regression part 1: linear models
41

  
42
lm_ANUSPLIN1<-lm(tmax~lat+lon+ELEV_SRTM, data=ghcn1507_s)
43
lm_PRISM1<-lm(tmax~lat+lon+ELEV_SRTM+ASPECT+DISTOC, data=ghcn1507_s) #Note that a variable on inversion is missing
44
summary(lm_ANUSPLIN1)
45
summary(lm_PRISM1)
46

  
47
###Regression part 2: GAM models
48
GAM_ANUSPLIN1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=ghcn1507_s)
49
GAM_PRISM1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (ASPECT)+ s(DISTOC), data=ghcn1507_s)
50
#use the s() for smoothing function
51

  
52
###Compare the models
53
#Show AIC, Cook distance, p values and residuals plot
54
###Access the R2 and significance to give a report
55

  
56
AIC (lm_ANUSPLIN1,GAM_ANUSPLIN1) #list the AIC and for the results
57
AIC (lm_PRISM1,GAM_PRISM1) #list the AIC and for the results
58

  
59
GCVA1<-GAM_ANUSPLIN1$gcv.ubre
60
GCVP1<-GAM_PRISM1$gcv.ubre
61

  
62
DevianceA1<-GAM_ANUSPLIN1$deviance
63
DevianceP1<-GAM_PRISM1$deviance
64

  
65
anova(lm_ANUSPLIN1, GAM_ANUSPLIN1,test="F") #compare the different models in terms of F; a reference model should be set for comparison. 
66
anova(lm_PRISM1, GAM_PRISM1,test="F")
67

  
68
summary(lm_ANUSPLIN1)$r.squared
69
summary(GAM_ANUSPLIN1)$r.squared
70
summary(lm_PRISM1)$r.squared                
71
summary(GAM_PRISM1)$r.squared
72

  
73
############Diagnostic GAM plots#############
74
win.graph()
75
gam.check(GAM_ANUSPLIN1)
76
savePlot("GAM_ANUSPLIN1_diagnostic1.emf", type="emf")
77
win.graph()
78
gam.check(GAM_PRISM1)   #This will produce basic plots of residuals
79
savePlot("GAM_PRISM_diagnostic1.emf", type="emf")
80
gam.check(GAM_ANUSPLIN1)
81
win.graph()
82
vis.gam(GAM_ANUSPLIN1)
83
savePlot("GAM_ANUSPLIN1_prediction.emf", type="emf")        
84
win.graph()
85
vis.gam(GAM_PRISM1)
86
savePlot("GAM_PRISM1_prediction.emf", type="emf")
87
win.graph()
88
vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM"))
89
#vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM", theta=100,phi=200))
90
savePlot("GAM_ANUSPLIN1_prediction2.emf", type="emf")
91

  
92
####VALIDATION: Prediction checking the results using the testing data########
93
y_plANUSPLIN1<- predict(lm_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE)
94
y_pgANUSPLIN1<- predict(GAM_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE) #Using the coeff to predict new values.
95
y_plPRISM1<- predict(lm_PRISM1, newdata=ghcn1507_v, se.fit = TRUE)
96
y_pgPRISM1<- predict(GAM_PRISM1, newdata=ghcn1507_v, se.fit = TRUE)            
97
         
98
res_yplA1<- ghcn1507_v$tmax - y_plANUSPLIN1$fit
99
res_ypgA1<- ghcn1507_v$tmax - y_pgANUSPLIN1$fit
100
res_yplP1<- ghcn1507_v$tmax - y_plPRISM1$fit   #Residuals for lm model that resembles the PRISM interpolation
101
res_ypgP1<- ghcn1507_v$tmax - y_pgPRISM1$fit   #Residuals for GAM model that resembles the PRISM interpolation                               
102

  
103
#results_val <-c(res_yplA1,res_ypgA1,res_yplP1,res_ypgP1)
104
results_val<-data.frame(res_yplA1=res_yplA1,res_ypgA1=res_ypgA1,res_yplP1=res_yplP1,res_ypgP1=res_ypgP1)
105
nv<- nrow(ghcn1507_v)
106
                  
107
RMSE_yplA1 <- sqrt(sum(res_yplA1^2)*1/nv)            
108
#RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv)
109
RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv)
110
RMSE_yplP1 <- sqrt(sum(res_yplP1^2)/nv)            
111
RMSE_ypgP1 <- sqrt(sum(res_ypgP1^2)/nv)
112
      
113
#Printing the RMSE values for the different models                  
114
RMSE_yplA1             
115
RMSE_ypgA1 
116
RMSE_yplP1          
117
RMSE_ypgP1
118

  
119
mod_name<-c("yplA1","ypgA1","yplP1","ypgP1")
120
mod_type<-c("lm_ANUSPLIN1","GAM_ANUSPLIN1","lm_PRISM1","GAM_PRISM1")
121

  
122
RMSE_all<-c(RMSE_yplA1,RMSE_ypgA1,RMSE_yplP1,RMSE_ypgP1)
123
AIC_all<-AIC(lm_ANUSPLIN1,GAM_ANUSPLIN1,lm_PRISM1,GAM_PRISM1)
124
GCV_all<-c(0,GCVA1,0,GCVP1)     #This places the GCV values for each model in a vector
125
Deviance_all<-c(0,DevianceA1,0,DevianceP1)
126

  
127
#results<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC)
128
#results_val<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC)  
129
results<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC, GCV=GCV_all,Deviance=Deviance_all)
130

  
131
#Add deviance
132

  
133
barplot(results$RMSE,main="RMSE for the models",names.arg=c("yplA1","ypgA1","yplP1","ypgP1"),ylab="Temp (deg. C)")                  
134
barplot(results$AIC,main="AIC for the models",names.arg=c("yplA1","ypgA1","yplP1","ypgP1"),ylab="AIC")
135
#dump("results", file= paste(path,"/","results_reg1507.txt",sep=""))
136
write.csv(results, file= paste(path,"/","results_reg1507_Assessment.txt",sep=""))
137
#write.csv(results_val, file= paste(path,"/","results_reg1507_val.txt",sep=""))
138

  
139
                
140
library(lattice)                 
141
g <- expand.grid(x = ghcn1507_v$lon, y = ghcn1507_v$lat)
142
g$z <- y_plANUSPLIN1$fit
143
wireframe(z ~ x * y, data = g)                   
144

  
145
#####End of script##########
146
###############################
30
#for loops
31
ddat <- as.list(rep("", length(dates)))
32

  
33
for(i in 1:length(dates)){
34
  data<-cat("ghcn_",dates[[1]],sep="")
35
  data<-as.data.frame(data)
36
  ddat[[i]] <- data.frame(subset(ghcn,ghcn$date_==dates[[i]]))
37
  #data<-subset(ghcn,ghcn$date_==dates[[i]])
38
  n<-nrow(data)
39
  ns<-n-round(n*0.3)  #Create a sample from the data frame with 70% of the rows
40
  #ns<-n-round(n*prop)  #Create a sample from the data frame with 70% of the rows
41
  ind.training <- sample(nrow(data), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
42
  ind.testing <- setdiff(1:nrow(data), ind.training)
43
  data_s <- data[ind.training, ]
44
  data_v <- data[ind.testing, ]
45
  }
46

  
47
# ############ REGRESSION ###############
48
# 
49
# ###Regression part 1: linear models
50
# 
51
# lm_ANUSPLIN1<-lm(tmax~lat+lon+ELEV_SRTM, data=ghcn1507_s)
52
# lm_PRISM1<-lm(tmax~lat+lon+ELEV_SRTM+ASPECT+DISTOC, data=ghcn1507_s) #Note that a variable on inversion is missing
53
# summary(lm_ANUSPLIN1)
54
# summary(lm_PRISM1)
55
# 
56
# ###Regression part 2: GAM models
57
# GAM_ANUSPLIN1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=ghcn1507_s)
58
# GAM_PRISM1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (ASPECT)+ s(DISTOC), data=ghcn1507_s)
59
# #use the s() for smoothing function
60
# 
61
# ###Compare the models
62
# #Show AIC, Cook distance, p values and residuals plot
63
# ###Access the R2 and significance to give a report
64
# 
65
# AIC (lm_ANUSPLIN1,GAM_ANUSPLIN1) #list the AIC and for the results
66
# AIC (lm_PRISM1,GAM_PRISM1) #list the AIC and for the results
67
# 
68
# GCVA1<-GAM_ANUSPLIN1$gcv.ubre
69
# GCVP1<-GAM_PRISM1$gcv.ubre
70
# 
71
# DevianceA1<-GAM_ANUSPLIN1$deviance
72
# DevianceP1<-GAM_PRISM1$deviance
73
# 
74
# anova(lm_ANUSPLIN1, GAM_ANUSPLIN1,test="F") #compare the different models in terms of F; a reference model should be set for comparison. 
75
# anova(lm_PRISM1, GAM_PRISM1,test="F")
76
# 
77
# summary(lm_ANUSPLIN1)$r.squared
78
# summary(GAM_ANUSPLIN1)$r.squared
79
# summary(lm_PRISM1)$r.squared                
80
# summary(GAM_PRISM1)$r.squared
81
# 
82
# ############Diagnostic GAM plots#############
83
# win.graph()
84
# gam.check(GAM_ANUSPLIN1)
85
# savePlot("GAM_ANUSPLIN1_diagnostic1.emf", type="emf")
86
# win.graph()
87
# gam.check(GAM_PRISM1)   #This will produce basic plots of residuals
88
# savePlot("GAM_PRISM_diagnostic1.emf", type="emf")
89
# gam.check(GAM_ANUSPLIN1)
90
# win.graph()
91
# vis.gam(GAM_ANUSPLIN1)
92
# savePlot("GAM_ANUSPLIN1_prediction.emf", type="emf")        
93
# win.graph()
94
# vis.gam(GAM_PRISM1)
95
# savePlot("GAM_PRISM1_prediction.emf", type="emf")
96
# win.graph()
97
# vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM"))
98
# #vis.gam(GAM_ANUSPLIN1, view=c("lat","ELEV_SRTM", theta=100,phi=200))
99
# savePlot("GAM_ANUSPLIN1_prediction2.emf", type="emf")
100
# 
101
# ####VALIDATION: Prediction checking the results using the testing data########
102
# y_plANUSPLIN1<- predict(lm_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE)
103
# y_pgANUSPLIN1<- predict(GAM_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE) #Using the coeff to predict new values.
104
# y_plPRISM1<- predict(lm_PRISM1, newdata=ghcn1507_v, se.fit = TRUE)
105
# y_pgPRISM1<- predict(GAM_PRISM1, newdata=ghcn1507_v, se.fit = TRUE)            
106
#          
107
# res_yplA1<- ghcn1507_v$tmax - y_plANUSPLIN1$fit
108
# res_ypgA1<- ghcn1507_v$tmax - y_pgANUSPLIN1$fit
109
# res_yplP1<- ghcn1507_v$tmax - y_plPRISM1$fit   #Residuals for lm model that resembles the PRISM interpolation
110
# res_ypgP1<- ghcn1507_v$tmax - y_pgPRISM1$fit   #Residuals for GAM model that resembles the PRISM interpolation                               
111
# 
112
# #results_val <-c(res_yplA1,res_ypgA1,res_yplP1,res_ypgP1)
113
# results_val<-data.frame(res_yplA1=res_yplA1,res_ypgA1=res_ypgA1,res_yplP1=res_yplP1,res_ypgP1=res_ypgP1)
114
# nv<- nrow(ghcn1507_v)
115
#                   
116
# RMSE_yplA1 <- sqrt(sum(res_yplA1^2)*1/nv)            
117
# #RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv)
118
# RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)/nv)
119
# RMSE_yplP1 <- sqrt(sum(res_yplP1^2)/nv)            
120
# RMSE_ypgP1 <- sqrt(sum(res_ypgP1^2)/nv)
121
#       
122
# #Printing the RMSE values for the different models                  
123
# RMSE_yplA1             
124
# RMSE_ypgA1 
125
# RMSE_yplP1          
126
# RMSE_ypgP1
127
# 
128
# mod_name<-c("yplA1","ypgA1","yplP1","ypgP1")
129
# mod_type<-c("lm_ANUSPLIN1","GAM_ANUSPLIN1","lm_PRISM1","GAM_PRISM1")
130
# 
131
# RMSE_all<-c(RMSE_yplA1,RMSE_ypgA1,RMSE_yplP1,RMSE_ypgP1)
132
# AIC_all<-AIC(lm_ANUSPLIN1,GAM_ANUSPLIN1,lm_PRISM1,GAM_PRISM1)
133
# GCV_all<-c(0,GCVA1,0,GCVP1)     #This places the GCV values for each model in a vector
134
# Deviance_all<-c(0,DevianceA1,0,DevianceP1)
135
# 
136
# #results<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC)
137
# #results_val<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC)  
138
# results<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC, GCV=GCV_all,Deviance=Deviance_all)
139
# 
140
# #Add deviance
141
# 
142
# barplot(results$RMSE,main="RMSE for the models",names.arg=c("yplA1","ypgA1","yplP1","ypgP1"),ylab="Temp (deg. C)")                  
143
# barplot(results$AIC,main="AIC for the models",names.arg=c("yplA1","ypgA1","yplP1","ypgP1"),ylab="AIC")
144
# #dump("results", file= paste(path,"/","results_reg1507.txt",sep=""))
145
# write.csv(results, file= paste(path,"/","results_reg1507_Assessment.txt",sep=""))
146
# #write.csv(results_val, file= paste(path,"/","results_reg1507_val.txt",sep=""))
147
# 
148
#                 
149
# library(lattice)                 
150
# g <- expand.grid(x = ghcn1507_v$lon, y = ghcn1507_v$lat)
151
# g$z <- y_plANUSPLIN1$fit
152
# wireframe(z ~ x * y, data = g)                   
153
# 
154
# #####End of script##########
155
# ###############################
147 156

  
148 157

  
149 158

  

Also available in: Unified diff