Project

General

Profile

« Previous | Next » 

Revision 3f649df0

Added by Benoit Parmentier over 12 years ago

  • ID 3f649df0623403a7777487b6dba6551a87273b46

changed RMSE and AIC code in GAM script

View differences:

climate/research/oregon/interpolation/Linear_reg.R
3 3
#1)assumes that the csv file is in the current working
4 4
#2)extract relevant variables from raster images before performing the regressions.
5 5
#The user must provide the list of raster images in  a textile.
6
#Created by Benoit Parmentier on February 15, 2012.
6
#Script created by Benoit Parmentier on February 15, 2012.
7 7

  
8 8
###Loading r library and packages
9 9
library(raster)                                                                        # loading the raster package
......
14 14
###Parameters and arguments
15 15
infile1<-"ghcn_or_b_02122012_OR83M.csv"
16 16
inlistf<-"list_files.txt"
17
path<-getwd()
17
path<-"C:/Data/Benoit/NCEAS/window_Oregon_data"
18
#path<-getwd()
19
#date<-"20101507"                                                                       #Date selected for the regression
20
#prop<-0.3                                                                              #Proportion of testing retained for validation
18 21
#list_var<-list.files()                                                                 # displaying the files in the current folder which should contain only the relevant raster images.
19 22
#dump("list_var", file="list_var.txt")                                                  # Saving the values of list_var in a ascii text file.
20 23
#inlistvar<-"ghcn_var_02122012.txt"
......
23 26

  
24 27
###Reading the station data
25 28
ghcn<-read.csv(paste(path,"/",infile1, sep=""), header=TRUE)                            #The "paste" function concatenates the path and file name in common string.
26
ghcn<-read.csv(infile1)                                                                 #Use read.table if the input file is space delimited or read.table(infile1, headername=TRUE, sep=',')
29
#ghcn<-read.csv(infile1)                                                                 #Use read.table if the input file is space delimited or read.table(infile1, headername=TRUE, sep=',')
27 30
names(ghcn)                                                                             #Checking that the columns are correctly labelled.
28 31

  
29 32
###Extracting the variables values from the raster files
30 33
coords<- ghcn[,c('x_OR83M','y_OR83M')]                                                  #Selecting all rows for the x and y columns and packaging the output in a data frame.
31
lines <-readLines(inlistf)
34
lines <-readLines(paste(path,"/",inlistf, sep="")
32 35
inlistvar<-paste(path,"/",lines,sep="")
33 36

  
34 37
#s_raster<- stack(inlistvar)                                                            #Creating a stack of raster images from the list of variables.
......
36 39
#create a shape file and data_frame with names??
37 40

  
38 41
###Creating a validation dataset by creating training and testing datasets (%30)
42
#ghcn1507 <-subset(ghcn,ghcn$date_== date)
43
ghcn1507 <-subset(ghcn,ghcn$date_=="20100715")
39 44
n<-nrow(ghcn1507)
40 45
ns<-n-round(n*0.3)  #Create a sample from the data frame with 70% of the rows
46
#ns<-n-round(n*prop)  #Create a sample from the data frame with 70% of the rows
41 47
ind.training <- sample(nrow(ghcn1507), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
42 48
ind.testing <- setdiff(1:nrow(ghcn1507), ind.training)
43 49
ghcn1507_s <- ghcn1507[ind.training, ]
......
46 52
############ REGRESSION ###############
47 53

  
48 54
###Regression part 1: linear models
49
lm_PRISM1=lm(tmax~lat+lon+ELEV_SRTM+ASPECT+DISTOC, data=ghcn)
50
lm_ANUSPLIN1=lm(tmax~lat+lon+ELEV_SRTM, data=ghcn)
55

  
56
lm_ANUSPLIN1=lm(tmax~lat+lon+ELEV_SRTM, data=ghcn1507_s)
57
lm_PRISM1=lm(tmax~lat+lon+ELEV_SRTM+ASPECT+DISTOC, data=ghcn1507_s) #Note that a variable on inversion is missing
51 58
summary(lm_ANUSPLIN1)
52 59
summary(lm_PRISM1)
53 60

  
54
#Regression part2: linear model on a specific date.
55
ghcn1507 <-subset(ghcn,ghcn$date_=="20100715")
56
lm_PRISM2=lm(tmax~lat+lon+ELEV_SRTM+ASPECT+DISTOC, data=ghcn1507)
57
lm_ANUSPLIN2=lm(tmax~lat+lon+ELEV_SRTM, data=ghcn1507)
58

  
59

  
60
###Regression part 3: GAM models
61
GAM1<-gam(tmax~ s(lat) + s (lon) + s (elev), data=ghcn1507)
62

  
63

  
61
###Regression part 2: GAM models
62
GAM_PRISM1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=ghcn1507_s)
63
GAM_ANUSPLIN1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=ghcn1507_s)
64 64
#use the s() for smoothing function
65 65

  
66 66
###Compare the models
67 67
#Show AIC, Cook distance, p values and residuals plot
68 68
###Access the R2 and significance to give a report
69 69

  
70
AIC (lm_ANUSPLIN1,lm_ANUSPLIN2,GAM1) #list the AIC and for the results
71
anova(lm_ANUSPLIN1, lm_ANUSPLIN2,GAM1,test="F") #compare the different models in terms of F; a reference model should be set for comparison.
72
#GAM plot of partial residuals
73
gam.check(GAM1)   #This will produce basic plots of residuals
70
AIC (lm_ANUSPLIN1,GAM_ANUSPLIN1) #list the AIC and for the results
71
AIC (lm_PRISM1,GAM_PRISM1) #list the AIC and for the results
72
anova(lm_ANUSPLIN1, GAM_ANUSPLIN1,test="F") #compare the different models in terms of F; a reference model should be set for comparison.
73
anova(lm_PRISM1, GAM_PRISM1,test="F")
74 74

  
75
#GAM plot of partial residuals
76
gam.check(GAM_PRISM1)   #This will produce basic plots of residuals
77
gam.check(GAM_ANUSPLIN1)
78

  
79
#Prediction checking the results using the testing data
80
y_plANUSPLIN1<- predict(lm_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE)
81
y_pgANUSPLIN1<- predict(GAM_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE)
82
y_plPRISM1<- predict(lm_PRISM1, newdata=ghcn1507_v, se.fit = TRUE)
83
y_pgPRISM1<- predict(GAM_PRISM1, newdata=ghcn1507_v, se.fit = TRUE)
84

  
85
res_yplA1<- ghcn1507_v$tmax - y_plANUSPLIN1$fit
86
res_ypgA1<- ghcn1507_v$tmax - y_pgANUSPLIN1$fit
87
res_yplP1<- ghcn1507_v$tmax - y_plPRISM1$fit   #Residuals for lm model that resembles the PRISM interpolation
88
res_ypgP1<- ghcn1507_v$tmax - y_pgPRISM1$fit   #Residuals for GAM model that resembles the PRISM interpolation
89

  
90
RMSE_yplA1 <- sqrt(sum(res_yplA1^2))
91
#RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2))
92
RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2))
93
RMSE_yplP1 <- sqrt(sum(res_yplP1^2))
94
RMSE_ypgP1 <- sqrt(sum(res_ypgP1^2))
95

  
96
#Printing the RMSE values for the different models
97
RMSE_yplA1
98
RMSE_ypgA1
99
RMSE_yplP1
100
RMSE_ypgP1
101

  
102
RMSE_all<-c(RMSE_yplA1,RMSE_ypgA1,RMSE_yplP1,RMSE_ypgP1)
103
mod_name<-c("yplA1","ypgA1","yplP1","ypgP1")
104
mod_type<-c("lm_ANUSPLIN1","GAM_ANUSPLIN1","lm_PRISM1","GAM_PRISM1")
105
AIC_all<-AIC(lm_ANUSPLIN1,GAM_ANUSPLIN1,lm_PRISM1,GAM_PRISM1)
106
results<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC)
107

  
108

  
109
#dump("results", file= paste(path,"/","results_reg1507.txt",sep=""))
110
write.csv(results, file= paste(path,"/","results_reg1507.txt",sep=""))
75 111

  
76 112
##End of script
77 113

  

Also available in: Unified diff