Revision 3f649df0
Added by Benoit Parmentier over 12 years ago
- ID 3f649df0623403a7777487b6dba6551a87273b46
climate/research/oregon/interpolation/Linear_reg.R | ||
---|---|---|
3 | 3 |
#1)assumes that the csv file is in the current working |
4 | 4 |
#2)extract relevant variables from raster images before performing the regressions. |
5 | 5 |
#The user must provide the list of raster images in a textile. |
6 |
#Created by Benoit Parmentier on February 15, 2012.
|
|
6 |
#Script created by Benoit Parmentier on February 15, 2012.
|
|
7 | 7 |
|
8 | 8 |
###Loading r library and packages |
9 | 9 |
library(raster) # loading the raster package |
... | ... | |
14 | 14 |
###Parameters and arguments |
15 | 15 |
infile1<-"ghcn_or_b_02122012_OR83M.csv" |
16 | 16 |
inlistf<-"list_files.txt" |
17 |
path<-getwd() |
|
17 |
path<-"C:/Data/Benoit/NCEAS/window_Oregon_data" |
|
18 |
#path<-getwd() |
|
19 |
#date<-"20101507" #Date selected for the regression |
|
20 |
#prop<-0.3 #Proportion of testing retained for validation |
|
18 | 21 |
#list_var<-list.files() # displaying the files in the current folder which should contain only the relevant raster images. |
19 | 22 |
#dump("list_var", file="list_var.txt") # Saving the values of list_var in a ascii text file. |
20 | 23 |
#inlistvar<-"ghcn_var_02122012.txt" |
... | ... | |
23 | 26 |
|
24 | 27 |
###Reading the station data |
25 | 28 |
ghcn<-read.csv(paste(path,"/",infile1, sep=""), header=TRUE) #The "paste" function concatenates the path and file name in common string. |
26 |
ghcn<-read.csv(infile1) #Use read.table if the input file is space delimited or read.table(infile1, headername=TRUE, sep=',') |
|
29 |
#ghcn<-read.csv(infile1) #Use read.table if the input file is space delimited or read.table(infile1, headername=TRUE, sep=',')
|
|
27 | 30 |
names(ghcn) #Checking that the columns are correctly labelled. |
28 | 31 |
|
29 | 32 |
###Extracting the variables values from the raster files |
30 | 33 |
coords<- ghcn[,c('x_OR83M','y_OR83M')] #Selecting all rows for the x and y columns and packaging the output in a data frame. |
31 |
lines <-readLines(inlistf)
|
|
34 |
lines <-readLines(paste(path,"/",inlistf, sep="")
|
|
32 | 35 |
inlistvar<-paste(path,"/",lines,sep="") |
33 | 36 |
|
34 | 37 |
#s_raster<- stack(inlistvar) #Creating a stack of raster images from the list of variables. |
... | ... | |
36 | 39 |
#create a shape file and data_frame with names?? |
37 | 40 |
|
38 | 41 |
###Creating a validation dataset by creating training and testing datasets (%30) |
42 |
#ghcn1507 <-subset(ghcn,ghcn$date_== date) |
|
43 |
ghcn1507 <-subset(ghcn,ghcn$date_=="20100715") |
|
39 | 44 |
n<-nrow(ghcn1507) |
40 | 45 |
ns<-n-round(n*0.3) #Create a sample from the data frame with 70% of the rows |
46 |
#ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
|
41 | 47 |
ind.training <- sample(nrow(ghcn1507), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly |
42 | 48 |
ind.testing <- setdiff(1:nrow(ghcn1507), ind.training) |
43 | 49 |
ghcn1507_s <- ghcn1507[ind.training, ] |
... | ... | |
46 | 52 |
############ REGRESSION ############### |
47 | 53 |
|
48 | 54 |
###Regression part 1: linear models |
49 |
lm_PRISM1=lm(tmax~lat+lon+ELEV_SRTM+ASPECT+DISTOC, data=ghcn) |
|
50 |
lm_ANUSPLIN1=lm(tmax~lat+lon+ELEV_SRTM, data=ghcn) |
|
55 |
|
|
56 |
lm_ANUSPLIN1=lm(tmax~lat+lon+ELEV_SRTM, data=ghcn1507_s) |
|
57 |
lm_PRISM1=lm(tmax~lat+lon+ELEV_SRTM+ASPECT+DISTOC, data=ghcn1507_s) #Note that a variable on inversion is missing |
|
51 | 58 |
summary(lm_ANUSPLIN1) |
52 | 59 |
summary(lm_PRISM1) |
53 | 60 |
|
54 |
#Regression part2: linear model on a specific date. |
|
55 |
ghcn1507 <-subset(ghcn,ghcn$date_=="20100715") |
|
56 |
lm_PRISM2=lm(tmax~lat+lon+ELEV_SRTM+ASPECT+DISTOC, data=ghcn1507) |
|
57 |
lm_ANUSPLIN2=lm(tmax~lat+lon+ELEV_SRTM, data=ghcn1507) |
|
58 |
|
|
59 |
|
|
60 |
###Regression part 3: GAM models |
|
61 |
GAM1<-gam(tmax~ s(lat) + s (lon) + s (elev), data=ghcn1507) |
|
62 |
|
|
63 |
|
|
61 |
###Regression part 2: GAM models |
|
62 |
GAM_PRISM1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=ghcn1507_s) |
|
63 |
GAM_ANUSPLIN1<-gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=ghcn1507_s) |
|
64 | 64 |
#use the s() for smoothing function |
65 | 65 |
|
66 | 66 |
###Compare the models |
67 | 67 |
#Show AIC, Cook distance, p values and residuals plot |
68 | 68 |
###Access the R2 and significance to give a report |
69 | 69 |
|
70 |
AIC (lm_ANUSPLIN1,lm_ANUSPLIN2,GAM1) #list the AIC and for the results
|
|
71 |
anova(lm_ANUSPLIN1, lm_ANUSPLIN2,GAM1,test="F") #compare the different models in terms of F; a reference model should be set for comparison.
|
|
72 |
#GAM plot of partial residuals
|
|
73 |
gam.check(GAM1) #This will produce basic plots of residuals
|
|
70 |
AIC (lm_ANUSPLIN1,GAM_ANUSPLIN1) #list the AIC and for the results
|
|
71 |
AIC (lm_PRISM1,GAM_PRISM1) #list the AIC and for the results
|
|
72 |
anova(lm_ANUSPLIN1, GAM_ANUSPLIN1,test="F") #compare the different models in terms of F; a reference model should be set for comparison.
|
|
73 |
anova(lm_PRISM1, GAM_PRISM1,test="F")
|
|
74 | 74 |
|
75 |
#GAM plot of partial residuals |
|
76 |
gam.check(GAM_PRISM1) #This will produce basic plots of residuals |
|
77 |
gam.check(GAM_ANUSPLIN1) |
|
78 |
|
|
79 |
#Prediction checking the results using the testing data |
|
80 |
y_plANUSPLIN1<- predict(lm_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE) |
|
81 |
y_pgANUSPLIN1<- predict(GAM_ANUSPLIN1, newdata=ghcn1507_v, se.fit = TRUE) |
|
82 |
y_plPRISM1<- predict(lm_PRISM1, newdata=ghcn1507_v, se.fit = TRUE) |
|
83 |
y_pgPRISM1<- predict(GAM_PRISM1, newdata=ghcn1507_v, se.fit = TRUE) |
|
84 |
|
|
85 |
res_yplA1<- ghcn1507_v$tmax - y_plANUSPLIN1$fit |
|
86 |
res_ypgA1<- ghcn1507_v$tmax - y_pgANUSPLIN1$fit |
|
87 |
res_yplP1<- ghcn1507_v$tmax - y_plPRISM1$fit #Residuals for lm model that resembles the PRISM interpolation |
|
88 |
res_ypgP1<- ghcn1507_v$tmax - y_pgPRISM1$fit #Residuals for GAM model that resembles the PRISM interpolation |
|
89 |
|
|
90 |
RMSE_yplA1 <- sqrt(sum(res_yplA1^2)) |
|
91 |
#RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)) |
|
92 |
RMSE_ypgA1 <- sqrt(sum(res_ypgA1^2)) |
|
93 |
RMSE_yplP1 <- sqrt(sum(res_yplP1^2)) |
|
94 |
RMSE_ypgP1 <- sqrt(sum(res_ypgP1^2)) |
|
95 |
|
|
96 |
#Printing the RMSE values for the different models |
|
97 |
RMSE_yplA1 |
|
98 |
RMSE_ypgA1 |
|
99 |
RMSE_yplP1 |
|
100 |
RMSE_ypgP1 |
|
101 |
|
|
102 |
RMSE_all<-c(RMSE_yplA1,RMSE_ypgA1,RMSE_yplP1,RMSE_ypgP1) |
|
103 |
mod_name<-c("yplA1","ypgA1","yplP1","ypgP1") |
|
104 |
mod_type<-c("lm_ANUSPLIN1","GAM_ANUSPLIN1","lm_PRISM1","GAM_PRISM1") |
|
105 |
AIC_all<-AIC(lm_ANUSPLIN1,GAM_ANUSPLIN1,lm_PRISM1,GAM_PRISM1) |
|
106 |
results<-data.frame(model=mod_name,RMSE=RMSE_all,df=AIC_all$df,AIC=AIC_all$AIC) |
|
107 |
|
|
108 |
|
|
109 |
#dump("results", file= paste(path,"/","results_reg1507.txt",sep="")) |
|
110 |
write.csv(results, file= paste(path,"/","results_reg1507.txt",sep="")) |
|
75 | 111 |
|
76 | 112 |
##End of script |
77 | 113 |
|
Also available in: Unified diff
changed RMSE and AIC code in GAM script