Project

General

Profile

« Previous | Next » 

Revision 89c6eee2

Added by Benoit Parmentier over 12 years ago

  • ID 89c6eee24f7646cff46371b0ad02b1a463e48897

added initial R script with GAM regression for tmax (one date)

View differences:

climate/research/oregon/interpolation/Linear_reg.R
1
#This script interpolates station value for the Oregon case study. This program loads the station data from a csv file and perform two types  of regression: multiple linear model and general additive model (GAM). Note that this program 1)assumes that the csv file is in the current working 2)extract relevant variables from raster images before performing the regressions. The user must provide the list of raster images
2
#in  a textile.
3
#Created by Benoit Parmentier on February 12, 2012.
4

  
5
###Loading r library and packages
6
library(raster)                                                                              # loading the raster package
7
library(gtools)                                                                              l# loading ...
8
library(sp)
9
library(mgcv)
10

  
11
###Parameters and arguments
12
infile1<-"ghcn_or_b_02122012_OR83M.csv"
13
inlistf<-"list_files.txt"
14
path<-getwd()
15
#list_var<-list.files()                                                                        # displaying the files in the current folder which should contain only the relevant raster images.
16
#dump("list_var", file="list_var.txt")                                                # Saving the values of list_var in a ascii text file.
17

  
18
#inlistvar<-"ghcn_var_02122012.txt"
19

  
20
###Reading the station data
21
ghcn<-read.csv(paste(path,"/",infile1, sep=""), header=TRUE)    #The "paste" function concatenates the path and file name in common string.
22
ghcn<-read.csv(infile1)                                                                 #Use read.table if the input file is space delimited or read.table(infile1, headername=TRUE, sep=',')
23
names(ghcn)                                                                                 #Checking that the columns are correctly labelled.
24

  
25
###Extracting the variables values from the raster files
26
coords<- ghcn[,c('x_OR83M','y_OR83M')]                                    #Selecting all rows for the x and y columns and packaging the output in a data frame.
27
lines <-readLines(inlistf)
28
inlistvar<-paste(path,"/",lines,sep="")
29

  
30
#s_raster<- stack(inlistvar)                                                              #Creating a stack of raster images from the list of variables.
31
#stat_val<- extract(s_raster, coords)                                               #Extracting values from the raster stack for every point location in coords data frame.
32
#create a shape file and data_frame with names??
33

  
34

  
35
###Regression part 1: linear models
36
lm_PRISM1=lm(tmax~lat+lon+ELEV_SRTM+ASPECT+DISTOC, data=ghcn)
37
lm_ANUSPLIN1=lm(tmax~lat+lon+ELEV_SRTM, data=ghcn)
38
summary(lm_ANUSPLIN1)
39
summary(lm_PRISM1)
40

  
41
#Regression part2: linear model on a specific date.
42
ghcn1507 <-subset(ghcn,ghcn$date_=="20100715")
43
lm_PRISM2=lm(tmax~lat+lon+ELEV_SRTM+ASPECT+DISTOC, data=ghcn1507)
44
lm_ANUSPLIN2=lm(tmax~lat+lon+ELEV_SRTM, data=ghcn1507)
45

  
46

  
47
###Regression part 3: GAM models
48
GAM1<-gam(tmax~ s(lat) + s (lon) + s (elev), data=ghcn1507)
49

  
50

  
51
#use the s() for smoothing function
52

  
53
###Compare the models
54
#Show AIC, Cook distance, p values and residuals plot
55
###Access the R2 and significance to give a report
56

  
57
AIC (lm_ANUSPLIN1,lm_ANUSPLIN2,GAM1) #list the AIC and for the results
58
anova(lm_ANUSPLIN1, lm_ANUSPLIN2,GAM1,test="F") #compare the different models in terms of F; a reference model should be set for comparison.
59
#GAM plot of partial residuals
60
gam.check(GAM1)   #This will produce basic plots of residuals
61

  
62
###Creating a validation dataset by creating training and testing datasets (%30)
63
n<-nrow(ghcn1507)
64
ns<-n-round(n*0.3)  #Create a sample from the data frame with 70% of the rows
65
ind.training <- sample(nrow(ghcn1507), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
66
ind.testing <- setdiff(1:nrow(ghcn1507), ind.training)
67
ghcn1507_s <- ghcn1507[ind.training, ]
68
ghcn1507_v <- ghcn1507[ind.testing, ]
69

  
70

  
71
#ghcn1507_s<- sample(ghcn1507,size=ns, replace=T)
72
ghcn1507_s<- ghcn1507[sample(nrow(ghcn1507), size=ns), ]
73
#ghcn1507_v <- ghcn1507$STAT_ID[ !ghcn1507_s$STAT_ID ] #dataset for validation that can be used to predict values.
74

  
75

  
76

  
77
##End of script
78

  
79

  
80

  
81

  
82
 

Also available in: Unified diff