Project

General

Profile

« Previous | Next » 

Revision 51050314

Added by Benoit Parmentier over 12 years ago

GAM LST, first code to test effect of sampling on GAM, task #409

View differences:

climate/research/oregon/interpolation/GAM_LST_sampling_reg.R
1
####################Interpolation of Tmax for 10 dates.#####################
2
#This script interpolates station tmax values for the Oregon case study.It provides a mean to asssess the effect of random sampling and proportion
3
# of validation hold out on the RMSE.This program loads the station data from a csv file 
4
#and perform one type of regression:  general additive model (GAM) with different variables: 
5
# Lat, long, ELEV_SRTM, Eastness, Northness, DISTOC, mean_LST_monthly, Land Cover proportions.
6
#Note that this program:
7
#1)assumes that the csv file is in the current working 
8
#2)extract relevant variables from raster images before performing the regressions. 
9
#3)does not clear memory workspace at the start or end of script.
10
#This scripts predicts tmax using GAM and LST derived from MOD11A1.
11
#Interactions terms are also included and assessed using the RMSE from validation dataset.
12
#There are 10 dates used for the GAM interpolation. The dates must be provided as a textfile.
13
#Script created by Benoit Parmentier on April 25, 2012. 
14

  
15
###Loading r library and packages                                                      # loading the raster package
16
library(gtools)                                                                        # loading ...
17
library(mgcv)
18
library(sp)
19
library(spdep)
20
library(rgdal)
21

  
22
###Parameters and arguments
23

  
24
infile1<-"ghcn_or_tmax_b_04142012_OR83M.shp"
25
path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations"
26
#path<-"H:/Data/IPLANT_project/data_Oregon_stations"
27
setwd(path) 
28
infile2<-"dates_interpolation_03052012.txt"               #List of 10 dates for the regression
29
prop<-0.3  #Proportion of testing retained for validation   
30
n_runs<- 2 #Number of runs
31
out_prefix<-"_04252012_run30_LST"
32
infile3<-"LST_dates_var_names.txt"
33
infile4<-"models_interpolation_04032012b.txt"
34

  
35
#######START OF THE SCRIPT #############
36

  
37
###Reading the station data and setting up for models' comparison
38
filename<-sub(".shp","",infile1)              #Removing the extension from file.
39
ghcn<-readOGR(".", filename)                  #reading shapefile 
40
                  
41
ghcn = transform(ghcn,Northness = cos(ASPECT)) #Adding a variable to the dataframe
42
ghcn = transform(ghcn,Eastness = sin(ASPECT))  #adding variable to the dataframe.
43
ghcn = transform(ghcn,Northness_w = sin(slope)*cos(ASPECT)) #Adding a variable to the dataframe
44
ghcn = transform(ghcn,Eastness_w = sin(slope)*sin(ASPECT))  #adding variable to the dataframe.
45
                                              #Note that "transform" output is a data.frame not spatial object 
46
#set.seed(100)
47
dates <-readLines(paste(path,"/",infile2, sep=""))
48
LST_dates <-readLines(paste(path,"/",infile3, sep=""))
49
models <-readLines(paste(path,"/",infile4, sep=""))
50

  
51
#results <- matrix(1,length(dates),14)            #This is a matrix containing the diagnostic measures from the GAM models.
52

  
53
results_AIC<- matrix(1,length(dates),length(models)+2)  
54
results_GCV<- matrix(1,length(dates),length(models)+2)
55
results_DEV<- matrix(1,length(dates),length(models)+2)
56
results_RMSE<- matrix(1,length(dates),length(models)+2)
57
RMSE_run<-matrix(1,length(dates),1)
58
cor_LST_LC1<-matrix(1,length(dates),1)      #correlation LST-LC1
59
cor_LST_LC3<-matrix(1,length(dates),1)      #correlation LST-LC3
60
cor_LST_tmax<-matrix(1,length(dates),1)    #correlation LST-tmax
61
#Screening for bad values
62
RMSE_run<-matrix(1,lenth)
63
results_RMSE_all<- matrix(1,length(dates)*n_runs,length(models)+3)
64

  
65
ghcn_all<-ghcn
66
ghcn_test<-subset(ghcn,ghcn$tmax>-150 & ghcn$tmax<400)
67
ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0)
68
ghcn<-ghcn_test2
69

  
70
#month_var<-c("mm_01","mm_02","mm_03","mm_04","mm_05","mm_06","mm_07","mm_08","mm_09", "mm_10", "mm_11", "mm_12")
71

  
72
## looping through the dates...
73
#Changed this section into  a nested loop, looping through the number of models
74

  
75
for(j in 1:n_runs){
76
  
77
  ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 subset data
78
  
79
  for(i in 1:length(dates)){            # start of the for loop #1
80
    date<-strptime(dates[i], "%Y%m%d")
81
    month<-strftime(date, "%m")
82
    LST_month<-paste("mm_",month,sep="")
83
    ###Regression part 1: Creating a validation dataset by creating training and testing datasets
84
    
85
    mod <-ghcn.subsets[[i]][,match(LST_month, names(ghcn.subsets[[i]]))]
86
    ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod)
87
    #Screening LST values
88
    #ghcn.subsets[[i]]<-subset(ghcn.subsets[[i]],ghcn.subsets[[i]]$LST> 258 & ghcn.subsets[[i]]$LST<313)
89
    n<-nrow(ghcn.subsets[[i]])
90
    ns<-n-round(n*prop)  #Create a sample from the data frame with 70% of the rows
91
    nv<-n-ns             #create a sample for validation with prop of the rows
92
    ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
93
    ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
94
    data_s <- ghcn.subsets[[i]][ind.training, ]
95
    data_v <- ghcn.subsets[[i]][ind.testing, ]
96
    
97
    ####Regression part 2: GAM models
98
    
99
    mod1<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=data_s)
100
    mod2<- gam(tmax~ s(lat,lon,ELEV_SRTM), data=data_s)
101
    mod3<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) +  s (Northness)+ s (Eastness) + s(DISTOC), data=data_s)
102
    mod4<- gam(tmax~ s(lat) + s (lon) + s(ELEV_SRTM) + s(Northness) + s (Eastness) + s(DISTOC) + s(LST), data=data_s)
103
    mod5<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST), data=data_s)
104
    mod6<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC1), data=data_s)
105
    mod7<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC3), data=data_s)
106
    
107
    ####Regression part 3: Calculating and storing diagnostic measures
108
    
109
    results_AIC[i,1]<- j
110
    results_AIC[i,1]<- dates[i]  #storing the interpolation dates in the first column
111
    results_AIC[i,2]<- ns        #number of stations used in the training stage
112
    results_AIC[i,3]<- AIC (mod1)
113
    results_AIC[i,4]<- AIC (mod2)
114
    results_AIC[i,5]<- AIC (mod3)
115
    results_AIC[i,6]<- AIC (mod4)
116
    results_AIC[i,7]<- AIC (mod5)
117
    results_AIC[i,8]<- AIC (mod6)
118
    results_AIC[i,9]<- AIC (mod7)
119
    
120
    results_GCV[i,1]<- dates[i]  #storing the interpolation dates in the first column
121
    results_GCV[i,2]<- ns        #number of stations used in the training stage
122
    results_GCV[i,3]<- mod1$gcv.ubre
123
    results_GCV[i,4]<- mod2$gcv.ubre
124
    results_GCV[i,5]<- mod3$gcv.ubre
125
    results_GCV[i,6]<- mod4$gcv.ubre
126
    results_GCV[i,7]<- mod5$gcv.ubre
127
    results_GCV[i,8]<- mod6$gcv.ubre
128
    results_GCV[i,9]<- mod7$gcv.ubre
129
    
130
    results_DEV[i,1]<- dates[i]  #storing the interpolation dates in the first column
131
    results_DEV[i,2]<- ns        #number of stations used in the training stage
132
    results_DEV[i,3]<- mod1$deviance
133
    results_DEV[i,4]<- mod2$deviance
134
    results_DEV[i,5]<- mod3$deviance
135
    results_DEV[i,6]<- mod4$deviance
136
    results_DEV[i,7]<- mod5$deviance
137
    results_DEV[i,8]<- mod6$deviance
138
    results_DEV[i,9]<- mod7$deviance
139
    
140
    #####VALIDATION: Prediction checking the results using the testing data########
141
    
142
    y_mod1<- predict(mod1, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values.
143
    y_mod2<- predict(mod2, newdata=data_v, se.fit = TRUE)            
144
    y_mod3<- predict(mod3, newdata=data_v, se.fit = TRUE) 
145
    y_mod4<- predict(mod4, newdata=data_v, se.fit = TRUE) 
146
    y_mod5<- predict(mod5, newdata=data_v, se.fit = TRUE) 
147
    y_mod6<- predict(mod6, newdata=data_v, se.fit = TRUE)
148
    y_mod7<- predict(mod7, newdata=data_v, se.fit = TRUE)
149
    
150
    res_mod1<- data_v$tmax - y_mod1$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation
151
    res_mod2<- data_v$tmax - y_mod2$fit   #Residuals for GAM model that resembles the PRISM interpolation                               
152
    res_mod3<- data_v$tmax - y_mod3$fit  
153
    res_mod4<- data_v$tmax - y_mod4$fit
154
    res_mod5<- data_v$tmax - y_mod5$fit
155
    res_mod6<- data_v$tmax - y_mod6$fit
156
    res_mod7<- data_v$tmax - y_mod7$fit
157
    
158
    RMSE_mod1 <- sqrt(sum(res_mod1^2)/nv)          
159
    RMSE_mod2 <- sqrt(sum(res_mod2^2)/nv)
160
    RMSE_mod3 <- sqrt(sum(res_mod3^2)/nv)
161
    RMSE_mod4 <- sqrt(sum(res_mod4^2)/nv)
162
    RMSE_mod5 <- sqrt(sum(res_mod5^2)/nv)
163
    RMSE_mod6 <- sqrt(sum(res_mod6^2)/nv)
164
    RMSE_mod7 <- sqrt(sum(res_mod7^2)/nv)
165
    
166
    results_RMSE[i,1]<- dates[i]  #storing the interpolation dates in the first column
167
    results_RMSE[i,2]<- ns        #number of stations used in the training stage
168
    results_RMSE[i,3]<- RMSE_mod1
169
    results_RMSE[i,4]<- RMSE_mod2
170
    results_RMSE[i,5]<- RMSE_mod3
171
    results_RMSE[i,6]<- RMSE_mod4
172
    results_RMSE[i,7]<- RMSE_mod5
173
    results_RMSE[i,8]<- RMSE_mod6
174
    results_RMSE[i,9]<- RMSE_mod7
175
    #Saving dataset in dataframes
176
    data_name<-paste("ghcn_v_",dates[[i]],sep="")
177
    assign(data_name,data_v)
178
    data_name<-paste("ghcn_s_",dates[[i]],sep="")
179
    assign(data_name,data_s)
180
    #ghcn_v<-ls(pattern="ghcn_v_")
181
    RMSE_run[i]<-j
182
    # end of the for loop #2 (nested in loop #1)
183
  }
184
  results_RMSE_all
185
  results_RMSEnum <-results_RMSE
186
  results_AICnum <-results_AIC
187
  mode(results_RMSEnum)<- "numeric"
188
  mode(results_AICnum)<- "numeric"
189
  # Make it numeric first
190
  # Now turn it into a data.frame...
191
  
192
  results_table_RMSE<-as.data.frame(results_RMSEnum)
193
  results_table_AIC<-as.data.frame(results_AICnum)
194
  colnames(results_table_RMSE)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6", "mod7")
195
  colnames(results_table_AIC)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6", "mod7")
196
  
197
  write.table(results_table_RMSE, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep=""), sep=",", append=TRUE)
198
  #write.table(results_table_AIC, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep=""),sep=",", append=TRUE)
199
  
200
  Print(j) #This is the run umber printed to the console.
201
} #end of loop 1
202

  
203
# End of script##########
204

  
205
#Selecting dates and files based on names
206
#cor_LST_LC<-matrix(1,10,1)
207
# for(i in 1:length(dates)){
208
#   cor_LST_LC1[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC1)
209
# }
210
# for(i in 1:length(dates)){
211
#   cor_LST_LC3[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC3)
212
# }
213

  

Also available in: Unified diff