Revision 51050314
Added by Benoit Parmentier over 12 years ago
climate/research/oregon/interpolation/GAM_LST_sampling_reg.R | ||
---|---|---|
1 |
####################Interpolation of Tmax for 10 dates.##################### |
|
2 |
#This script interpolates station tmax values for the Oregon case study.It provides a mean to asssess the effect of random sampling and proportion |
|
3 |
# of validation hold out on the RMSE.This program loads the station data from a csv file |
|
4 |
#and perform one type of regression: general additive model (GAM) with different variables: |
|
5 |
# Lat, long, ELEV_SRTM, Eastness, Northness, DISTOC, mean_LST_monthly, Land Cover proportions. |
|
6 |
#Note that this program: |
|
7 |
#1)assumes that the csv file is in the current working |
|
8 |
#2)extract relevant variables from raster images before performing the regressions. |
|
9 |
#3)does not clear memory workspace at the start or end of script. |
|
10 |
#This scripts predicts tmax using GAM and LST derived from MOD11A1. |
|
11 |
#Interactions terms are also included and assessed using the RMSE from validation dataset. |
|
12 |
#There are 10 dates used for the GAM interpolation. The dates must be provided as a textfile. |
|
13 |
#Script created by Benoit Parmentier on April 25, 2012. |
|
14 |
|
|
15 |
###Loading r library and packages # loading the raster package |
|
16 |
library(gtools) # loading ... |
|
17 |
library(mgcv) |
|
18 |
library(sp) |
|
19 |
library(spdep) |
|
20 |
library(rgdal) |
|
21 |
|
|
22 |
###Parameters and arguments |
|
23 |
|
|
24 |
infile1<-"ghcn_or_tmax_b_04142012_OR83M.shp" |
|
25 |
path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations" |
|
26 |
#path<-"H:/Data/IPLANT_project/data_Oregon_stations" |
|
27 |
setwd(path) |
|
28 |
infile2<-"dates_interpolation_03052012.txt" #List of 10 dates for the regression |
|
29 |
prop<-0.3 #Proportion of testing retained for validation |
|
30 |
n_runs<- 2 #Number of runs |
|
31 |
out_prefix<-"_04252012_run30_LST" |
|
32 |
infile3<-"LST_dates_var_names.txt" |
|
33 |
infile4<-"models_interpolation_04032012b.txt" |
|
34 |
|
|
35 |
#######START OF THE SCRIPT ############# |
|
36 |
|
|
37 |
###Reading the station data and setting up for models' comparison |
|
38 |
filename<-sub(".shp","",infile1) #Removing the extension from file. |
|
39 |
ghcn<-readOGR(".", filename) #reading shapefile |
|
40 |
|
|
41 |
ghcn = transform(ghcn,Northness = cos(ASPECT)) #Adding a variable to the dataframe |
|
42 |
ghcn = transform(ghcn,Eastness = sin(ASPECT)) #adding variable to the dataframe. |
|
43 |
ghcn = transform(ghcn,Northness_w = sin(slope)*cos(ASPECT)) #Adding a variable to the dataframe |
|
44 |
ghcn = transform(ghcn,Eastness_w = sin(slope)*sin(ASPECT)) #adding variable to the dataframe. |
|
45 |
#Note that "transform" output is a data.frame not spatial object |
|
46 |
#set.seed(100) |
|
47 |
dates <-readLines(paste(path,"/",infile2, sep="")) |
|
48 |
LST_dates <-readLines(paste(path,"/",infile3, sep="")) |
|
49 |
models <-readLines(paste(path,"/",infile4, sep="")) |
|
50 |
|
|
51 |
#results <- matrix(1,length(dates),14) #This is a matrix containing the diagnostic measures from the GAM models. |
|
52 |
|
|
53 |
results_AIC<- matrix(1,length(dates),length(models)+2) |
|
54 |
results_GCV<- matrix(1,length(dates),length(models)+2) |
|
55 |
results_DEV<- matrix(1,length(dates),length(models)+2) |
|
56 |
results_RMSE<- matrix(1,length(dates),length(models)+2) |
|
57 |
RMSE_run<-matrix(1,length(dates),1) |
|
58 |
cor_LST_LC1<-matrix(1,length(dates),1) #correlation LST-LC1 |
|
59 |
cor_LST_LC3<-matrix(1,length(dates),1) #correlation LST-LC3 |
|
60 |
cor_LST_tmax<-matrix(1,length(dates),1) #correlation LST-tmax |
|
61 |
#Screening for bad values |
|
62 |
RMSE_run<-matrix(1,lenth) |
|
63 |
results_RMSE_all<- matrix(1,length(dates)*n_runs,length(models)+3) |
|
64 |
|
|
65 |
ghcn_all<-ghcn |
|
66 |
ghcn_test<-subset(ghcn,ghcn$tmax>-150 & ghcn$tmax<400) |
|
67 |
ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0) |
|
68 |
ghcn<-ghcn_test2 |
|
69 |
|
|
70 |
#month_var<-c("mm_01","mm_02","mm_03","mm_04","mm_05","mm_06","mm_07","mm_08","mm_09", "mm_10", "mm_11", "mm_12") |
|
71 |
|
|
72 |
## looping through the dates... |
|
73 |
#Changed this section into a nested loop, looping through the number of models |
|
74 |
|
|
75 |
for(j in 1:n_runs){ |
|
76 |
|
|
77 |
ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 subset data |
|
78 |
|
|
79 |
for(i in 1:length(dates)){ # start of the for loop #1 |
|
80 |
date<-strptime(dates[i], "%Y%m%d") |
|
81 |
month<-strftime(date, "%m") |
|
82 |
LST_month<-paste("mm_",month,sep="") |
|
83 |
###Regression part 1: Creating a validation dataset by creating training and testing datasets |
|
84 |
|
|
85 |
mod <-ghcn.subsets[[i]][,match(LST_month, names(ghcn.subsets[[i]]))] |
|
86 |
ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod) |
|
87 |
#Screening LST values |
|
88 |
#ghcn.subsets[[i]]<-subset(ghcn.subsets[[i]],ghcn.subsets[[i]]$LST> 258 & ghcn.subsets[[i]]$LST<313) |
|
89 |
n<-nrow(ghcn.subsets[[i]]) |
|
90 |
ns<-n-round(n*prop) #Create a sample from the data frame with 70% of the rows |
|
91 |
nv<-n-ns #create a sample for validation with prop of the rows |
|
92 |
ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly |
|
93 |
ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training) |
|
94 |
data_s <- ghcn.subsets[[i]][ind.training, ] |
|
95 |
data_v <- ghcn.subsets[[i]][ind.testing, ] |
|
96 |
|
|
97 |
####Regression part 2: GAM models |
|
98 |
|
|
99 |
mod1<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=data_s) |
|
100 |
mod2<- gam(tmax~ s(lat,lon,ELEV_SRTM), data=data_s) |
|
101 |
mod3<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) + s (Northness)+ s (Eastness) + s(DISTOC), data=data_s) |
|
102 |
mod4<- gam(tmax~ s(lat) + s (lon) + s(ELEV_SRTM) + s(Northness) + s (Eastness) + s(DISTOC) + s(LST), data=data_s) |
|
103 |
mod5<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST), data=data_s) |
|
104 |
mod6<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC1), data=data_s) |
|
105 |
mod7<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC3), data=data_s) |
|
106 |
|
|
107 |
####Regression part 3: Calculating and storing diagnostic measures |
|
108 |
|
|
109 |
results_AIC[i,1]<- j |
|
110 |
results_AIC[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
111 |
results_AIC[i,2]<- ns #number of stations used in the training stage |
|
112 |
results_AIC[i,3]<- AIC (mod1) |
|
113 |
results_AIC[i,4]<- AIC (mod2) |
|
114 |
results_AIC[i,5]<- AIC (mod3) |
|
115 |
results_AIC[i,6]<- AIC (mod4) |
|
116 |
results_AIC[i,7]<- AIC (mod5) |
|
117 |
results_AIC[i,8]<- AIC (mod6) |
|
118 |
results_AIC[i,9]<- AIC (mod7) |
|
119 |
|
|
120 |
results_GCV[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
121 |
results_GCV[i,2]<- ns #number of stations used in the training stage |
|
122 |
results_GCV[i,3]<- mod1$gcv.ubre |
|
123 |
results_GCV[i,4]<- mod2$gcv.ubre |
|
124 |
results_GCV[i,5]<- mod3$gcv.ubre |
|
125 |
results_GCV[i,6]<- mod4$gcv.ubre |
|
126 |
results_GCV[i,7]<- mod5$gcv.ubre |
|
127 |
results_GCV[i,8]<- mod6$gcv.ubre |
|
128 |
results_GCV[i,9]<- mod7$gcv.ubre |
|
129 |
|
|
130 |
results_DEV[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
131 |
results_DEV[i,2]<- ns #number of stations used in the training stage |
|
132 |
results_DEV[i,3]<- mod1$deviance |
|
133 |
results_DEV[i,4]<- mod2$deviance |
|
134 |
results_DEV[i,5]<- mod3$deviance |
|
135 |
results_DEV[i,6]<- mod4$deviance |
|
136 |
results_DEV[i,7]<- mod5$deviance |
|
137 |
results_DEV[i,8]<- mod6$deviance |
|
138 |
results_DEV[i,9]<- mod7$deviance |
|
139 |
|
|
140 |
#####VALIDATION: Prediction checking the results using the testing data######## |
|
141 |
|
|
142 |
y_mod1<- predict(mod1, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values. |
|
143 |
y_mod2<- predict(mod2, newdata=data_v, se.fit = TRUE) |
|
144 |
y_mod3<- predict(mod3, newdata=data_v, se.fit = TRUE) |
|
145 |
y_mod4<- predict(mod4, newdata=data_v, se.fit = TRUE) |
|
146 |
y_mod5<- predict(mod5, newdata=data_v, se.fit = TRUE) |
|
147 |
y_mod6<- predict(mod6, newdata=data_v, se.fit = TRUE) |
|
148 |
y_mod7<- predict(mod7, newdata=data_v, se.fit = TRUE) |
|
149 |
|
|
150 |
res_mod1<- data_v$tmax - y_mod1$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation |
|
151 |
res_mod2<- data_v$tmax - y_mod2$fit #Residuals for GAM model that resembles the PRISM interpolation |
|
152 |
res_mod3<- data_v$tmax - y_mod3$fit |
|
153 |
res_mod4<- data_v$tmax - y_mod4$fit |
|
154 |
res_mod5<- data_v$tmax - y_mod5$fit |
|
155 |
res_mod6<- data_v$tmax - y_mod6$fit |
|
156 |
res_mod7<- data_v$tmax - y_mod7$fit |
|
157 |
|
|
158 |
RMSE_mod1 <- sqrt(sum(res_mod1^2)/nv) |
|
159 |
RMSE_mod2 <- sqrt(sum(res_mod2^2)/nv) |
|
160 |
RMSE_mod3 <- sqrt(sum(res_mod3^2)/nv) |
|
161 |
RMSE_mod4 <- sqrt(sum(res_mod4^2)/nv) |
|
162 |
RMSE_mod5 <- sqrt(sum(res_mod5^2)/nv) |
|
163 |
RMSE_mod6 <- sqrt(sum(res_mod6^2)/nv) |
|
164 |
RMSE_mod7 <- sqrt(sum(res_mod7^2)/nv) |
|
165 |
|
|
166 |
results_RMSE[i,1]<- dates[i] #storing the interpolation dates in the first column |
|
167 |
results_RMSE[i,2]<- ns #number of stations used in the training stage |
|
168 |
results_RMSE[i,3]<- RMSE_mod1 |
|
169 |
results_RMSE[i,4]<- RMSE_mod2 |
|
170 |
results_RMSE[i,5]<- RMSE_mod3 |
|
171 |
results_RMSE[i,6]<- RMSE_mod4 |
|
172 |
results_RMSE[i,7]<- RMSE_mod5 |
|
173 |
results_RMSE[i,8]<- RMSE_mod6 |
|
174 |
results_RMSE[i,9]<- RMSE_mod7 |
|
175 |
#Saving dataset in dataframes |
|
176 |
data_name<-paste("ghcn_v_",dates[[i]],sep="") |
|
177 |
assign(data_name,data_v) |
|
178 |
data_name<-paste("ghcn_s_",dates[[i]],sep="") |
|
179 |
assign(data_name,data_s) |
|
180 |
#ghcn_v<-ls(pattern="ghcn_v_") |
|
181 |
RMSE_run[i]<-j |
|
182 |
# end of the for loop #2 (nested in loop #1) |
|
183 |
} |
|
184 |
results_RMSE_all |
|
185 |
results_RMSEnum <-results_RMSE |
|
186 |
results_AICnum <-results_AIC |
|
187 |
mode(results_RMSEnum)<- "numeric" |
|
188 |
mode(results_AICnum)<- "numeric" |
|
189 |
# Make it numeric first |
|
190 |
# Now turn it into a data.frame... |
|
191 |
|
|
192 |
results_table_RMSE<-as.data.frame(results_RMSEnum) |
|
193 |
results_table_AIC<-as.data.frame(results_AICnum) |
|
194 |
colnames(results_table_RMSE)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6", "mod7") |
|
195 |
colnames(results_table_AIC)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6", "mod7") |
|
196 |
|
|
197 |
write.table(results_table_RMSE, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep=""), sep=",", append=TRUE) |
|
198 |
#write.table(results_table_AIC, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep=""),sep=",", append=TRUE) |
|
199 |
|
|
200 |
Print(j) #This is the run umber printed to the console. |
|
201 |
} #end of loop 1 |
|
202 |
|
|
203 |
# End of script########## |
|
204 |
|
|
205 |
#Selecting dates and files based on names |
|
206 |
#cor_LST_LC<-matrix(1,10,1) |
|
207 |
# for(i in 1:length(dates)){ |
|
208 |
# cor_LST_LC1[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC1) |
|
209 |
# } |
|
210 |
# for(i in 1:length(dates)){ |
|
211 |
# cor_LST_LC3[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC3) |
|
212 |
# } |
|
213 |
|
Also available in: Unified diff
GAM LST, first code to test effect of sampling on GAM, task #409