/climate/research/oregon/interpolation/GAM.R - Annotate - Environment and organisms - NCEAS Projects

4c29b739

Adam M. Wilson

####################Interpolation of Tmax for 10 dates.#####################

2

#This script interpolates station tmax values for the Oregon case study.It provides a mean to asssess the effect of random sampling and proportion

3

# of validation hold out on the RMSE.This program loads the station data from a csv file

4

#and perform one type of regression:  general additive model (GAM) with different variables:

5

# Lat, long, ELEV_SRTM, Eastness, Northness, DISTOC, mean_LST_monthly, Land Cover proportions.

6

#Note that this program:

7

#1)assumes that the csv file is in the current working

8

#2)extract relevant variables from raster images before performing the regressions.

9

#3)does not clear memory workspace at the start or end of script.

10

#This scripts predicts tmax using GAM and LST derived from MOD11A1.

11

#Interactions terms are also included and assessed using the RMSE from validation dataset.

12

#There are 10 dates used for the GAM interpolation. The dates must be provided as a textfile.

13

#Script created by Benoit Parmentier on April 25, 2012.

14

15

###Loading r library and packages                                                      # loading the raster package

16

library(gtools)                                                                        # loading ...

17

library(mgcv)

18

library(sp)

19

library(spdep)

20

library(rgdal)

21

library(multicore)  # if installed allows easy parallelization

22

library(reshape)    # very useful for switching from 'wide' to 'long' data formats

23

24

###Parameters and arguments

25

26

infile1<-"ghcn_or_ppt_covariates_20120705_OR83M.shp"

27

path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations"

28

#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations"

29

                                        #path<-"H:/Data/IPLANT_project/data_Oregon_stations"

30

path="/home/wilson/data"

31

setwd(path)

32

infile2<-"dates_interpolation_03052012.txt"               #List of 10 dates for the regression

33

prop<-0.3  #Proportion of testing retained for validation

34

n_runs<- 30 #Number of runs

35

out_prefix<-"_20120705_run01_LST"

36

infile3<-"LST_dates_var_names.txt"     #List of LST averages.

37

infile4<-"models_interpolation_05142012.txt"

38

39

#######START OF THE SCRIPT #############

40

41

###Reading the station data and setting up for models' comparison

42

filename<-sub(".shp","",infile1)              #Removing the extension from file.

43

ghcn<-readOGR(".", filename)                  #reading shapefile

44

45

ghcn = transform(ghcn,Northness = cos(ASPECT*pi/180)) #Adding a variable to the dataframe

46

ghcn = transform(ghcn,Eastness = sin(ASPECT*pi/180))  #adding variable to the dataframe.

47

ghcn = transform(ghcn,Northness_w = sin(slope*pi/180)*cos(ASPECT*pi/180)) #Adding a variable to the dataframe

48

ghcn = transform(ghcn,Eastness_w = sin(slope*pi/180)*sin(ASPECT*pi/180))  #adding variable to the dataframe.

49

                                              #Note that "transform" output is a data.frame not spatial object

50

#set.seed(100) #modify this to a seed variable allowing different runs.

51

52

 dates <-readLines(paste(path,"/",infile2, sep=""))

53

LST_dates <-readLines(paste(path,"/",infile3, sep=""))

54

models <-readLines(paste(path,"/",infile4, sep=""))

55

56

#results <- matrix(1,length(dates),14)            #This is a matrix containing the diagnostic measures from the GAM models.

57

58

####  Define GAM models

59

var="tmax"

60

61

mods=data.frame(formula=c(

62

                  paste(var,"~ s(lat) + s (lon) + s (ELEV_SRTM)",sep=""),

63

                  paste(var,"~ s(lat,lon,ELEV_SRTM)",sep=""),

64

                  paste(var,"~ s(lat) + s (lon) + s (ELEV_SRTM) +  s (Northness)+ s (Eastness) + s(DISTOC)",sep=""),

65

                  paste(var,"~ s(lat) + s (lon) + s(ELEV_SRTM) + s(Northness) + s (Eastness) + s(DISTOC) + s(LST)",sep=""),

66

                  paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST)",sep=""),

67

                  paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC1)",sep=""),

68

                  paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC3)",sep=""),

69

                  paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST) + s(LC1)",sep="")

70

                  ),stringsAsFactors=F)

71

mods$model=paste("mod",1:nrow(mods),sep="")

72

73

74

### subset dataset?

75

ghcn_all<-ghcn

76

ghcn_test<-subset(ghcn,ghcn$tmax>-150 & ghcn$tmax<400)

77

ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0)

78

ghcn<-ghcn_test2

79

80

81

## loop through the dates...

82

83

ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 subset data

84

85

 results=do.call(rbind.data.frame,                   # Collect the results in a single data.frame

86

   mclapply(1:length(dates),function(i) {            # loop over dates

87

     date<-strptime(dates[i], "%Y%m%d")              # get date

88

     month<-strftime(date, "%m")                     # get month

89

     LST_month<-paste("mm_",month,sep="")            # get LST_month name

90

91

     ##Regression part 1: Creating a validation dataset by creating training and testing datasets

92

     mod <-ghcn.subsets[[i]][,match(LST_month, names(ghcn.subsets[[i]]))]

93

     ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod)

94

     ##Screening LST values

95

     ##ghcn.subsets[[i]]<-subset(ghcn.subsets[[i]],ghcn.subsets[[i]]$LST> 258 & ghcn.subsets[[i]]$LST<313)

96

     n<-nrow(ghcn.subsets[[i]])

97

     ns<-n-round(n*prop)  #Create a sample from the data frame with 70% of the rows

98

     nv<-n-ns             #create a sample for validation with prop of the rows

99

     ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly

100

     ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)

101

     data_s <- ghcn.subsets[[i]][ind.training, ]

102

     data_v <- ghcn.subsets[[i]][ind.testing, ]

103

104

     ## lapply loops through models for the ith day, calculates the validation metrics, and saves them as R objects

105

     results=do.call(rbind.data.frame,

106

       lapply(1:nrow(mods),function(m,savemodel=F,saveFullPrediction=T) {

107

         ## run the model

108

         mod=gam(formula(mods$formula[m]),data=data_s)

109

110

         ##VALIDATION: Prediction checking the results using the testing data########

111

         y_mod<- predict(mod, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values.

112

         res_mod<- data_v$tmax - y_mod$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation

113

114

         ##Regression part 3: Calculating and storing diagnostic measures

115

         tresults=data.frame(            # build 1-row dataframe for this model-date

116

           date=dates[i],                # interpolation date

117

           model=mods$model[m],          # model number

118

           ns=ns,                        # number of stations used in the training stage

119

           AIC=AIC(mod),                # AIC

120

           GCV=mod$gcv.ubre,             # GCV

121

           DEV=mod$deviance,             # Deviance

122

           RMSE=sqrt(sum(res_mod^2)/nv)  # RMSE

123

124

125

         ## Save the model object if desired

126

         if(savemodel)  save(mod,file= paste(path,"/","results_GAM_Model","_",out_prefix,"_",dates[i],".Rdata",sep=""))

127

128

         ## do the full prediction and save it if desired

129

         if(saveFullPrediction){

130

           s_raster=readRaster(filename=paste(path,"covariates.gri"))

131

           predict(s_raster,mod,filename=paste(outpath,"_",sub("-","",date),"_prediction.tif",sep=""),format="GTiff",progress="text",fun="predict")

132

           predict(s_raster,mod,filename=paste(outpath,"_",sub("-","",date),"_prediction.se.tif",sep=""),format="GTiff",progress="text",fun="predict.se")

133

134

135

         ## print progress

136

         print(paste("Finshed Model:",mods$model[m]," for Date:",dates[i]))

137

         ## return the results table

138

         return(tresults)

139

                                        # end of the for loop #2 (nested in loop #1)

140

}))

141

     print(paste("Finshed Date:",dates[i]))

142

     return(results)

143

144

))

  write.table(results_RMSE_all2, file= paste(path,"/","results_GAM_Assessment",out_prefix,"all.txt",sep=""), sep=",", col.names=TRUE)

149

150

resl=melt(results,id=c("run","date","model","ns"))

151

152

  xyplot(value~date|variable,groups=model,data=resl,scales=list(y=list(relation="free")),auto.key=list()

###Analysing the results from the 365 days run: Summarize by month

157

158

# for(i in 1:nrow(results_table_RMSE)){

159

#   date<-results_table_RMSE$dates[i]

160

#   date<-strptime(date, "%Y%m%d")

161

#   results_table_RMSE$month[i]<-as.integer(strftime(date, "%m"))

162

# }

163

164

# average<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE,mean, na.rm=TRUE)

165

# average<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, FUN=mean)

166

# #average on all the data.frame

167

# averaget<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month),FUN=mean, na.rm=TRUE)

168

# #mediant<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month),FUN=median, na.rm=TRUE)

169

# #average_lowt<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month), FUN=function(v) t.test(v)$conf.int[1])

170

# #average_up<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[2])

171

172

# median<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, median, na.rm=TRUE)

173

# average_low<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[1])

174

# average_up<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[2])

175

176

# mod<-names(averaget)

177

# mod<-mod[4:11]

178

# #Saving graphic plots

179

# for(i in 1:length(mod)){

180

#   X11(width=14,height=10)

181

#   name<-mod[i]

182

#   barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main="Mean RMSE per month", names.arg=c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep","Oct", "Nov", "Dec"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)

183

#   #title(paste("Sampling RMSE for mod",i,sep=""))

184

#   savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")

185

#   dev.off()

186

# }

187

188

189

# for(i in 1:length(mod)){

190

#   X11(width=14,height=10)

191

#   name<-mod[i]

192

#   barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main=paste(" Mean RMSE per month ",name, sep=""), names.arg=c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep","Oct", "Nov", "Dec"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)

193

#   #title(paste("Sampling RMSE for mod",i,sep=""))

194

#   savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")

195

#   dev.off()

196

197

#   X11(width=14,height=10)

198

#   name<-mod[i]

199

#   hist(results_table_RMSE[[name]],breaks=15, main=paste(" Histogram RMSE_",name, sep=""),xlab=paste("RMSE ",name, sep=""))

200

#   savePlot(paste("Hist_results_RMSE_365_",name,out_prefix,".png", sep=""), type="png")

201

#   dev.off()

202

203

# }

204

205

# for(i in 1:length(mod)){

206

#   X11(width=14,height=10)

207

#   name<-mod[i]

208

#   hist(results_table_RMSE[[name]],breaks=15, main=paste(" Histogram RMSE_",name, sep=""),xlab=paste("RMSE ",name, sep=""))

209

#   savePlot(paste("Hist_results_RMSE_365_",name,out_prefix,".png", sep=""), type="png")

210

#   dev.off()

211

# }

212

213

# r<-(results_RMSE_all2[,4:10]) #selecting only the columns related to models...

214

215

# mean_r<-sapply(r, mean)

216

# median_r<-sapply(r, median)

217

# sd_r<-sapply(r, sd)

218

219

# barplot(mean_r,ylim=c(20,26),ylab="RMSE in tenth deg C")

220

# barplot(median_r,ylim=c(20,26),ylab="RMSE in tenth deg C",add=TRUE,inside=FALSE,beside=TRUE) # put both on the same plot

221

# barplot(sd_r,ylim=c(6,8),ylab="RMSE in tenth deg C") # put both on the same plot

222

223

# height<-rbind(mean_r,median_r)

224

# barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE,legend=rownames(height))

225

# barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE, col=c("darkblue","red"),legend=rownames(height)) # put both on the same plot

226

# PNG(paste("Barplot_results_RMSE_sampling_",out_prefix,".png", sep=""))

227

228

# barplot2(mean_r,median_r,ylim=c(23,26),ylab="RMSE in tenth deg C") # put both on the same plot

229

# #Collect var explained and p values for each var...

230

231

# mod<-names(mean_r)

232

# average<-mean_r

233

# average_low<-mean_r-sd_r

234

# average_up<-mean_r+sd_r

235

236

# for(i in 1:length(mod)){

237

#   #X11(width=14,height=10)

238

#   name<-mod[i]

239

#   barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main=paste(" Mean RMSE per month ",name, sep=""), names.arg=c("mod1", "mod2", "mod3", "mod4", "mod5", "mod6","mod7"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)

240

#   #title(paste("Sampling RMSE for mod",i,sep=""))

241

#   #savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")

242

#   #dev.off()

243

#   }

244

245

# End of script##########

246

247

#Selecting dates and files based on names

248

#cor_LST_LC<-matrix(1,10,1)

249

# for(i in 1:length(dates)){

250

#   cor_LST_LC1[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC1)

251

# }

252

# for(i in 1:length(dates)){

253

#   cor_LST_LC3[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC3)

254

# }

Project

General

Profile

Project

General

Profile

root/climate/research/oregon/interpolation/GAM.R @ 4c29b739