Project

General

Profile

« Previous | Next » 

Revision 4c29b739

Added by Adam M. Wilson over 12 years ago

Modified GAM.R interpolation script to 1) simplify validation results, 2) add full prediction (rather than just stations) and 3) some other (hopefully) simplifications and improvements.

View differences:

climate/research/oregon/interpolation/Extraction_raster_covariates_study_area.R
24 24

  
25 25
path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations"             #Path to all datasets on Atlas
26 26
setwd(path)                                                                   #Setting the working directory
27

  
27 28
infile1<-"ghcn_data_TMAXy2010_2010_OR_0626012.shp"                            #Weather station location with interp. var. (TMAX, TMIN or PRCP)
28
#inlistf<-"list_files_04252012.txt"                                           #Covariates as list of raster files and output names separated by space
29
                                                                              #Name of raster files should come with extension    
30
outfile<-'ghcn_or_tmax_covariates_06262012_OR83M.shp'                         #Name of the new shapefile created with covariates extracted at station locations
29
infile1<-"/home/wilson/data/ghcn_data_PRCPy2010_OR_20110705.shp"              #User defined output prefix
30

  
31
inlistf<-"list_files_05032012.txt"                                           #Covariates as list of raster files and output names separated by space
32
                                                                             #Name of raster files should come with extension    
33
outfile<-'ghcn_or_ppt_covariates_20120705_OR83M.shp'                         #Name of the new shapefile created with covariates extracted at station locations
34
outpath="/home/wilson/data/"
31 35

  
32 36
#######START OF THE SCRIPT #############
33 37

  
34 38
###Reading the station data
35 39
filename<-sub(".shp","",infile1)                                             #Removing the extension from file.
36
ghcn3<-readOGR(".", filename)                                                #Reading shape file using rgdal library
40
ghcn3<-readOGR(dirname(infile1),layer=basename(infile1))                                                #Reading shape file using rgdal library
37 41
 
38 42
###Extracting the variables values from the raster files                                             
39

  
40 43
lines<-read.table(paste(path,"/",inlistf,sep=""), sep=" ")                  #Column 1 contains the names of raster files
41 44
inlistvar<-lines[,1]
42 45
inlistvar<-paste(path,"/",as.character(inlistvar),sep="")
......
46 49
layerNames(s_raster)<-covar_names                                            #Assigning names to the raster layers
47 50
stat_val<- extract(s_raster, ghcn3)                                          #Extracting values from the raster stack for every point location in coords data frame.
48 51

  
52

  
53
#TODO:  Add lon and lat as layers to make easier predictions
54
#TODO: subset list to only those used (drop number of obs, etc.)
55

  
49 56
#create a shape file and data_frame with names
50 57

  
51 58
data_RST<-as.data.frame(stat_val)                                            #This creates a data frame with the values extracted
......
55 62
CRS<-proj4string(ghcn3)
56 63
proj4string(data_RST_SDF)<-CRS  #Need to assign coordinates...
57 64

  
58
#Creating a date column
59
date1<-ISOdate(data_RST_SDF$year,data_RST_SDF$month,data_RST_SDF$day) #Creating a date object from 3 separate column
60
date2<-gsub("-","",as.character(as.Date(date1)))
61
data_RST_SDF$date<-date2                                              #Date format (year,month,day) is the following: "20100627"
62

  
63 65
#write out a new shapefile (including .prj component)
64 66
outfile<-sub(".shp","",outfile)   #Removing extension if it is present
65 67

  
68
## save the raster stack for prediction
69
writeRaster(s_raster,filename=paste(outpath,"covariates",sep=""))
70

  
66 71
#Save a textfile and shape file of all the subset data
67 72
write.table(as.data.frame(data_RST_SDF),paste(outfile,".txt",sep=""), sep=",")
68
writeOGR(data_RST_SDF, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
73
writeOGR(data_RST_SDF, paste(outpath,outfile, ".shp", sep=""), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
69 74

  
70
##### END OF SCRIPT ##########
75
##### END OF SCRIPT ##########
climate/research/oregon/interpolation/GAM.R
1
####################Interpolation of Tmax for 10 dates.#####################
2
#This script interpolates station tmax values for the Oregon case study.It provides a mean to asssess the effect of random sampling and proportion
3
# of validation hold out on the RMSE.This program loads the station data from a csv file 
4
#and perform one type of regression:  general additive model (GAM) with different variables: 
5
# Lat, long, ELEV_SRTM, Eastness, Northness, DISTOC, mean_LST_monthly, Land Cover proportions.
6
#Note that this program:
7
#1)assumes that the csv file is in the current working 
8
#2)extract relevant variables from raster images before performing the regressions. 
9
#3)does not clear memory workspace at the start or end of script.
10
#This scripts predicts tmax using GAM and LST derived from MOD11A1.
11
#Interactions terms are also included and assessed using the RMSE from validation dataset.
12
#There are 10 dates used for the GAM interpolation. The dates must be provided as a textfile.
13
#Script created by Benoit Parmentier on April 25, 2012. 
14

  
15
###Loading r library and packages                                                      # loading the raster package
16
library(gtools)                                                                        # loading ...
17
library(mgcv)
18
library(sp)
19
library(spdep)
20
library(rgdal)
21
library(multicore)  # if installed allows easy parallelization
22
library(reshape)    # very useful for switching from 'wide' to 'long' data formats
23

  
24
###Parameters and arguments
25

  
26
infile1<-"ghcn_or_ppt_covariates_20120705_OR83M.shp"
27
path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations"
28
#path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations"
29
                                        #path<-"H:/Data/IPLANT_project/data_Oregon_stations"
30
path="/home/wilson/data"
31
setwd(path) 
32
infile2<-"dates_interpolation_03052012.txt"               #List of 10 dates for the regression
33
prop<-0.3  #Proportion of testing retained for validation   
34
n_runs<- 30 #Number of runs
35
out_prefix<-"_20120705_run01_LST"
36
infile3<-"LST_dates_var_names.txt"     #List of LST averages.
37
infile4<-"models_interpolation_05142012.txt"
38

  
39
#######START OF THE SCRIPT #############
40

  
41
###Reading the station data and setting up for models' comparison
42
filename<-sub(".shp","",infile1)              #Removing the extension from file.
43
ghcn<-readOGR(".", filename)                  #reading shapefile 
44
                  
45
ghcn = transform(ghcn,Northness = cos(ASPECT*pi/180)) #Adding a variable to the dataframe
46
ghcn = transform(ghcn,Eastness = sin(ASPECT*pi/180))  #adding variable to the dataframe.
47
ghcn = transform(ghcn,Northness_w = sin(slope*pi/180)*cos(ASPECT*pi/180)) #Adding a variable to the dataframe
48
ghcn = transform(ghcn,Eastness_w = sin(slope*pi/180)*sin(ASPECT*pi/180))  #adding variable to the dataframe.
49
                                              #Note that "transform" output is a data.frame not spatial object 
50
#set.seed(100) #modify this to a seed variable allowing different runs.
51

  
52
 dates <-readLines(paste(path,"/",infile2, sep=""))
53
LST_dates <-readLines(paste(path,"/",infile3, sep=""))
54
models <-readLines(paste(path,"/",infile4, sep=""))
55

  
56
#results <- matrix(1,length(dates),14)            #This is a matrix containing the diagnostic measures from the GAM models.
57

  
58
####  Define GAM models
59
var="tmax"
60

  
61
mods=data.frame(formula=c(    
62
                  paste(var,"~ s(lat) + s (lon) + s (ELEV_SRTM)",sep=""),
63
                  paste(var,"~ s(lat,lon,ELEV_SRTM)",sep=""),
64
                  paste(var,"~ s(lat) + s (lon) + s (ELEV_SRTM) +  s (Northness)+ s (Eastness) + s(DISTOC)",sep=""),
65
                  paste(var,"~ s(lat) + s (lon) + s(ELEV_SRTM) + s(Northness) + s (Eastness) + s(DISTOC) + s(LST)",sep=""),
66
                  paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST)",sep=""),
67
                  paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC1)",sep=""),
68
                  paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC3)",sep=""),
69
                  paste(var,"~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST) + s(LC1)",sep="")
70
                  ),stringsAsFactors=F)
71
mods$model=paste("mod",1:nrow(mods),sep="")
72

  
73

  
74
### subset dataset?
75
ghcn_all<-ghcn
76
ghcn_test<-subset(ghcn,ghcn$tmax>-150 & ghcn$tmax<400)
77
ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0)
78
ghcn<-ghcn_test2
79

  
80

  
81
## loop through the dates...
82
  
83
ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 subset data
84
  
85
 results=do.call(rbind.data.frame,                   # Collect the results in a single data.frame
86
   mclapply(1:length(dates),function(i) {            # loop over dates
87
     date<-strptime(dates[i], "%Y%m%d")              # get date
88
     month<-strftime(date, "%m")                     # get month
89
     LST_month<-paste("mm_",month,sep="")            # get LST_month name
90

  
91
     ##Regression part 1: Creating a validation dataset by creating training and testing datasets
92
     mod <-ghcn.subsets[[i]][,match(LST_month, names(ghcn.subsets[[i]]))]
93
     ghcn.subsets[[i]] = transform(ghcn.subsets[[i]],LST = mod)
94
     ##Screening LST values
95
     ##ghcn.subsets[[i]]<-subset(ghcn.subsets[[i]],ghcn.subsets[[i]]$LST> 258 & ghcn.subsets[[i]]$LST<313)
96
     n<-nrow(ghcn.subsets[[i]])
97
     ns<-n-round(n*prop)  #Create a sample from the data frame with 70% of the rows
98
     nv<-n-ns             #create a sample for validation with prop of the rows
99
     ind.training <- sample(nrow(ghcn.subsets[[i]]), size=ns, replace=FALSE) #This selects the index position for 70% of the rows taken randomly
100
     ind.testing <- setdiff(1:nrow(ghcn.subsets[[i]]), ind.training)
101
     data_s <- ghcn.subsets[[i]][ind.training, ]
102
     data_v <- ghcn.subsets[[i]][ind.testing, ]
103
     
104
     ## lapply loops through models for the ith day, calculates the validation metrics, and saves them as R objects
105
     results=do.call(rbind.data.frame,
106
       lapply(1:nrow(mods),function(m,savemodel=F,saveFullPrediction=T) {  
107
         ## run the model
108
         mod=gam(formula(mods$formula[m]),data=data_s)
109
         
110
         ##VALIDATION: Prediction checking the results using the testing data########
111
         y_mod<- predict(mod, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values.
112
         res_mod<- data_v$tmax - y_mod$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation
113
         
114
         ##Regression part 3: Calculating and storing diagnostic measures
115
         tresults=data.frame(            # build 1-row dataframe for this model-date
116
           date=dates[i],                # interpolation date
117
           model=mods$model[m],          # model number
118
           ns=ns,                        # number of stations used in the training stage
119
           AIC=AIC(mod),                # AIC
120
           GCV=mod$gcv.ubre,             # GCV
121
           DEV=mod$deviance,             # Deviance
122
           RMSE=sqrt(sum(res_mod^2)/nv)  # RMSE
123
           )
124
         
125
         ## Save the model object if desired
126
         if(savemodel)  save(mod,file= paste(path,"/","results_GAM_Model","_",out_prefix,"_",dates[i],".Rdata",sep=""))
127

  
128
         ## do the full prediction and save it if desired
129
         if(saveFullPrediction){
130
           s_raster=readRaster(filename=paste(path,"covariates.gri"))
131
           predict(s_raster,mod,filename=paste(outpath,"_",sub("-","",date),"_prediction.tif",sep=""),format="GTiff",progress="text",fun="predict")
132
           predict(s_raster,mod,filename=paste(outpath,"_",sub("-","",date),"_prediction.se.tif",sep=""),format="GTiff",progress="text",fun="predict.se")
133
         }
134
         
135
         ## print progress
136
         print(paste("Finshed Model:",mods$model[m]," for Date:",dates[i]))
137
         ## return the results table
138
         return(tresults)
139
                                        # end of the for loop #2 (nested in loop #1)
140
       }))
141
     print(paste("Finshed Date:",dates[i]))
142
     return(results)
143
   }
144
            ))
145

  
146

  
147

  
148
  write.table(results_RMSE_all2, file= paste(path,"/","results_GAM_Assessment",out_prefix,"all.txt",sep=""), sep=",", col.names=TRUE)
149

  
150
resl=melt(results,id=c("run","date","model","ns"))
151

  
152
  xyplot(value~date|variable,groups=model,data=resl,scales=list(y=list(relation="free")),auto.key=list()
153

  
154
  
155

  
156
###Analysing the results from the 365 days run: Summarize by month
157
# 
158
# for(i in 1:nrow(results_table_RMSE)){
159
#   date<-results_table_RMSE$dates[i]
160
#   date<-strptime(date, "%Y%m%d")
161
#   results_table_RMSE$month[i]<-as.integer(strftime(date, "%m"))
162
# }
163
# 
164
# average<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE,mean, na.rm=TRUE)
165
# average<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, FUN=mean)
166
# #average on all the data.frame
167
# averaget<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month),FUN=mean, na.rm=TRUE)
168
# #mediant<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month),FUN=median, na.rm=TRUE)
169
# #average_lowt<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month), FUN=function(v) t.test(v)$conf.int[1])
170
# #average_up<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[2])
171
# 
172
# median<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, median, na.rm=TRUE)
173
# average_low<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[1])
174
# average_up<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[2])
175
# 
176
# mod<-names(averaget)
177
# mod<-mod[4:11]
178
# #Saving graphic plots
179
# for(i in 1:length(mod)){
180
#   X11(width=14,height=10)
181
#   name<-mod[i]
182
#   barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main="Mean RMSE per month", names.arg=c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep","Oct", "Nov", "Dec"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
183
#   #title(paste("Sampling RMSE for mod",i,sep=""))
184
#   savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
185
#   dev.off() 
186
# }
187
# 
188
# 
189
# for(i in 1:length(mod)){
190
#   X11(width=14,height=10)
191
#   name<-mod[i]
192
#   barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main=paste(" Mean RMSE per month ",name, sep=""), names.arg=c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep","Oct", "Nov", "Dec"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
193
#   #title(paste("Sampling RMSE for mod",i,sep=""))
194
#   savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
195
#   dev.off() 
196
#   
197
#   X11(width=14,height=10)
198
#   name<-mod[i]
199
#   hist(results_table_RMSE[[name]],breaks=15, main=paste(" Histogram RMSE_",name, sep=""),xlab=paste("RMSE ",name, sep=""))
200
#   savePlot(paste("Hist_results_RMSE_365_",name,out_prefix,".png", sep=""), type="png")
201
#   dev.off()
202
#   
203
# }
204
# 
205
# for(i in 1:length(mod)){
206
#   X11(width=14,height=10)
207
#   name<-mod[i]
208
#   hist(results_table_RMSE[[name]],breaks=15, main=paste(" Histogram RMSE_",name, sep=""),xlab=paste("RMSE ",name, sep=""))
209
#   savePlot(paste("Hist_results_RMSE_365_",name,out_prefix,".png", sep=""), type="png")
210
#   dev.off()
211
# }
212
# 
213
# r<-(results_RMSE_all2[,4:10]) #selecting only the columns related to models...
214
# 
215
# mean_r<-sapply(r, mean)
216
# median_r<-sapply(r, median)
217
# sd_r<-sapply(r, sd)
218
# 
219
# barplot(mean_r,ylim=c(20,26),ylab="RMSE in tenth deg C")
220
# barplot(median_r,ylim=c(20,26),ylab="RMSE in tenth deg C",add=TRUE,inside=FALSE,beside=TRUE) # put both on the same plot
221
# barplot(sd_r,ylim=c(6,8),ylab="RMSE in tenth deg C") # put both on the same plot
222
# 
223
# height<-rbind(mean_r,median_r)
224
# barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE,legend=rownames(height))
225
# barplot(height,ylim=c(20,26),ylab="RMSE in tenth deg C",beside=TRUE, col=c("darkblue","red"),legend=rownames(height)) # put both on the same plot
226
# PNG(paste("Barplot_results_RMSE_sampling_",out_prefix,".png", sep=""))
227
# 
228
# barplot2(mean_r,median_r,ylim=c(23,26),ylab="RMSE in tenth deg C") # put both on the same plot
229
# #Collect var explained and p values for each var...
230
# 
231
# mod<-names(mean_r)
232
# average<-mean_r
233
# average_low<-mean_r-sd_r
234
# average_up<-mean_r+sd_r
235
# 
236
# for(i in 1:length(mod)){
237
#   #X11(width=14,height=10)
238
#   name<-mod[i]
239
#   barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main=paste(" Mean RMSE per month ",name, sep=""), names.arg=c("mod1", "mod2", "mod3", "mod4", "mod5", "mod6","mod7"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
240
#   #title(paste("Sampling RMSE for mod",i,sep=""))
241
#   #savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
242
#   #dev.off()
243
#   }
244

  
245
# End of script##########
246

  
247
#Selecting dates and files based on names
248
#cor_LST_LC<-matrix(1,10,1)
249
# for(i in 1:length(dates)){
250
#   cor_LST_LC1[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC1)
251
# }
252
# for(i in 1:length(dates)){
253
#   cor_LST_LC3[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC3)
254
# }
255

  
climate/research/oregon/interpolation/GHCND_stations_extraction_study_area.R
35 35
#path<-"H:/Data/IPLANT_project/data_Oregon_stations"                      #Jupiter Location on XANDERS
36 36

  
37 37
outpath=path                                                              # create different output path because we don't have write access to other's home dirs
38
#outpath="/home/wilson/data"
39 38
setwd(path) 
40 39
out_prefix<-"y2010_2010_OR_20110705"                                                 #User defined output prefix
41 40

  
41
#for Adam
42
#outpath="/home/wilson/data"
43
#out_prefix<-"y2010_OR_20110705"                                                 #User defined output prefix
42 44

  
43 45
############ START OF THE SCRIPT #################
44 46

  

Also available in: Unified diff