/ - Diff - Environment and organisms - NCEAS Projects

« Previous | Next »

Revision c9e2af49

Added by Benoit Parmentier over 12 years ago

ID c9e2af49bf2f6f6b5d8c3ff1b2861f5cd6ed647b
Parent 75151566
Child ab8957ca

GAM LST, major modifications to create GAM+Kriging code, task #364

     ####################Interpolation of Tmax for 10 dates.#####################
     #This script interpolates station values for the Oregon case study. This program loads the station data from a shapefile
     #and perform 8 regressions using the general additive model (GAM). Note that this program:
     #1)assumes that the shapefile in the current working
     #This script interpolates station values for the Oregon case study using a two stage regression.
     #This program loads the station data from a shape file and perform 8 regressions using general additive model (GAM) followed by kriging on the residuals.
     #It uses LST monthly averages as input variables.
     #Note that this program:
     #1)assumes that the shape file is in the current working
     #2)extract relevant variables from raster images before performing the regressions.
     #This scripts predicts tmas xsing GAM and LST derived from MOD11A1.
     #This scripts predicts tmax using ing GAM and LST derived from MOD11A1.
     #Interactions terms are also included and assessed using the RMSE from validation dataset.
     #There are 10 dates used for the GAM interpolation. The dates must be provided as a textfile.
     #Script created by Benoit Parmentier on April 4, 2012.
     #Script created by Benoit Parmentier on May 6, 2012.
     ###Loading r library and packages                                                      # loading the raster package
     library(gtools)                                                                        # loading ...
-...
     library(sp)
     library(spdep)
     library(rgdal)
     library(gstat)
     ###Parameters and arguments
     infile1<-"ghcn_or_tmax_b_04142012_OR83M.shp"
     path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations"
     #path<-"H:/Data/IPLANT_project/data_Oregon_stations"
     #path<-"/data/computer/parmentier/Data/IPLANT_project/data_Oregon_stations"
     path<-"H:/Data/IPLANT_project/data_Oregon_stations"
     setwd(path)
     infile2<-"dates_interpolation_03052012.txt"                                          #List of 10 dates for the regression
     infile2<-"list_365_dates_04212012.txt"
     infile2<-"dates_interpolation_03052012_2dates_test.txt"                                          #List of 10 dates for the regression
     prop<-0.3                                                                            #Proportion of testing retained for validation
     out_prefix<-"_05012012_mod8_LST"
     out_prefix<-"_05062012_Kr_LST"
     infile3<-"LST_dates_var_names.txt"
     infile4<-"models_interpolation_04032012b.txt"
-...
     filename<-sub(".shp","",infile1)              #Removing the extension from file.
     ghcn<-readOGR(".", filename)                  #reading shapefile
     proj4string(ghcn) #This retrieves the coordinate system for the SDF
     CRS_ghcn<-proj4string(ghcn) #this can be assigned to mean_LST!!!
     CRS<-proj4string(ghcn)
     mean_LST<- readGDAL("mean_day244_rescaled.rst")  #This reads the whole raster in memory and provide a grid for kriging
     proj4string(mean_LST)<-CRS #Assigning coordinates information
     ghcn = transform(ghcn,Northness = cos(ASPECT)) #Adding a variable to the dataframe
     ghcn = transform(ghcn,Eastness = sin(ASPECT))  #adding variable to the dataframe.
-...
     LST_dates <-readLines(paste(path,"/",infile3, sep=""))
     models <-readLines(paste(path,"/",infile4, sep=""))
     results <- matrix(1,length(dates),15)            #This is a matrix containing the diagnostic measures from the GAM models.
     results <- matrix(1,length(dates),14)            #This is a matrix containing the diagnostic measures from the GAM models.
     results_AIC<- matrix(1,length(dates),length(models)+3)
     results_GCV<- matrix(1,length(dates),length(models)+3)
     results_DEV<- matrix(1,length(dates),length(models)+3)
     results_RMSE<- matrix(1,length(dates),length(models)+3)
     cor_LST_LC1<-matrix(1,length(dates),1)      #correlation LST-LC1
     cor_LST_LC3<-matrix(1,length(dates),1)      #correlation LST-LC3
     cor_LST_tmax<-matrix(1,length(dates),1)    #correlation LST-tmax
     results_AIC<- matrix(1,length(dates),length(models)+2)
     results_GCV<- matrix(1,length(dates),length(models)+2)
     results_DEV<- matrix(1,length(dates),length(models)+2)
     results_RMSE<- matrix(1,length(dates),length(models)+2)
     results_RMSE_kr<- matrix(1,length(dates),length(models)+2)
     cor_LST_LC1<-matrix(1,10,1)      #correlation LST-LC1
     cor_LST_LC3<-matrix(1,10,1)      #correlation LST-LC3
     cor_LST_tmax<-matrix(1,10,1)    #correlation LST-tmax
     #Screening for bad values
     ghcn_all<-ghcn
     ghcn_test<-subset(ghcn,ghcn$tmax>-150 & ghcn$tmax<400)
     ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0)
     ghcn<-ghcn_test2
     #coords<- ghcn[,c('x_OR83M','y_OR83M')]
     month_var<-c("mm_01","mm_02","mm_03","mm_04","mm_05","mm_06","mm_07","mm_08","mm_09", "mm_10", "mm_11", "mm_12")
     ghcn.subsets <-lapply(dates, function(d) subset(ghcn, date==d)) #this creates a list of 10 subsets data
-...
       ####Regression part 2: GAM models
       mod1<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM), data=data_s)
       #mod2<- gam(tmax~ s(lat,lon,ELEV_SRTM), data=data_s)
       mod2<- gam(tmax~ s(lat,lon) + s(ELEV_SRTM), data=data_s)
       mod2<- gam(tmax~ s(lat,lon,ELEV_SRTM), data=data_s)
       mod3<- gam(tmax~ s(lat) + s (lon) + s (ELEV_SRTM) +  s (Northness)+ s (Eastness) + s(DISTOC), data=data_s)
       mod4<- gam(tmax~ s(lat) + s (lon) + s(ELEV_SRTM) + s(Northness) + s (Eastness) + s(DISTOC) + s(LST), data=data_s)
       mod5<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST), data=data_s)
       mod6<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC1), data=data_s)
       mod7<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST,LC3), data=data_s)
       mod8<- gam(tmax~ s(lat,lon) +s(ELEV_SRTM) + s(Northness,Eastness) + s(DISTOC) + s(LST) + s(LC1), data=data_s)
       ####Regression part 3: Calculating and storing diagnostic measures
       results_AIC[i,1]<- dates[i]  #storing the interpolation dates in the first column
       results_AIC[i,2]<- ns        #number of stations used in the training stage
       results_AIC[i,3]<- AIC (mod1)
-...
       results_AIC[i,7]<- AIC (mod5)
       results_AIC[i,8]<- AIC (mod6)
       results_AIC[i,9]<- AIC (mod7)
       results_AIC[i,10]<- AIC (mod8)
       results_GCV[i,1]<- dates[i]  #storing the interpolation dates in the first column
       results_GCV[i,2]<- ns        #number of stations used in the training stage
-...
       results_GCV[i,7]<- mod5$gcv.ubre
       results_GCV[i,8]<- mod6$gcv.ubre
       results_GCV[i,9]<- mod7$gcv.ubre
       results_GCV[i,10]<- mod7$gcv.ubre
       results_DEV[i,1]<- dates[i]  #storing the interpolation dates in the first column
       results_DEV[i,2]<- ns        #number of stations used in the training stage
-...
       results_DEV[i,7]<- mod5$deviance
       results_DEV[i,8]<- mod6$deviance
       results_DEV[i,9]<- mod7$deviance
       results_DEV[i,10]<- mod8$deviance
       #####VALIDATION: Prediction checking the results using the testing data########
       #Automate this using a data frame of size??
       y_mod1<- predict(mod1, newdata=data_v, se.fit = TRUE) #Using the coeff to predict new values.
       y_mod2<- predict(mod2, newdata=data_v, se.fit = TRUE)
       y_mod3<- predict(mod3, newdata=data_v, se.fit = TRUE)
-...
       y_mod5<- predict(mod5, newdata=data_v, se.fit = TRUE)
       y_mod6<- predict(mod6, newdata=data_v, se.fit = TRUE)
       y_mod7<- predict(mod7, newdata=data_v, se.fit = TRUE)
       y_mod8<- predict(mod8, newdata=data_v, se.fit = TRUE)
       res_mod1<- data_v$tmax - y_mod1$fit #Residuals for GMA model that resembles the ANUSPLIN interpolation
       res_mod2<- data_v$tmax - y_mod2$fit   #Residuals for GAM model that resembles the PRISM interpolation
-...
       res_mod5<- data_v$tmax - y_mod5$fit
       res_mod6<- data_v$tmax - y_mod6$fit
       res_mod7<- data_v$tmax - y_mod7$fit
       res_mod8<- data_v$tmax - y_mod8$fit
       RMSE_mod1 <- sqrt(sum(res_mod1^2)/nv)
       RMSE_mod2 <- sqrt(sum(res_mod2^2)/nv)
-...
       RMSE_mod5 <- sqrt(sum(res_mod5^2)/nv)
       RMSE_mod6 <- sqrt(sum(res_mod6^2)/nv)
       RMSE_mod7 <- sqrt(sum(res_mod7^2)/nv)
       RMSE_mod8 <- sqrt(sum(res_mod8^2)/nv)
       results_RMSE[i,1]<- dates[i]  #storing the interpolation dates in the first column
       results_RMSE[i,2]<- ns        #number of stations used in the training stage
-...
       results_RMSE[i,7]<- RMSE_mod5
       results_RMSE[i,8]<- RMSE_mod6
       results_RMSE[i,9]<- RMSE_mod7
       results_RMSE[i,10]<- RMSE_mod8
       #Saving dataset in dataframes
       data_name<-paste("ghcn_v_",dates[[i]],sep="")
       #Saving dataset in dataframes: residuals from RMSE
     #   data_v$mod1<-y_mod1$fit
     #   data_v$mod2<-y_mod2$fit
     #   data_v$mod3<-y_mod3$fit
     #   data_v$mod4<-y_mod4$fit
     #   data_v$mod5<-y_mod5$fit
     #   data_v$mod6<-y_mod6$fit
     #   data_v$mod7<-y_mod7$fit
+    #
     #   data_s$mod1<-mod1$fit
     #   data_s$mod2<-mod2$fit
     #   data_s$mod3<-mod3$fit
     #   data_s$mod4<-mod4$fit
     #   data_s$mod5<-mod5$fit
     #   data_s$mod6<-mod6$fit
     #   data_s$mod7<-mod7$fit
+    #
       data_v$res_mod1<-as.numeric(res_mod1)
       data_v$res_mod2<-as.numeric(res_mod2)
       data_v$res_mod3<-as.numeric(res_mod3)
       data_v$res_mod4<-as.numeric(res_mod4)
       data_v$res_mod5<-as.numeric(res_mod5)
       data_v$res_mod6<-as.numeric(res_mod6)
       data_v$res_mod7<-as.numeric(res_mod7)
       data_s$res_mod1<-as.numeric(mod1$residuals)
       data_s$res_mod2<-as.numeric(mod2$residuals)
       data_s$res_mod3<-as.numeric(mod3$residuals)
       data_s$res_mod4<-as.numeric(mod4$residuals)
       data_s$res_mod5<-as.numeric(mod5$residuals)
       data_s$res_mod6<-as.numeric(mod6$residuals)
       data_s$res_mod7<-as.numeric(mod7$residuals)
       ###BEFORE Kringing the data object must be transformed to SDF
       coords<- data_v[,c('x_OR83M','y_OR83M')]
       coordinates(data_v)<-coords
       proj4string(data_v)<-CRS  #Need to assign coordinates...
       coords<- data_s[,c('x_OR83M','y_OR83M')]
       coordinates(data_s)<-coords
       proj4string(data_s)<-CRS  #Need to assign coordinates..
       #Kriging residuals!!
       for (j in 1:length(models)){
         name<-paste("res_mod",j,sep="")
         data_s$residuals<-data_s[[name]]
         X11()
         hscat(residuals~1,data_s,(0:9)*20000) # 9 lag classes with 20,000m width
         v<-variogram(residuals~1, data_s)
         plot(v)
         v.fit<-fit.variogram(v,vgm(1,"Sph", 150000,1))
         res_krige<-krige(residuals~1, data_s,mean_LST, v.fit)#mean_LST provides the data grid/raster image for the kriging locations.
         # Kriging visualization of Residuals fit over space
         #spplot.vcov(co_kriged_surf)                           #Visualizing the covariance structure
         res_krig1_s <- overlay(res_krige,data_s)             #This overlays the kriged surface tmax and the location of weather stations
         res_krig1_v <- overlay(res_krige,data_v)             #This overlays the kriged surface tmax and the location of weather stations
         name2<-paste("pred_kr_mod",j,sep="")
         #Adding the results back into the original dataframes.
         data_s[[name2]]<-res_krig1_s$var1.pred
         data_v[[name2]]<-res_krig1_v$var1.pred
         #Calculate RMSE and then krig the residuals....!
         res_mod_kr_s<- data_s$tmax - data_s[[name2]]           #Residuals from kriging.
         res_mod_kr_v<- data_v$tmax - data_v[[name2]]           #Residuals from cokriging.
         RMSE_mod_kr_s <- sqrt(sum(res_mod_kr_s^2,na.rm=TRUE)/(nv-sum(is.na(res_mod_kr_s))))                  #RMSE from kriged surface.
         RMSE_mod_kr_v <- sqrt(sum(res_mod_kr_v^2,na.rm=TRUE)/(nv-sum(is.na(res_mod_kr_v))))                  #RMSE from co-kriged surface.
         #(nv-sum(is.na(res_mod2)))
         #Writing out results
         results_RMSE_kr[i,1]<- dates[i]  #storing the interpolation dates in the first column
         results_RMSE_kr[i,2]<- ns        #number of stations used in the training stage
         results_RMSE_kr[i,j+2]<- RMSE_mod_kr_v
         #results_RMSE_kr[i,3]<- res_mod_kr_v
         name3<-paste("res_kr_mod",j,sep="")
         data_s[[name3]]<-res_mod_kr_s
         data_v[[name3]]<-res_mod_kr_v #Writing residuals from kriging
+        }
       data_name<-paste("ghcn_v_",out_prefix,"_",dates[[i]],sep="")
       assign(data_name,data_v)
       data_name<-paste("ghcn_s_",dates[[i]],sep="")
       assign(data_name,data_s)
       #ghcn_v<-ls(pattern="ghcn_v_")
       write.table(data_v, file= paste(path,"/",data_name,".txt",sep=""), sep=" ")
       #write out a new shapefile (including .prj component)
       outfile<-sub(".shp","",data_name)   #Removing extension if it is present
       writeOGR(data_v,".", outfile, driver ="ESRI Shapefile")
       cor_LST_LC1[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC1)
       cor_LST_LC3[i]<-cor(ghcn.subsets[[i]]$LST,ghcn.subsets[[i]]$LC3)
       data_name<-paste("ghcn_s_",out_prefix,"_",dates[[i]],sep="")
       assign(data_name,data_s)
       write.table(data_s, file= paste(path,"/",data_name,".txt",sep=""), sep=" ")
       outfile<-sub(".shp","",data_name)   #Removing extension if it is present
       writeOGR(data_s,".", outfile, driver ="ESRI Shapefile")
       #end of the for loop #1
       # end of the for loop #1
+      }
     ## Plotting and saving diagnostic measures
     results_RMSEnum <-results_RMSE
     results_AICnum <-results_AIC
     mode(results_RMSEnum)<- "numeric"
     mode(results_AICnum)<- "numeric"
     # Make it numeric first
     # Now turn it into a data.frame...
     mode(results_RMSEnum)<- "numeric"         # Make it numeric first
     mode(results_AICnum)<- "numeric"          # Now turn it into a data.frame...
     results_table_RMSE<-as.data.frame(results_RMSEnum)
     results_table_AIC<-as.data.frame(results_AICnum)
     colnames(results_table_RMSE)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6", "mod7", "mod8")
     colnames(results_table_AIC)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6", "mod7", "mod8")
     colnames(results_table_RMSE)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6", "mod7")
     colnames(results_table_AIC)<-c("dates","ns","mod1", "mod2","mod3", "mod4", "mod5", "mod6", "mod7")
     results_RMSE_kr_num <-results_RMSE_kr
     mode(results_RMSE_kr_num)<- "numeric"         # Make it numeric first
     results_table_RMSE_kr<-as.data.frame(results_RMSE_kr_num)
     colnames(results_table_RMSE_kr)<-c("dates","ns","mod1k", "mod2k","mod3k", "mod4k", "mod5k", "mod6k", "mod7k")
     #results_table_RMSE
     write.table(results_table_RMSE, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep=""), sep=",")
     write.table(results_table_AIC, file= paste(path,"/","results_GAM_Assessment",out_prefix,".txt",sep=""),sep=",", append=TRUE)
     write.table(results_table_RMSE_kr, file= paste(path,"/","results_GAM_Assessment_kr",out_prefix,".txt",sep=""), sep=",")
     ###Analysing the results from the 365 days run: Summarize by month
     for(i in 1:nrow(results_table_RMSE)){
       date<-results_table_RMSE$dates[i]
       date<-strptime(date, "%Y%m%d")
       results_table_RMSE$month[i]<-as.integer(strftime(date, "%m"))
+    }
     average<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE,mean, na.rm=TRUE)
     average<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, FUN=mean)
     #average on all the data.frame
     averaget<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month),FUN=mean, na.rm=TRUE)
     #mediant<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month),FUN=median, na.rm=TRUE)
     #average_lowt<-aggregate(results_table_RMSE, by=list(results_table_RMSE$month), FUN=function(v) t.test(v)$conf.int[1])
     #average_up<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[2])
     median<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, median, na.rm=TRUE)
     average_low<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[1])
     average_up<-aggregate(cbind(mod1,mod2,mod3,mod4,mod5,mod6,mod7,mod8)~month,data=results_table_RMSE, function(v) t.test(v)$conf.int[2])
     ##Visualization of results##
     mod<-names(averaget)
     mod<-mod[4:11]
     #Saving graphic plots
     for(i in 1:length(mod)){
       X11(width=14,height=10)
       name<-mod[i]
       barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main="Mean RMSE per month", names.arg=c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep","Oct", "Nov", "Dec"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
       #title(paste("Sampling RMSE for mod",i,sep=""))
       savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
       dev.off()
+    }
     for(i in 1:length(mod)){
       X11(width=14,height=10)
       name<-mod[i]
       barplot2(average[[name]],plot.ci=TRUE, ci.l=average_low[[name]], ci.u=average_up[[name]],main=paste(" Mean RMSE per month ",name, sep=""), names.arg=c("Jan", "Feb", "Mar", "Apr", "May", "Jun","Jul", "Aug", "Sep","Oct", "Nov", "Dec"),ylim=c(20,30),ylab="RMSE in tenth deg C",xlab=name)
       #title(paste("Sampling RMSE for mod",i,sep=""))
       savePlot(paste("barplot_results_RMSE_month_",name,out_prefix,".png", sep=""), type="png")
       dev.off()
       X11(width=14,height=10)
       name<-mod[i]
       hist(results_table_RMSE[[name]],breaks=15, main=paste(" Histogram RMSE_",name, sep=""),xlab=paste("RMSE ",name, sep=""))
       savePlot(paste("Hist_results_RMSE_365_",name,out_prefix,".png", sep=""), type="png")
       dev.off()
     for(i in 1:length(dates)){
       X11()
       RMSE_kr<-results_table_RMSE_kr[i,]
       RMSE_ga<-results_table_RMSE[i,]
+    }
     for(i in 1:length(mod)){
       X11(width=14,height=10)
       name<-mod[i]
       hist(results_table_RMSE[[name]],breaks=15, main=paste(" Histogram RMSE_",name, sep=""),xlab=paste("RMSE ",name, sep=""))
       savePlot(paste("Hist_results_RMSE_365_",name,out_prefix,".png", sep=""), type="png")
       dev.off()
+    }
     r<-(results_table_RMSE[,3:10]) #selecting only the columns related to models...
     mean_r<-mean(r)
     median_r<-sapply(r, median)
     sd_r<-sapply(r, sd)
     barplot(mean_r,ylim=c(23,26),ylab="RMSE in tenth deg C")
     barplot(median_r,ylim=c(23,26),ylab="RMSE in tenth deg C",add=TRUE,inside=FALSE,beside=TRUE) # put both on the same plot
     barplot(sd_r,ylim=c(6,8),ylab="RMSE in tenth deg C") # put both on the same plot
     height<-rbind(mean_r,median_r)
     barplot(height,ylim=c(23,26),ylab="RMSE in tenth deg C",beside=TRUE,legend=rownames(height))
     barplot(height,ylim=c(23,26),ylab="RMSE in tenth deg C",beside=TRUE, col=c("darkblue","red"),legend=rownames(height)) # put both on the same plot
       RMSE_kr<-RMSE_kr[,1:length(models)+2]
       RMSE_ga<-RMSE_ga[,1:length(models)+2]
       colnames(RMSE_kr)<-names(RMSE_ga)
       height<-rbind(RMSE_ga,RMSE_kr)
       rownames(height)<-c("GAM","GAM_KR")
       height<-as.matrix(height)
       barplot(height,ylim=c(0,105),ylab="RMSE in tenth deg C",beside=TRUE,
               legend.text=rownames(height),
               args.legend=list(x="topright"),
               main=paste("RMSE for date ",dates[i], sep=""))
       savePlot(paste("Barplot_results_RMSE_GAM_KR_",dates[i],out_prefix,".png", sep=""), type="png")
+      }
     # End of script##########
     barplot2(mean_r,median_r,ylim=c(23,26),ylab="RMSE in tenth deg C") # put both on the same plot
     #Collect var explained and p values for each var...
     # End of script##########
     names(results_table_RMSE)

Also available in: Unified diff

Project

General

Profile

Revision c9e2af49

Added by Benoit Parmentier over 12 years ago