/ - Diff - Environment and organisms - NCEAS Projects

« Previous | Next »

Revision 1c15fc49

Added by Adam M. Wilson about 11 years ago

ID 1c15fc49741a63d7c873114aace2e49ce3dbaebe
Parent f9c71298
Child f693d9e4

Revert "Merge branch 'ag/interp' of code.nceas.ucsb.edu:environmental-layers into aw/precip"

This reverts commit f9c712987ba814b83b2c2cc058c6c1f9b07933c1, reversing
changes made to f0375becc9fb9e13e55c5b7a48be5c5f98064f87.

     ####################################  INTERPOLATION TEMPERATURES  #######################################
     ############################  AAG 2013 and OREGON transitition script #######################################
     #This script reads information concerning the Oregon case study to adapt data for the revised
     # interpolation code.
     #Figures and data for the AAG conference are also produced.
     #AUTHOR: Benoit Parmentier                                                                      #
     #DATE: 04/08/2013
     #Version: 1
     #PROJECT: Environmental Layers project                                       #
     #################################################################################################
     ###Loading R library and packages
     library(gtools)                              # loading some useful tools
     library(mgcv)                                # GAM package by Simon Wood
     library(sp)                                  # Spatial pacakge with class definition by Bivand et al.
     library(spdep)                               # Spatial pacakge with methods and spatial stat. by Bivand et al.
     library(rgdal)                               # GDAL wrapper for R, spatial utilities
     library(gstat)                               # Kriging and co-kriging by Pebesma et al.
     library(fields)                              # NCAR Spatial Interpolation methods such as kriging, splines
     library(raster)                              # Hijmans et al. package for raster processing
     library(gdata)                               # various tools with xls reading
     library(rasterVis)
     library(parallel)
     library(maptools)
     library(maps)
     library(reshape)
     library(plotrix)
     #### UNCTION USED IN SCRIPT
     create_modis_tiles_region<-function(tiles,modis_sp){
       #This functions returns a subset of tiles from the modis grdi.
       #Arguments: modies grid tile,list of tiles
       #Output: spatial grid data frame of the subset of tiles
       h_list<-lapply(tiles,substr,start=2,stop=3) #passing multiple arguments
       v_list<-lapply(tiles,substr,start=5,stop=6) #passing multiple arguments
       selected_tiles<-subset(subset(modis_sp,subset = h %in% as.numeric (h_list) ),
                              subset = v %in% as.numeric(v_list))
       return(selected_tiles)
+    }
     ### Parameters and argument
     lc_path<-"/home/layers/data/land-cover/lc-consensus-global"
     infile_modis_grid<-"modis_sinusoidal_grid_world.shp"
     infile_elev<-"/home/layers/data/terrain/dem-cgiar-srtm-1km-tif/srtm_1km.tif"  #this is the global file: replace later with the input produced by the DEM team
     infile_canheight<-"Simard_Pinto_3DGlobalVeg_JGR.tif"              #Canopy height
     #list_tiles_modis = c('h11v08','h11v07','h12v07','h12v08','h10v07','h10v08') #tile for Venezuel and surrounding area
     list_tiles_modis = c('h08v04','h09v04')
     #infile_reg_outline=""  #input region outline defined by polygon
     infile_reg_outline= "OR83M_state_outline.shp"
     infile_countries_sinusoidal<-"countries_sinusoidal_world.shp"
     #CRS_interp<-"+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs";
     CRS_interp <-"+proj=lcc +lat_1=43 +lat_2=45.5 +lat_0=41.75 +lon_0=-120.5 +x_0=400000 +y_0=0 +ellps=GRS80 +units=m +no_defs";
     CRS_locs_WGS84<-CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +towgs84=0,0,0") #Station coords WGS84
     out_region_name<-"Oregon_region" #generated on the fly
     out_prefix<-"_OR_04052013"
     ref_rast_name<- "mean_day244_rescaled.rst"                     #This is the shape file of outline of the study area. #local raster name defining resolution, exent, local projection--. set on the fly??
     infile_covariates<-"covariates__venezuela_region__VE_01292013.tif" #this is an output from covariate script and used in stage 3 and stage 4
     #The names of covariates can be changed...these names should be output/input from covar script!!!
     rnames<-c("x","y","lon","lat","N","E","N_w","E_w","elev","slope","aspect","CANHEIGHT","DISTOC")
     lc_names<-c("LC1","LC2","LC3","LC4","LC5","LC6","LC7","LC8","LC9","LC10","LC11","LC12")
     lst_names<-c("mm_01","mm_02","mm_03","mm_04","mm_05","mm_06","mm_07","mm_08","mm_09","mm_10","mm_11","mm_12",
                  "nobs_01","nobs_02","nobs_03","nobs_04","nobs_05","nobs_06","nobs_07","nobs_08",
                  "nobs_09","nobs_10","nobs_11","nobs_12")
     covar_names<-c(rnames,lc_names,lst_names)
     infile2<-"/home/layers/data/climate/ghcn/v2.92-upd-2012052822/ghcnd-stations.txt"                              #This is the textfile of station
     in_path<- "/home/parmentier/Data/IPLANT_project/Oregon_interpolation/Oregon_covariates"
     #c("Oregon", c("h08v04","h09v04","h08v05","h09v05"))
     study_area_list_tiles <- vector("list",6)
     study_area_list_tiles[[1]] <-list("Oregon", c("h08v04","h09v04"))
     study_area_list_tiles[[2]] <-list("Venezuela",c("h10v07", "h10v08", "h11v7", "h11v08", "h12v07", "h12v08"))
     study_area_list_tiles[[3]] <-list("Norway",c("h18v02","h18v03", "h19v02", "h19v03"))
     study_area_list_tiles[[4]] <-list("East_Africa",c("h20v08", "h21v08", "h22v08", "h20v09", "h21v09", "h22v09", "h21v10"))
     study_area_list_tiles[[5]] <-list("South_Africa",c("h19v11", "h20v11", "h19v12", "h20v12"))
     study_area_list_tiles[[6]] <-list("Queensland",c("h31v10", "h31v10", "h32v10", "h30v11", "h31v11"))
     #######################################################################################
     ###########################      BEGIN SCRIPT    ######################################
     setwd(in_path)
     ####### PART I: Prepare data for figures and for Oregon interpolation ##########
     ### Read in Venezuela covariate stack
     s_raster_Ven<-brick(infile_covariates) #read brick
     names(s_raster_Ven)<-covar_names #assign names
     mm_01_Ven<-subset(s_raster_ven,"mm_01") #select LST January month average
     ### Read in world map to show stuy areas.
     world_sp <- getData("countries")  # different resolutions available
     outfile2<-file.path(in_path,paste("word_countries.shp",sep=""))  #Name of the file
     writeOGR(world_sp,dsn= dirname(outfile2),layer= sub(".shp","",basename(outfile2)), driver="ESRI Shapefile",overwrite_layer=TRUE)
     ### Read in sinusoidal grid and world countries
     filename<-sub(".shp","",infile_modis_grid)       #Removing the extension from file.
     modis_grid<-readOGR(".", filename)     #Reading shape file using rgdal library
     ### Create list ofALL STUDY AREAS/TEST SITES
     ## Create list of study area regions:
     list_tiles<-lapply(1:length(study_area_list_tiles),function(k) study_area_list_tiles[[k]][[2]])
     modis_reg_outlines<-lapply(list_tiles,FUN=create_modis_tiles_region,modis_sp=modis_grid) #problem...this does not
     #writeOGR(modis_reg_outline,dsn= ".",layer= paste("outline",out_region_name,"_",out_suffix,sep=""),
     #         driver="ESRI Shapefile",overwrite_layer="TRUE")
     ####################################################
     #Read in GHCND database station locations
     dat_stat <- read.fwf(infile2,
                          widths = c(11,9,10,7,3,31,4,4,6),fill=TRUE)
     colnames(dat_stat)<-c("STAT_ID","lat","lon","elev","state","name","GSNF","HCNF","WMOID")
     coords<- dat_stat[,c('lon','lat')]
     coordinates(dat_stat)<-coords
     proj4string(dat_stat)<-CRS_locs_WGS84 #this is the WGS84 projection
     #Save shapefile for later
     outfile1<-file.path(in_path,paste("ghcnd_stations.shp",sep=""))  #Name of the file
     writeOGR(dat_stat,dsn= dirname(outfile1),layer= sub(".shp","",basename(outfile1)), driver="ESRI Shapefile",overwrite_layer=TRUE)
     interp_area <- readOGR(dsn=in_path,sub(".shp","",infile_reg_outline))
     interp_area_WGS84 <-spTransform(interp_area,CRS_locs_WGS84)         # Project from WGS84 to new coord. system
     # Spatial query to find relevant stations
     inside <- !is.na(over(dat_stat, as(interp_area_WGS84, "SpatialPolygons")))  #Finding stations contained in the current interpolation area
     stat_reg_OR<-dat_stat[inside,]              #Selecting stations contained in the current interpolation area
     stat_reg_OR <-spTransform(stat_reg_OR,CRS(proj4string(interp_area)))         # Project from WGS84 to new coord. system
     #Now Venezuela
     interp_area_Ven_WGS84 <-spTransform(modis_reg_outlines[[2]],CRS_locs_WGS84)         # Project from WGS84 to new coord. system
     inside <- !is.na(over(dat_stat, as(interp_area_Ven_WGS84, "SpatialPolygons")))  #Finding stations contained in the current interpolation area
     stat_reg_Ven <-dat_stat[inside,]              #Selecting stations contained in the current interpolation area
     ## Get the data in the local projection
     stat_reg_Ven <-spTransform(stat_reg_Ven,CRS(proj4string(modis_reg_outlines[[2]])))         # Project from WGS84 to new coord. system
     ### READ IN COVARIATES FILES FOR OREGON AND MAKE IT A MULTI-BAND FILE
     inlistf<-"list_files_covariates_04032013.txt"
     lines<-read.table(paste(inlistf,sep=""), sep=" ")                  #Column 1 contains the names of raster files
     inlistvar<-lines[,1]
     inlistvar<-as.character(inlistvar)
     covar_names_OR<-as.character(lines[,2])                                         #Column two contains short names for covaraites
     s_raster_OR<- stack(inlistvar)                                                  #Creating a stack of raster images from the list of variables.
     layerNames(s_raster_OR)<-covar_names_OR                                            #Assigning names to the raster layers
     aspect<-subset(s_raster_OR,"aspect")             #Select layer from stack
     slope<-subset(s_raster_OR,"slope")             #Select layer from stack
     distoc<-subset(s_raster_OR,"DISTOC")
     N<-cos(aspect*pi/180)
     E<-sin(aspect*pi/180)
     Nw<-sin(slope*pi/180)*cos(aspect*pi/180)   #Adding a variable to the dataframe
     Ew<-sin(slope*pi/180)*sin(aspect*pi/180)   #Adding variable to the dataframe.
     xy <-coordinates(slope)  #get x and y projected coordinates...
     xy_latlon<-project(xy, CRS_interp, inv=TRUE) # find lat long for projected coordinats (or pixels...)
     x <-init(slope,v="x")
     y <-init(slope,v="y")
     lon<-x
     lat<-y
     lon <-setValues(lon,xy_latlon[,1]) #longitude for every pixel in the processing tile/region
     lat <-setValues(lat,xy_latlon[,2]) #latitude for every pixel in the processing tile/region
     CANHEIGHT<-subset(s_raster_OR,"CANHEIGHT")
     CANHEIGHT[is.na(CANHEIGHT)]<-0
     elev<-subset(s_raster_OR,"elev")
     elev[elev==-9999] <- NA
     r<-stack(x,y,lon,lat,N,E,Nw,Ew,elev,slope,aspect,CANHEIGHT,distoc)
     rnames<-c("x","y","lon","lat","N","E","N_w","E_w","elev","slope","aspect","CANHEIGHT","DISTOC")
     s_raster<-r
     #Add landcover layers
     lc_names<-c("LC1","LC2","LC3","LC4","LC5","LC6","LC7","LC8","LC9","LC10")
     lc_reg_s<-subset(s_raster_OR,lc_names)
     #Should be a function...ok for now??
     test<-vector("list",nlayers(lc_reg_s))
     for (k in 1:nlayers(lc_reg_s)){
       LC<-raster(lc_reg_s,layer=k)             #Select layer from stack
       LC[is.na(LC)]<-0
       test[[k]]<-LC
+    }
     #tmp_df<-freq(lc_reg_s,merge=TRUE) #check to see if it worked
     #head(tmp_df)
     lc_reg_s<-stack(test)
     s_raster<-addLayer(s_raster, lc_reg_s)
     lst_names<-c("mm_01","mm_02","mm_03","mm_04","mm_05","mm_06","mm_07","mm_08","mm_09","mm_10","mm_11","mm_12",
                  "nobs_01","nobs_02","nobs_03","nobs_04","nobs_05","nobs_06","nobs_07","nobs_08",
                  "nobs_09","nobs_10","nobs_11","nobs_12")
     lst_mm_names<-c("mm_01","mm_02","mm_03","mm_04","mm_05","mm_06","mm_07","mm_08","mm_09","mm_10","mm_11","mm_12")
     lst_mm_s<-subset(s_raster_OR,lst_mm_names)
     lst_mm_s <- lst_mm_s - 273.16
     lst_nobs_names<-c("nobs_01","nobs_02","nobs_03","nobs_04","nobs_05","nobs_06","nobs_07","nobs_08",
                     "nobs_09","nobs_10","nobs_11","nobs_12")
     lst_nobs_s<-subset(s_raster_OR,lst_nobs_names)
     s_raster<-addLayer(s_raster, lst_mm_s)
     s_raster<-addLayer(s_raster, lst_nobs_s)
     covar_names<-c(rnames,lc_names,lst_names)
     names(s_raster)<-covar_names
     # create mask!!! Should combine with mask of elev
     LC10<-subset(s_raster,"LC10")
     LC10_mask<-LC10
     LC10_mask[is.na(LC10_mask)]<- 0
     LC10_mask[LC10==100]<- NA
     LC10_mask[LC10_mask<100]<- 1
     LC10_mask[is.na(LC10_mask)]<- 0
     mask_land_NA<-LC10_mask
     mask_land_NA[mask_land_NA==0]<-NA
     ##### SAVE AS MULTIBAND...make this a function...
     #list of files...
     file_format<-".tif"
     NA_val<- -9999
     band_order<- "BSQ"
     var<-"TMAX"
     data_name<-paste("covariates_",out_region_name,"_",sep="")
     raster_name<-paste(data_name,var,"_",out_prefix,file_format, sep="")
     #writeRaster(s_raster, filename=raster_name,NAflag=-999,bylayer=FALSE,bandorder="BSQ",overwrite=TRUE)  #Writing the data in a raster file format...
     #if mask
     #stat_val<- extract(s_raster, ghcn3)                                          #Extracting values from the raster stack for every point location in coords data frame.
     s_raster_m<-mask(s_raster,mask_land_NA,filename=raster_name,
                      overwrite=TRUE,NAflag=NA_val,bylayer=FALSE,bandorder=band_order)
     #if no mask
     #writeRaster(s_raster, filename=raster_name,NAflag=-999,bylayer=FALSE,bandorder="BSQ",overwrite=TRUE)  #Writing the data in a raster file format...
     ############# PART II: PRODUCE FIGURES #######
     ### CREATE FIGURE TEST SITES
     dat_stat_sinusoidal <- spTransform(dat_stat,CRS(proj4string(modis_grid)))
     world_sinusoidal <- readOGR(dsn=".",sub(".shp","",infile_countries_sinusoidal))
     png(paste("Study_area_modis_grid",out_prefix,".png",sep=""))
     plot(world_sinusoidal)
     plot(dat_stat_sinusoidal,cex=0.2,pch=16,col=c("blue"),add=TRUE)
     plot(modis_grid,add=TRUE)
     for (k in 1:length(modis_reg_outlines)){
       plot(modis_reg_outlines[[k]],border=c("red"),lwd=2.5,add=TRUE)
+    }
     title("Study area for temperature and precipitation predictions")
     #legend
     dev.off()
     ### CREATE FIGURE MEAN DAILY AND MEAN MONTHLY: AAG 2013  ####
     lst_md<-raster(ref_rast_name)
     lst_mm_09<-subset(s_raster,"mm_09")
     plot(stack(lst_md,lst_mm_09))
     lst_md<-raster("mean_day001_rescaled.rst")
     lst_md<- lst_md - 273.16
     lst_mm_01<-subset(s_raster,"mm_01")
     png(filename=paste("Comparison_daily_monthly_mean_lst",out_prefix,".png",sep=""),width=960,height=480)
     par(mfrow=c(1,2))
     plot(lst_md)
     plot(interp_area,add=TRUE)
     title("Mean January 1")
     plot(lst_mm_01)
     plot(interp_area,add=TRUE)
     title("Mean for monht of January")
     dev.off()
     ### CREATE FIGURE NUMBER OF STATIONS PER SITE
     png(paste("stations_for_Venezuela_Oregon_areas",out_prefix,".png",sep=""),,width=960,height=480)
     par(mfrow=c(1,2))
     #Oregon data
     plot(lst_mm_01)
     plot(interp_area,add=TRUE)
     plot(stat_reg_OR,add=TRUE)
     title("Stations located in Oregon from GHNCD")
     plot(mm_01_Ven)
     plot(modis_reg_outlines[[2]],add=TRUE)
     plot(stat_reg_Ven,add=TRUE)
     title("Stations located in Venezuela from GHNCD")
     dev.off()
     ### CREATE FIGURE NUMBER OF STATIONS PER SITE AND SPECIFIC MONTH...
     #png(paste("stations_for_Venezuela_Oregon_areas_per_month",out_prefix,".png",sep=""))
     #par(mfrow=c(1,2))
     #plot(interp_area_WGS84)
     #plot(stat_reg_OR,add=TRUE)
     #plot(modis_reg_outlines[[2]])
     #plot(stat_reg_Ven,add=TRUE)
     #dev.off()
     ############ PART III: SCREENING OF COVARIATES #############
     ### SCREENING FUNCTION for covariate stack and GHNCD data base to add later in the functions
     #Screen for extreme values": this needs more thought, min and max val vary with regions
     #min_val<-(-15+273.16) #if values less than -15C then screen out (note the Kelvin units that will need to be changed later in all datasets)
     #r1[r1 < (min_val)]<-NA
     #s_raster<-addLayer(s_raster,LST)            #Adding current month
     nel<-12
     #tab_range<-data.frame(varname=character(nel),varterm=character(nel),vmin=numeric(nel),vmax=numeric(nel))
     val_range<-vector("list",nel) #list of one row data.frame
     val_rst<-vector("list",nel) #list of one row data.frame
     val_range[[1]]<-data.frame(varname="lon",vmin=-180,vmax=180)
     val_range[[2]]<-data.frame(varname="lat",vmin=-90,vmax=90)
     val_range[[3]]<-data.frame(varname="N",vmin=-1,vmax=1)
     val_range[[4]]<-data.frame(varname="E",vmin=-1,vmax=1)
     val_range[[5]]<-data.frame(varname="N_w",vmin=-1,vmax=1)
     val_range[[6]]<-data.frame(varname="E_w",vmin=-1,vmax=1)
     val_range[[7]]<-data.frame(varname="elev",vmin=0,vmax=6000)
     val_range[[8]]<-data.frame(varname="slope",varterm="slope",vmin=0,vmax=90)
     val_range[[9]]<-data.frame(varname="aspect",varterm="aspect",vmin=0,vmax=360)
     val_range[[10]]<-data.frame(varname="DISTOC",vmin=-0,vmax=10000000)
     val_range[[11]]<-data.frame(varname="CANHEIGHT",vmin=0,vmax=255)
     val_range[[12]]<-data.frame(varname="LC1",vmin=0,vmax=100)
     val_range[[13]]<-data.frame(varname="LC3",vmin=0,vmax=100)
     val_range[[14]]<-data.frame(varname="mm_01",vmin=-15,vmax=50)
     val_range[[15]]<-data.frame(varname="mm_02",vmin=-15,vmax=50)
     val_range[[16]]<-data.frame(varname="mm_03",vmin=-15,vmax=50)
     val_range[[17]]<-data.frame(varname="mm_04",vmin=-15,vmax=50)
     val_range[[18]]<-data.frame(varname="mm_05",vmin=-15,vmax=50)
     val_range[[19]]<-data.frame(varname="mm_06",vmin=-15,vmax=50)
     val_range[[20]]<-data.frame(varname="mm_07",vmin=-15,vmax=50)
     val_range[[21]]<-data.frame(varname="mm_08",vmin=-15,vmax=50)
     val_range[[22]]<-data.frame(varname="mm_09",vmin=-15,vmax=50)
     val_range[[23]]<-data.frame(varname="mm_10",vmin=-15,vmax=50)
     val_range[[24]]<-data.frame(varname="mm_11",vmin=-15,vmax=50)
     val_range[[25]]<-data.frame(varname="mm_12",vmin=-15,vmax=50)
     tab_range<-do.call(rbind,val_range)
     #pos<-match("ELEV_SRTM",layerNames(s_raster)) #Find column with the current month for instance mm12
     #ELEV_SRTM<-raster(s_raster,pos)
     screening_val_covariates_fun<-function(tab_range,r_stack){
       #Screening for raster stack
+      #
       for (k in 1:nrow(tab_range)){
         avl<-c(-Inf,tab_range$vmin[k],NA, tab_range$vmax[k],+Inf,NA)   #This creates a input vector...val 1 are -9999, 2 neg, 3 positive
         rclmat<-matrix(avl,ncol=3,byrow=TRUE)
         #s_raster_r<-raster(r_stack,match(tab_range$varterm[k],names(r_stack))) #select relevant layer from stack
         s_raster_r<-raster(r_stack,match(tab_range$varname[k],names(r_stack)))
         s_raster_r<-reclass(s_raster_r,rclmat)  #now reclass values
         r_stack<-dropLayer(r_stack,"N")
         names(s_raster_r)<-tab_range$varname[k] #Loss of layer names when using reclass
         val_rst[[k]]<-s_raster_r
+      }
       s_rst_m<-stack(val_rst) #This a raster stack with valid range of values
       r_stack<-addLayer(r_stack,s_rst_m) #add back layers that were screened out
       return(r_stack)
+    }
     #### ADD SCREENING FUNCTION FOR GHCND extracted!!!
     #Remove NA for LC and CANHEIGHT
     #ghcn$LC1[is.na(ghcn$LC1)]<-0
     #ghcn$LC3[is.na(ghcn$LC3)]<-0
     #ghcn$CANHEIGHT[is.na(ghcn$CANHEIGHT)]<-0
     #ghcn$LC4[is.na(ghcn$LC4)]<-0
     #ghcn$LC6[is.na(ghcn$LC6)]<-0
     ##ghcn$ELEV_SRTM[ghcn$ELEV_SRTM==-9999]<-NA
     #dst<-subset(dst,dst$TMax>-15 & dst$TMax<40)
     #dst<-subset(dst,dst$ELEV_SRTM>0) #This will drop two stations...or 24 rows

     ##################    Data preparation for interpolation   #######################################
     ############################ Extraction of station data ##########################################
     database_covariates_preparation<-function(list_param_prep){
       #This function performs queries on the Postgres ghcnd database for stations matching the
       #interpolation area. It requires 11 inputs:
       # 1)  db.name :  Postgres database name containing the meteorological stations
       # 2)  var: the variable of interest - "TMAX","TMIN" or "PRCP"
       # 3)  range_years: range of records used in the daily interpolation, note that upper bound year is not included
       # 4)  range_years_clim: range of records used in the monthly climatology interpolation, note that upper bound is not included
       # 5)  infile_reg_outline: region outline as a shape file - used in the interpolation  stage too
       # 6)  infile_ghncd_data: ghcnd stations locations as a textfile name with lat-long fields
       # 7)  infile_covarariates: tif file of raser covariates for the interpolation area: it should have a local projection
       # 8)  CRS_locs_WGS84: longlat EPSG 4326 used as coordinates reference system (proj4)for stations locations
       # 9)  in_path: input path for covariates data and other files, this is also the output?
       # 10) out_path: output path created in master script
       # 11) covar_names: names of covariates used for the interpolation --may be removed later? (should be stored in the brick)
       # 12) qc_flags_stations: flag used to screen out at this stage only two values...
       # 13) out_prefix: output suffix added to output names--it is the same in the interpolation script
+      #
       #The output is a list of four shapefile names produced by the function:
       #1) loc_stations: locations of stations as shapefile in EPSG 4326
       #2) loc_stations_ghcn: ghcn daily data for the year range of interpolation (locally projected)
       #3) daily_query_ghcn_data: ghcn daily data from daily query before application of quality flag
       #4) daily_covar_ghcn_data: ghcn daily data with covariates for the year range of interpolation (locally projected)
       #5) monthly_query_ghcn_data: ghcn daily data from monthly query before application of quality flag
       #6) monthly_covar_ghcn_data: ghcn monthly averaged data with covariates for the year range of interpolation (locally projected)
       #AUTHOR: Benoit Parmentier
       #DATE: 05/21/2013
       #PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363--
       #Comments and TODO
       #-Add buffer option...
       #-Add screening for value predicted: var
       ##################################################################################################
       ###Loading R library and packages: should it be read in before???
       library(RPostgreSQL)
       library(sp)                                           # Spatial pacakge with class definition by Bivand et al.
       library(spdep)                                          # Spatial pacakge with methods and spatial stat. by Bivand et al.
       library(rgdal)                                          # GDAL wrapper for R, spatial utilities
       library(rgeos)
       library(rgdal)
       library(raster)
       library(rasterVis)
       library(maps)
       library(maptools)
       ### Functions used in the script
       format_s <-function(s_ID){
         #Format station ID in a vector format/tuple that is used in a psql query.
         # Argument 1: vector of station ID
         # Return: character of station ID
         tx2<-s_ID
         tx2<-as.character(tx2)
         stat_list<-tx2
         temp<-shQuote(stat_list)
         t<-paste(temp, collapse= " ")
         t1<-gsub(" ", ",",t)
         sf_ID<-paste("(",t1,")",sep="") #vector containing the station ID to query
         return(sf_ID)
+      }
       #parsing input arguments
       db.name <- list_param_prep$db.name             #name of the Postgres database
       var <- list_param_prep$var                     #name of the variables to keep: TMIN, TMAX or PRCP
       year_start <-list_param_prep$range_years[1] #"2010"               #starting year for the query (included)
       year_end <-list_param_prep$range_years[2] #"2011"                 #end year for the query (excluded)
       year_start_clim <-list_param_prep$range_years_clim[1] #right bound not included in the range!! starting year for monthly query to calculate clime
       year_end_clim <-list_param_prep$range_years_clim[2] #right bound not included in the range!! starting year for monthly query to calculate clime
       infile_reg_outline<- list_param_prep$infile_reg_outline  #This is the shape file of outline of the study area                                                      #It is an input/output of the covariate script
       infile_ghncd_data<-list_param_prep$infile_ghncd_data      #"/home/layers/data/climate/ghcn/v2.92-upd-2012052822/ghcnd-stations.txt"                              #This is the textfile of station locations from GHCND
       infile_covariates<-list_param_prep$infile_covariates #"covariates__venezuela_region__VE_01292013.tif" #this is an output from covariate script
       CRS_locs_WGS84<-list_param_prep$CRS_locs_WGS84 #Station coords WGS84: same as earlier
       in_path <- list_param_prep$in_path #CRS_locs_WGS84"/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/input_data/"
       out_path <- list_param_prep$out_path #CRS_locs_WGS84"/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/input_data/"
       covar_names<-list_param_prep$covar_names # names should be written in the tif file!!!
       qc_flags_stations <- list_param_prep$qc_flags_stations #flags allowed for the query from the GHCND??
       out_prefix<-list_param_prep$out_prefix #"_365d_GAM_fus5_all_lstd_03012013"                #User defined output prefix
       ## working directory is the same for input and output for this function
       #setwd(in_path)
       setwd(out_path)
       ##### STEP 1: Select station in the study area
       filename<-sub(".shp","",infile_reg_outline)             #Removing the extension from file.
       interp_area <- readOGR(dsn=dirname(filename),basename(filename))
       CRS_interp<-proj4string(interp_area)         #Storing the coordinate information: geographic coordinates longlat WGS84
       #Read in GHCND database station locations
       dat_stat <- read.fwf(infile_ghncd_data,
                            widths = c(11,9,10,7,3,31,4,4,6),fill=TRUE)
       colnames(dat_stat)<-c("STAT_ID","latitude","longitude","elev","state","name","GSNF","HCNF","WMOID")
       coords<- dat_stat[,c('longitude','latitude')]
       coordinates(dat_stat)<-coords
       proj4string(dat_stat)<-CRS_locs_WGS84 #this is the WGS84 projection
       #proj4string(dat_stat)<-CRS_interp
       interp_area_WGS84 <-spTransform(interp_area,CRS_locs_WGS84)         # Project from WGS84 to new coord. system
       # Spatial query to find relevant stations
       inside <- !is.na(over(dat_stat, as(interp_area_WGS84, "SpatialPolygons")))  #Finding stations contained in the current interpolation area
       stat_reg<-dat_stat[inside,]              #Selecting stations contained in the current interpolation area
       ####
       ##TODO: Add buffer option?
       ####
       #### STEP 2: Connecting to the database and query for relevant data
       drv <- dbDriver("PostgreSQL")
       db <- dbConnect(drv, dbname=db.name)
       time1<-proc.time()    #Start stop watch
       list_s<-format_s(stat_reg$STAT_ID)
       data2<-dbGetQuery(db, paste("SELECT *
                                   FROM ghcn
                                   WHERE element=",shQuote(var),
                                   "AND year>=",year_start,
                                   "AND year<",year_end,
                                   "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
       time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
       time_minutes<-time_duration[3]/60
       dbDisconnect(db)
       data_table<-merge(data2,as.data.frame(stat_reg), by.x = "station", by.y = "STAT_ID")
       #Transform the subset data frame in a spatial data frame and reproject
       data_reg<-data_table                               #Make a copy of the data frame
       coords<- data_reg[c('longitude','latitude')]              #Define coordinates in a data frame: clean up here!!
       coordinates(data_reg)<-coords                      #Assign coordinates to the data frame
       proj4string(data_reg)<-CRS_locs_WGS84                #Assign coordinates reference system in PROJ4 format
       data_reg<-spTransform(data_reg,CRS(CRS_interp))     #Project from WGS84 to new coord. system
       data_d <-data_reg  #data_d: daily data containing the query without screening
       #data_reg <-subset(data_d,mflag=="0" | mflag=="S") #should be input arguments!!
       #Transform the query to be depending on the number of flags
       data_reg <-subset(data_d, mflag %in% qc_flags_stations) #screening using flags
       #data_reg2 <-subset(data_d,mflag==qc_flags_stations[1] | mflag==qc_flags_stations[2]) #screening using flags
       ##################################################################
       ### STEP 3: Save results and outuput in textfile and a shape file
       #browser()
       #Save shape files of the locations of meteorological stations in the study area
       #outfile1<-file.path(in_path,paste("stations","_",out_prefix,".shp",sep=""))
       outfile1<-file.path(out_path,paste("stations","_",out_prefix,".shp",sep=""))
       writeOGR(stat_reg,dsn= dirname(outfile1),layer= sub(".shp","",basename(outfile1)), driver="ESRI Shapefile",overwrite_layer=TRUE)
       #writeOGR(dst,dsn= ".",layer= sub(".shp","",outfile4), driver="ESRI Shapefile",overwrite_layer=TRUE)
       outfile2<-file.path(out_path,paste("ghcn_data_query_",var,"_",year_start,"_",year_end,out_prefix,".shp",sep=""))         #Name of the file
       #writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
       writeOGR(data_d,dsn= dirname(outfile2),layer= sub(".shp","",basename(outfile2)), driver="ESRI Shapefile",overwrite_layer=TRUE)
       outfile3<-file.path(out_path,paste("ghcn_data_",var,"_",year_start,"_",year_end,out_prefix,".shp",sep=""))         #Name of the file
       #writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
       writeOGR(data_reg,dsn= dirname(outfile3),layer= sub(".shp","",basename(outfile3)), driver="ESRI Shapefile",overwrite_layer=TRUE)
       ###################################################################
       ### STEP 4: Extract values at stations from covariates stack of raster images
       #Eventually this step may be skipped if the covariates information is stored in the database...
       #s_raster<-stack(file.path(in_path,infile_covariates))                   #read in the data stack
       s_raster<-brick(infile_covariates)                   #read in the data stack
       names(s_raster)<-covar_names               #Assigning names to the raster layers: making sure it is included in the extraction
       stat_val<- extract(s_raster, data_reg)        #Extracting values from the raster stack for every point location in coords data frame.
       #stat_val_test<- extract(s_raster, data_reg,def=TRUE)
       #create a shape file and data_frame with names
       data_RST<-as.data.frame(stat_val)                                            #This creates a data frame with the values extracted
       data_RST_SDF<-cbind(data_reg,data_RST)
       coordinates(data_RST_SDF)<-coordinates(data_reg) #Transforming data_RST_SDF into a spatial point dataframe
       CRS_reg<-proj4string(data_reg)
       proj4string(data_RST_SDF)<-CRS_reg  #Need to assign coordinates...
       #Creating a date column
       date1<-ISOdate(data_RST_SDF$year,data_RST_SDF$month,data_RST_SDF$day) #Creating a date object from 3 separate column
       date2<-gsub("-","",as.character(as.Date(date1)))
       data_RST_SDF$date<-date2                                              #Date format (year,month,day) is the following: "20100627"
       #This allows to change only one name of the data.frame
       pos<-match("value",names(data_RST_SDF)) #Find column with name "value"
       if (var=="TMAX"){
         #names(data_RST_SDF)[pos]<-c("TMax")
         data_RST_SDF$value<-data_RST_SDF$value/10                #TMax is the average max temp for monthy data
+      }
       if (var=="TMIN"){
         #names(data_RST_SDF)[pos]<-c("TMin")
         data_RST_SDF$value<-data_RST_SDF$value/10                #TMax is the average max temp for monthy data
+      }
       #write out a new shapefile (including .prj component)
       outfile4<-file.path(out_path,paste("daily_covariates_ghcn_data_",var,"_",range_years[1],"_",range_years[2],out_prefix,".shp",sep=""))         #Name of the file
       writeOGR(data_RST_SDF,dsn= dirname(outfile4),layer= sub(".shp","",basename(outfile4)), driver="ESRI Shapefile",overwrite_layer=TRUE)
       ###############################################################
       ######## STEP 5: Preparing monthly averages from the ProstGres database
       drv <- dbDriver("PostgreSQL")
       db <- dbConnect(drv, dbname=db.name)
       #year_start_clim: set at the start of the script
       time1<-proc.time()    #Start stop watch
       list_s<-format_s(stat_reg$STAT_ID)
       data_m<-dbGetQuery(db, paste("SELECT *
                                    FROM ghcn
                                    WHERE element=",shQuote(var),
                                    "AND year>=",year_start_clim,
                                    "AND year<",year_end_clim,
                                    "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
       time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
       time_minutes<-time_duration[3]/60
       dbDisconnect(db)
       #Clean out this section!!
       date1<-ISOdate(data_m$year,data_m$month,data_m$day) #Creating a date object from 3 separate column
       date2<-as.POSIXlt(as.Date(date1))
       data_m$date<-date2
       #Save the query data here...
       data_m<-merge(data_m, stat_reg, by.x="station", by.y="STAT_ID")   #Inner join all columns are retained
       #Extracting covariates from stack for the monthly dataset...
       coords<- data_m[c('longitude','latitude')]              #Define coordinates in a data frame
       coordinates(data_m)<-coords                      #Assign coordinates to the data frame
       proj4string(data_m)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
       data_m<-spTransform(data_m,CRS(CRS_interp))     #Project from WGS84 to new coord. system
       outfile5<-file.path(out_path,paste("monthly_query_ghcn_data_",var,"_",year_start_clim,"_",year_end_clim,out_prefix,".shp",sep=""))  #Name of the file
       writeOGR(data_m,dsn= dirname(outfile5),layer= sub(".shp","",basename(outfile5)), driver="ESRI Shapefile",overwrite_layer=TRUE)
       #In Venezuela and other regions where there are not many stations...mflag==S should be added..see Durenne etal.2010.
       #d<-subset(data_m,mflag==qc_flags_stations[1] | mflag==qc_flags_stations[2])
       d<-subset(data_m,mflag %in% qc_flags_stations)
       #Add screening here ...May need some screeing??? i.e. range of temp and elevation...
       d1<-aggregate(value~station+month, data=d, mean)  #Calculate monthly mean for every station in OR
       #d2<-aggregate(value~station+month, data=d, length)  #Calculate monthly mean for every station in OR
       is_not_na_fun<-function(x) sum(!is.na(x)) #count the number of available observation
       d2<-aggregate(value~station+month, data=d, is_not_na_fun)  #Calculate monthly mean for every station in OR
       #names(d2)[names(d2)=="value"] <-"nobs_station"
       d1$nobs_station<-d2$value
       dst<-merge(d1, stat_reg, by.x="station", by.y="STAT_ID")   #Inner join all columns are retained
       #This allows to change only one name of the data.frame
       pos<-match("value",names(dst)) #Find column with name "value"
       if (var=="TMAX"){
         names(dst)[pos]<-c("TMax")           #renaming the column "value" extracted from the Postgres database
         dst$TMax<-dst$TMax/10                #TMax is the average max temp for monthy data
+      }
       if (var=="TMIN"){
         names(dst)[pos]<-c("TMin")
         dst$TMin<-dst$TMin/10                #TMin is the average min temp for monthy data
+      }
       #Extracting covariates from stack for the monthly dataset...
       #names(dst)[5:6] <-c('latitude','longitude')
       coords<- dst[c('longitude','latitude')]              #Define coordinates in a data frame
       coordinates(dst)<-coords                      #Assign coordinates to the data frame
       proj4string(dst)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
       dst_month<-spTransform(dst,CRS(CRS_interp))     #Project from WGS84 to new coord. system
       stations_val<-extract(s_raster,dst_month,df=TRUE)  #extraction of the information at station location in a data frame
       #dst_extract<-spCbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
       dst_extract<-cbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
       dst<-dst_extract #problem!!! two column named elev!!! use elev_s??
       #browser()
       coords<- dst[c('x','y')]              #Define coordinates in a data frame, this is the local x,y
       index<-!is.na(coords$x)               #remove if NA, may need to revisit at a later stage
       dst<-dst[index,]
       coords<- dst[c('x','y')]              #Define coordinates in a data frame, this is the local x,y
       coordinates(dst)<-coords                    #Assign coordinates to the data frame
       proj4string(dst)<-CRS_interp        #Assign coordinates reference system in PROJ4 format
       ### ADD SCREENING HERE BEFORE WRITING OUT DATA
       #Covariates ok since screening done in covariate script
       #screening on var i.e. value, TMIN, TMAX...
       ####
       ####
       #write out a new shapefile (including .prj component)
       dst$OID<-1:nrow(dst) #need a unique ID?
       outfile6<-file.path(out_path,paste("monthly_covariates_ghcn_data_",var,"_",year_start_clim,"_",year_end_clim,out_prefix,".shp",sep=""))  #Name of the file
       writeOGR(dst,dsn= dirname(outfile6),layer= sub(".shp","",basename(outfile6)), driver="ESRI Shapefile",overwrite_layer=TRUE)
       ### list of outputs return
       outfiles_obj<-list(outfile1,outfile2,outfile3,outfile4,outfile5,outfile6)
       names(outfiles_obj)<- c("loc_stations","loc_stations_ghcn","daily_query_ghcn_data","daily_covar_ghcn_data","monthly_query_ghcn_data","monthly_covar_ghcn_data")
       save(outfiles_obj,file= file.path(out_path,paste("met_stations_outfiles_obj_",interpolation_method,"_", out_prefix,".RData",sep="")))
       return(outfiles_obj)
       #END OF FUNCTION #
+    }
     ########## END OF SCRIPT ##########

     ##################    Data preparation for interpolation   #######################################
     ############################ Extraction of station data ##########################################
     database_covaratiates_preparation<-function(list_param_prep){
       #This function performs queries on the Postgres ghcnd database for stations matching the
       #interpolation area. It requires 11 inputs:
       # 1) db.name :  Postgres database name containing the meteorological stations
       # 2) var: the variable of interest - "TMAX","TMIN" or "PRCP"
       # 3) range_years: range of records used in the daily interpolation, note that upper bound year is not included
       # 4) range_years_clim: range of records used in the monthly climatology interpolation, note that upper bound is not included
       # 5) infile1: region outline as a shape file - used in the interpolation  stage too
       # 6) infile2: ghcnd stations locations as a textfile name with lat-long fields
       # 7) infile_covarariates: tif file of raser covariates for the interpolation area: it should have a local projection
       # 8) CRS_locs_WGS84: longlat EPSG 4326 used as coordinates reference system (proj4)for stations locations
       # 9) in_path: input path for covariates data and other files, this is also the output?
       # 10) covar_names: names of covariates used for the interpolation --may be removed later? (should be stored in the brick)
       # 11) out_prefix: output suffix added to output names--it is the same in the interpolation script
+      #
       #The output is a list of four shapefile names produced by the function:
       #1) loc_stations: locations of stations as shapefile in EPSG 4326
       #2) loc_stations_ghcn: ghcn daily data for the year range of interpolation (locally projected)
       #3) daily_covar_ghcn_data: ghcn daily data with covariates for the year range of interpolation (locally projected)
       #4) monthly_covar_ghcn_data: ghcn daily data with covariates for the year range of interpolation (locally projected)
       #AUTHOR: Benoit Parmentier
       #DATE: 03/01/2013
       #PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363--
       #Comments and TODO
       #-Add buffer option...
       #-Add output path argument option
       #-Add qc flag options
       ##################################################################################################
       ###Loading R library and packages: should it be read in before???
       library(RPostgreSQL)
       library(sp)                                           # Spatial pacakge with class definition by Bivand et al.
       library(spdep)                                          # Spatial pacakge with methods and spatial stat. by Bivand et al.
       library(rgdal)                                          # GDAL wrapper for R, spatial utilities
       library(rgeos)
       library(rgdal)
       library(raster)
       library(rasterVis)
       ### Functions used in the script
       format_s <-function(s_ID){
         #Format station ID in a vector format/tuple that is used in a psql query.
         # Argument 1: vector of station ID
         # Return: character of station ID
         tx2<-s_ID
         tx2<-as.character(tx2)
         stat_list<-tx2
         temp<-shQuote(stat_list)
         t<-paste(temp, collapse= " ")
         t1<-gsub(" ", ",",t)
         sf_ID<-paste("(",t1,")",sep="") #vector containing the station ID to query
         return(sf_ID)
+      }
       #parsing input arguments
       db.name <- list_param_prep$db.name             #name of the Postgres database
       var <- list_param_prep$var                     #name of the variables to keep: TMIN, TMAX or PRCP
       year_start <-list_param_prep$range_years[1] #"2010"               #starting year for the query (included)
       year_end <-list_param_prep$range_years[2] #"2011"                 #end year for the query (excluded)
       year_start_clim <-list_param_prep$range_years_clim[1] #right bound not included in the range!! starting year for monthly query to calculate clime
       infile1<- list_param_prep$infile1  #This is the shape file of outline of the study area                                                      #It is an input/output of the covariate script
       infile2<-list_param_prep$infile2      #"/home/layers/data/climate/ghcn/v2.92-upd-2012052822/ghcnd-stations.txt"                              #This is the textfile of station locations from GHCND
       infile3<-list_param_prep$infile_covariates #"covariates__venezuela_region__VE_01292013.tif" #this is an output from covariate script
       CRS_locs_WGS84<-list_param_prep$CRS_locs_WGS84 #Station coords WGS84: same as earlier
       in_path <- list_param_prep$in_path #CRS_locs_WGS84"/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/input_data/"
       out_prefix<-list_param_prep$out_prefix #"_365d_GAM_fus5_all_lstd_03012013"                #User defined output prefix
       #qc_flags<-list_param_prep$qc_flags    flags allowed for the query from the GHCND??
       covar_names<-list_param_prep$covar_names # names should be written in the tif file!!!
       ## working directory is the same for input and output for this function
       setwd(in_path)
       ##### STEP 1: Select station in the study area
       filename<-sub(".shp","",infile1)             #Removing the extension from file.
       interp_area <- readOGR(".",filename)
       CRS_interp<-proj4string(interp_area)         #Storing the coordinate information: geographic coordinates longlat WGS84
       #Read in GHCND database station locations
       dat_stat <- read.fwf(infile2,
                            widths = c(11,9,10,7,3,31,4,4,6),fill=TRUE)
       colnames(dat_stat)<-c("STAT_ID","lat","lon","elev","state","name","GSNF","HCNF","WMOID")
       coords<- dat_stat[,c('lon','lat')]
       coordinates(dat_stat)<-coords
       proj4string(dat_stat)<-CRS_locs_WGS84 #this is the WGS84 projection
       #proj4string(dat_stat)<-CRS_interp
       dat_stat2<-spTransform(dat_stat,CRS(CRS_interp))         # Project from WGS84 to new coord. system
       # Spatial query to find relevant stations
       inside <- !is.na(over(dat_stat2, as(interp_area, "SpatialPolygons")))  #Finding stations contained in the current interpolation area
       stat_reg<-dat_stat2[inside,]              #Selecting stations contained in the current interpolation area
       ####
       ##TODO: Add buffer option?
       ####
       #### STEP 2: Connecting to the database and query for relevant data
       drv <- dbDriver("PostgreSQL")
       db <- dbConnect(drv, dbname=db.name)
       time1<-proc.time()    #Start stop watch
       list_s<-format_s(stat_reg$STAT_ID)
       data2<-dbGetQuery(db, paste("SELECT *
                                   FROM ghcn
                                   WHERE element=",shQuote(var),
                                   "AND year>=",year_start,
                                   "AND year<",year_end,
                                   "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
       time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
       time_minutes<-time_duration[3]/60
       dbDisconnect(db)
       ###
       #Add month query and averages here...
       ###
       #data2 contains only 46 stations for Venezueal area??
       data_table<-merge(data2,as.data.frame(stat_reg), by.x = "station", by.y = "STAT_ID")
       #Transform the subset data frame in a spatial data frame and reproject
       data_reg<-data_table                               #Make a copy of the data frame
       coords<- data_reg[c('lon','lat')]              #Define coordinates in a data frame: clean up here!!
       #Wrong label...it is in fact projected...
       coordinates(data_reg)<-coords                      #Assign coordinates to the data frame
       #proj4string(data3)<-locs_coord                  #Assign coordinates reference system in PROJ4 format
       proj4string(data_reg)<-CRS_locs_WGS84                #Assign coordinates reference system in PROJ4 format
       data_reg<-spTransform(data_reg,CRS(CRS_interp))     #Project from WGS84 to new coord. system
       ##################################################################
       ### STEP 3: Save results and outuput in textfile and a shape file
       #browser()
       #Save shape files of the locations of meteorological stations in the study area
       outfile1<-file.path(in_path,paste("stations","_",out_prefix,".shp",sep=""))
       writeOGR(stat_reg,dsn= dirname(outfile1),layer= sub(".shp","",basename(outfile1)), driver="ESRI Shapefile",overwrite_layer=TRUE)
       #writeOGR(dst,dsn= ".",layer= sub(".shp","",outfile4), driver="ESRI Shapefile",overwrite_layer=TRUE)
       outfile2<-file.path(in_path,paste("ghcn_data_",var,"_",year_start_clim,"_",year_end,out_prefix,".shp",sep=""))         #Name of the file
       #writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
       writeOGR(data_reg,dsn= dirname(outfile2),layer= sub(".shp","",basename(outfile2)), driver="ESRI Shapefile",overwrite_layer=TRUE)
       ###################################################################
       ### STEP 4: Extract values at stations from covariates stack of raster images
       #Eventually this step may be skipped if the covariates information is stored in the database...
       s_raster<-stack(infile3)                   #read in the data stack
       names(s_raster)<-covar_names               #Assigning names to the raster layers: making sure it is included in the extraction
       stat_val<- extract(s_raster, data_reg)        #Extracting values from the raster stack for every point location in coords data frame.
       #create a shape file and data_frame with names
       data_RST<-as.data.frame(stat_val)                                            #This creates a data frame with the values extracted
       data_RST_SDF<-cbind(data_reg,data_RST)
       coordinates(data_RST_SDF)<-coordinates(data_reg) #Transforming data_RST_SDF into a spatial point dataframe
       CRS_reg<-proj4string(data_reg)
       proj4string(data_RST_SDF)<-CRS_reg  #Need to assign coordinates...
       #Creating a date column
       date1<-ISOdate(data_RST_SDF$year,data_RST_SDF$month,data_RST_SDF$day) #Creating a date object from 3 separate column
       date2<-gsub("-","",as.character(as.Date(date1)))
       data_RST_SDF$date<-date2                                              #Date format (year,month,day) is the following: "20100627"
       #This allows to change only one name of the data.frame
       pos<-match("value",names(data_RST_SDF)) #Find column with name "value"
       if (var=="TMAX"){
         #names(data_RST_SDF)[pos]<-c("TMax")
         data_RST_SDF$value<-data_RST_SDF$value/10                #TMax is the average max temp for monthy data
+      }
       #write out a new shapefile (including .prj component)
       outfile3<-file.path(in_path,paste("daily_covariates_ghcn_data_",var,"_",range_years[1],"_",range_years[2],out_prefix,".shp",sep=""))         #Name of the file
       writeOGR(data_RST_SDF,dsn= dirname(outfile3),layer= sub(".shp","",basename(outfile3)), driver="ESRI Shapefile",overwrite_layer=TRUE)
       ###############################################################
       ######## STEP 5: Preparing monthly averages from the ProstGres database
       drv <- dbDriver("PostgreSQL")
       db <- dbConnect(drv, dbname=db.name)
       #year_start_clim: set at the start of the script
       year_end<-2011
       time1<-proc.time()    #Start stop watch
       list_s<-format_s(stat_reg$STAT_ID)
       data_m<-dbGetQuery(db, paste("SELECT *
                                    FROM ghcn
                                    WHERE element=",shQuote(var),
                                    "AND year>=",year_start_clim,
                                    "AND year<",year_end,
                                    "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
       time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
       time_minutes<-time_duration[3]/60
       dbDisconnect(db)
       #Clean out this section!!
       date1<-ISOdate(data_m$year,data_m$month,data_m$day) #Creating a date object from 3 separate column
       date2<-as.POSIXlt(as.Date(date1))
       data_m$date<-date2
       #In Venezuela and other regions where there are not many stations...mflag==S should be added..see Durenne etal.2010.
       #d<-subset(data_m,year>=2000 & mflag=="0" ) #Selecting dataset 2000-2010 with good quality: 193 stations
       d<-subset(data_m,mflag=="0" | mflag=="S") #should be input arguments!!
       #May need some screeing??? i.e. range of temp and elevation...
       d1<-aggregate(value~station+month, data=d, mean)  #Calculate monthly mean for every station in OR
       id<-as.data.frame(unique(d1$station))     #Unique station in OR for year 2000-2010: 193 but 7 loss of monthly avg
       dst<-merge(d1, stat_reg, by.x="station", by.y="STAT_ID")   #Inner join all columns are retained
       #This allows to change only one name of the data.frame
       pos<-match("value",names(dst)) #Find column with name "value"
       if (var=="TMAX"){
         names(dst)[pos]<-c("TMax")
         dst$TMax<-dst$TMax/10                #TMax is the average max temp for monthy data
+      }
       #Extracting covariates from stack for the monthly dataset...
       coords<- dst[c('lon','lat')]              #Define coordinates in a data frame
       coordinates(dst)<-coords                      #Assign coordinates to the data frame
       proj4string(dst)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
       dst_month<-spTransform(dst,CRS(CRS_interp))     #Project from WGS84 to new coord. system
       stations_val<-extract(s_raster,dst_month)  #extraction of the infomration at station location
       stations_val<-as.data.frame(stations_val)
       dst_extract<-cbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
       dst<-dst_extract
       coords<- dst[c('x','y')]              #Define coordinates in a data frame, this is the local x,y
       coordinates(dst)<-coords                    #Assign coordinates to the data frame
       proj4string(dst)<-projection(s_raster)        #Assign coordinates reference system in PROJ4 format
       ####
       #write out a new shapefile (including .prj component)
       dst$OID<-1:nrow(dst) #need a unique ID?
       outfile4<-file.path(in_path,paste("monthly_covariates_ghcn_data_",var,"_",year_start_clim,"_",year_end,out_prefix,".shp",sep=""))  #Name of the file
       writeOGR(dst,dsn= dirname(outfile4),layer= sub(".shp","",basename(outfile4)), driver="ESRI Shapefile",overwrite_layer=TRUE)
       ### list of outputs return
       outfiles_obj<-list(outfile1,outfile2,outfile3,outfile4)
       names(outfiles_obj)<- c("loc_stations","loc_stations_ghcn","daily_covar_ghcn_data","monthly_covar_ghcn_data")
       return(outfiles_obj)
       #END OF FUNCTION #
+    }
     ##### END OF SCRIPT ##########

     ##################    Data preparation for interpolation   #######################################
     ############################ Extraction of station data ##########################################
     #This script perform queries on the Postgres database ghcn for stations matching the
     #interpolation area. It requires the following inputs:
     # 1)the text file ofGHCND  stations from NCDC matching the database version release
     # 2)a shape file of the study area with geographic coordinates: lonlat WGS84                                                                          #
     # 3)a new coordinate system can be provided as an argument
     # 4)the variable of interest: "TMAX","TMIN" or "PRCP"
     # 5)the location of raser covariate stack.
     #The outputs are text files and a shape file of a time subset of the database
     #AUTHOR: Benoit Parmentier
     #DATE: 02/08/2013
     #PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#363--
     #Comments and TODO
     #-Add buffer option...
     #-Add calculation of monthly mean...
     ##################################################################################################
     ###Loading R library and packages
     library(RPostgreSQL)
     library(sp)                                           # Spatial pacakge with class definition by Bivand et al.
     library(spdep)                                          # Spatial pacakge with methods and spatial stat. by Bivand et al.
     library(rgdal)                                          # GDAL wrapper for R, spatial utilities
     library(rgeos)
     library(rgdal)
     library(raster)
     library(rasterVis)
     ### Parameters and arguments
     db.name <- "ghcn"                #name of the Postgres database
     var <- "TMAX"                    #name of the variables to keep: TMIN, TMAX or PRCP
     year_start<-"2010"               #starting year for the query (included)
     year_end<-"2011"                 #end year for the query (excluded)
     infile1<- "outline_venezuela_region__VE_01292013.shp"      #This is the shape file of outline of the study area.                                              #It is projected alreaday
     infile2<-"ghcnd-stations.txt"                             #This is the textfile of station locations from GHCND
     infile3<-"covariates__venezuela_region__VE_01292013.tif" #this is an output from covariate script
     new_proj<-"+proj=sinu +lon_0=0 +x_0=0 +y_0=0 +a=6371007.181 +b=6371007.181 +units=m +no_defs"
     locs_coord<-CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +towgs84=0,0,0")
     CRS_locs_WGS84<-"+proj=longlat +ellps=WGS84 +datum=WGS84 +towgs84=0,0,0"
     ##Paths to inputs and output
     in_path <- "/home/parmentier/Data/benoit_test"
     in_path <- "/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/input_data/"
     out_path<- "/home/parmentier/Data/IPLANT_project/Venezuela_interpolation/Venezuela_01142013/output_data/"
     ghcnd_path<- "/home/layers/data/climate/ghcn/v2.92-upd-2012052822"
     setwd(in_path)
     out_suffix<-"y2010_2010_VE_02082013"                                                 #User defined output prefix
     out_region_name<-"_venezuela_region"
     #out_suffix<-"_VE_01292013"
     ### Functions used in the script
     format_s <-function(s_ID){
       #Format station ID in a vector format/tuple that is used in a psql query.
       # Argument 1: vector of station ID
       # Return: character of station ID
       tx2<-s_ID
       tx2<-as.character(tx2)
       stat_list<-tx2
       temp<-shQuote(stat_list)
       t<-paste(temp, collapse= " ")
       t1<-gsub(" ", ",",t)
       sf_ID<-paste("(",t1,")",sep="") #vector containing the station ID to query
       return(sf_ID)
+    }
     ############ BEGIN: START OF THE SCRIPT #################
     ##### STEP 1: Select station in the study area
     filename<-sub(".shp","",infile1)             #Removing the extension from file.
     interp_area <- readOGR(".",filename)
     CRS_interp<-proj4string(interp_area)         #Storing the coordinate information: geographic coordinates longlat WGS84
     dat_stat <- read.fwf(file.path(ghcnd_path,"ghcnd-stations.txt"), widths = c(11,9,10,7,3,31,4,4,6),fill=TRUE)
     colnames(dat_stat)<-c("STAT_ID","lat","lon","elev","state","name","GSNF","HCNF","WMOID")
     coords<- dat_stat[,c('lon','lat')]
     coordinates(dat_stat)<-coords
     proj4string(dat_stat)<-locs_coord #this is the WGS84 projection
     #proj4string(dat_stat)<-CRS_interp
     dat_stat2<-spTransform(dat_stat,CRS(new_proj))         # Project from WGS84 to new coord. system
     # Spatial query to find relevant stations
     inside <- !is.na(over(dat_stat2, as(interp_area, "SpatialPolygons")))  #Finding stations contained in the current interpolation area
     stat_reg<-dat_stat2[inside,]              #Selecting stations contained in the current interpolation area
     #Quick visualization of station locations
     plot(interp_area, axes =TRUE)
     plot(stat_reg, pch=1, col="red", cex= 0.7, add=TRUE)
     #plot(data3,pch=1,col="blue",cex=3,add=TRUE)
     #legend("topleft", pch=1,col="red",bty="n",title= "Stations",cex=1.6)
     #only 357 station for Venezuela??
     ####
     ##Add buffer option?
     ####
     #### STEP 2: Connecting to the database and query for relevant data
     drv <- dbDriver("PostgreSQL")
     db <- dbConnect(drv, dbname=db.name)
     time1<-proc.time()    #Start stop watch
     list_s<-format_s(stat_reg$STAT_ID)
     data2<-dbGetQuery(db, paste("SELECT *
           FROM ghcn
           WHERE element=",shQuote(var),
           "AND year>=",year_start,
           "AND year<",year_end,
           "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
     time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
     time_minutes<-time_duration[3]/60
     ###
     #Add month query and averages here...
     ###
     #data2 contains only 46 stations for Venezueal area??
     data_table<-merge(data2,as.data.frame(stat_reg), by.x = "station", by.y = "STAT_ID")
     #Transform the subset data frame in a spatial data frame and reproject
     data_reg<-data_table                               #Make a copy of the data frame
     coords<- data_reg[c('lon','lat')]              #Define coordinates in a data frame: clean up here!!
                                                        #Wrong label...it is in fact projected...
     coordinates(data_reg)<-coords                      #Assign coordinates to the data frame
     #proj4string(data3)<-locs_coord                  #Assign coordinates reference system in PROJ4 format
     proj4string(data_reg)<-locs_coord                #Assign coordinates reference system in PROJ4 format
     data_reg<-spTransform(data_reg,CRS(new_proj))     #Project from WGS84 to new coord. system
     plot(interp_area, axes =TRUE)
     plot(stat_reg, pch=1, col="red", cex= 0.7, add=TRUE)
     plot(data_reg,pch=2,col="blue",cex=2,add=TRUE)
     ##################################################################
     ### STEP 3: Save results and outuput in textfile and a shape file
     #Save a textfile of the locations of meteorological stations in the study area
     write.table(as.data.frame(stat_reg), file=file.path(in_path,paste("stations",out_region_name,"_",
                                                               out_suffix,".txt",sep="")),sep=",")
     outfile<-paste("stations",out_region_name,"_",
                    out_suffix,sep="")
     writeOGR(stat_reg,dsn= ".",layer= outfile, driver="ESRI Shapefile",overwrite_layer=TRUE)
     outfile<-paste("ghcn_data_",var,out_suffix,sep="")         #Name of the file
     #writeOGR(data_proj, paste(outfile, "shp", sep="."), outfile, driver ="ESRI Shapefile") #Note that the layer name is the file name without extension
     writeOGR(data_reg,dsn= ".",layer= outfile, driver="ESRI Shapefile",overwrite_layer=TRUE)
     ###################################################################
     ### STEP 4: Extract values at stations from covariates stack of raster images
     #Eventually this step may be skipped if the covariates information is stored in the database...
     #The names of covariates can be changed...
     rnames<-c("x","y","lon","lat","N","E","N_w","E_w","elev","slope","aspect","CANHEIGHT","DISTOC")
     lc_names<-c("LC1","LC2","LC3","LC4","LC5","LC6","LC7","LC8","LC9","LC10","LC11","LC12")
     lst_names<-c("mm_01","mm_02","mm_03","mm_04","mm_05","mm_06","mm_07","mm_08","mm_09","mm_10","mm_11","mm_12",
                  "nobs_01","nobs_02","nobs_03","nobs_04","nobs_05","nobs_06","nobs_07","nobs_08",
                  "nobs_09","nobs_10","nobs_11","nobs_12")
     covar_names<-c(rnames,lc_names,lst_names)
     s_raster<-stack(infile3)                   #read in the data stack
     names(s_raster)<-covar_names               #Assigning names to the raster layers: making sure it is included in the extraction
     stat_val<- extract(s_raster, data_reg)        #Extracting values from the raster stack for every point location in coords data frame.
     #create a shape file and data_frame with names
     data_RST<-as.data.frame(stat_val)                                            #This creates a data frame with the values extracted
     data_RST_SDF<-cbind(data_reg,data_RST)
     coordinates(data_RST_SDF)<-coordinates(data_reg) #Transforming data_RST_SDF into a spatial point dataframe
     CRS_reg<-proj4string(data_reg)
     proj4string(data_RST_SDF)<-CRS_reg  #Need to assign coordinates...
     #Creating a date column
     date1<-ISOdate(data_RST_SDF$year,data_RST_SDF$month,data_RST_SDF$day) #Creating a date object from 3 separate column
     date2<-gsub("-","",as.character(as.Date(date1)))
     data_RST_SDF$date<-date2                                              #Date format (year,month,day) is the following: "20100627"
     #This allows to change only one name of the data.frame
     pos<-match("value",names(data_RST_SDF)) #Find column with name "value"
     if (var=="TMAX"){
       #names(data_RST_SDF)[pos]<-c("TMax")
       data_RST_SDF$value<-data_RST_SDF$value/10                #TMax is the average max temp for monthy data
+    }
     #write out a new shapefile (including .prj component)
     outfile<-paste("daily_covariates_ghcn_data_",var,out_suffix,sep="")         #Name of the file
     writeOGR(data_RST_SDF,dsn= ".",layer= outfile, driver="ESRI Shapefile",overwrite_layer=TRUE)
     ###############################################################
     ######## STEP 5: Preparing monthly averages from the ProstGres database
     drv <- dbDriver("PostgreSQL")
     db <- dbConnect(drv, dbname=db.name)
     year_start<-2000
     year_end<-2011
     time1<-proc.time()    #Start stop watch
     list_s<-format_s(stat_reg$STAT_ID)
     data_m<-dbGetQuery(db, paste("SELECT *
                                 FROM ghcn
                                 WHERE element=",shQuote(var),
                                 "AND year>=",year_start,
                                 "AND year<",year_end,
                                 "AND station IN ",list_s,";",sep=""))  #Selecting station using a SQL query
     time_duration<-proc.time()-time1             #Time for the query may be long given the size of the database
     time_minutes<-time_duration[3]/60
     # do this work outside of (before) this function
     # to avoid making a copy of the data frame inside the function call
     date1<-ISOdate(data_m$year,data_m$month,data_m$day) #Creating a date object from 3 separate column
     date2<-as.POSIXlt(as.Date(date1))
     data_m$date<-date2
     #In Venezuela and other regions where there are not many stations...mflag==S should be added..see Durenne etal.2010.
     #d<-subset(data_m,year>=2000 & mflag=="0" ) #Selecting dataset 2000-2010 with good quality: 193 stations
     d<-subset(data_m,mflag=="0" | mflag=="S")
     #May need some screeing??? i.e. range of temp and elevation...
     d1<-aggregate(value~station+month, data=d, mean)  #Calculate monthly mean for every station in OR
     id<-as.data.frame(unique(d1$station))     #Unique station in OR for year 2000-2010: 193 but 7 loss of monthly avg
     dst<-merge(d1, stat_reg, by.x="station", by.y="STAT_ID")   #Inner join all columns are retained
     #This allows to change only one name of the data.frame
     pos<-match("value",names(dst)) #Find column with name "value"
     if (var=="TMAX"){
       names(dst)[pos]<-c("TMax")
       dst$TMax<-dst$TMax/10                #TMax is the average max temp for monthy data
+    }
     #dstjan=dst[dst$month==9,]  #dst contains the monthly averages for tmax for every station over 2000-2010
     #Extracting covariates from stack for the monthly dataset...
     coords<- dst[c('lon','lat')]              #Define coordinates in a data frame
     coordinates(dst)<-coords                      #Assign coordinates to the data frame
     proj4string(dst)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
     dst_month<-spTransform(dst,CRS(CRS_interp))     #Project from WGS84 to new coord. system
     stations_val<-extract(s_raster,dst_month)  #extraction of the infomration at station location
     stations_val<-as.data.frame(stations_val)
     dst_extract<-cbind(dst_month,stations_val) #this is in sinusoidal from the raster stack
     dst<-dst_extract
     coords<- dst[c('x','y')]              #Define coordinates in a data frame, this is the local x,y
     coordinates(dst)<-coords                    #Assign coordinates to the data frame
     proj4string(dst)<-projection(s_raster)        #Assign coordinates reference system in PROJ4 format
     ####
     #write out a new shapefile (including .prj component)
     outfile<-paste("monthly_covariates_ghcn_data_",var,out_suffix,sep="")         #Name of the file
     dst$OID<-1:nrow(dst) #need a unique ID?
     writeOGR(dst,dsn= ".",layer= outfile, driver="ESRI Shapefile",overwrite_layer=TRUE)
     ##### END OF SCRIPT ##########

     ######################################## METHOD COMPARISON #######################################
     ############################ Constant sampling for GAM CAI method #####################################
     #This script interpolates tmax values using MODIS LST and GHCND station data                     #
     #interpolation area. It requires the text file of stations and a shape file of the study area.   #
     #Note that the projection for both GHCND and study area is lonlat WGS84.                         #
     #Method is assedsed using constant sampling with variation  of validation sample with different  #
     #hold out proportions.                                                                           #
     #AUTHOR: Benoit Parmentier                                                                       #
     #DATE: 12/27/2012                                                                                #
     #PROJECT: NCEAS INPLANT: Environment and Organisms --TASK#491--                                  #
     ###################################################################################################
     ###Loading R library and packages
     library(gtools)                                         # loading some useful tools
     library(mgcv)                                           # GAM package by Simon Wood
     library(sp)                                             # Spatial pacakge with class definition by Bivand et al.
     library(spdep)                               # Spatial pacakge with methods and spatial stat. by Bivand et al.
     library(rgdal)                               # GDAL wrapper for R, spatial utilities
     library(gstat)                               # Kriging and co-kriging by Pebesma et al.
     library(fields)                              # NCAR Spatial Interpolation methods such as kriging, splines
     library(raster)                              # Hijmans et al. package for raster processing
     library(rasterVis)
     library(parallel)                            # Urbanek S. and Ripley B., package for multi cores & parralel processing
     library(reshape)
     ### Parameters and argument
     infile1<- "ghcn_or_tmax_covariates_06262012_OR83M.shp"             #GHCN shapefile containing variables for modeling 2010
     #infile2<-"list_10_dates_04212012.txt"                     #List of 10 dates for the regression
     infile2<-"list_365_dates_04212012.txt"
     infile3<-"LST_dates_var_names.txt"                        #LST dates name
     infile4<-"models_interpolation_05142012.txt"              #Interpolation model names
     infile5<-"mean_day244_rescaled.rst"                       #Raster or grid for the locations of predictions
     #infile6<-"lst_climatology.txt"
     infile6<-"LST_files_monthly_climatology.txt"
     inlistf<-"list_files_05032012.txt"                        #Stack of images containing the Covariates
     path<-"/home/parmentier/Data/IPLANT_project/data_Oregon_stations_10242012_CAI" #Atlas location
     setwd(path)
     #Station location for the study area
     stat_loc<-read.table(paste(path,"/","location_study_area_OR_0602012.txt",sep=""),sep=",", header=TRUE)
     #GHCN Database for 1980-2010 for study area (OR)
     data3<-read.table(paste(path,"/","ghcn_data_TMAXy1980_2010_OR_0602012.txt",sep=""),sep=",", header=TRUE)
     nmodels<-9                #number of models running
     y_var_name<-"dailyTmax"   #climate variable interpolated
     climgam<-1                                             #if 1, then GAM is run on the climatology rather than the daily deviation surface...
     predval<-1                                              #if 1, produce raster prediction
     prop<-0.3                                               #Proportion of testing retained for validation
     seed_number<- 100                                             #Seed number for random sampling, if seed_number<0, no seed number is used..
     #out_prefix<-"_365d_GAM_CAI2_const_10222012_"                 #User defined output prefix
     #out_prefix<-"_365d_GAM_CAI2_const_all_lstd_10272012"         #User defined output prefix
     out_prefix<-"_365d_GAM_CAI4_all_12272012"               #User defined output prefix
     bias_val<-0            #if value 1 then daily training data is used in the bias surface rather than the all monthly stations (added on 07/11/2012)
     bias_prediction<-1     #if value 1 then use GAM for the BIAS prediction otherwise GAM direct reprediction for y_var (daily tmax)
     nb_sample<-1           #number of time random sampling must be repeated for every hold out proportion
     prop_min<-0.3          #if prop_min=prop_max and step=0 then predicitons are done for the number of dates...
     prop_max<-0.3
     step<-0
     constant<-0            #if value 1 then use the same sample used in the first date for interpolation over the set of dates
     #projection used in the interpolation of the study area
     CRS_interp<-"+proj=lcc +lat_1=43 +lat_2=45.5 +lat_0=41.75 +lon_0=-120.5 +x_0=400000 +y_0=0 +ellps=GRS80 +units=m +no_defs";
     CRS_locs_WGS84<-CRS("+proj=longlat +ellps=WGS84 +datum=WGS84 +towgs84=0,0,0") #Station coords WGS84
     #This can be entered as textfile or option later...ok for running now on 12/07/2012
     list_formulas<-vector("list",nmodels)
     list_formulas[[1]] <- as.formula("y_var~ s(ELEV_SRTM)", env=.GlobalEnv)
     list_formulas[[2]] <- as.formula("y_var~ s(LST)", env=.GlobalEnv)
     list_formulas[[3]] <- as.formula("y_var~ s(ELEV_SRTM,LST)", env=.GlobalEnv)
     list_formulas[[4]] <- as.formula("y_var~ s(lat)+s(lon)+s(ELEV_SRTM)", env=.GlobalEnv)
     list_formulas[[5]] <- as.formula("y_var~ s(lat,lon,ELEV_SRTM)", env=.GlobalEnv)
     list_formulas[[6]] <- as.formula("y_var~ s(lat,lon)+s(ELEV_SRTM)+s(Northness_w,Eastness_w)+s(LST)", env=.GlobalEnv)
     list_formulas[[7]] <- as.formula("y_var~ s(lat,lon)+s(ELEV_SRTM)+s(Northness_w,Eastness_w)+s(LST)+s(LC1)", env=.GlobalEnv)
     list_formulas[[8]] <- as.formula("y_var~ s(lat,lon)+s(ELEV_SRTM)+s(Northness_w,Eastness_w)+s(LST)+s(LC3)", env=.GlobalEnv)
     list_formulas[[9]] <- as.formula("y_var~ s(x_OR83M,y_OR83M)", env=.GlobalEnv)
     #source("GAM_CAI_function_multisampling_10252012.R")
     source("GAM_CAI_function_multisampling_12072012.R")
     ############ START OF THE SCRIPT ##################
     ###Reading the station data and setting up for models' comparison
     filename<-sub(".shp","",infile1)             #Removing the extension from file.
     ghcn<-readOGR(".", filename)                 #reading shapefile
     CRS<-proj4string(ghcn)                       #Storing projection information (ellipsoid, datum,etc.)
     mean_LST<- readGDAL(infile5)                 #Reading the whole raster in memory. This provides a grid for kriging
     proj4string(mean_LST)<-CRS                   #Assigning coordinate information to prediction grid.
     ghcn <- transform(ghcn,Northness = cos(ASPECT*pi/180)) #Adding a variable to the dataframe
     ghcn <- transform(ghcn,Eastness = sin(ASPECT*pi/180))  #adding variable to the dataframe.
     ghcn <- transform(ghcn,Northness_w = sin(slope*pi/180)*cos(ASPECT*pi/180)) #Adding a variable to the dataframe
     ghcn <- transform(ghcn,Eastness_w = sin(slope*pi/180)*sin(ASPECT*pi/180))  #adding variable to the dataframe.
     #Remove NA for LC and CANHEIGHT
     ghcn$LC1[is.na(ghcn$LC1)]<-0
     ghcn$LC3[is.na(ghcn$LC3)]<-0
     ghcn$CANHEIGHT[is.na(ghcn$CANHEIGHT)]<-0
     ghcn$LC4[is.na(ghcn$LC4)]<-0
     ghcn$LC6[is.na(ghcn$LC6)]<-0
     #Use file.path for to construct pathfor independent os platform? !!!
     dates<-readLines(file.path(path,infile2))
     #dates <-readLines(paste(path,"/",infile2, sep=""))
     LST_dates <-readLines(paste(path,"/",infile3, sep=""))
     models <-readLines(paste(path,"/",infile4, sep=""))
     ##Extracting the variables values from the raster files
     lines<-read.table(paste(path,"/",inlistf,sep=""), sep=" ")                  #Column 1 contains the names of raster files
     inlistvar<-lines[,1]
     inlistvar<-paste(path,"/",as.character(inlistvar),sep="")
     covar_names<-as.character(lines[,2])                                         #Column two contains short names for covaraites
     s_raster<- stack(inlistvar)                                                  #Creating a stack of raster images from the list of variables.
     layerNames(s_raster)<-covar_names                                            #Assigning names to the raster layers
     projection(s_raster)<-CRS
     #stat_val<- extract(s_raster, ghcn3)                                          #Extracting values from the raster stack for every point location in coords data frame.
     pos<-match("ASPECT",layerNames(s_raster)) #Find column with name "value"
     r1<-raster(s_raster,layer=pos)             #Select layer from stack
     pos<-match("slope",layerNames(s_raster)) #Find column with name "value"
     r2<-raster(s_raster,layer=pos)             #Select layer from stack
     N<-cos(r1*pi/180)
     E<-sin(r1*pi/180)
     Nw<-sin(r2*pi/180)*cos(r1*pi/180)   #Adding a variable to the dataframe
     Ew<-sin(r2*pi/180)*sin(r1*pi/180)   #Adding variable to the dataframe.
     pos<-match("LC1",layerNames(s_raster)) #Find column with name "value"
     LC1<-raster(s_raster,layer=pos)             #Select layer from stack
     s_raster<-dropLayer(s_raster,pos)
     LC1[is.na(LC1)]<-0                      #NA must be set to zero.
     pos<-match("LC3",layerNames(s_raster)) #Find column with name "value"
     LC3<-raster(s_raster,layer=pos)             #Select layer from stack
     s_raster<-dropLayer(s_raster,pos)
     LC3[is.na(LC3)]<-0
     #Modification added to account for other land cover
     pos<-match("LC4",layerNames(s_raster)) #Find column with name "value"
     LC4<-raster(s_raster,layer=pos)             #Select layer from stack
     s_raster<-dropLayer(s_raster,pos)
     LC4[is.na(LC4)]<-0
     pos<-match("LC6",layerNames(s_raster)) #Find column with name "value"
     LC6<-raster(s_raster,layer=pos)             #Select layer from stack
     s_raster<-dropLayer(s_raster,pos)
     LC6[is.na(LC6)]<-0
     LC_s<-stack(LC1,LC3,LC4,LC6)
     layerNames(LC_s)<-c("LC1_forest","LC3_grass","LC4_crop","LC6_urban")
     #plot(LC_s)
     pos<-match("CANHEIGHT",layerNames(s_raster)) #Find column with name "value"
     CANHEIGHT<-raster(s_raster,layer=pos)             #Select layer from stack
     s_raster<-dropLayer(s_raster,pos)
     CANHEIGHT[is.na(CANHEIGHT)]<-0
     pos<-match("ELEV_SRTM",layerNames(s_raster)) #Find column with name "ELEV_SRTM"
     ELEV_SRTM<-raster(s_raster,layer=pos)             #Select layer from stack on 10/30
     s_raster<-dropLayer(s_raster,pos)
     ELEV_SRTM[ELEV_SRTM <0]<-NA
     xy<-coordinates(r1)  #get x and y projected coordinates...
     xy_latlon<-project(xy, CRS, inv=TRUE) # find lat long for projected coordinats (or pixels...)
     lon<-raster(xy_latlon) #Transform a matrix into a raster object ncol=ncol(r1), nrow=nrow(r1))
     ncol(lon)<-ncol(r1)
     nrow(lon)<-nrow(r1)
     extent(lon)<-extent(r1)
     projection(lon)<-CRS  #At this stage this is still an empty raster with 536 nrow and 745 ncell
     lat<-lon
     values(lon)<-xy_latlon[,1]
     values(lat)<-xy_latlon[,2]
     r<-stack(N,E,Nw,Ew,lon,lat,LC1,LC3,LC4,LC6, CANHEIGHT,ELEV_SRTM)
     rnames<-c("Northness","Eastness","Northness_w","Eastness_w", "lon","lat","LC1","LC3","LC4","LC6","CANHEIGHT","ELEV_SRTM")
     layerNames(r)<-rnames
     s_raster<-addLayer(s_raster, r)
     #s_sgdf<-as(s_raster,"SpatialGridDataFrame") #Conversion to spatial grid data frame
     ####### Preparing LST stack of climatology...
     #l=list.files(pattern="mean_month.*rescaled.rst")
     l <-readLines(paste(path,"/",infile6, sep=""))
     molst<-stack(l)  #Creating a raster stack...
     #setwd(old)
     molst<-molst-273.16  #K->C          #LST stack of monthly average...
     idx <- seq(as.Date('2010-01-15'), as.Date('2010-12-15'), 'month')
     molst <- setZ(molst, idx)
     layerNames(molst) <- month.abb
     ######  Preparing tables for model assessment: specific diagnostic/metrics
     #Model assessment: specific diagnostics/metrics
     results_m1<- matrix(1,1,nmodels+3)
     results_m2<- matrix(1,1,nmodels+3)
     results_m3<- matrix(1,1,nmodels+3)
     #results_RMSE_f<- matrix(1,length(models)+3)
     #Model assessment: general diagnostic/metrics
     results_RMSE <- matrix(1,1,nmodels+4)
     results_MAE <- matrix(1,1,nmodels+4)
     results_ME <- matrix(1,1,nmodels+4)       #There are 8+1 models
     results_R2 <- matrix(1,1,nmodels+4)       #Coef. of determination for the validation dataset
     results_RMSE_f<- matrix(1,1,nmodels+4)    #RMSE fit, RMSE for the training dataset
     results_MAE_f <- matrix(1,1,nmodels+4)
     results_R2_f<-matrix(1,1,nmodels+4)
     ######## Preparing monthly averages from the ProstGres database
     # do this work outside of (before) this function
     # to avoid making a copy of the data frame inside the function call
     date1<-ISOdate(data3$year,data3$month,data3$day) #Creating a date object from 3 separate column
     date2<-as.POSIXlt(as.Date(date1))
     data3$date<-date2
     d<-subset(data3,year>=2000 & mflag=="0" ) #Selecting dataset 2000-2010 with good quality: 193 stations
     #May need some screeing??? i.e. range of temp and elevation...
     d1<-aggregate(value~station+month, data=d, mean)  #Calculate monthly mean for every station in OR
     id<-as.data.frame(unique(d1$station))     #Unique station in OR for year 2000-2010: 193 but 7 loss of monthly avg
     dst<-merge(d1, stat_loc, by.x="station", by.y="STAT_ID")   #Inner join all columns are retained
     #This allows to change only one name of the data.frame
     pos<-match("value",names(dst)) #Find column with name "value"
     names(dst)[pos]<-c("TMax")
     dst$TMax<-dst$TMax/10                #TMax is the average max temp for monthy data
     #dstjan=dst[dst$month==9,]  #dst contains the monthly averages for tmax for every station over 2000-2010
     #Extracting covariates from stack for the monthly dataset...
     coords<- dst[c('lon','lat')]              #Define coordinates in a data frame
     coordinates(dst)<-coords                      #Assign coordinates to the data frame
     proj4string(dst)<-CRS_locs_WGS84                  #Assign coordinates reference system in PROJ4 format
     dst_month<-spTransform(dst,CRS(CRS_interp))     #Project from WGS84 to new coord. system
     stations_val<-extract(s_raster,dst_month)  #extraction of the infomration at station location
     stations_val<-as.data.frame(stations_val)
     dst_extract<-cbind(dst_month,stations_val)
     dst<-dst_extract
     #Now clean and screen monthly values
     dst_all<-dst
     dst<-subset(dst,dst$TMax>-15 & dst$TMax<40)
     dst<-subset(dst,dst$ELEV_SRTM>0) #This will drop two stations...or 24 rows
     ######### Preparing daily values for training and testing
     #Screening for bad values: value is tmax in this case
     #ghcn$value<-as.numeric(ghcn$value)
     ghcn_all<-ghcn
     ghcn_test<-subset(ghcn,ghcn$value>-150 & ghcn$value<400)
     ghcn_test2<-subset(ghcn_test,ghcn_test$ELEV_SRTM>0)
     ghcn<-ghcn_test2
     #coords<- ghcn[,c('x_OR83M','y_OR83M')]
     ##Sampling: training and testing sites...
     if (seed_number>0) {
       set.seed(seed_number)                        #Using a seed number allow results based on random number to be compared...
+    }
     nel<-length(dates)
     dates_list<-vector("list",nel) #list of one row data.frame
     prop_range<-(seq(from=prop_min,to=prop_max,by=step))*100
     sn<-length(dates)*nb_sample*length(prop_range)
     for(i in 1:length(dates)){
       d_tmp<-rep(dates[i],nb_sample*length(prop_range)) #repeating same date
       s_nb<-rep(1:nb_sample,length(prop_range))         #number of random sample per proportion
       prop_tmp<-sort(rep(prop_range, nb_sample))

... This diff was truncated because it exceeds the maximum size that can be displayed.

Also available in: Unified diff

Project

General

Profile

Revision 1c15fc49

Added by Adam M. Wilson about 11 years ago