#*********************************************************
#  Written by: Bradley University
#  Dr. Steven Dolins
#  Use: Transform and Load Raw Specimen
#       GBIF Data from GBIF file into the
# 		Staging Table (StagingRawSpecimen).
#  Version: 2.0
#  Date: November 09, 2009
#*********************************************************

<?php	
## ======== Connect to database
include('connect.php');

## ======== Spacing for buttons in index.php
echo "<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br>";

## ======== Identify data file
$Validation_check = 0;
$GBIFFile = 'GBIF.txt';
$duplicateEntries=0;
$successEntries=0;
$validation_errors=0;
$recordCount = 0;

## ======== Load file as a character array - each entry is one line of text with newline still attached
$GBIFFile = fopen($GBIFFile, 'r');

##==== Pull off a single line from file and add it to array ====
	
//$line = fgets($GBIFFile);	//UNCOMMENT ONLY IF TITLES ARE INCLUDED IN FIRST LINE OF FILE
while($line = fgets($GBIFFile))
{
	$line = trim($line);
	$arr = explode("\t", $line);
	$recordCount++;

	## ======== Ensure that data enetered is a full tuple (has all columns associated with GBIF data)
	
	if(count($arr) != 30)
	{
		reportError(NULL, NULL, $recordCount, "Incomplete tuple, ignored");
		continue;
	}

	## ======== Clean up values in array and check data consistency
	
	checkArray();

	## ======== Populate an array which holds only the values to be inserted

	$arrayInsert['InstitutionCD'] = $arr[5];
	$arrayInsert['CollectionCD'] = $arr[6];
	$arrayInsert['CatalogNO'] = $arr[7];
	$arrayInsert['DBSourceName'] = "GBIF";
	$arrayInsert['LastDateModified'] = $arr[8];
	$arrayInsert['ScientificName'] = $arr[12];
	$arrayInsert['ScientificNameOriginal'] = $arr[10];
	$arrayInsert['Family'] = $arr[14];
	$arrayInsert['Genus'] = $arr[16];
	$arrayInsert['AuthorOfScientificName'] = $arr[11];
	$arrayInsert['Country'] = $arr[17];
	$arrayInsert['StateProvince'] = $arr[20];
	$arrayInsert['County'] = $arr[19];
	$arrayInsert['MinimumElevationInMeter'] = $arr[26];
	$arrayInsert['MaximumElevationInMeter'] = $arr[27];
	$arrayInsert['LatestDateCollected'] = $arr[4];
	$arrayInsert['Collector'] = $arr[2];
	$arrayInsert['DecimalLatitude'] = $arr[21];
	$arrayInsert['DecimalLongitude'] = $arr[23];
	$arrayInsert['Locality'] = $arr[18];
	$arrayInsert['IdentifiedBy'] = $arr[9];
	$arrayInsert['VerbatimLatitude'] = $arr[22];
	$arrayInsert['VerbatimLongitude'] = $arr[24];
	$arrayInsert['SpecificEpithet'] = $arr[28];
	$arrayInsert['InfraspecificEpithet'] = $arr[29];
	$arrayInsert['GBIFFamilyOriginal'] = $arr[13];
	$arrayInsert['GBIFGenusOriginal'] = $arr[15];
	$arrayInsert['InfraspecificRank'] = $arr[30];
	

	## ======== Ensure primary key values and not null values (CollectionCD, CatalogNO, InstitutionCD) are not null

	if(is_null($arrayInsert['InstitutionCD']) OR is_null($arrayInsert['CatalogNO']) OR is_null($arrayInsert['InstitutionCD']))
	{
		reportError(NULL, NULL, NULL, "One or more non-null value is NULL");
		continue;
	} 

	## ======== Filter out MOBOT and NYBG Data since those are inserted from another file

	if ( strstr($arrayInsert['InstitutionCD'], "MOBOT") or strstr($arrayInsert['InstitutionCD'] , "NYBG"))
	{
		reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], $arrayInsert['InstitutionCD'], "Institution code contains either MOBOT or NYBG");
		continue;
	}
		
	## ======== Generate SQL code to insert desired information

	$sql = sprintf("INSERT INTO StagingRawSpecimen	
			(
				InstitutionCD,
				CollectionCD,
				CatalogNO,
				DBSourceName,
				LastDateModified,
				ScientificName,
				ScientificNameOriginal,
				GBIFFamilyOriginal,
				Family,
				GBIFGenusOriginal,
				Genus,
				AuthorOfScientificName,
				Country,
				StateProvince,
				County,
				MinimumElevationInMeter,
				MaximumElevationInMeter,
				LatestDateCollected,
				Collector,
				DecimalLatitude,
				DecimalLongitude,
				Locality,
				IdentifiedBy,
				VerbatimLatitude,
				VerbatimLongitude,
				SpecificEpithet,
				InfraspecificEpithet,
				InfraspecificRank
				) 
			VALUES
				(
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s,
				%s
			)
			",
			insertSQLData($arrayInsert['InstitutionCD']),
			insertSQLData($arrayInsert['CollectionCD']),
			insertSQLData($arrayInsert['CatalogNO']),
			insertSQLData($arrayInsert['DBSourceName']),
			insertSQLData($arrayInsert['LastDateModified']),
			insertSQLData($arrayInsert['ScientificName']),
			insertSQLData($arrayInsert['ScientificNameOriginal']),
			insertSQLData($arrayInsert['GBIFFamilyOriginal']),
			insertSQLData($arrayInsert['Family']),
			insertSQLData($arrayInsert['GBIFGenusOriginal']),
			insertSQLData($arrayInsert['Genus']),
			insertSQLData($arrayInsert['AuthorOfScientificName']),
			insertSQLData($arrayInsert['Country']),
			insertSQLData($arrayInsert['StateProvince']),
			insertSQLData($arrayInsert['County']),
			insertSQLData($arrayInsert['MinimumElevationInMeter']),
			insertSQLData($arrayInsert['MaximumElevationInMeter']),
			insertSQLData($arrayInsert['LatestDateCollected']),
			insertSQLData($arrayInsert['Collector']),
			insertSQLData($arrayInsert['DecimalLatitude']),
			insertSQLData($arrayInsert['DecimalLongitude']),
			insertSQLData($arrayInsert['Locality']),
			insertSQLData($arrayInsert['IdentifiedBy']),
			insertSQLData($arrayInsert['VerbatimLatitude']),
			insertSQLData($arrayInsert['VerbatimLongitude']),
			insertSQLData($arrayInsert['SpecificEpithet']),
			insertSQLData($arrayInsert['InfraspecificEpithet']),
			insertSQLData($arrayInsert['InfraspecificRank'])
		);
		
		//echo("<br> <br> $sql <br> <br>");

	## ======== Check whether a tuple with the variables you're trying to enter has already been entered (trying to insert duplicate tuple)

	if(mysql_query($sql))
	{
		$successEntries++;
	}
	else if(mysql_errno() == 1062)
	{
		$duplicateEntries++;
		//reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, mysql_error());
		continue;
	}
	else
	{
		reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, mysql_error());
	}
		

	## ======== Validate data entered
	if ($Validation_check) // Validation Check status
{	
	$validation_query = sprintf(
				"SELECT * FROM StagingRawSpecimen WHERE
				CollectionCD %s
				AND CatalogNO %s",
			selectSQLData($arrayInsert['CollectionCD']),
			selectSQLData($arrayInsert['CatalogNO'])
			);

	//echo("<br> <br> $validation_query <br> <br>");
		
	$validation_result = mysql_query($validation_query) OR reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, mysql_error());
	
	if(mysql_num_rows($validation_result) > 1)	## IF more than 1 record has these properties, then NOT UNIQUE				 
	{
		reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, "Validation result not unique");
		continue;	//error, skip the rest of validation
	}

	if(mysql_num_rows($validation_result) < 1)
	{
		reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, "Validation result does not exist");
		continue;	//error, skip the rest of validation
	}		

	else //(num_rows == 1)
	{
		$validation_result_row = mysql_fetch_assoc($validation_result) OR DIE("Error - no result found");		
	
		$CatalogNO = $arrayInsert['CatalogNO'];
		$CollectionCD = $arrayInsert['CollectionCD'];
		foreach($arrayInsert AS $key => $data)
		{
			## ======== check if the result found in the database matches that entered (Check both exact string match and numerical equality)

			if((strcasecmp($validation_result_row[$key], $data) != 0) AND (!checkDifference($data, $validation_result_row[$key], 5)))
			{
				reportError($CollectionCD, $CatalogNO, $key, "Result does not match data entered |--| $data |--| $validation_result_row[$key] |--|");	
			}
		}
	}
}	//end while loop
}	
## ======== Display results to user
	
if($duplicateEntries >= 1)
{
	echo "<br>" . $duplicateEntries . " record(s) has been skipped, key already exists<br>";
}

if($successEntries >= 1)
{
	echo "<br>" . $successEntries . " record(s) has been successfully inserted<br>";
}
echo("<br> $validation_errors Validation Errors");

// =============================
// ======== END PROGRAM ========
// =============================
	

## ======================================
## Name: checkArray
## Function: Checks the data form the file and formats it for entry into the staging table
## Internal Variables: 
## Global Variables: $arr
## Calls Function: None
## Called by: Main loop
## Written by: Andrew Becker, Mark Sheehan
## Last Modified: 07-22-09
## =======================================
	 
function checkArray()
{
	global $arr;
	for($a=0; $a<31; $a++)
	{
		if(!(array_key_exists($a, $arr)))
		{
			$arr[$a] = ' ';
		}

		## ==== Trim white spaces and newline off of front and end of string (necessary for values at end of line)
			
		$arr[$a] = trim($arr[$a], ' ');
		
		## ==== Replace Empty values ('') with null keyword
		
		if($arr[$a] == '' OR $arr[$a] == '\N')	
		{
			$arr[$a] = NULL;
		}
			
		if(($a == 4 OR $a == 8) AND $arr[$a] != NULL)	## CHANGED JUN 22 09 - IF date/time AND NOT NULL, then convert it to a form that mysql/PHP recognizes - YYYY-MM-DD HH-MM-SS
		{
			if($a == 8)
			{
				//==== Actual data in GBIF file in format "MM-DD-YYYY HH:MM"
				//Start formatting date
				
				$dateFormat = explode(' ', $arr[$a]);
				$date = $dateFormat[0];
				$time = $dateFormat[1];
				$date = explode('-', $date);	//break date up into MM DD YYYY
				$month = str_pad($date[1], 2, '0', STR_PAD_LEFT);	//if only entered as 5/27/2008, need it in form 05/27/2008 so need padding on left
				$day = str_pad($date[2], 2, '0', STR_PAD_LEFT);
				$year = $date[0];
			
				$time = explode(':', $time);
				$hour = $time[0];
				$hour = str_pad($hour, 2, '0', STR_PAD_LEFT);	//same issue as above
				$minute = $time[1];
				$second = "00";	//seconds not given, assume 00
				$arr[$a] = sprintf("%s-%s-%s %s:%s:%s", $year, $month, $day, $hour, $minute, $second); 
				//echo(" $arr[$a] <br>");
			}
			else if($a == 4)
			{
				
				//==== Actual data in GBIF file in format MM/DD/YYYY OR YYYY-MM-DD
				
				if(stripos($arr[$a], '/'))	// if in MM/DD/YYYY format
				{
					$dateFormat = explode('/', $arr[$a]);
					$month = str_pad($dateFormat[0], 2, '0', STR_PAD_LEFT);
					$day = str_pad($dateFormat[1], 2, '0', STR_PAD_LEFT);
					$year = $dateFormat[2];

					$arr[$a] = sprintf("%s-%s-%s", $year, $month, $day);	
				}
				else if(stripos($arr[$a], '-'))	//if in YYYY-MM-DD format
				{
					$dateFormat = explode('-', $arr[$a]);
					$month = str_pad($dateFormat[1], 2, '0', STR_PAD_LEFT);
					$day = str_pad($dateFormat[2], 2, '0', STR_PAD_LEFT);
					$year = $dateFormat[0];
						
					$arr[$a] = sprintf("%s-%s-%s", $year, $month, $day);
				}
				else
				{
					reportError($arr[6], $arr[7], $arr[$a], "Date format for ||$arr[$a]|| not found.");
				}
			}
		}
		else if($a == 26) // if elevation range is given as XXX-YYY in one field instead of as minimum and maximum
		{
			$minMax = explode('-', $arr[$a]);
			if(sizeof($minMax) > 1)
			{
				$arr[26] = $minMax[0];
				$arr[27] = $minMax[1];
			}
		}
		else if ($a == 12) //  To get the species and subspeices from the scientific name, Dhaval Shah
		{
			$spe_subspe = explode(' ',$arr[$a]);
			if(sizeof($spe_subspe) > 1 and !(is_numeric($spe_subspe[1])))
			{
				$arr[28] = trim($spe_subspe[1]);
				if(sizeof($spe_subspe) > 3 and !(is_numeric($spe_subspe[2])))
				{
					
					if ((strstr($spe_subspe[2], "var") or strstr($spe_subspe[2], "subsp") or strstr($spe_subspe[2], "ssp") or strstr($spe_subspe[2], "sbsp") or strstr($spe_subspe[2], "forma") or strstr($spe_subspe[2], "fo") or strstr($spe_subspe[2], "f") or strstr($spe_subspe[2], "cv")) and (strlen($spe_subspe[2]) < 6))
					{
						if ( strstr($spe_subspe[2], "ssp"))
						{
							$spe_subspe[2] = "subsp.";
						}
						$arr[30] = trim($spe_subspe[2]);
						$arr[29] = trim($spe_subspe[3]);
					}
					else
					{
						$arr[30] = NULL;
						$arr[29] = NULL;
					}
				}
				else
				{
					$arr[30] = NULL;
					$arr[29] = NULL;
				}
									
			}
			else
			{
				$arr[28] = NULL;
				$arr[29] = NULL;
				$arr[30] = NULL;
			}
		}
	}
}

## ========================================
## Name: reportError
## Function: creates a detailed error report in errorLog
## Internal Variables:
## Globals: $recordCount, $GBIFFile
## Calls Function: None
## Called by: Main Loop
## Written by: Mark Sheehan
## Last Updated: 07-22-09
## =======================================
	
function reportError($CollectionCD, $CatalogNo, $columnName, $Error)
{
	GLOBAL $recordCount;
	GLOBAL $GBIFFile;
	//echo("Creating error log for $CollectionCD | $CatalogNo | $Error <br><br>");
	$errorReport = sprintf(
		"INSERT INTO ErrorLog (CollectionCD, CatalogNO, RawDataFileName, RawRecordNO, StagingTableName, StagingColumnName, ErrorDescription)
		VALUES (%s, %s, %s, %s, %s, %s, %s)",
		insertSQLData($CollectionCD), 
		insertSQLData($CatalogNo),
		insertSQLData("GBIFFile.txt"),
		insertSQLData($recordCount),
		insertSQLData("StagingRawSpecimen"),
		insertSQLData($columnName),
		insertSQLData($Error)
	);
	//echo ("<br> $errorReport <br>");
	mysql_query($errorReport) OR DIE("Failed to enter log into error report." . mysql_error());
}

## =======================================
## Name: checkDifference
## Function: determines whether the difference between two numbers is close to 0 at given precision
## Internal Variables:
## Globals: None
## Calls Function: None
## Called by: main loop
## Written by: Mark Sheehan
## Last updated: 07-29-09
## ======================================

function checkDifference($firstNum, $secondNum, $precision)
{
	if(abs($firstNum - $secondNum) < 2 * pow(10, -$precision))
	{
		return TRUE;
	}
	else
	{
		return FALSE;
	}
}

## =======================================
## Name: InsertSQLData
## Function: Formats data to be inserted for SQL insert query
## Internal Variables:
## Globals: None
## Calls Function: None
## Called by: Main Loop
## Written by: Mark Sheehan
## Last Updated: 07-22-09
## ======================================

function InsertSQLData($string)
{
	if($string == '' OR $string == NULL)
	{
		return('NULL');
	}
	else
	{
		return(sprintf("'%s'", mysql_real_escape_string($string)));
	}
}

## =======================================
## Name: SelectSQLData
## Function: Formats data to be inserted into SQL select query
## Internal Variables:
## Globals: None
## Calls Function: None
## Called by: Main Loop
## Written by: Mark Sheehan
## Last Updated: 07-22-09
## ========================================

function SelectSQLData($string)
{
	if($string == '' OR $string == NULL)
	{
		return("IS NULL");
	}
	else
	{
		return(sprintf("= '%s'", mysql_real_escape_string($string)));
	}
}
	
?>
