Project

General

Profile

1
#*********************************************************
2
#  Written by: Bradley University
3
#  Dr. Steven Dolins
4
#  Use: Transform and Load Raw Specimen
5
#       GBIF Data from GBIF file into the
6
# 		Staging Table (StagingRawSpecimen).
7
#  Version: 2.0
8
#  Date: November 09, 2009
9
#*********************************************************
10

    
11
<?php	
12
## ======== Connect to database
13
include('connect.php');
14

    
15
## ======== Spacing for buttons in index.php
16
echo "<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br>";
17

    
18
## ======== Identify data file
19
$Validation_check = 0;
20
$GBIFFile = 'GBIF.txt';
21
$duplicateEntries=0;
22
$successEntries=0;
23
$validation_errors=0;
24
$recordCount = 0;
25

    
26
## ======== Load file as a character array - each entry is one line of text with newline still attached
27
$GBIFFile = fopen($GBIFFile, 'r');
28

    
29
##==== Pull off a single line from file and add it to array ====
30
	
31
//$line = fgets($GBIFFile);	//UNCOMMENT ONLY IF TITLES ARE INCLUDED IN FIRST LINE OF FILE
32
while($line = fgets($GBIFFile))
33
{
34
	$line = trim($line);
35
	$arr = explode("\t", $line);
36
	$recordCount++;
37

    
38
	## ======== Ensure that data enetered is a full tuple (has all columns associated with GBIF data)
39
	
40
	if(count($arr) != 30)
41
	{
42
		reportError(NULL, NULL, $recordCount, "Incomplete tuple, ignored");
43
		continue;
44
	}
45

    
46
	## ======== Clean up values in array and check data consistency
47
	
48
	checkArray();
49

    
50
	## ======== Populate an array which holds only the values to be inserted
51

    
52
	$arrayInsert['InstitutionCD'] = $arr[5];
53
	$arrayInsert['CollectionCD'] = $arr[6];
54
	$arrayInsert['CatalogNO'] = $arr[7];
55
	$arrayInsert['DBSourceName'] = "GBIF";
56
	$arrayInsert['LastDateModified'] = $arr[8];
57
	$arrayInsert['ScientificName'] = $arr[12];
58
	$arrayInsert['ScientificNameOriginal'] = $arr[10];
59
	$arrayInsert['Family'] = $arr[14];
60
	$arrayInsert['Genus'] = $arr[16];
61
	$arrayInsert['AuthorOfScientificName'] = $arr[11];
62
	$arrayInsert['Country'] = $arr[17];
63
	$arrayInsert['StateProvince'] = $arr[20];
64
	$arrayInsert['County'] = $arr[19];
65
	$arrayInsert['MinimumElevationInMeter'] = $arr[26];
66
	$arrayInsert['MaximumElevationInMeter'] = $arr[27];
67
	$arrayInsert['LatestDateCollected'] = $arr[4];
68
	$arrayInsert['Collector'] = $arr[2];
69
	$arrayInsert['DecimalLatitude'] = $arr[21];
70
	$arrayInsert['DecimalLongitude'] = $arr[23];
71
	$arrayInsert['Locality'] = $arr[18];
72
	$arrayInsert['IdentifiedBy'] = $arr[9];
73
	$arrayInsert['VerbatimLatitude'] = $arr[22];
74
	$arrayInsert['VerbatimLongitude'] = $arr[24];
75
	$arrayInsert['SpecificEpithet'] = $arr[28];
76
	$arrayInsert['InfraspecificEpithet'] = $arr[29];
77
	$arrayInsert['GBIFFamilyOriginal'] = $arr[13];
78
	$arrayInsert['GBIFGenusOriginal'] = $arr[15];
79
	$arrayInsert['InfraspecificRank'] = $arr[30];
80
	
81

    
82
	## ======== Ensure primary key values and not null values (CollectionCD, CatalogNO, InstitutionCD) are not null
83

    
84
	if(is_null($arrayInsert['InstitutionCD']) OR is_null($arrayInsert['CatalogNO']) OR is_null($arrayInsert['InstitutionCD']))
85
	{
86
		reportError(NULL, NULL, NULL, "One or more non-null value is NULL");
87
		continue;
88
	} 
89

    
90
	## ======== Filter out MOBOT and NYBG Data since those are inserted from another file
91

    
92
	if ( strstr($arrayInsert['InstitutionCD'], "MOBOT") or strstr($arrayInsert['InstitutionCD'] , "NYBG"))
93
	{
94
		reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], $arrayInsert['InstitutionCD'], "Institution code contains either MOBOT or NYBG");
95
		continue;
96
	}
97
		
98
	## ======== Generate SQL code to insert desired information
99

    
100
	$sql = sprintf("INSERT INTO StagingRawSpecimen	
101
			(
102
				InstitutionCD,
103
				CollectionCD,
104
				CatalogNO,
105
				DBSourceName,
106
				LastDateModified,
107
				ScientificName,
108
				ScientificNameOriginal,
109
				GBIFFamilyOriginal,
110
				Family,
111
				GBIFGenusOriginal,
112
				Genus,
113
				AuthorOfScientificName,
114
				Country,
115
				StateProvince,
116
				County,
117
				MinimumElevationInMeter,
118
				MaximumElevationInMeter,
119
				LatestDateCollected,
120
				Collector,
121
				DecimalLatitude,
122
				DecimalLongitude,
123
				Locality,
124
				IdentifiedBy,
125
				VerbatimLatitude,
126
				VerbatimLongitude,
127
				SpecificEpithet,
128
				InfraspecificEpithet,
129
				InfraspecificRank
130
				) 
131
			VALUES
132
				(
133
				%s,
134
				%s,
135
				%s,
136
				%s,
137
				%s,
138
				%s,
139
				%s,
140
				%s,
141
				%s,
142
				%s,
143
				%s,
144
				%s,
145
				%s,
146
				%s,
147
				%s,
148
				%s,
149
				%s,
150
				%s,
151
				%s,
152
				%s,
153
				%s,
154
				%s,
155
				%s,
156
				%s,
157
				%s,
158
				%s,
159
				%s,
160
				%s
161
			)
162
			",
163
			insertSQLData($arrayInsert['InstitutionCD']),
164
			insertSQLData($arrayInsert['CollectionCD']),
165
			insertSQLData($arrayInsert['CatalogNO']),
166
			insertSQLData($arrayInsert['DBSourceName']),
167
			insertSQLData($arrayInsert['LastDateModified']),
168
			insertSQLData($arrayInsert['ScientificName']),
169
			insertSQLData($arrayInsert['ScientificNameOriginal']),
170
			insertSQLData($arrayInsert['GBIFFamilyOriginal']),
171
			insertSQLData($arrayInsert['Family']),
172
			insertSQLData($arrayInsert['GBIFGenusOriginal']),
173
			insertSQLData($arrayInsert['Genus']),
174
			insertSQLData($arrayInsert['AuthorOfScientificName']),
175
			insertSQLData($arrayInsert['Country']),
176
			insertSQLData($arrayInsert['StateProvince']),
177
			insertSQLData($arrayInsert['County']),
178
			insertSQLData($arrayInsert['MinimumElevationInMeter']),
179
			insertSQLData($arrayInsert['MaximumElevationInMeter']),
180
			insertSQLData($arrayInsert['LatestDateCollected']),
181
			insertSQLData($arrayInsert['Collector']),
182
			insertSQLData($arrayInsert['DecimalLatitude']),
183
			insertSQLData($arrayInsert['DecimalLongitude']),
184
			insertSQLData($arrayInsert['Locality']),
185
			insertSQLData($arrayInsert['IdentifiedBy']),
186
			insertSQLData($arrayInsert['VerbatimLatitude']),
187
			insertSQLData($arrayInsert['VerbatimLongitude']),
188
			insertSQLData($arrayInsert['SpecificEpithet']),
189
			insertSQLData($arrayInsert['InfraspecificEpithet']),
190
			insertSQLData($arrayInsert['InfraspecificRank'])
191
		);
192
		
193
		//echo("<br> <br> $sql <br> <br>");
194

    
195
	## ======== Check whether a tuple with the variables you're trying to enter has already been entered (trying to insert duplicate tuple)
196

    
197
	if(mysql_query($sql))
198
	{
199
		$successEntries++;
200
	}
201
	else if(mysql_errno() == 1062)
202
	{
203
		$duplicateEntries++;
204
		//reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, mysql_error());
205
		continue;
206
	}
207
	else
208
	{
209
		reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, mysql_error());
210
	}
211
		
212

    
213
	## ======== Validate data entered
214
	if ($Validation_check) // Validation Check status
215
{	
216
	$validation_query = sprintf(
217
				"SELECT * FROM StagingRawSpecimen WHERE
218
				CollectionCD %s
219
				AND CatalogNO %s",
220
			selectSQLData($arrayInsert['CollectionCD']),
221
			selectSQLData($arrayInsert['CatalogNO'])
222
			);
223

    
224
	//echo("<br> <br> $validation_query <br> <br>");
225
		
226
	$validation_result = mysql_query($validation_query) OR reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, mysql_error());
227
	
228
	if(mysql_num_rows($validation_result) > 1)	## IF more than 1 record has these properties, then NOT UNIQUE				 
229
	{
230
		reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, "Validation result not unique");
231
		continue;	//error, skip the rest of validation
232
	}
233

    
234
	if(mysql_num_rows($validation_result) < 1)
235
	{
236
		reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, "Validation result does not exist");
237
		continue;	//error, skip the rest of validation
238
	}		
239

    
240
	else //(num_rows == 1)
241
	{
242
		$validation_result_row = mysql_fetch_assoc($validation_result) OR DIE("Error - no result found");		
243
	
244
		$CatalogNO = $arrayInsert['CatalogNO'];
245
		$CollectionCD = $arrayInsert['CollectionCD'];
246
		foreach($arrayInsert AS $key => $data)
247
		{
248
			## ======== check if the result found in the database matches that entered (Check both exact string match and numerical equality)
249

    
250
			if((strcasecmp($validation_result_row[$key], $data) != 0) AND (!checkDifference($data, $validation_result_row[$key], 5)))
251
			{
252
				reportError($CollectionCD, $CatalogNO, $key, "Result does not match data entered |--| $data |--| $validation_result_row[$key] |--|");	
253
			}
254
		}
255
	}
256
}	//end while loop
257
}	
258
## ======== Display results to user
259
	
260
if($duplicateEntries >= 1)
261
{
262
	echo "<br>" . $duplicateEntries . " record(s) has been skipped, key already exists<br>";
263
}
264

    
265
if($successEntries >= 1)
266
{
267
	echo "<br>" . $successEntries . " record(s) has been successfully inserted<br>";
268
}
269
echo("<br> $validation_errors Validation Errors");
270

    
271
// =============================
272
// ======== END PROGRAM ========
273
// =============================
274
	
275

    
276
## ======================================
277
## Name: checkArray
278
## Function: Checks the data form the file and formats it for entry into the staging table
279
## Internal Variables: 
280
## Global Variables: $arr
281
## Calls Function: None
282
## Called by: Main loop
283
## Written by: Andrew Becker, Mark Sheehan
284
## Last Modified: 07-22-09
285
## =======================================
286
	 
287
function checkArray()
288
{
289
	global $arr;
290
	for($a=0; $a<31; $a++)
291
	{
292
		if(!(array_key_exists($a, $arr)))
293
		{
294
			$arr[$a] = ' ';
295
		}
296

    
297
		## ==== Trim white spaces and newline off of front and end of string (necessary for values at end of line)
298
			
299
		$arr[$a] = trim($arr[$a], ' ');
300
		
301
		## ==== Replace Empty values ('') with null keyword
302
		
303
		if($arr[$a] == '' OR $arr[$a] == '\N')	
304
		{
305
			$arr[$a] = NULL;
306
		}
307
			
308
		if(($a == 4 OR $a == 8) AND $arr[$a] != NULL)	## CHANGED JUN 22 09 - IF date/time AND NOT NULL, then convert it to a form that mysql/PHP recognizes - YYYY-MM-DD HH-MM-SS
309
		{
310
			if($a == 8)
311
			{
312
				//==== Actual data in GBIF file in format "MM-DD-YYYY HH:MM"
313
				//Start formatting date
314
				
315
				$dateFormat = explode(' ', $arr[$a]);
316
				$date = $dateFormat[0];
317
				$time = $dateFormat[1];
318
				$date = explode('-', $date);	//break date up into MM DD YYYY
319
				$month = str_pad($date[1], 2, '0', STR_PAD_LEFT);	//if only entered as 5/27/2008, need it in form 05/27/2008 so need padding on left
320
				$day = str_pad($date[2], 2, '0', STR_PAD_LEFT);
321
				$year = $date[0];
322
			
323
				$time = explode(':', $time);
324
				$hour = $time[0];
325
				$hour = str_pad($hour, 2, '0', STR_PAD_LEFT);	//same issue as above
326
				$minute = $time[1];
327
				$second = "00";	//seconds not given, assume 00
328
				$arr[$a] = sprintf("%s-%s-%s %s:%s:%s", $year, $month, $day, $hour, $minute, $second); 
329
				//echo(" $arr[$a] <br>");
330
			}
331
			else if($a == 4)
332
			{
333
				
334
				//==== Actual data in GBIF file in format MM/DD/YYYY OR YYYY-MM-DD
335
				
336
				if(stripos($arr[$a], '/'))	// if in MM/DD/YYYY format
337
				{
338
					$dateFormat = explode('/', $arr[$a]);
339
					$month = str_pad($dateFormat[0], 2, '0', STR_PAD_LEFT);
340
					$day = str_pad($dateFormat[1], 2, '0', STR_PAD_LEFT);
341
					$year = $dateFormat[2];
342

    
343
					$arr[$a] = sprintf("%s-%s-%s", $year, $month, $day);	
344
				}
345
				else if(stripos($arr[$a], '-'))	//if in YYYY-MM-DD format
346
				{
347
					$dateFormat = explode('-', $arr[$a]);
348
					$month = str_pad($dateFormat[1], 2, '0', STR_PAD_LEFT);
349
					$day = str_pad($dateFormat[2], 2, '0', STR_PAD_LEFT);
350
					$year = $dateFormat[0];
351
						
352
					$arr[$a] = sprintf("%s-%s-%s", $year, $month, $day);
353
				}
354
				else
355
				{
356
					reportError($arr[6], $arr[7], $arr[$a], "Date format for ||$arr[$a]|| not found.");
357
				}
358
			}
359
		}
360
		else if($a == 26) // if elevation range is given as XXX-YYY in one field instead of as minimum and maximum
361
		{
362
			$minMax = explode('-', $arr[$a]);
363
			if(sizeof($minMax) > 1)
364
			{
365
				$arr[26] = $minMax[0];
366
				$arr[27] = $minMax[1];
367
			}
368
		}
369
		else if ($a == 12) //  To get the species and subspeices from the scientific name, Dhaval Shah
370
		{
371
			$spe_subspe = explode(' ',$arr[$a]);
372
			if(sizeof($spe_subspe) > 1 and !(is_numeric($spe_subspe[1])))
373
			{
374
				$arr[28] = trim($spe_subspe[1]);
375
				if(sizeof($spe_subspe) > 3 and !(is_numeric($spe_subspe[2])))
376
				{
377
					
378
					if ((strstr($spe_subspe[2], "var") or strstr($spe_subspe[2], "subsp") or strstr($spe_subspe[2], "ssp") or strstr($spe_subspe[2], "sbsp") or strstr($spe_subspe[2], "forma") or strstr($spe_subspe[2], "fo") or strstr($spe_subspe[2], "f") or strstr($spe_subspe[2], "cv")) and (strlen($spe_subspe[2]) < 6))
379
					{
380
						if ( strstr($spe_subspe[2], "ssp"))
381
						{
382
							$spe_subspe[2] = "subsp.";
383
						}
384
						$arr[30] = trim($spe_subspe[2]);
385
						$arr[29] = trim($spe_subspe[3]);
386
					}
387
					else
388
					{
389
						$arr[30] = NULL;
390
						$arr[29] = NULL;
391
					}
392
				}
393
				else
394
				{
395
					$arr[30] = NULL;
396
					$arr[29] = NULL;
397
				}
398
									
399
			}
400
			else
401
			{
402
				$arr[28] = NULL;
403
				$arr[29] = NULL;
404
				$arr[30] = NULL;
405
			}
406
		}
407
	}
408
}
409

    
410
## ========================================
411
## Name: reportError
412
## Function: creates a detailed error report in errorLog
413
## Internal Variables:
414
## Globals: $recordCount, $GBIFFile
415
## Calls Function: None
416
## Called by: Main Loop
417
## Written by: Mark Sheehan
418
## Last Updated: 07-22-09
419
## =======================================
420
	
421
function reportError($CollectionCD, $CatalogNo, $columnName, $Error)
422
{
423
	GLOBAL $recordCount;
424
	GLOBAL $GBIFFile;
425
	//echo("Creating error log for $CollectionCD | $CatalogNo | $Error <br><br>");
426
	$errorReport = sprintf(
427
		"INSERT INTO ErrorLog (CollectionCD, CatalogNO, RawDataFileName, RawRecordNO, StagingTableName, StagingColumnName, ErrorDescription)
428
		VALUES (%s, %s, %s, %s, %s, %s, %s)",
429
		insertSQLData($CollectionCD), 
430
		insertSQLData($CatalogNo),
431
		insertSQLData("GBIFFile.txt"),
432
		insertSQLData($recordCount),
433
		insertSQLData("StagingRawSpecimen"),
434
		insertSQLData($columnName),
435
		insertSQLData($Error)
436
	);
437
	//echo ("<br> $errorReport <br>");
438
	mysql_query($errorReport) OR DIE("Failed to enter log into error report." . mysql_error());
439
}
440

    
441
## =======================================
442
## Name: checkDifference
443
## Function: determines whether the difference between two numbers is close to 0 at given precision
444
## Internal Variables:
445
## Globals: None
446
## Calls Function: None
447
## Called by: main loop
448
## Written by: Mark Sheehan
449
## Last updated: 07-29-09
450
## ======================================
451

    
452
function checkDifference($firstNum, $secondNum, $precision)
453
{
454
	if(abs($firstNum - $secondNum) < 2 * pow(10, -$precision))
455
	{
456
		return TRUE;
457
	}
458
	else
459
	{
460
		return FALSE;
461
	}
462
}
463

    
464
## =======================================
465
## Name: InsertSQLData
466
## Function: Formats data to be inserted for SQL insert query
467
## Internal Variables:
468
## Globals: None
469
## Calls Function: None
470
## Called by: Main Loop
471
## Written by: Mark Sheehan
472
## Last Updated: 07-22-09
473
## ======================================
474

    
475
function InsertSQLData($string)
476
{
477
	if($string == '' OR $string == NULL)
478
	{
479
		return('NULL');
480
	}
481
	else
482
	{
483
		return(sprintf("'%s'", mysql_real_escape_string($string)));
484
	}
485
}
486

    
487
## =======================================
488
## Name: SelectSQLData
489
## Function: Formats data to be inserted into SQL select query
490
## Internal Variables:
491
## Globals: None
492
## Calls Function: None
493
## Called by: Main Loop
494
## Written by: Mark Sheehan
495
## Last Updated: 07-22-09
496
## ========================================
497

    
498
function SelectSQLData($string)
499
{
500
	if($string == '' OR $string == NULL)
501
	{
502
		return("IS NULL");
503
	}
504
	else
505
	{
506
		return(sprintf("= '%s'", mysql_real_escape_string($string)));
507
	}
508
}
509
	
510
?>
(1-1/7)