1
|
#*********************************************************
|
2
|
# Written by: Bradley University
|
3
|
# Dr. Steven Dolins
|
4
|
# Use: Transform and Load Raw Specimen
|
5
|
# GBIF Data from GBIF file into the
|
6
|
# Staging Table (StagingRawSpecimen).
|
7
|
# Version: 2.0
|
8
|
# Date: November 09, 2009
|
9
|
#*********************************************************
|
10
|
|
11
|
<?php
|
12
|
## ======== Connect to database
|
13
|
include('connect.php');
|
14
|
|
15
|
## ======== Spacing for buttons in index.php
|
16
|
echo "<br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br><br>";
|
17
|
|
18
|
## ======== Identify data file
|
19
|
$Validation_check = 0;
|
20
|
$GBIFFile = 'GBIF.txt';
|
21
|
$duplicateEntries=0;
|
22
|
$successEntries=0;
|
23
|
$validation_errors=0;
|
24
|
$recordCount = 0;
|
25
|
|
26
|
## ======== Load file as a character array - each entry is one line of text with newline still attached
|
27
|
$GBIFFile = fopen($GBIFFile, 'r');
|
28
|
|
29
|
##==== Pull off a single line from file and add it to array ====
|
30
|
|
31
|
//$line = fgets($GBIFFile); //UNCOMMENT ONLY IF TITLES ARE INCLUDED IN FIRST LINE OF FILE
|
32
|
while($line = fgets($GBIFFile))
|
33
|
{
|
34
|
$line = trim($line);
|
35
|
$arr = explode("\t", $line);
|
36
|
$recordCount++;
|
37
|
|
38
|
## ======== Ensure that data enetered is a full tuple (has all columns associated with GBIF data)
|
39
|
|
40
|
if(count($arr) != 30)
|
41
|
{
|
42
|
reportError(NULL, NULL, $recordCount, "Incomplete tuple, ignored");
|
43
|
continue;
|
44
|
}
|
45
|
|
46
|
## ======== Clean up values in array and check data consistency
|
47
|
|
48
|
checkArray();
|
49
|
|
50
|
## ======== Populate an array which holds only the values to be inserted
|
51
|
|
52
|
$arrayInsert['InstitutionCD'] = $arr[5];
|
53
|
$arrayInsert['CollectionCD'] = $arr[6];
|
54
|
$arrayInsert['CatalogNO'] = $arr[7];
|
55
|
$arrayInsert['DBSourceName'] = "GBIF";
|
56
|
$arrayInsert['LastDateModified'] = $arr[8];
|
57
|
$arrayInsert['ScientificName'] = $arr[12];
|
58
|
$arrayInsert['ScientificNameOriginal'] = $arr[10];
|
59
|
$arrayInsert['Family'] = $arr[14];
|
60
|
$arrayInsert['Genus'] = $arr[16];
|
61
|
$arrayInsert['AuthorOfScientificName'] = $arr[11];
|
62
|
$arrayInsert['Country'] = $arr[17];
|
63
|
$arrayInsert['StateProvince'] = $arr[20];
|
64
|
$arrayInsert['County'] = $arr[19];
|
65
|
$arrayInsert['MinimumElevationInMeter'] = $arr[26];
|
66
|
$arrayInsert['MaximumElevationInMeter'] = $arr[27];
|
67
|
$arrayInsert['LatestDateCollected'] = $arr[4];
|
68
|
$arrayInsert['Collector'] = $arr[2];
|
69
|
$arrayInsert['DecimalLatitude'] = $arr[21];
|
70
|
$arrayInsert['DecimalLongitude'] = $arr[23];
|
71
|
$arrayInsert['Locality'] = $arr[18];
|
72
|
$arrayInsert['IdentifiedBy'] = $arr[9];
|
73
|
$arrayInsert['VerbatimLatitude'] = $arr[22];
|
74
|
$arrayInsert['VerbatimLongitude'] = $arr[24];
|
75
|
$arrayInsert['SpecificEpithet'] = $arr[28];
|
76
|
$arrayInsert['InfraspecificEpithet'] = $arr[29];
|
77
|
$arrayInsert['GBIFFamilyOriginal'] = $arr[13];
|
78
|
$arrayInsert['GBIFGenusOriginal'] = $arr[15];
|
79
|
$arrayInsert['InfraspecificRank'] = $arr[30];
|
80
|
|
81
|
|
82
|
## ======== Ensure primary key values and not null values (CollectionCD, CatalogNO, InstitutionCD) are not null
|
83
|
|
84
|
if(is_null($arrayInsert['InstitutionCD']) OR is_null($arrayInsert['CatalogNO']) OR is_null($arrayInsert['InstitutionCD']))
|
85
|
{
|
86
|
reportError(NULL, NULL, NULL, "One or more non-null value is NULL");
|
87
|
continue;
|
88
|
}
|
89
|
|
90
|
## ======== Filter out MOBOT and NYBG Data since those are inserted from another file
|
91
|
|
92
|
if ( strstr($arrayInsert['InstitutionCD'], "MOBOT") or strstr($arrayInsert['InstitutionCD'] , "NYBG"))
|
93
|
{
|
94
|
reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], $arrayInsert['InstitutionCD'], "Institution code contains either MOBOT or NYBG");
|
95
|
continue;
|
96
|
}
|
97
|
|
98
|
## ======== Generate SQL code to insert desired information
|
99
|
|
100
|
$sql = sprintf("INSERT INTO StagingRawSpecimen
|
101
|
(
|
102
|
InstitutionCD,
|
103
|
CollectionCD,
|
104
|
CatalogNO,
|
105
|
DBSourceName,
|
106
|
LastDateModified,
|
107
|
ScientificName,
|
108
|
ScientificNameOriginal,
|
109
|
GBIFFamilyOriginal,
|
110
|
Family,
|
111
|
GBIFGenusOriginal,
|
112
|
Genus,
|
113
|
AuthorOfScientificName,
|
114
|
Country,
|
115
|
StateProvince,
|
116
|
County,
|
117
|
MinimumElevationInMeter,
|
118
|
MaximumElevationInMeter,
|
119
|
LatestDateCollected,
|
120
|
Collector,
|
121
|
DecimalLatitude,
|
122
|
DecimalLongitude,
|
123
|
Locality,
|
124
|
IdentifiedBy,
|
125
|
VerbatimLatitude,
|
126
|
VerbatimLongitude,
|
127
|
SpecificEpithet,
|
128
|
InfraspecificEpithet,
|
129
|
InfraspecificRank
|
130
|
)
|
131
|
VALUES
|
132
|
(
|
133
|
%s,
|
134
|
%s,
|
135
|
%s,
|
136
|
%s,
|
137
|
%s,
|
138
|
%s,
|
139
|
%s,
|
140
|
%s,
|
141
|
%s,
|
142
|
%s,
|
143
|
%s,
|
144
|
%s,
|
145
|
%s,
|
146
|
%s,
|
147
|
%s,
|
148
|
%s,
|
149
|
%s,
|
150
|
%s,
|
151
|
%s,
|
152
|
%s,
|
153
|
%s,
|
154
|
%s,
|
155
|
%s,
|
156
|
%s,
|
157
|
%s,
|
158
|
%s,
|
159
|
%s,
|
160
|
%s
|
161
|
)
|
162
|
",
|
163
|
insertSQLData($arrayInsert['InstitutionCD']),
|
164
|
insertSQLData($arrayInsert['CollectionCD']),
|
165
|
insertSQLData($arrayInsert['CatalogNO']),
|
166
|
insertSQLData($arrayInsert['DBSourceName']),
|
167
|
insertSQLData($arrayInsert['LastDateModified']),
|
168
|
insertSQLData($arrayInsert['ScientificName']),
|
169
|
insertSQLData($arrayInsert['ScientificNameOriginal']),
|
170
|
insertSQLData($arrayInsert['GBIFFamilyOriginal']),
|
171
|
insertSQLData($arrayInsert['Family']),
|
172
|
insertSQLData($arrayInsert['GBIFGenusOriginal']),
|
173
|
insertSQLData($arrayInsert['Genus']),
|
174
|
insertSQLData($arrayInsert['AuthorOfScientificName']),
|
175
|
insertSQLData($arrayInsert['Country']),
|
176
|
insertSQLData($arrayInsert['StateProvince']),
|
177
|
insertSQLData($arrayInsert['County']),
|
178
|
insertSQLData($arrayInsert['MinimumElevationInMeter']),
|
179
|
insertSQLData($arrayInsert['MaximumElevationInMeter']),
|
180
|
insertSQLData($arrayInsert['LatestDateCollected']),
|
181
|
insertSQLData($arrayInsert['Collector']),
|
182
|
insertSQLData($arrayInsert['DecimalLatitude']),
|
183
|
insertSQLData($arrayInsert['DecimalLongitude']),
|
184
|
insertSQLData($arrayInsert['Locality']),
|
185
|
insertSQLData($arrayInsert['IdentifiedBy']),
|
186
|
insertSQLData($arrayInsert['VerbatimLatitude']),
|
187
|
insertSQLData($arrayInsert['VerbatimLongitude']),
|
188
|
insertSQLData($arrayInsert['SpecificEpithet']),
|
189
|
insertSQLData($arrayInsert['InfraspecificEpithet']),
|
190
|
insertSQLData($arrayInsert['InfraspecificRank'])
|
191
|
);
|
192
|
|
193
|
//echo("<br> <br> $sql <br> <br>");
|
194
|
|
195
|
## ======== Check whether a tuple with the variables you're trying to enter has already been entered (trying to insert duplicate tuple)
|
196
|
|
197
|
if(mysql_query($sql))
|
198
|
{
|
199
|
$successEntries++;
|
200
|
}
|
201
|
else if(mysql_errno() == 1062)
|
202
|
{
|
203
|
$duplicateEntries++;
|
204
|
//reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, mysql_error());
|
205
|
continue;
|
206
|
}
|
207
|
else
|
208
|
{
|
209
|
reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, mysql_error());
|
210
|
}
|
211
|
|
212
|
|
213
|
## ======== Validate data entered
|
214
|
if ($Validation_check) // Validation Check status
|
215
|
{
|
216
|
$validation_query = sprintf(
|
217
|
"SELECT * FROM StagingRawSpecimen WHERE
|
218
|
CollectionCD %s
|
219
|
AND CatalogNO %s",
|
220
|
selectSQLData($arrayInsert['CollectionCD']),
|
221
|
selectSQLData($arrayInsert['CatalogNO'])
|
222
|
);
|
223
|
|
224
|
//echo("<br> <br> $validation_query <br> <br>");
|
225
|
|
226
|
$validation_result = mysql_query($validation_query) OR reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, mysql_error());
|
227
|
|
228
|
if(mysql_num_rows($validation_result) > 1) ## IF more than 1 record has these properties, then NOT UNIQUE
|
229
|
{
|
230
|
reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, "Validation result not unique");
|
231
|
continue; //error, skip the rest of validation
|
232
|
}
|
233
|
|
234
|
if(mysql_num_rows($validation_result) < 1)
|
235
|
{
|
236
|
reportError($arrayInsert['CollectionCD'], $arrayInsert['CatalogNO'], NULL, "Validation result does not exist");
|
237
|
continue; //error, skip the rest of validation
|
238
|
}
|
239
|
|
240
|
else //(num_rows == 1)
|
241
|
{
|
242
|
$validation_result_row = mysql_fetch_assoc($validation_result) OR DIE("Error - no result found");
|
243
|
|
244
|
$CatalogNO = $arrayInsert['CatalogNO'];
|
245
|
$CollectionCD = $arrayInsert['CollectionCD'];
|
246
|
foreach($arrayInsert AS $key => $data)
|
247
|
{
|
248
|
## ======== check if the result found in the database matches that entered (Check both exact string match and numerical equality)
|
249
|
|
250
|
if((strcasecmp($validation_result_row[$key], $data) != 0) AND (!checkDifference($data, $validation_result_row[$key], 5)))
|
251
|
{
|
252
|
reportError($CollectionCD, $CatalogNO, $key, "Result does not match data entered |--| $data |--| $validation_result_row[$key] |--|");
|
253
|
}
|
254
|
}
|
255
|
}
|
256
|
} //end while loop
|
257
|
}
|
258
|
## ======== Display results to user
|
259
|
|
260
|
if($duplicateEntries >= 1)
|
261
|
{
|
262
|
echo "<br>" . $duplicateEntries . " record(s) has been skipped, key already exists<br>";
|
263
|
}
|
264
|
|
265
|
if($successEntries >= 1)
|
266
|
{
|
267
|
echo "<br>" . $successEntries . " record(s) has been successfully inserted<br>";
|
268
|
}
|
269
|
echo("<br> $validation_errors Validation Errors");
|
270
|
|
271
|
// =============================
|
272
|
// ======== END PROGRAM ========
|
273
|
// =============================
|
274
|
|
275
|
|
276
|
## ======================================
|
277
|
## Name: checkArray
|
278
|
## Function: Checks the data form the file and formats it for entry into the staging table
|
279
|
## Internal Variables:
|
280
|
## Global Variables: $arr
|
281
|
## Calls Function: None
|
282
|
## Called by: Main loop
|
283
|
## Written by: Andrew Becker, Mark Sheehan
|
284
|
## Last Modified: 07-22-09
|
285
|
## =======================================
|
286
|
|
287
|
function checkArray()
|
288
|
{
|
289
|
global $arr;
|
290
|
for($a=0; $a<31; $a++)
|
291
|
{
|
292
|
if(!(array_key_exists($a, $arr)))
|
293
|
{
|
294
|
$arr[$a] = ' ';
|
295
|
}
|
296
|
|
297
|
## ==== Trim white spaces and newline off of front and end of string (necessary for values at end of line)
|
298
|
|
299
|
$arr[$a] = trim($arr[$a], ' ');
|
300
|
|
301
|
## ==== Replace Empty values ('') with null keyword
|
302
|
|
303
|
if($arr[$a] == '' OR $arr[$a] == '\N')
|
304
|
{
|
305
|
$arr[$a] = NULL;
|
306
|
}
|
307
|
|
308
|
if(($a == 4 OR $a == 8) AND $arr[$a] != NULL) ## CHANGED JUN 22 09 - IF date/time AND NOT NULL, then convert it to a form that mysql/PHP recognizes - YYYY-MM-DD HH-MM-SS
|
309
|
{
|
310
|
if($a == 8)
|
311
|
{
|
312
|
//==== Actual data in GBIF file in format "MM-DD-YYYY HH:MM"
|
313
|
//Start formatting date
|
314
|
|
315
|
$dateFormat = explode(' ', $arr[$a]);
|
316
|
$date = $dateFormat[0];
|
317
|
$time = $dateFormat[1];
|
318
|
$date = explode('-', $date); //break date up into MM DD YYYY
|
319
|
$month = str_pad($date[1], 2, '0', STR_PAD_LEFT); //if only entered as 5/27/2008, need it in form 05/27/2008 so need padding on left
|
320
|
$day = str_pad($date[2], 2, '0', STR_PAD_LEFT);
|
321
|
$year = $date[0];
|
322
|
|
323
|
$time = explode(':', $time);
|
324
|
$hour = $time[0];
|
325
|
$hour = str_pad($hour, 2, '0', STR_PAD_LEFT); //same issue as above
|
326
|
$minute = $time[1];
|
327
|
$second = "00"; //seconds not given, assume 00
|
328
|
$arr[$a] = sprintf("%s-%s-%s %s:%s:%s", $year, $month, $day, $hour, $minute, $second);
|
329
|
//echo(" $arr[$a] <br>");
|
330
|
}
|
331
|
else if($a == 4)
|
332
|
{
|
333
|
|
334
|
//==== Actual data in GBIF file in format MM/DD/YYYY OR YYYY-MM-DD
|
335
|
|
336
|
if(stripos($arr[$a], '/')) // if in MM/DD/YYYY format
|
337
|
{
|
338
|
$dateFormat = explode('/', $arr[$a]);
|
339
|
$month = str_pad($dateFormat[0], 2, '0', STR_PAD_LEFT);
|
340
|
$day = str_pad($dateFormat[1], 2, '0', STR_PAD_LEFT);
|
341
|
$year = $dateFormat[2];
|
342
|
|
343
|
$arr[$a] = sprintf("%s-%s-%s", $year, $month, $day);
|
344
|
}
|
345
|
else if(stripos($arr[$a], '-')) //if in YYYY-MM-DD format
|
346
|
{
|
347
|
$dateFormat = explode('-', $arr[$a]);
|
348
|
$month = str_pad($dateFormat[1], 2, '0', STR_PAD_LEFT);
|
349
|
$day = str_pad($dateFormat[2], 2, '0', STR_PAD_LEFT);
|
350
|
$year = $dateFormat[0];
|
351
|
|
352
|
$arr[$a] = sprintf("%s-%s-%s", $year, $month, $day);
|
353
|
}
|
354
|
else
|
355
|
{
|
356
|
reportError($arr[6], $arr[7], $arr[$a], "Date format for ||$arr[$a]|| not found.");
|
357
|
}
|
358
|
}
|
359
|
}
|
360
|
else if($a == 26) // if elevation range is given as XXX-YYY in one field instead of as minimum and maximum
|
361
|
{
|
362
|
$minMax = explode('-', $arr[$a]);
|
363
|
if(sizeof($minMax) > 1)
|
364
|
{
|
365
|
$arr[26] = $minMax[0];
|
366
|
$arr[27] = $minMax[1];
|
367
|
}
|
368
|
}
|
369
|
else if ($a == 12) // To get the species and subspeices from the scientific name, Dhaval Shah
|
370
|
{
|
371
|
$spe_subspe = explode(' ',$arr[$a]);
|
372
|
if(sizeof($spe_subspe) > 1 and !(is_numeric($spe_subspe[1])))
|
373
|
{
|
374
|
$arr[28] = trim($spe_subspe[1]);
|
375
|
if(sizeof($spe_subspe) > 3 and !(is_numeric($spe_subspe[2])))
|
376
|
{
|
377
|
|
378
|
if ((strstr($spe_subspe[2], "var") or strstr($spe_subspe[2], "subsp") or strstr($spe_subspe[2], "ssp") or strstr($spe_subspe[2], "sbsp") or strstr($spe_subspe[2], "forma") or strstr($spe_subspe[2], "fo") or strstr($spe_subspe[2], "f") or strstr($spe_subspe[2], "cv")) and (strlen($spe_subspe[2]) < 6))
|
379
|
{
|
380
|
if ( strstr($spe_subspe[2], "ssp"))
|
381
|
{
|
382
|
$spe_subspe[2] = "subsp.";
|
383
|
}
|
384
|
$arr[30] = trim($spe_subspe[2]);
|
385
|
$arr[29] = trim($spe_subspe[3]);
|
386
|
}
|
387
|
else
|
388
|
{
|
389
|
$arr[30] = NULL;
|
390
|
$arr[29] = NULL;
|
391
|
}
|
392
|
}
|
393
|
else
|
394
|
{
|
395
|
$arr[30] = NULL;
|
396
|
$arr[29] = NULL;
|
397
|
}
|
398
|
|
399
|
}
|
400
|
else
|
401
|
{
|
402
|
$arr[28] = NULL;
|
403
|
$arr[29] = NULL;
|
404
|
$arr[30] = NULL;
|
405
|
}
|
406
|
}
|
407
|
}
|
408
|
}
|
409
|
|
410
|
## ========================================
|
411
|
## Name: reportError
|
412
|
## Function: creates a detailed error report in errorLog
|
413
|
## Internal Variables:
|
414
|
## Globals: $recordCount, $GBIFFile
|
415
|
## Calls Function: None
|
416
|
## Called by: Main Loop
|
417
|
## Written by: Mark Sheehan
|
418
|
## Last Updated: 07-22-09
|
419
|
## =======================================
|
420
|
|
421
|
function reportError($CollectionCD, $CatalogNo, $columnName, $Error)
|
422
|
{
|
423
|
GLOBAL $recordCount;
|
424
|
GLOBAL $GBIFFile;
|
425
|
//echo("Creating error log for $CollectionCD | $CatalogNo | $Error <br><br>");
|
426
|
$errorReport = sprintf(
|
427
|
"INSERT INTO ErrorLog (CollectionCD, CatalogNO, RawDataFileName, RawRecordNO, StagingTableName, StagingColumnName, ErrorDescription)
|
428
|
VALUES (%s, %s, %s, %s, %s, %s, %s)",
|
429
|
insertSQLData($CollectionCD),
|
430
|
insertSQLData($CatalogNo),
|
431
|
insertSQLData("GBIFFile.txt"),
|
432
|
insertSQLData($recordCount),
|
433
|
insertSQLData("StagingRawSpecimen"),
|
434
|
insertSQLData($columnName),
|
435
|
insertSQLData($Error)
|
436
|
);
|
437
|
//echo ("<br> $errorReport <br>");
|
438
|
mysql_query($errorReport) OR DIE("Failed to enter log into error report." . mysql_error());
|
439
|
}
|
440
|
|
441
|
## =======================================
|
442
|
## Name: checkDifference
|
443
|
## Function: determines whether the difference between two numbers is close to 0 at given precision
|
444
|
## Internal Variables:
|
445
|
## Globals: None
|
446
|
## Calls Function: None
|
447
|
## Called by: main loop
|
448
|
## Written by: Mark Sheehan
|
449
|
## Last updated: 07-29-09
|
450
|
## ======================================
|
451
|
|
452
|
function checkDifference($firstNum, $secondNum, $precision)
|
453
|
{
|
454
|
if(abs($firstNum - $secondNum) < 2 * pow(10, -$precision))
|
455
|
{
|
456
|
return TRUE;
|
457
|
}
|
458
|
else
|
459
|
{
|
460
|
return FALSE;
|
461
|
}
|
462
|
}
|
463
|
|
464
|
## =======================================
|
465
|
## Name: InsertSQLData
|
466
|
## Function: Formats data to be inserted for SQL insert query
|
467
|
## Internal Variables:
|
468
|
## Globals: None
|
469
|
## Calls Function: None
|
470
|
## Called by: Main Loop
|
471
|
## Written by: Mark Sheehan
|
472
|
## Last Updated: 07-22-09
|
473
|
## ======================================
|
474
|
|
475
|
function InsertSQLData($string)
|
476
|
{
|
477
|
if($string == '' OR $string == NULL)
|
478
|
{
|
479
|
return('NULL');
|
480
|
}
|
481
|
else
|
482
|
{
|
483
|
return(sprintf("'%s'", mysql_real_escape_string($string)));
|
484
|
}
|
485
|
}
|
486
|
|
487
|
## =======================================
|
488
|
## Name: SelectSQLData
|
489
|
## Function: Formats data to be inserted into SQL select query
|
490
|
## Internal Variables:
|
491
|
## Globals: None
|
492
|
## Calls Function: None
|
493
|
## Called by: Main Loop
|
494
|
## Written by: Mark Sheehan
|
495
|
## Last Updated: 07-22-09
|
496
|
## ========================================
|
497
|
|
498
|
function SelectSQLData($string)
|
499
|
{
|
500
|
if($string == '' OR $string == NULL)
|
501
|
{
|
502
|
return("IS NULL");
|
503
|
}
|
504
|
else
|
505
|
{
|
506
|
return(sprintf("= '%s'", mysql_real_escape_string($string)));
|
507
|
}
|
508
|
}
|
509
|
|
510
|
?>
|