1 |
1
|
wheeler
|
<?php
|
2 |
|
|
#############################################################################
|
3 |
|
|
# Driver file for taiper2flatClient, a simple script to
|
4 |
|
|
# consume taiper web services and save the results in a delimited file,
|
5 |
1597
|
aaronmk
|
# one record per line. The current delimiter is comma ',' but
|
6 |
1
|
wheeler
|
# that can be changed by changing the $seperator variable located
|
7 |
|
|
# in configurableParams.php.
|
8 |
|
|
#
|
9 |
|
|
# Command line useage:
|
10 |
|
|
# php taiper2flat.php
|
11 |
|
|
#
|
12 |
|
|
# dependencies:
|
13 |
|
|
# php 5+
|
14 |
|
|
# pear
|
15 |
|
|
# HTTP_Request
|
16 |
|
|
# I had to install pear and HTTP_Request before it would work (on Ubuntu):
|
17 |
|
|
# sudo apt-get install php-http-request
|
18 |
|
|
# This should install all dependencies, but just in case, the full
|
19 |
|
|
# dependency list is:
|
20 |
|
|
# php-net-socket
|
21 |
|
|
# php-net-url
|
22 |
|
|
# php-pear
|
23 |
|
|
# php-http-request
|
24 |
|
|
#############################################################################
|
25 |
|
|
|
26 |
|
|
require_once('HTTP/Request.php'); // pear package
|
27 |
|
|
require_once('tapirRequestTemplate.php');
|
28 |
|
|
require_once('getAllConcepts.php');
|
29 |
|
|
|
30 |
|
|
global $url;
|
31 |
|
|
global $seperator;
|
32 |
|
|
|
33 |
1670
|
aaronmk
|
$flat_filename = "specieslink.specimens.csv";
|
34 |
1
|
wheeler
|
$record_num_filename = "recordnum.dat";
|
35 |
|
|
$error_log_filename = "error.log";
|
36 |
|
|
|
37 |
1666
|
aaronmk
|
$ctrl_chars = array_flip(array_map("chr", range(0, 31)));
|
38 |
|
|
foreach (array("\t", "\n", "\r") as $whitespace)
|
39 |
|
|
unset($ctrl_chars[$whitespace]);
|
40 |
|
|
$ctrl_chars = array_keys($ctrl_chars);
|
41 |
|
|
|
42 |
1
|
wheeler
|
$supportedConcepts = getAllConcepts();
|
43 |
|
|
if(!is_array($supportedConcepts)){
|
44 |
|
|
$error_log = fopen($error_log_filename,"a");
|
45 |
|
|
fwrite($error_log, "Unable to retrieve supported concepts from service, failing.\n");
|
46 |
|
|
fclose($error_log);
|
47 |
|
|
exit;
|
48 |
|
|
}
|
49 |
|
|
|
50 |
|
|
$successive_failures = 0;
|
51 |
|
|
|
52 |
|
|
$start_record = 0; // Record to begin first request (ordinal position, not ID)
|
53 |
|
|
if(file_exists($record_num_filename)) {
|
54 |
|
|
$start_record = file_get_contents($record_num_filename);
|
55 |
|
|
}
|
56 |
|
|
|
57 |
|
|
$request_limit = 1000; // Number of records per request; cannot exceed 1000
|
58 |
|
|
|
59 |
|
|
//According to TAPIR docs, the estimate from the service
|
60 |
|
|
//is an upper bound on the number of records. Start out high.
|
61 |
|
|
$estimated_max_from_service = 4000000;
|
62 |
|
|
|
63 |
|
|
if(!file_exists($flat_filename)) {
|
64 |
|
|
$flat_file = fopen($flat_filename,"a");
|
65 |
|
|
$file_header = '';
|
66 |
|
|
foreach(array_keys($supportedConcepts) as $key){
|
67 |
|
|
$file_header .= $key . $seperator;
|
68 |
|
|
}
|
69 |
|
|
$file_header = substr_replace($file_header ,"",-1);
|
70 |
|
|
$file_header .= "\n";
|
71 |
|
|
fwrite($flat_file,$file_header);
|
72 |
|
|
fclose($flat_file);
|
73 |
|
|
}
|
74 |
|
|
|
75 |
|
|
$finished = false;
|
76 |
|
|
$itrNum = 0;
|
77 |
|
|
|
78 |
|
|
$http_request = new HTTP_Request();
|
79 |
|
|
|
80 |
|
|
while(!$finished && $start_record < $estimated_max_from_service) {
|
81 |
|
|
$itrNum++;
|
82 |
|
|
|
83 |
|
|
$body = buildRequest($start_record,$request_limit,$supportedConcepts);
|
84 |
|
|
|
85 |
|
|
$http_request->setMethod( 'POST' );
|
86 |
|
|
$http_request->addHeader('Content-Type', 'text/xml');
|
87 |
|
|
$http_request->addRawPostData( $body );
|
88 |
|
|
$http_request->setURL( $url );
|
89 |
|
|
$http_request->_timeout = 300;
|
90 |
|
|
$http_request->_readTimeout = 300;
|
91 |
|
|
|
92 |
|
|
// This can be used to see the entire request
|
93 |
|
|
#$raw_request = $http_request->_buildRequest();
|
94 |
|
|
#echo "\n\n" . $raw_request;
|
95 |
|
|
|
96 |
|
|
$http_request->sendRequest();
|
97 |
|
|
|
98 |
|
|
$response = $http_request->getResponseBody();
|
99 |
|
|
$code = $http_request->getResponseCode();
|
100 |
|
|
|
101 |
|
|
if ( $code != 200 ) // 200 = OK
|
102 |
|
|
{
|
103 |
|
|
$label = 'Unknown Error';
|
104 |
|
|
|
105 |
|
|
switch ( $code )
|
106 |
|
|
{
|
107 |
|
|
case 201: $label = 'Created'; break;
|
108 |
|
|
case 202: $label = 'Accepted'; break;
|
109 |
|
|
case 203: $label = 'Non-Authoritative Information'; break;
|
110 |
|
|
case 204: $label = 'No Content'; break;
|
111 |
|
|
case 205: $label = 'Reset Content'; break;
|
112 |
|
|
case 206: $label = 'Partial Content'; break;
|
113 |
|
|
case 300: $label = 'Multiple Choices'; break;
|
114 |
|
|
case 301: $label = 'Moved Permanently'; break;
|
115 |
|
|
case 302: $label = 'Found'; break;
|
116 |
|
|
case 303: $label = 'See Other'; break;
|
117 |
|
|
case 304: $label = 'Not Modified'; break;
|
118 |
|
|
case 305: $label = 'Use Proxy'; break;
|
119 |
|
|
case 307: $label = 'Temporary Redirect'; break;
|
120 |
|
|
case 400: $label = 'Bad Request'; break;
|
121 |
|
|
case 401: $label = 'Unauthorized'; break;
|
122 |
|
|
case 402: $label = 'Payment Required'; break;
|
123 |
|
|
case 403: $label = 'Forbidden'; break;
|
124 |
|
|
case 404: $label = 'Not Found'; break;
|
125 |
|
|
case 405: $label = 'Method Not Allowed'; break;
|
126 |
|
|
case 406: $label = 'Not Acceptable'; break;
|
127 |
|
|
case 407: $label = 'Proxy Authentication Required'; break;
|
128 |
|
|
case 408: $label = 'Request Timeout'; break;
|
129 |
|
|
case 409: $label = 'Conflict'; break;
|
130 |
|
|
case 410: $label = 'Gone'; break;
|
131 |
|
|
case 411: $label = 'Length Required'; break;
|
132 |
|
|
case 412: $label = 'Precondition Failed'; break;
|
133 |
|
|
case 413: $label = 'Request Entity Too Large'; break;
|
134 |
|
|
case 414: $label = 'Request-URI Too Long'; break;
|
135 |
|
|
case 415: $label = 'Unsupported Media Type'; break;
|
136 |
|
|
case 416: $label = 'Requested Range Not Satisfiable'; break;
|
137 |
|
|
case 417: $label = 'Expectation Failed'; break;
|
138 |
|
|
case 500: $label = 'Internal Server Error'; break;
|
139 |
|
|
case 501: $label = 'Not Implemented'; break;
|
140 |
|
|
case 502: $label = 'Bad Gateway'; break;
|
141 |
|
|
case 503: $label = 'Service Unavailable'; break;
|
142 |
|
|
case 504: $label = 'Gateway Timeout'; break;
|
143 |
|
|
case 505: $label = 'HTTP Version Not Supported'; break;
|
144 |
|
|
}
|
145 |
|
|
|
146 |
|
|
$error_log = fopen($error_log_filename,"a");
|
147 |
|
|
fwrite($error_log, "Service responded with HTTP ".$code." code: ".$label."\n");
|
148 |
|
|
|
149 |
1601
|
aaronmk
|
#$successive_failures += 1;
|
150 |
|
|
#if($successive_failures >= 3) {
|
151 |
|
|
# fwrite($error_log, "3 successive failures, quitting\n");
|
152 |
|
|
# fclose($error_log);
|
153 |
|
|
# exit;
|
154 |
|
|
#}
|
155 |
1
|
wheeler
|
|
156 |
|
|
fclose($error_log);
|
157 |
|
|
|
158 |
|
|
//Should repeat the same request
|
159 |
|
|
continue;
|
160 |
|
|
}
|
161 |
|
|
|
162 |
1666
|
aaronmk
|
$response = filter_var(str_replace($ctrl_chars, "", $response),
|
163 |
|
|
FILTER_UNSAFE_RAW, FILTER_FLAG_ENCODE_HIGH);
|
164 |
1
|
wheeler
|
$xmlDoc = new DOMDocument();
|
165 |
|
|
$xmlDoc->loadXML($response);
|
166 |
|
|
|
167 |
|
|
$errors = $xmlDoc->getElementsByTagName("error");
|
168 |
|
|
if($errors->length > 0) {
|
169 |
|
|
$error_log = fopen($error_log_filename,"a");
|
170 |
|
|
foreach($errors as $error) {
|
171 |
|
|
fwrite($error_log, $error->nodeValue . "\n");
|
172 |
|
|
}
|
173 |
|
|
|
174 |
1601
|
aaronmk
|
#$successive_failures += 1;
|
175 |
|
|
#if($successive_failures >= 3) {
|
176 |
|
|
# fwrite($error_log, "3 successive failures, quitting\n");
|
177 |
|
|
# fclose($error_log);
|
178 |
|
|
# exit;
|
179 |
|
|
#}
|
180 |
1
|
wheeler
|
|
181 |
|
|
fclose($error_log);
|
182 |
|
|
|
183 |
|
|
//Should repeat the same request
|
184 |
|
|
continue;
|
185 |
|
|
}
|
186 |
|
|
|
187 |
|
|
$summary = $xmlDoc->getElementsByTagName("summary");
|
188 |
|
|
if($summary->length == 0) {
|
189 |
|
|
$error_log = fopen($error_log_filename,"a");
|
190 |
|
|
fwrite($error_log, "No summary node, assuming there's a missed error.\n");
|
191 |
|
|
|
192 |
1601
|
aaronmk
|
#$successive_failures += 1;
|
193 |
|
|
#if($successive_failures >= 3) {
|
194 |
|
|
# fwrite($error_log, "3 successive failures, quitting\n");
|
195 |
|
|
# fclose($error_log);
|
196 |
|
|
# exit;
|
197 |
|
|
#}
|
198 |
1
|
wheeler
|
|
199 |
|
|
fclose($error_log);
|
200 |
|
|
continue;
|
201 |
|
|
} else {
|
202 |
|
|
$start_record_temp = $summary->item(0)->getAttribute("next");
|
203 |
|
|
$estimated_max_from_service = $summary->item(0)->getAttribute("totalMatched");
|
204 |
|
|
}
|
205 |
|
|
|
206 |
1597
|
aaronmk
|
$flat_file = fopen($flat_filename,"a");
|
207 |
|
|
|
208 |
1
|
wheeler
|
$records = $xmlDoc->getElementsByTagName("record");
|
209 |
|
|
$recordsStr = '';
|
210 |
|
|
foreach($records as $record) {
|
211 |
1597
|
aaronmk
|
$fields = array();
|
212 |
1
|
wheeler
|
foreach(array_keys($supportedConcepts) as $key){
|
213 |
|
|
$element = $record->getElementsByTagName($key);
|
214 |
1603
|
aaronmk
|
$fields[] = $element->length > 0 ? $element->item(0)->nodeValue : "";
|
215 |
1
|
wheeler
|
}
|
216 |
1597
|
aaronmk
|
fputcsv($flat_file,$fields,$seperator);
|
217 |
1
|
wheeler
|
}
|
218 |
|
|
|
219 |
|
|
fwrite($flat_file,$recordsStr);
|
220 |
|
|
fclose($flat_file);
|
221 |
|
|
|
222 |
|
|
$successive_failures = 0;
|
223 |
|
|
|
224 |
|
|
$record_file = fopen($record_num_filename,"w");
|
225 |
|
|
fwrite($record_file,$start_record_temp);
|
226 |
|
|
fclose($record_file);
|
227 |
|
|
$start_record = $start_record_temp;
|
228 |
|
|
|
229 |
|
|
//If number of records is less than request_limit, it means
|
230 |
|
|
//the service is out of records.
|
231 |
|
|
$finished = ($request_limit > $records->length);
|
232 |
|
|
|
233 |
|
|
if($itrNum == 1) {
|
234 |
|
|
print "Extimated number of records: $estimated_max_from_service\n";
|
235 |
|
|
} elseif($itrNum % 10 == 0) {
|
236 |
|
|
print "Pulled $start_record records out of $estimated_max_from_service ".
|
237 |
|
|
"estimated total records.\n";
|
238 |
|
|
}
|
239 |
|
|
|
240 |
|
|
}
|
241 |
|
|
|
242 |
|
|
?>
|