1
|
#!/usr/bin/env python
|
2
|
# A DiGIR client
|
3
|
|
4
|
import os
|
5
|
import os.path
|
6
|
import sys
|
7
|
import urllib
|
8
|
import urllib2
|
9
|
import xml.dom.minidom as minidom
|
10
|
|
11
|
sys.path.append(os.path.dirname(__file__)+"/../lib")
|
12
|
|
13
|
import dates
|
14
|
import http
|
15
|
import opts
|
16
|
import profiling
|
17
|
import streams
|
18
|
import util
|
19
|
import xml_parse
|
20
|
import xpath
|
21
|
|
22
|
# Config
|
23
|
timeout = 20 # sec
|
24
|
default_chunk_size = 10000 # records
|
25
|
|
26
|
schema = 'http://digir.net/schema/conceptual/darwin/full/2003/1.0/darwin2full.xsd'
|
27
|
request_xml_template = '''\
|
28
|
<?xml version="1.0" encoding="UTF-8"?>
|
29
|
<request
|
30
|
xmlns="http://digir.net/schema/protocol/2003/1.0"
|
31
|
xmlns:xsd="http://www.w3.org/2001/XMLSchema"
|
32
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
33
|
xmlns:digir="http://digir.net/schema/protocol/2003/1.0"
|
34
|
xmlns:darwin="http://digir.net/schema/conceptual/darwin/2003/1.0"
|
35
|
xmlns:dwc="http://digir.net/schema/conceptual/darwin/2003/1.0"
|
36
|
xsi:schemaLocation="http://digir.net/schema/protocol/2003/1.0
|
37
|
http://digir.sourceforge.net/schema/protocol/2003/1.0/digir.xsd
|
38
|
http://digir.net/schema/conceptual/darwin/2003/1.0
|
39
|
http://digir.sourceforge.net/schema/conceptual/darwin/2003/1.0/darwin2.xsd">
|
40
|
<header>
|
41
|
<version>1.0</version>
|
42
|
<sendTime>[time]</sendTime>
|
43
|
<source>[source]</source>
|
44
|
<destination resource="[resource]">[url]</destination>
|
45
|
<type>search</type>
|
46
|
</header>
|
47
|
<search>
|
48
|
<filter>
|
49
|
<equals>
|
50
|
<darwin:Kingdom>plantae</darwin:Kingdom>
|
51
|
</equals>
|
52
|
</filter>
|
53
|
<records limit="[count]" start="[start]">
|
54
|
<structure schemaLocation="[schema]"/>
|
55
|
</records>
|
56
|
<count>true</count>
|
57
|
</search>
|
58
|
</request>
|
59
|
'''
|
60
|
|
61
|
diags_start = '<diagnostics>'
|
62
|
diags_end = '</diagnostics>'
|
63
|
|
64
|
class InputError(Exception): pass
|
65
|
|
66
|
def main():
|
67
|
# Usage
|
68
|
env_names = []
|
69
|
def usage_err():
|
70
|
raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' '
|
71
|
+sys.argv[0]+' 2>>log')
|
72
|
|
73
|
# Get config from env vars
|
74
|
url = opts.get_env_var('url', None, env_names)
|
75
|
resource = opts.get_env_var('resource', None, env_names)
|
76
|
start = util.cast(int, opts.get_env_var('start', 0, env_names))
|
77
|
count = util.cast(int, opts.get_env_var('n', None, env_names))
|
78
|
chunk_size = util.cast(int, opts.get_env_var('chunk_size',
|
79
|
default_chunk_size, env_names))
|
80
|
debug = opts.env_flag('debug', False, env_names)
|
81
|
if url == None or resource == None: usage_err()
|
82
|
|
83
|
# Logging
|
84
|
def clear_line(): sys.stderr.write('\n')
|
85
|
log_indent = 0
|
86
|
def log(msg, line_ending='\n'): sys.stderr.write(msg+line_ending)
|
87
|
def debug_log(str_, label=None):
|
88
|
if debug:
|
89
|
if label != None: sys.stderr.write(label+':\n')
|
90
|
sys.stderr.write(str_+'\n')
|
91
|
|
92
|
# Request XML
|
93
|
self_dir = os.path.dirname(__file__)
|
94
|
source = os.popen(self_dir+"/local_ip").read().strip()
|
95
|
this_request_xml_template = (request_xml_template
|
96
|
.replace('[source]', source)
|
97
|
.replace('[url]', url)
|
98
|
.replace('[resource]', resource)
|
99
|
.replace('[schema]', schema)
|
100
|
)
|
101
|
|
102
|
# Stats
|
103
|
total = 0
|
104
|
def print_status(line_ending='\n'):
|
105
|
log('Processed '+str(total)+' record(s)', line_ending)
|
106
|
match_ct = None
|
107
|
|
108
|
profiler = profiling.ItersProfiler(start_now=True, iter_text='record')
|
109
|
|
110
|
# Retrieve data
|
111
|
while count == None or total < count:
|
112
|
# Adjust chunk size if last chunk
|
113
|
this_count = chunk_size
|
114
|
if count != None: this_count = min(this_count, count - total)
|
115
|
|
116
|
# Request XML
|
117
|
time = dates.strftime('%Y-%m-%d %H:%M:%S %Z', dates.now())
|
118
|
request_xml = (this_request_xml_template
|
119
|
.replace('[count]', str(this_count))
|
120
|
.replace('[start]', str(start))
|
121
|
.replace('[time]', time)
|
122
|
)
|
123
|
debug_log(request_xml, 'request')
|
124
|
|
125
|
# Send request
|
126
|
this_url = url+'?'+urllib.urlencode({'request': request_xml})
|
127
|
stream = streams.CaptureStream(streams.TimeoutInputStream(
|
128
|
urllib2.urlopen(this_url), timeout), diags_start, diags_end)
|
129
|
|
130
|
# Retrieve response
|
131
|
streams.copy(stream, sys.stdout)
|
132
|
# Make sure output ends in a newline so that consecutive XML documents
|
133
|
# are on different lines
|
134
|
sys.stdout.write('\n')
|
135
|
stream.close()
|
136
|
|
137
|
# Parse diagnostics
|
138
|
diags_str = stream.matches[0]
|
139
|
debug_log(diags_str, 'diagnostics')
|
140
|
diags = xml_parse.parse_str(diags_str)
|
141
|
def get_diag(name):
|
142
|
return xpath.get_value(diags, 'diagnostic[@code='+name+']')
|
143
|
|
144
|
# Process match count
|
145
|
this_match_ct = util.cast(int, get_diag('MATCH_COUNT'))
|
146
|
if this_match_ct != match_ct: # first or updated match count
|
147
|
match_ct = this_match_ct
|
148
|
log('Found '+str(match_ct)+' record(s)')
|
149
|
|
150
|
# Process record count
|
151
|
this_ct = util.cast(int, get_diag('RECORD_COUNT'))
|
152
|
if this_ct == None: raise InputError('Missing RECORD_COUNT diagnostic')
|
153
|
total += this_ct
|
154
|
start += this_ct # advance start to fetch next set
|
155
|
print_status('\r') # CR at end so next print overwrites msg
|
156
|
|
157
|
# Decide if done
|
158
|
if this_ct == 0 or get_diag('END_OF_RECORDS') == 'true': break
|
159
|
|
160
|
print_status()
|
161
|
profiler.stop(total)
|
162
|
log(profiler.msg())
|
163
|
|
164
|
main()
|