1 |
1674
|
aaronmk
|
#!/usr/bin/env python
|
2 |
|
|
# A DiGIR client
|
3 |
|
|
|
4 |
1678
|
aaronmk
|
import os
|
5 |
1674
|
aaronmk
|
import os.path
|
6 |
|
|
import sys
|
7 |
|
|
import urllib
|
8 |
|
|
import urllib2
|
9 |
1690
|
aaronmk
|
import xml.dom.minidom as minidom
|
10 |
1674
|
aaronmk
|
|
11 |
|
|
sys.path.append(os.path.dirname(__file__)+"/../lib")
|
12 |
|
|
|
13 |
1678
|
aaronmk
|
import dates
|
14 |
1697
|
aaronmk
|
import http
|
15 |
1674
|
aaronmk
|
import opts
|
16 |
1699
|
aaronmk
|
import profiling
|
17 |
1674
|
aaronmk
|
import streams
|
18 |
|
|
import util
|
19 |
1709
|
aaronmk
|
import xml_parse
|
20 |
1690
|
aaronmk
|
import xpath
|
21 |
1674
|
aaronmk
|
|
22 |
|
|
# Config
|
23 |
|
|
timeout = 20 # sec
|
24 |
1692
|
aaronmk
|
default_chunk_size = 10000 # records
|
25 |
1674
|
aaronmk
|
|
26 |
1692
|
aaronmk
|
schema = 'http://digir.net/schema/conceptual/darwin/full/2003/1.0/darwin2full.xsd'
|
27 |
1674
|
aaronmk
|
request_xml_template = '''\
|
28 |
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
29 |
|
|
<request
|
30 |
|
|
xmlns="http://digir.net/schema/protocol/2003/1.0"
|
31 |
|
|
xmlns:xsd="http://www.w3.org/2001/XMLSchema"
|
32 |
|
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
33 |
|
|
xmlns:digir="http://digir.net/schema/protocol/2003/1.0"
|
34 |
|
|
xmlns:darwin="http://digir.net/schema/conceptual/darwin/2003/1.0"
|
35 |
|
|
xmlns:dwc="http://digir.net/schema/conceptual/darwin/2003/1.0"
|
36 |
|
|
xsi:schemaLocation="http://digir.net/schema/protocol/2003/1.0
|
37 |
|
|
http://digir.sourceforge.net/schema/protocol/2003/1.0/digir.xsd
|
38 |
|
|
http://digir.net/schema/conceptual/darwin/2003/1.0
|
39 |
|
|
http://digir.sourceforge.net/schema/conceptual/darwin/2003/1.0/darwin2.xsd">
|
40 |
|
|
<header>
|
41 |
|
|
<version>1.0</version>
|
42 |
1678
|
aaronmk
|
<sendTime>[time]</sendTime>
|
43 |
|
|
<source>[source]</source>
|
44 |
1674
|
aaronmk
|
<destination resource="[resource]">[url]</destination>
|
45 |
|
|
<type>search</type>
|
46 |
|
|
</header>
|
47 |
|
|
<search>
|
48 |
|
|
<filter>
|
49 |
1675
|
aaronmk
|
<equals>
|
50 |
1678
|
aaronmk
|
<darwin:Kingdom>plantae</darwin:Kingdom>
|
51 |
1675
|
aaronmk
|
</equals>
|
52 |
1674
|
aaronmk
|
</filter>
|
53 |
|
|
<records limit="[count]" start="[start]">
|
54 |
1679
|
aaronmk
|
<structure schemaLocation="[schema]"/>
|
55 |
1674
|
aaronmk
|
</records>
|
56 |
|
|
<count>true</count>
|
57 |
|
|
</search>
|
58 |
|
|
</request>
|
59 |
|
|
'''
|
60 |
|
|
|
61 |
1683
|
aaronmk
|
diags_start = '<diagnostics>'
|
62 |
|
|
diags_end = '</diagnostics>'
|
63 |
|
|
|
64 |
1690
|
aaronmk
|
class InputError(Exception): pass
|
65 |
1685
|
aaronmk
|
|
66 |
1674
|
aaronmk
|
def main():
|
67 |
1690
|
aaronmk
|
# Usage
|
68 |
1674
|
aaronmk
|
env_names = []
|
69 |
|
|
def usage_err():
|
70 |
|
|
raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' '
|
71 |
|
|
+sys.argv[0]+' 2>>log')
|
72 |
|
|
|
73 |
|
|
# Get config from env vars
|
74 |
|
|
url = opts.get_env_var('url', None, env_names)
|
75 |
|
|
resource = opts.get_env_var('resource', None, env_names)
|
76 |
|
|
start = util.cast(int, opts.get_env_var('start', 0, env_names))
|
77 |
1690
|
aaronmk
|
count = util.cast(int, opts.get_env_var('n', None, env_names))
|
78 |
1692
|
aaronmk
|
chunk_size = util.cast(int, opts.get_env_var('chunk_size',
|
79 |
|
|
default_chunk_size, env_names))
|
80 |
1678
|
aaronmk
|
debug = opts.env_flag('debug', False, env_names)
|
81 |
1674
|
aaronmk
|
if url == None or resource == None: usage_err()
|
82 |
|
|
|
83 |
1690
|
aaronmk
|
# Logging
|
84 |
1678
|
aaronmk
|
def clear_line(): sys.stderr.write('\n')
|
85 |
|
|
log_indent = 0
|
86 |
1691
|
aaronmk
|
def log(msg, line_ending='\n'): sys.stderr.write(msg+line_ending)
|
87 |
1685
|
aaronmk
|
def debug_log(str_, label=None):
|
88 |
|
|
if debug:
|
89 |
|
|
if label != None: sys.stderr.write(label+':\n')
|
90 |
|
|
sys.stderr.write(str_+'\n')
|
91 |
1674
|
aaronmk
|
|
92 |
1690
|
aaronmk
|
# Request XML
|
93 |
1678
|
aaronmk
|
self_dir = os.path.dirname(__file__)
|
94 |
|
|
source = os.popen(self_dir+"/local_ip").read().strip()
|
95 |
|
|
this_request_xml_template = (request_xml_template
|
96 |
|
|
.replace('[source]', source)
|
97 |
|
|
.replace('[url]', url)
|
98 |
|
|
.replace('[resource]', resource)
|
99 |
1679
|
aaronmk
|
.replace('[schema]', schema)
|
100 |
1678
|
aaronmk
|
)
|
101 |
|
|
|
102 |
1690
|
aaronmk
|
# Stats
|
103 |
|
|
total = 0
|
104 |
|
|
def print_status(line_ending='\n'):
|
105 |
|
|
log('Processed '+str(total)+' record(s)', line_ending)
|
106 |
|
|
match_ct = None
|
107 |
1674
|
aaronmk
|
|
108 |
1699
|
aaronmk
|
profiler = profiling.ItersProfiler(start_now=True, iter_text='record')
|
109 |
|
|
|
110 |
1690
|
aaronmk
|
# Retrieve data
|
111 |
|
|
while count == None or total < count:
|
112 |
1691
|
aaronmk
|
# Adjust chunk size if last chunk
|
113 |
1695
|
aaronmk
|
this_count = chunk_size
|
114 |
|
|
if count != None: this_count = min(this_count, count - total)
|
115 |
1691
|
aaronmk
|
|
116 |
1690
|
aaronmk
|
# Request XML
|
117 |
|
|
time = dates.strftime('%Y-%m-%d %H:%M:%S %Z', dates.now())
|
118 |
|
|
request_xml = (this_request_xml_template
|
119 |
1695
|
aaronmk
|
.replace('[count]', str(this_count))
|
120 |
1690
|
aaronmk
|
.replace('[start]', str(start))
|
121 |
|
|
.replace('[time]', time)
|
122 |
|
|
)
|
123 |
|
|
debug_log(request_xml, 'request')
|
124 |
|
|
|
125 |
|
|
# Send request
|
126 |
|
|
this_url = url+'?'+urllib.urlencode({'request': request_xml})
|
127 |
|
|
stream = streams.CaptureStream(streams.TimeoutInputStream(
|
128 |
|
|
urllib2.urlopen(this_url), timeout), diags_start, diags_end)
|
129 |
|
|
|
130 |
|
|
# Retrieve response
|
131 |
|
|
streams.copy(stream, sys.stdout)
|
132 |
1692
|
aaronmk
|
# Make sure output ends in a newline so that consecutive XML documents
|
133 |
|
|
# are on different lines
|
134 |
|
|
sys.stdout.write('\n')
|
135 |
1690
|
aaronmk
|
stream.close()
|
136 |
|
|
|
137 |
|
|
# Parse diagnostics
|
138 |
1706
|
aaronmk
|
diags_str = stream.matches[0]
|
139 |
1690
|
aaronmk
|
debug_log(diags_str, 'diagnostics')
|
140 |
1709
|
aaronmk
|
diags = xml_parse.parse_str(diags_str)
|
141 |
1690
|
aaronmk
|
def get_diag(name):
|
142 |
|
|
return xpath.get_value(diags, 'diagnostic[@code='+name+']')
|
143 |
|
|
|
144 |
|
|
# Process match count
|
145 |
|
|
this_match_ct = util.cast(int, get_diag('MATCH_COUNT'))
|
146 |
|
|
if this_match_ct != match_ct: # first or updated match count
|
147 |
|
|
match_ct = this_match_ct
|
148 |
|
|
log('Found '+str(match_ct)+' record(s)')
|
149 |
|
|
|
150 |
|
|
# Process record count
|
151 |
|
|
this_ct = util.cast(int, get_diag('RECORD_COUNT'))
|
152 |
|
|
if this_ct == None: raise InputError('Missing RECORD_COUNT diagnostic')
|
153 |
|
|
total += this_ct
|
154 |
|
|
start += this_ct # advance start to fetch next set
|
155 |
|
|
print_status('\r') # CR at end so next print overwrites msg
|
156 |
1695
|
aaronmk
|
|
157 |
|
|
# Decide if done
|
158 |
1690
|
aaronmk
|
if this_ct == 0 or get_diag('END_OF_RECORDS') == 'true': break
|
159 |
1687
|
aaronmk
|
|
160 |
1690
|
aaronmk
|
print_status()
|
161 |
1699
|
aaronmk
|
profiler.stop(total)
|
162 |
|
|
log(profiler.msg())
|
163 |
1674
|
aaronmk
|
|
164 |
|
|
main()
|