Revision 4918
Added by Aaron Marcuse-Kubitza about 12 years ago
bin/tnrs_client | ||
---|---|---|
1 |
#!/usr/bin/env python |
|
2 |
# A TNRS client |
|
3 |
# Note that obtaining an actual CSV requires four (!) steps: submit, retrieve, |
|
4 |
# prepare download, and download. The output of the retrieve step is unusable |
|
5 |
# because the array has different lengths depending on the taxonomic ranks |
|
6 |
# present in the provided taxon name. |
|
7 |
|
|
8 |
import os.path |
|
9 |
import re |
|
10 |
import sys |
|
11 |
import time |
|
12 |
import urllib2 |
|
13 |
|
|
14 |
sys.path.append(os.path.dirname(__file__)+"/../lib") |
|
15 |
|
|
16 |
import opts |
|
17 |
import streams |
|
18 |
#import util |
|
19 |
|
|
20 |
# Config |
|
21 |
initial_pause = 0.35 # sec |
|
22 |
pause_growth_factor = 1.3 |
|
23 |
max_pause = 5 # sec |
|
24 |
assert initial_pause <= max_pause |
|
25 |
|
|
26 |
# Protocol params |
|
27 |
url_base = 'http://tnrs.iplantcollaborative.org/tnrsdemo/' |
|
28 |
url = url_base+'search' |
|
29 |
initial_headers = { |
|
30 |
'Content-Type': 'text/x-gwt-rpc; charset=utf-8', |
|
31 |
'X-GWT-Module-Base': url_base, |
|
32 |
'X-GWT-Permutation': '574AA16D15D917C7704646FD92AFF6B3', |
|
33 |
} |
|
34 |
submission_request_template = ('7|0|7|'+url_base+ |
|
35 |
'||org.iplantc.tnrs.demo.client.SearchService|doSearch|\ |
|
36 |
java.lang.String/2004016611|{"sources":"gcc,tropicos,usda", "names":"[taxon]", \ |
|
37 |
"type":"matching", "taxonomic":"true", "classification":"tropicos", \ |
|
38 |
"match_to_rank":"true"}|0.05|1|2|3|4|2|5|5|6|7|') |
|
39 |
submission_response_pattern = r'^//OK\[1,\["(\w+)"\],0,7\]$' |
|
40 |
retrieval_request_template = ('7|0|15|'+url_base+ |
|
41 |
'|1E87C78041CEFBF0992F46BDF84D7D60|org.iplantc.tnrs.demo.client.SearchService\ |
|
42 |
|getRemoteData|com.extjs.gxt.ui.client.data.PagingLoadConfig|\ |
|
43 |
java.lang.String/2004016611|com.extjs.gxt.ui.client.data.BasePagingLoadConfig/\ |
|
44 |
2011366567|com.extjs.gxt.ui.client.data.RpcMap/3441186752|sortField|sortDir|\ |
|
45 |
com.extjs.gxt.ui.client.Style$SortDir/640452531|offset|java.lang.Integer/\ |
|
46 |
3438268394|limit|{"email":"tnrs@lka5jjs.orv", "key":"[key]", \ |
|
47 |
"taxonomic_constraint":"false", "source_sorting":"false", "first":"false"}\ |
|
48 |
|1|2|3|4|2|5|6|7|0|1|8|4|9|0|10|11|0|12|13|0|14|13|100|15|') |
|
49 |
retrieval_response_pattern = '^//OK\[.*?\["com.extjs.gxt.ui.client.data.\ |
|
50 |
BasePagingLoadResult/496878394","java.util.ArrayList/4159755760","org.iplantc.\ |
|
51 |
tnrs.demo.shared.BeanTNRSEntry/1039545748",".*"\],0,7\]$' |
|
52 |
retrieval_response_info_pattern = r'(?ms).*^Set-Cookie: JSESSIONID=(\w+);' |
|
53 |
download_request_template = ('7|0|6|'+url_base+ |
|
54 |
'|1E87C78041CEFBF0992F46BDF84D7D60|org.iplantc.tnrs.demo.client.SearchService|\ |
|
55 |
downloadRemoteResults|java.lang.String/2004016611|{"name":"tnrs_results.txt", \ |
|
56 |
"mode":"Best", "type":"Detailed", "encoding":"utf8", "dirty":"false", \ |
|
57 |
"sources":"false", "taxonomic":"false", "email":"tnrs@lka5jjs.orv", \ |
|
58 |
"key":"[key]"}|1|2|3|4|1|5|6|') |
|
59 |
download_response_pattern = '^//OK\[1,\["(.*)"\],0,7\]$' |
|
60 |
download_url_suffix = '&name=tnrs_results.txt&encoding=utf8' |
|
61 |
|
|
62 |
def gwt_encode(str_): return re.sub(r'[^\w.() -]+', r' ', str_) |
|
63 |
|
|
64 |
def parse_response(name, pattern, response): |
|
65 |
match = re.match(pattern, response) |
|
66 |
if not match: raise SystemExit('Invalid '+name+' response:\n'+response) |
|
67 |
return match.groups() |
|
68 |
|
|
69 |
def main(): |
|
70 |
# Usage |
|
71 |
env_names = [] |
|
72 |
def usage_err(): |
|
73 |
raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' ' |
|
74 |
+sys.argv[0]+' 2>>log') |
|
75 |
|
|
76 |
# Get config from env vars |
|
77 |
taxon = opts.get_env_var('taxon', None, env_names) |
|
78 |
debug = opts.env_flag('debug', False, env_names) |
|
79 |
if taxon == None: usage_err() |
|
80 |
|
|
81 |
# Logging |
|
82 |
def debug_log(label, str_=''): |
|
83 |
if debug: sys.stderr.write('\n'+label+':\n'+str_+'\n') |
|
84 |
|
|
85 |
## HTTP |
|
86 |
headers = initial_headers |
|
87 |
|
|
88 |
def do_request(request=None, url=url, headers=headers): |
|
89 |
debug_log('request', str(request)) |
|
90 |
response = urllib2.urlopen(urllib2.Request(url, request, headers)) |
|
91 |
response_str = streams.read_all(response) |
|
92 |
response_info = str(response.info()) |
|
93 |
debug_log('response info', response_info) |
|
94 |
debug_log('response str', response_str) |
|
95 |
return response_str, response_info |
|
96 |
|
|
97 |
def do_repeated_request(request): |
|
98 |
pause = initial_pause |
|
99 |
total_pause = 0 |
|
100 |
while True: |
|
101 |
total_pause += pause |
|
102 |
if total_pause > max_pause: raise # error is not temporary |
|
103 |
debug_log('total_pause', str(total_pause)+'s') |
|
104 |
time.sleep(pause) # wait for job to complete |
|
105 |
|
|
106 |
try: return do_request(request) |
|
107 |
except urllib2.HTTPError: pass # try again |
|
108 |
pause *= pause_growth_factor |
|
109 |
|
|
110 |
debug_log('Submit') |
|
111 |
request = submission_request_template.replace('[taxon]', gwt_encode(taxon)) |
|
112 |
response, response_info = do_request(request) |
|
113 |
key, = parse_response('submission', submission_response_pattern, response) |
|
114 |
debug_log('key', key) |
|
115 |
key_enc = gwt_encode(key) |
|
116 |
|
|
117 |
debug_log('Retrieve') |
|
118 |
request = retrieval_request_template.replace('[key]', key_enc) |
|
119 |
response, response_info = do_repeated_request(request) |
|
120 |
parse_response('retrieval', retrieval_response_pattern, response) |
|
121 |
session_id, = parse_response('retrieval info', |
|
122 |
retrieval_response_info_pattern, response_info) |
|
123 |
debug_log('session_id', session_id) |
|
124 |
headers['Cookie'] = 'JSESSIONID='+session_id |
|
125 |
|
|
126 |
debug_log('Prepare download') |
|
127 |
request = download_request_template.replace('[key]', key_enc) |
|
128 |
response, response_info = do_request(request) |
|
129 |
csv_url, = parse_response('download', download_response_pattern, response) |
|
130 |
csv_url += download_url_suffix |
|
131 |
debug_log('csv_url', csv_url) |
|
132 |
|
|
133 |
debug_log('Download') |
|
134 |
response, response_info = do_request(url=csv_url, headers={}) |
|
135 |
sys.stdout.write(response) |
|
136 |
|
|
137 |
main() |
|
0 | 138 |
Also available in: Unified diff
Added tnrs_client. Note that obtaining an actual CSV requires four (!) steps: submit, retrieve, prepare download, and download. The output of the retrieve step is unusable because the array has different lengths depending on the taxonomic ranks present in the provided taxon name. This initial version runs one name at a time, but could later be expanded to batch process because TNRS can run multiple names at once.