Revision 4920
Added by Aaron Marcuse-Kubitza about 12 years ago
bin/tnrs_client | ||
---|---|---|
1 | 1 |
#!/usr/bin/env python |
2 | 2 |
# A TNRS client |
3 |
# When using xargs to pass many names, note that xargs will by default split its |
|
4 |
# arguments into chunks of 5000. You can change this using the -n option. |
|
3 | 5 |
# Note that obtaining an actual CSV requires four (!) steps: submit, retrieve, |
4 | 6 |
# prepare download, and download. The output of the retrieve step is unusable |
5 | 7 |
# because the array has different lengths depending on the taxonomic ranks |
... | ... | |
20 | 22 |
# Config |
21 | 23 |
initial_pause = 0.35 # sec |
22 | 24 |
pause_growth_factor = 1.3 |
23 |
max_pause = 5 # sec
|
|
25 |
max_pause = 600 # sec; = 10 min
|
|
24 | 26 |
assert initial_pause <= max_pause |
27 |
max_taxons = 5000# according to http://tnrs.iplantcollaborative.org/TNRSapp.html |
|
25 | 28 |
|
26 | 29 |
# Protocol params |
27 | 30 |
url_base = 'http://tnrs.iplantcollaborative.org/tnrsdemo/' |
... | ... | |
33 | 36 |
} |
34 | 37 |
submission_request_template = ('7|0|7|'+url_base+ |
35 | 38 |
'||org.iplantc.tnrs.demo.client.SearchService|doSearch|\ |
36 |
java.lang.String/2004016611|{"sources":"gcc,tropicos,usda", "names":"[taxon]", \
|
|
37 |
"type":"matching", "taxonomic":"true", "classification":"tropicos", \ |
|
39 |
java.lang.String/2004016611|{"sources":"gcc,tropicos,usda", "names":"[taxons]"\
|
|
40 |
, "type":"matching", "taxonomic":"true", "classification":"tropicos", \
|
|
38 | 41 |
"match_to_rank":"true"}|0.05|1|2|3|4|2|5|5|6|7|') |
39 | 42 |
submission_response_pattern = r'^//OK\[1,\["(\w+)"\],0,7\]$' |
40 | 43 |
retrieval_request_template = ('7|0|15|'+url_base+ |
... | ... | |
67 | 70 |
return match.groups() |
68 | 71 |
|
69 | 72 |
def main(): |
70 |
# Usage
|
|
73 |
# Input
|
|
71 | 74 |
env_names = [] |
72 |
def usage_err(): |
|
73 |
raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' ' |
|
74 |
+sys.argv[0]+' 2>>log') |
|
75 |
|
|
76 |
# Get config from env vars |
|
77 |
taxon = opts.get_env_var('taxon', None, env_names) |
|
78 | 75 |
debug = opts.env_flag('debug', False, env_names) |
79 |
if taxon == None: usage_err() |
|
76 |
taxons = sys.argv[1:] |
|
77 |
if not taxons: raise SystemExit('Usage: '+opts.env_usage(env_names, True) |
|
78 |
+' '+sys.argv[0]+' taxon... >out 2>>log') |
|
79 |
assert len(taxons) <= max_taxons |
|
80 | 80 |
|
81 | 81 |
# Logging |
82 | 82 |
def debug_log(label, str_=''): |
... | ... | |
108 | 108 |
pause *= pause_growth_factor |
109 | 109 |
|
110 | 110 |
debug_log('Submit') |
111 |
request = submission_request_template.replace('[taxon]', gwt_encode(taxon)) |
|
111 |
request = submission_request_template.replace('[taxons]', |
|
112 |
r'\\n'.join(map(gwt_encode, taxons))) # double-escape \n |
|
112 | 113 |
response, response_info = do_request(request) |
113 | 114 |
key, = parse_response('submission', submission_response_pattern, response) |
114 | 115 |
debug_log('key', key) |
Also available in: Unified diff
tnrs_client: Support parsing multiple taxons at once, by specifying each as a command-line argument. Increased the max_pause to 10 min to support large batches. Limited the batch size to 5000 names, using the limit at <http://tnrs.iplantcollaborative.org/TNRSapp.html>. Note that when using xargs to pass many names, xargs will by default split its arguments into chunks of 5000. You can change this using the -n option.