Project

General

Profile

« Previous | Next » 

Revision 4920

tnrs_client: Support parsing multiple taxons at once, by specifying each as a command-line argument. Increased the max_pause to 10 min to support large batches. Limited the batch size to 5000 names, using the limit at <http://tnrs.iplantcollaborative.org/TNRSapp.html&gt;. Note that when using xargs to pass many names, xargs will by default split its arguments into chunks of 5000. You can change this using the -n option.

View differences:

bin/tnrs_client
1 1
#!/usr/bin/env python
2 2
# A TNRS client
3
# When using xargs to pass many names, note that xargs will by default split its
4
# arguments into chunks of 5000. You can change this using the -n option.
3 5
# Note that obtaining an actual CSV requires four (!) steps: submit, retrieve,
4 6
# prepare download, and download. The output of the retrieve step is unusable
5 7
# because the array has different lengths depending on the taxonomic ranks
......
20 22
# Config
21 23
initial_pause = 0.35 # sec
22 24
pause_growth_factor = 1.3
23
max_pause = 5 # sec
25
max_pause = 600 # sec; = 10 min
24 26
assert initial_pause <= max_pause
27
max_taxons = 5000# according to http://tnrs.iplantcollaborative.org/TNRSapp.html
25 28

  
26 29
# Protocol params
27 30
url_base = 'http://tnrs.iplantcollaborative.org/tnrsdemo/'
......
33 36
}
34 37
submission_request_template = ('7|0|7|'+url_base+
35 38
'||org.iplantc.tnrs.demo.client.SearchService|doSearch|\
36
java.lang.String/2004016611|{"sources":"gcc,tropicos,usda", "names":"[taxon]", \
37
"type":"matching", "taxonomic":"true", "classification":"tropicos", \
39
java.lang.String/2004016611|{"sources":"gcc,tropicos,usda", "names":"[taxons]"\
40
, "type":"matching", "taxonomic":"true", "classification":"tropicos", \
38 41
"match_to_rank":"true"}|0.05|1|2|3|4|2|5|5|6|7|')
39 42
submission_response_pattern = r'^//OK\[1,\["(\w+)"\],0,7\]$'
40 43
retrieval_request_template = ('7|0|15|'+url_base+
......
67 70
    return match.groups()
68 71

  
69 72
def main():
70
    # Usage
73
    # Input
71 74
    env_names = []
72
    def usage_err():
73
        raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' '
74
            +sys.argv[0]+' 2>>log')
75
    
76
    # Get config from env vars
77
    taxon = opts.get_env_var('taxon', None, env_names)
78 75
    debug = opts.env_flag('debug', False, env_names)
79
    if taxon == None: usage_err()
76
    taxons = sys.argv[1:]
77
    if not taxons: raise SystemExit('Usage: '+opts.env_usage(env_names, True)
78
        +' '+sys.argv[0]+' taxon... >out 2>>log')
79
    assert len(taxons) <= max_taxons
80 80
    
81 81
    # Logging
82 82
    def debug_log(label, str_=''):
......
108 108
            pause *= pause_growth_factor
109 109
    
110 110
    debug_log('Submit')
111
    request = submission_request_template.replace('[taxon]', gwt_encode(taxon))
111
    request = submission_request_template.replace('[taxons]',
112
        r'\\n'.join(map(gwt_encode, taxons))) # double-escape \n
112 113
    response, response_info = do_request(request)
113 114
    key, = parse_response('submission', submission_response_pattern, response)
114 115
    debug_log('key', key)

Also available in: Unified diff