Project

General

Profile

« Previous | Next » 

Revision 4918

Added tnrs_client. Note that obtaining an actual CSV requires four (!) steps: submit, retrieve, prepare download, and download. The output of the retrieve step is unusable because the array has different lengths depending on the taxonomic ranks present in the provided taxon name. This initial version runs one name at a time, but could later be expanded to batch process because TNRS can run multiple names at once.

View differences:

bin/tnrs_client
1
#!/usr/bin/env python
2
# A TNRS client
3
# Note that obtaining an actual CSV requires four (!) steps: submit, retrieve,
4
# prepare download, and download. The output of the retrieve step is unusable
5
# because the array has different lengths depending on the taxonomic ranks
6
# present in the provided taxon name.
7

  
8
import os.path
9
import re
10
import sys
11
import time
12
import urllib2
13

  
14
sys.path.append(os.path.dirname(__file__)+"/../lib")
15

  
16
import opts
17
import streams
18
#import util
19

  
20
# Config
21
initial_pause = 0.35 # sec
22
pause_growth_factor = 1.3
23
max_pause = 5 # sec
24
assert initial_pause <= max_pause
25

  
26
# Protocol params
27
url_base = 'http://tnrs.iplantcollaborative.org/tnrsdemo/'
28
url = url_base+'search'
29
initial_headers = {
30
    'Content-Type': 'text/x-gwt-rpc; charset=utf-8',
31
    'X-GWT-Module-Base': url_base,
32
    'X-GWT-Permutation': '574AA16D15D917C7704646FD92AFF6B3',
33
}
34
submission_request_template = ('7|0|7|'+url_base+
35
'||org.iplantc.tnrs.demo.client.SearchService|doSearch|\
36
java.lang.String/2004016611|{"sources":"gcc,tropicos,usda", "names":"[taxon]", \
37
"type":"matching", "taxonomic":"true", "classification":"tropicos", \
38
"match_to_rank":"true"}|0.05|1|2|3|4|2|5|5|6|7|')
39
submission_response_pattern = r'^//OK\[1,\["(\w+)"\],0,7\]$'
40
retrieval_request_template = ('7|0|15|'+url_base+
41
'|1E87C78041CEFBF0992F46BDF84D7D60|org.iplantc.tnrs.demo.client.SearchService\
42
|getRemoteData|com.extjs.gxt.ui.client.data.PagingLoadConfig|\
43
java.lang.String/2004016611|com.extjs.gxt.ui.client.data.BasePagingLoadConfig/\
44
2011366567|com.extjs.gxt.ui.client.data.RpcMap/3441186752|sortField|sortDir|\
45
com.extjs.gxt.ui.client.Style$SortDir/640452531|offset|java.lang.Integer/\
46
3438268394|limit|{"email":"tnrs@lka5jjs.orv", "key":"[key]", \
47
"taxonomic_constraint":"false", "source_sorting":"false", "first":"false"}\
48
|1|2|3|4|2|5|6|7|0|1|8|4|9|0|10|11|0|12|13|0|14|13|100|15|')
49
retrieval_response_pattern = '^//OK\[.*?\["com.extjs.gxt.ui.client.data.\
50
BasePagingLoadResult/496878394","java.util.ArrayList/4159755760","org.iplantc.\
51
tnrs.demo.shared.BeanTNRSEntry/1039545748",".*"\],0,7\]$'
52
retrieval_response_info_pattern = r'(?ms).*^Set-Cookie: JSESSIONID=(\w+);'
53
download_request_template = ('7|0|6|'+url_base+
54
'|1E87C78041CEFBF0992F46BDF84D7D60|org.iplantc.tnrs.demo.client.SearchService|\
55
downloadRemoteResults|java.lang.String/2004016611|{"name":"tnrs_results.txt", \
56
"mode":"Best", "type":"Detailed", "encoding":"utf8", "dirty":"false", \
57
"sources":"false", "taxonomic":"false", "email":"tnrs@lka5jjs.orv", \
58
"key":"[key]"}|1|2|3|4|1|5|6|')
59
download_response_pattern = '^//OK\[1,\["(.*)"\],0,7\]$'
60
download_url_suffix = '&name=tnrs_results.txt&encoding=utf8'
61

  
62
def gwt_encode(str_): return re.sub(r'[^\w.() -]+', r' ', str_)
63

  
64
def parse_response(name, pattern, response):
65
    match = re.match(pattern, response)
66
    if not match: raise SystemExit('Invalid '+name+' response:\n'+response)
67
    return match.groups()
68

  
69
def main():
70
    # Usage
71
    env_names = []
72
    def usage_err():
73
        raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' '
74
            +sys.argv[0]+' 2>>log')
75
    
76
    # Get config from env vars
77
    taxon = opts.get_env_var('taxon', None, env_names)
78
    debug = opts.env_flag('debug', False, env_names)
79
    if taxon == None: usage_err()
80
    
81
    # Logging
82
    def debug_log(label, str_=''):
83
        if debug: sys.stderr.write('\n'+label+':\n'+str_+'\n')
84
    
85
    ## HTTP
86
    headers = initial_headers
87
    
88
    def do_request(request=None, url=url, headers=headers):
89
        debug_log('request', str(request))
90
        response = urllib2.urlopen(urllib2.Request(url, request, headers))
91
        response_str = streams.read_all(response)
92
        response_info = str(response.info())
93
        debug_log('response info', response_info)
94
        debug_log('response str', response_str)
95
        return response_str, response_info
96
    
97
    def do_repeated_request(request):
98
        pause = initial_pause
99
        total_pause = 0
100
        while True:
101
            total_pause += pause
102
            if total_pause > max_pause: raise # error is not temporary
103
            debug_log('total_pause', str(total_pause)+'s')
104
            time.sleep(pause) # wait for job to complete
105
            
106
            try: return do_request(request)
107
            except urllib2.HTTPError: pass # try again
108
            pause *= pause_growth_factor
109
    
110
    debug_log('Submit')
111
    request = submission_request_template.replace('[taxon]', gwt_encode(taxon))
112
    response, response_info = do_request(request)
113
    key, = parse_response('submission', submission_response_pattern, response)
114
    debug_log('key', key)
115
    key_enc = gwt_encode(key)
116
    
117
    debug_log('Retrieve')
118
    request = retrieval_request_template.replace('[key]', key_enc)
119
    response, response_info = do_repeated_request(request)
120
    parse_response('retrieval', retrieval_response_pattern, response)
121
    session_id, = parse_response('retrieval info',
122
        retrieval_response_info_pattern, response_info)
123
    debug_log('session_id', session_id)
124
    headers['Cookie'] = 'JSESSIONID='+session_id
125
    
126
    debug_log('Prepare download')
127
    request = download_request_template.replace('[key]', key_enc)
128
    response, response_info = do_request(request)
129
    csv_url, = parse_response('download', download_response_pattern, response)
130
    csv_url += download_url_suffix
131
    debug_log('csv_url', csv_url)
132
    
133
    debug_log('Download')
134
    response, response_info = do_request(url=csv_url, headers={})
135
    sys.stdout.write(response)
136

  
137
main()
0 138

  

Also available in: Unified diff