/ - Diff - BIEN 3 - NCEAS Projects

« Previous | Next »

Revision 4918

Added by Aaron Marcuse-Kubitza over 12 years ago

Added tnrs_client. Note that obtaining an actual CSV requires four (!) steps: submit, retrieve, prepare download, and download. The output of the retrieve step is unusable because the array has different lengths depending on the taxonomic ranks present in the provided taxon name. This initial version runs one name at a time, but could later be expanded to batch process because TNRS can run multiple names at once.

     #!/usr/bin/env python
     # A TNRS client
     # Note that obtaining an actual CSV requires four (!) steps: submit, retrieve,
     # prepare download, and download. The output of the retrieve step is unusable
     # because the array has different lengths depending on the taxonomic ranks
     # present in the provided taxon name.
     import os.path
     import re
     import sys
     import time
     import urllib2
     sys.path.append(os.path.dirname(__file__)+"/../lib")
     import opts
     import streams
     #import util
     # Config
     initial_pause = 0.35 # sec
     pause_growth_factor = 1.3
     max_pause = 5 # sec
     assert initial_pause <= max_pause
     # Protocol params
     url_base = 'http://tnrs.iplantcollaborative.org/tnrsdemo/'
     url = url_base+'search'
     initial_headers = {
         'Content-Type': 'text/x-gwt-rpc; charset=utf-8',
         'X-GWT-Module-Base': url_base,
         'X-GWT-Permutation': '574AA16D15D917C7704646FD92AFF6B3',
+    }
     submission_request_template = ('7|0|7|'+url_base+
     '||org.iplantc.tnrs.demo.client.SearchService|doSearch|\
     java.lang.String/2004016611|{"sources":"gcc,tropicos,usda", "names":"[taxon]", \
     "type":"matching", "taxonomic":"true", "classification":"tropicos", \
     "match_to_rank":"true"}|0.05|1|2|3|4|2|5|5|6|7|')
     submission_response_pattern = r'^//OK\[1,\["(\w+)"\],0,7\]$'
     retrieval_request_template = ('7|0|15|'+url_base+
     '|1E87C78041CEFBF0992F46BDF84D7D60|org.iplantc.tnrs.demo.client.SearchService\
     |getRemoteData|com.extjs.gxt.ui.client.data.PagingLoadConfig|\
     java.lang.String/2004016611|com.extjs.gxt.ui.client.data.BasePagingLoadConfig/\
     2011366567|com.extjs.gxt.ui.client.data.RpcMap/3441186752|sortField|sortDir|\
     com.extjs.gxt.ui.client.Style$SortDir/640452531|offset|java.lang.Integer/\
     3438268394|limit|{"email":"tnrs@lka5jjs.orv", "key":"[key]", \
     "taxonomic_constraint":"false", "source_sorting":"false", "first":"false"}\
     |1|2|3|4|2|5|6|7|0|1|8|4|9|0|10|11|0|12|13|0|14|13|100|15|')
     retrieval_response_pattern = '^//OK\[.*?\["com.extjs.gxt.ui.client.data.\
     BasePagingLoadResult/496878394","java.util.ArrayList/4159755760","org.iplantc.\
     tnrs.demo.shared.BeanTNRSEntry/1039545748",".*"\],0,7\]$'
     retrieval_response_info_pattern = r'(?ms).*^Set-Cookie: JSESSIONID=(\w+);'
     download_request_template = ('7|0|6|'+url_base+
     '|1E87C78041CEFBF0992F46BDF84D7D60|org.iplantc.tnrs.demo.client.SearchService|\
     downloadRemoteResults|java.lang.String/2004016611|{"name":"tnrs_results.txt", \
     "mode":"Best", "type":"Detailed", "encoding":"utf8", "dirty":"false", \
     "sources":"false", "taxonomic":"false", "email":"tnrs@lka5jjs.orv", \
     "key":"[key]"}|1|2|3|4|1|5|6|')
     download_response_pattern = '^//OK\[1,\["(.*)"\],0,7\]$'
     download_url_suffix = '&name=tnrs_results.txt&encoding=utf8'
     def gwt_encode(str_): return re.sub(r'[^\w.() -]+', r' ', str_)
     def parse_response(name, pattern, response):
         match = re.match(pattern, response)
         if not match: raise SystemExit('Invalid '+name+' response:\n'+response)
         return match.groups()
     def main():
         # Usage
         env_names = []
         def usage_err():
             raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' '
                 +sys.argv[0]+' 2>>log')
         # Get config from env vars
         taxon = opts.get_env_var('taxon', None, env_names)
         debug = opts.env_flag('debug', False, env_names)
         if taxon == None: usage_err()
         # Logging
         def debug_log(label, str_=''):
             if debug: sys.stderr.write('\n'+label+':\n'+str_+'\n')
         ## HTTP
         headers = initial_headers
         def do_request(request=None, url=url, headers=headers):
             debug_log('request', str(request))
             response = urllib2.urlopen(urllib2.Request(url, request, headers))
             response_str = streams.read_all(response)
             response_info = str(response.info())
             debug_log('response info', response_info)
             debug_log('response str', response_str)
             return response_str, response_info
         def do_repeated_request(request):
             pause = initial_pause
             total_pause = 0
             while True:
                 total_pause += pause
                 if total_pause > max_pause: raise # error is not temporary
                 debug_log('total_pause', str(total_pause)+'s')
                 time.sleep(pause) # wait for job to complete
                 try: return do_request(request)
                 except urllib2.HTTPError: pass # try again
                 pause *= pause_growth_factor
         debug_log('Submit')
         request = submission_request_template.replace('[taxon]', gwt_encode(taxon))
         response, response_info = do_request(request)
         key, = parse_response('submission', submission_response_pattern, response)
         debug_log('key', key)
         key_enc = gwt_encode(key)
         debug_log('Retrieve')
         request = retrieval_request_template.replace('[key]', key_enc)
         response, response_info = do_repeated_request(request)
         parse_response('retrieval', retrieval_response_pattern, response)
         session_id, = parse_response('retrieval info',
             retrieval_response_info_pattern, response_info)
         debug_log('session_id', session_id)
         headers['Cookie'] = 'JSESSIONID='+session_id
         debug_log('Prepare download')
         request = download_request_template.replace('[key]', key_enc)
         response, response_info = do_request(request)
         csv_url, = parse_response('download', download_response_pattern, response)
         csv_url += download_url_suffix
         debug_log('csv_url', csv_url)
         debug_log('Download')
         response, response_info = do_request(url=csv_url, headers={})
         sys.stdout.write(response)
     main()

Also available in: Unified diff

Project

General

Profile

Revision 4918

Added by Aaron Marcuse-Kubitza over 12 years ago