/bin/tnrs_db - BIEN 3 - NCEAS Projects

root/bin/tnrs_db @ 5097

       #!/usr/bin/env python
       # Scrubs the taxonpaths in VegBIEN using TNRS.
       # Runs continuously until no new rows are added after max_pause.
       import os.path
       import StringIO
       import sys
       import time
       sys.path.append(os.path.dirname(__file__)+"/../lib")
       import csvs
       import opts
       import sql
       import sql_gen
       import sql_io
       import streams
       import strings
       import tnrs
       # Config
       pause = 60 # sec
       max_pause = 2*60*60 # sec; = 2 hr; must be >= max import time of one partition
       assert pause <= max_pause
       max_taxons = 500 # less than the limit to avoid slowing down the TNRS server
       tnrs_data = sql_gen.Table('tnrs')
       def main():
           # Input
           env_names = []
           db_config = opts.get_env_vars(sql.db_config_names, None, env_names)
           verbosity = float(opts.get_env_var('verbosity', 3, env_names))
           if not 'engine' in db_config: raise SystemExit('Usage: '
               +opts.env_usage(env_names)+' '+sys.argv[0]+' 2>>log')
           def log(msg, level=1):
               '''Higher level -> more verbose'''
               if level <= verbosity:
                   sys.stderr.write(strings.to_raw_str(msg.rstrip('\n')+'\n'))
           # Connect to DB
           db = sql.connect(db_config, log_debug=log)
           # Iterate over unscrubbed verbatim taxonpaths
           start = 0
           total_pause = 0
           while True:
               # Fetch next set
               cur = sql.select(db, 'taxonpath', ['taxonomicnamewithauthor'],
                   [('canon_taxonpath_id', None)], limit=max_taxons, start=start,
                   cacheable=False)
               this_ct = cur.rowcount
               start += this_ct # advance start to fetch next set
               if this_ct == 0:
                   total_pause += pause
                   if total_pause > max_pause: break
                   log('Waited '+str(total_pause)+' sec. Waiting...')
                   time.sleep(pause) # wait for more rows
                   continue # try again
               # otherwise, rows found
               total_pause = 0
               # Run TNRS
               log('Making TNRS request')
               try: stream = tnrs.repeated_tnrs_request(list(sql.values(cur)))
               except InvalidResponse: pass # skip this set in case it caused the error
               else:
                   log('Storing TNRS response data')
                   stream_info = csvs.stream_info(stream, parse_header=True)
                   stream = streams.ProgressInputStream(stream, sys.stderr, n=1000)
                   sql_io.append_csv(db, tnrs_data, stream_info, stream)
       main()

(56-56/61)

Project

General

Profile