Project

General

Profile

1
#!/usr/bin/env python
2
# Scrubs the taxonlabels in VegBIEN using TNRS.
3

    
4
# to estimate total runtime:
5
# bin/psql_vegbien <<<'SELECT COUNT(*) FROM tnrs_input_name'
6
# # names from above * 1.5 multiplier for scrubbing accepted names
7
# (the test_taxonomic_names sample from Brad produces 8 accepted names for
8
# 15 input names)
9
# * ((# ms/name from log file * 1 sec/1000 ms) + (# sec to run
10
#   `SELECT * FROM "tnrs_input_name"` in log file / tnrs.max_names names/batch))
11
# * 1 hr / 3600 sec * 1 day / 24 hr = # days
12

    
13
import os.path
14
import sys
15

    
16
sys.path.append(os.path.dirname(__file__)+"/../lib")
17

    
18
import csvs
19
import opts
20
import profiling
21
import sql
22
import sql_gen
23
import sql_io
24
import strings
25
import tnrs
26

    
27
tnrs_input = sql_gen.Table('tnrs_input_name')
28
tnrs_data = sql_gen.Table('tnrs')
29

    
30
def main():
31
    # Input
32
    env_names = []
33
    db_config = opts.get_env_vars(sql.db_config_names, None, env_names)
34
    verbosity = float(opts.get_env_var('verbosity', 3, env_names))
35
    if not 'engine' in db_config: raise SystemExit('Usage: '
36
        +opts.env_usage(env_names)+' '+sys.argv[0]+' 2>>log')
37
    
38
    def log(msg, level=1):
39
        '''Higher level -> more verbose'''
40
        if level <= verbosity:
41
            sys.stderr.write(strings.to_raw_str(msg.rstrip('\n')+'\n'))
42
    
43
    # Connect to DB
44
    db = sql.connect(db_config, log_debug=log)
45
    
46
    cumulative_tnrs_profiler = profiling.ItersProfiler(iter_text='name')
47
    
48
    # Iterate over unscrubbed verbatim taxonlabels
49
    while True:
50
        # Fetch next set
51
        cur = sql.select(db, tnrs_input, limit=tnrs.max_names, cacheable=False)
52
        this_ct = cur.rowcount
53
        log('Processing '+str(this_ct)+' taxonlabels')
54
        if this_ct == 0: break
55
        # otherwise, rows found
56
        names = list(sql.values(cur))
57
        
58
        def process():
59
            # Run TNRS
60
            log('Making TNRS request')
61
            stream = tnrs.tnrs_request(names,
62
                cumulative_profiler=cumulative_tnrs_profiler)
63
            
64
            log('Storing TNRS response data')
65
            sql_io.append_csv(db, tnrs_data, *csvs.reader_and_header(stream))
66
        # start transaction *before* submitting data, so Time_submitted is
67
        # correctly set to the submission time rather than the insertion time.
68
        # these may differ by several minutes if TNRS is slow.
69
        sql.with_savepoint(db, process)
70

    
71
main()
(73-73/81)