Project

General

Profile

1 5079 aaronmk
#!/usr/bin/env python
2 5640 aaronmk
# Scrubs the taxonlabels in VegBIEN using TNRS.
3 5079 aaronmk
4 9998 aaronmk
# runtime: 162 ms/name ("real 458m50.126s" for "169,539 name(s)" [1])
5
# [1] $ tail -c +12953807 ../inputs/.TNRS/tnrs/logs/tnrs.make.log.sql|head -15
6
7 9530 aaronmk
# to estimate total runtime:
8
# bin/psql_vegbien <<<'SELECT COUNT(*) FROM tnrs_input_name'
9
# # names from above * 1.5 multiplier for scrubbing accepted names
10
# (the test_taxonomic_names sample from Brad produces 8 accepted names for
11
# 15 input names)
12
# * ((# ms/name from log file * 1 sec/1000 ms) + (# sec to run
13
#   `SELECT * FROM "tnrs_input_name"` in log file / tnrs.max_names names/batch))
14
# * 1 hr / 3600 sec * 1 day / 24 hr = # days
15
16 5079 aaronmk
import os.path
17
import sys
18
19
sys.path.append(os.path.dirname(__file__)+"/../lib")
20
21
import csvs
22
import opts
23 5098 aaronmk
import profiling
24 5079 aaronmk
import sql
25
import sql_gen
26
import sql_io
27
import strings
28
import tnrs
29
30 5669 aaronmk
tnrs_input = sql_gen.Table('tnrs_input_name')
31 5079 aaronmk
tnrs_data = sql_gen.Table('tnrs')
32
33
def main():
34
    # Input
35
    env_names = []
36
    db_config = opts.get_env_vars(sql.db_config_names, None, env_names)
37
    verbosity = float(opts.get_env_var('verbosity', 3, env_names))
38
    if not 'engine' in db_config: raise SystemExit('Usage: '
39
        +opts.env_usage(env_names)+' '+sys.argv[0]+' 2>>log')
40
41
    def log(msg, level=1):
42
        '''Higher level -> more verbose'''
43
        if level <= verbosity:
44
            sys.stderr.write(strings.to_raw_str(msg.rstrip('\n')+'\n'))
45
46
    # Connect to DB
47
    db = sql.connect(db_config, log_debug=log)
48
49 9522 aaronmk
    cumulative_tnrs_profiler = profiling.ItersProfiler(iter_text='name')
50 5123 aaronmk
51 5640 aaronmk
    # Iterate over unscrubbed verbatim taxonlabels
52 9515 aaronmk
    while True:
53 5123 aaronmk
        # Fetch next set
54 5837 aaronmk
        cur = sql.select(db, tnrs_input, limit=tnrs.max_names, cacheable=False)
55 5123 aaronmk
        this_ct = cur.rowcount
56 5640 aaronmk
        log('Processing '+str(this_ct)+' taxonlabels')
57 9518 aaronmk
        if this_ct == 0: break
58 5123 aaronmk
        # otherwise, rows found
59
        names = list(sql.values(cur))
60
61 9515 aaronmk
        def process():
62
            # Run TNRS
63
            log('Making TNRS request')
64 9526 aaronmk
            stream = tnrs.tnrs_request(names,
65
                cumulative_profiler=cumulative_tnrs_profiler)
66 9515 aaronmk
67
            log('Storing TNRS response data')
68 9517 aaronmk
            sql_io.append_csv(db, tnrs_data, *csvs.reader_and_header(stream))
69 9516 aaronmk
        # start transaction *before* submitting data, so Time_submitted is
70
        # correctly set to the submission time rather than the insertion time.
71
        # these may differ by several minutes if TNRS is slow.
72
        sql.with_savepoint(db, process)
73 5079 aaronmk
74
main()