Project

General

Profile

1
#!/usr/bin/env python
2
# Scrubs the taxonlabels in VegBIEN using TNRS.
3

    
4
# runtime: 162 ms/name ("real 458m50.126s" for "169,539 name(s)" [1])
5
# [1] $ tail -c +12953807 ../inputs/.TNRS/tnrs/logs/tnrs.make.log.sql|head -15
6

    
7
# total runtime: 10 days ("Rows (counted) 5221748" (TNRS.tnrs @r9998)
8
# * 162 ms/name (above) * 1s/1000ms * 1h/3600s * 1day/24h = 9.79 days)
9

    
10
# to estimate total runtime:
11
# bin/psql_vegbien <<<'SELECT COUNT(*) FROM tnrs_input_name'
12
# # names from above * 1.5 multiplier for scrubbing accepted names
13
# (the test_taxonomic_names sample from Brad produces 8 accepted names for
14
# 15 input names)
15
# * ((# ms/name from log file * 1 sec/1000 ms) + (# sec to run
16
#   `SELECT * FROM "tnrs_input_name"` in log file / tnrs.max_names names/batch))
17
# * 1 hr / 3600 sec * 1 day / 24 hr = # days
18

    
19
import os.path
20
import sys
21

    
22
sys.path.append(os.path.dirname(__file__)+"/../lib")
23

    
24
import csvs
25
import opts
26
import profiling
27
import sql
28
import sql_gen
29
import sql_io
30
import strings
31
import tnrs
32

    
33
tnrs_input = sql_gen.Table('tnrs_input_name')
34
tnrs_batch = sql_gen.Table('batch')
35
tnrs_data = sql_gen.Table('tnrs')
36

    
37
def main():
38
    # Input
39
    env_names = []
40
    db_config = opts.get_env_vars(sql.db_config_names, None, env_names)
41
    verbosity = float(opts.get_env_var('verbosity', 3, env_names))
42
    if not 'engine' in db_config: raise SystemExit('Usage: '
43
        +opts.env_usage(env_names)+' '+sys.argv[0]+' 2>>log')
44
    
45
    def log(msg, level=1):
46
        '''Higher level -> more verbose'''
47
        if level <= verbosity:
48
            sys.stderr.write(strings.to_raw_str(msg.rstrip('\n')+'\n'))
49
    
50
    # Connect to DB
51
    db = sql.connect(db_config, log_debug=log)
52
    
53
    cumulative_tnrs_profiler = profiling.ItersProfiler(iter_text='name')
54
    
55
    # Iterate over unscrubbed verbatim taxonlabels
56
    while True:
57
        # Fetch next set
58
        cur = sql.select(db, tnrs_input, limit=tnrs.max_names, cacheable=False)
59
        this_ct = cur.rowcount
60
        log('Processing '+str(this_ct)+' taxonlabels')
61
        if this_ct == 0: break
62
        # otherwise, rows found
63
        names = list(sql.values(cur))
64
        
65
        def process():
66
            # Run TNRS
67
            log('Making TNRS request')
68
            stream = tnrs.tnrs_request(names,
69
                cumulative_profiler=cumulative_tnrs_profiler)
70
            
71
            log('Storing TNRS response data')
72
            sql.insert(db, tnrs_batch, []) # time_submitted is autopopulated
73
            sql_io.append_csv(db, tnrs_data, *csvs.reader_and_header(stream))
74
        # start transaction *before* submitting data, so Time_submitted is
75
        # correctly set to the submission time rather than the insertion time.
76
        # these may differ by several minutes if TNRS is slow.
77
        sql.with_savepoint(db, process)
78

    
79
main()
(77-77/85)