Project

General

Profile

1 5079 aaronmk
#!/usr/bin/env python
2 5640 aaronmk
# Scrubs the taxonlabels in VegBIEN using TNRS.
3 5079 aaronmk
4
import os.path
5
import StringIO
6
import sys
7
import time
8
9
sys.path.append(os.path.dirname(__file__)+"/../lib")
10
11
import csvs
12 5737 aaronmk
import dates
13 5079 aaronmk
import opts
14 5098 aaronmk
import profiling
15 5079 aaronmk
import sql
16
import sql_gen
17
import sql_io
18
import streams
19
import strings
20
import tnrs
21
22 5669 aaronmk
tnrs_input = sql_gen.Table('tnrs_input_name')
23 5079 aaronmk
tnrs_data = sql_gen.Table('tnrs')
24
25
def main():
26
    # Input
27
    env_names = []
28
    db_config = opts.get_env_vars(sql.db_config_names, None, env_names)
29
    verbosity = float(opts.get_env_var('verbosity', 3, env_names))
30
    if not 'engine' in db_config: raise SystemExit('Usage: '
31
        +opts.env_usage(env_names)+' '+sys.argv[0]+' 2>>log')
32
33
    def log(msg, level=1):
34
        '''Higher level -> more verbose'''
35
        if level <= verbosity:
36
            sys.stderr.write(strings.to_raw_str(msg.rstrip('\n')+'\n'))
37
38
    # Connect to DB
39
    db = sql.connect(db_config, log_debug=log)
40
41 5124 aaronmk
    tnrs_profiler = profiling.ItersProfiler(iter_text='name')
42 5123 aaronmk
43 5640 aaronmk
    # Iterate over unscrubbed verbatim taxonlabels
44 9515 aaronmk
    while True:
45 5123 aaronmk
        # Fetch next set
46 5837 aaronmk
        cur = sql.select(db, tnrs_input, limit=tnrs.max_names, cacheable=False)
47 5123 aaronmk
        this_ct = cur.rowcount
48 5640 aaronmk
        log('Processing '+str(this_ct)+' taxonlabels')
49 9518 aaronmk
        if this_ct == 0: break
50 5123 aaronmk
        # otherwise, rows found
51
        names = list(sql.values(cur))
52
53 9515 aaronmk
        def process():
54
            # Run TNRS
55
            log('Making TNRS request')
56
            tnrs_profiler.start()
57
            try: stream = tnrs.repeated_tnrs_request(names)
58
            finally:
59
                tnrs_profiler.stop(iter_ct=this_ct)
60
                log('Cumulatively: '+tnrs_profiler.msg())
61
62
            log('Storing TNRS response data')
63 9517 aaronmk
            sql_io.append_csv(db, tnrs_data, *csvs.reader_and_header(stream))
64 9516 aaronmk
        # start transaction *before* submitting data, so Time_submitted is
65
        # correctly set to the submission time rather than the insertion time.
66
        # these may differ by several minutes if TNRS is slow.
67
        sql.with_savepoint(db, process)
68 5079 aaronmk
69
main()