Project

General

Profile

1
#!/usr/bin/env python
2
# Scrubs the taxonlabels in VegBIEN using TNRS.
3

    
4
import os.path
5
import StringIO
6
import sys
7
import time
8

    
9
sys.path.append(os.path.dirname(__file__)+"/../lib")
10

    
11
import csvs
12
import dates
13
import opts
14
import profiling
15
import sql
16
import sql_gen
17
import sql_io
18
import streams
19
import strings
20
import tnrs
21

    
22
tnrs_input = sql_gen.Table('tnrs_input_name')
23
tnrs_data = sql_gen.Table('tnrs')
24

    
25
def main():
26
    # Input
27
    env_names = []
28
    db_config = opts.get_env_vars(sql.db_config_names, None, env_names)
29
    verbosity = float(opts.get_env_var('verbosity', 3, env_names))
30
    if not 'engine' in db_config: raise SystemExit('Usage: '
31
        +opts.env_usage(env_names)+' '+sys.argv[0]+' 2>>log')
32
    
33
    def log(msg, level=1):
34
        '''Higher level -> more verbose'''
35
        if level <= verbosity:
36
            sys.stderr.write(strings.to_raw_str(msg.rstrip('\n')+'\n'))
37
    
38
    # Connect to DB
39
    db = sql.connect(db_config, log_debug=log)
40
    
41
    tnrs_profiler = profiling.ItersProfiler(iter_text='name')
42
    
43
    # Iterate over unscrubbed verbatim taxonlabels
44
    while True:
45
        # Fetch next set
46
        cur = sql.select(db, tnrs_input, limit=tnrs.max_names, cacheable=False)
47
        this_ct = cur.rowcount
48
        log('Processing '+str(this_ct)+' taxonlabels')
49
        if this_ct == 0: break
50
        # otherwise, rows found
51
        names = list(sql.values(cur))
52
        
53
        def process():
54
            # Run TNRS
55
            log('Making TNRS request')
56
            tnrs_profiler.start()
57
            try: stream = tnrs.tnrs_request(names)
58
            finally:
59
                tnrs_profiler.stop(iter_ct=len(names))
60
                log('Cumulatively: '+tnrs_profiler.msg())
61
            
62
            log('Storing TNRS response data')
63
            sql_io.append_csv(db, tnrs_data, *csvs.reader_and_header(stream))
64
        # start transaction *before* submitting data, so Time_submitted is
65
        # correctly set to the submission time rather than the insertion time.
66
        # these may differ by several minutes if TNRS is slow.
67
        sql.with_savepoint(db, process)
68

    
69
main()
(73-73/81)