Revision 5079
Added by Aaron Marcuse-Kubitza over 12 years ago
bin/tnrs_db | ||
---|---|---|
1 |
#!/usr/bin/env python |
|
2 |
# Scrubs the taxonpaths in VegBIEN using TNRS. |
|
3 |
# Runs continuously until no new rows are added after max_pause. |
|
4 |
|
|
5 |
import os.path |
|
6 |
import StringIO |
|
7 |
import sys |
|
8 |
import time |
|
9 |
|
|
10 |
sys.path.append(os.path.dirname(__file__)+"/../lib") |
|
11 |
|
|
12 |
import csvs |
|
13 |
import opts |
|
14 |
import sql |
|
15 |
import sql_gen |
|
16 |
import sql_io |
|
17 |
import streams |
|
18 |
import strings |
|
19 |
import tnrs |
|
20 |
|
|
21 |
# Config |
|
22 |
pause = 60 # sec |
|
23 |
max_pause = 2*60*60 # sec; = 2 hr; must be >= max import time of one partition |
|
24 |
assert pause <= max_pause |
|
25 |
|
|
26 |
tnrs_data = sql_gen.Table('tnrs') |
|
27 |
|
|
28 |
def main(): |
|
29 |
# Input |
|
30 |
env_names = [] |
|
31 |
db_config = opts.get_env_vars(sql.db_config_names, None, env_names) |
|
32 |
verbosity = float(opts.get_env_var('verbosity', 3, env_names)) |
|
33 |
if not 'engine' in db_config: raise SystemExit('Usage: ' |
|
34 |
+opts.env_usage(env_names)+' '+sys.argv[0]+' 2>>log') |
|
35 |
|
|
36 |
def log(msg, level=1): |
|
37 |
'''Higher level -> more verbose''' |
|
38 |
if level <= verbosity: |
|
39 |
sys.stderr.write(strings.to_raw_str(msg.rstrip('\n')+'\n')) |
|
40 |
|
|
41 |
# Connect to DB |
|
42 |
db = sql.connect(db_config, log_debug=log) |
|
43 |
|
|
44 |
# Iterate over unscrubbed verbatim taxonpaths |
|
45 |
start = 0 |
|
46 |
total_pause = 0 |
|
47 |
while True: |
|
48 |
# Fetch next set |
|
49 |
cur = sql.select(db, 'taxonpath', ['taxonomicnamewithauthor'], |
|
50 |
[('canon_taxonpath_id', None)], limit=tnrs.max_taxons, start=start, |
|
51 |
cacheable=False) |
|
52 |
this_ct = cur.rowcount |
|
53 |
start += this_ct # advance start to fetch next set |
|
54 |
if this_ct == 0: |
|
55 |
total_pause += pause |
|
56 |
if total_pause > max_pause: break |
|
57 |
log('Waited '+str(total_pause)+' sec. Waiting...') |
|
58 |
time.sleep(pause) # wait for more rows |
|
59 |
continue # try again |
|
60 |
# otherwise, rows found |
|
61 |
total_pause = 0 |
|
62 |
|
|
63 |
# Run TNRS |
|
64 |
stream = tnrs.tnrs_request(list(sql.values(cur))) |
|
65 |
stream_info = csvs.stream_info(stream, parse_header=True) |
|
66 |
stream = streams.ProgressInputStream(stream, sys.stderr, n=1000) |
|
67 |
sql_io.append_csv(db, tnrs_data, stream_info, stream) |
|
68 |
|
|
69 |
main() |
|
0 | 70 |
Also available in: Unified diff
Added tnrs_db to scrub the taxonpaths in VegBIEN using TNRS