Revision 9518
Added by Aaron Marcuse-Kubitza over 11 years ago
tnrs_db | ||
---|---|---|
1 | 1 |
#!/usr/bin/env python |
2 | 2 |
# Scrubs the taxonlabels in VegBIEN using TNRS. |
3 |
# Runs continuously until no new rows are added after max_pause. |
|
4 | 3 |
|
5 | 4 |
import os.path |
6 | 5 |
import StringIO |
... | ... | |
20 | 19 |
import strings |
21 | 20 |
import tnrs |
22 | 21 |
|
23 |
# Config |
|
24 |
pause = 2*60*60 # sec; = 2 hr |
|
25 |
max_pause = 9*60*60 # sec; = 9 hr; must be >= max partition import time (1.5 hr) |
|
26 |
assert pause <= max_pause |
|
27 |
|
|
28 | 22 |
tnrs_input = sql_gen.Table('tnrs_input_name') |
29 | 23 |
tnrs_data = sql_gen.Table('tnrs') |
30 | 24 |
|
... | ... | |
33 | 27 |
env_names = [] |
34 | 28 |
db_config = opts.get_env_vars(sql.db_config_names, None, env_names) |
35 | 29 |
verbosity = float(opts.get_env_var('verbosity', 3, env_names)) |
36 |
wait = opts.env_flag('wait', False, env_names) |
|
37 | 30 |
if not 'engine' in db_config: raise SystemExit('Usage: ' |
38 | 31 |
+opts.env_usage(env_names)+' '+sys.argv[0]+' 2>>log') |
39 | 32 |
|
... | ... | |
48 | 41 |
tnrs_profiler = profiling.ItersProfiler(iter_text='name') |
49 | 42 |
|
50 | 43 |
# Iterate over unscrubbed verbatim taxonlabels |
51 |
total_pause = 0 |
|
52 | 44 |
while True: |
53 | 45 |
# Fetch next set |
54 | 46 |
cur = sql.select(db, tnrs_input, limit=tnrs.max_names, cacheable=False) |
55 | 47 |
this_ct = cur.rowcount |
56 | 48 |
log('Processing '+str(this_ct)+' taxonlabels') |
57 |
if this_ct == 0: |
|
58 |
if not wait: break |
|
59 |
log('Waited '+str(total_pause)+' sec total') |
|
60 |
total_pause += pause |
|
61 |
if total_pause > max_pause: break |
|
62 |
log('Waiting '+str(pause)+' sec...') |
|
63 |
time.sleep(pause) # wait for more rows |
|
64 |
continue # try again |
|
49 |
if this_ct == 0: break |
|
65 | 50 |
# otherwise, rows found |
66 |
total_pause = 0 |
|
67 | 51 |
names = list(sql.values(cur)) |
68 | 52 |
|
69 | 53 |
def process(): |
Also available in: Unified diff
bin/tnrs_db: removed no longer used $wait flag (which caused tnrs_db to wait max_pause for new rows to be added), because tnrs_db is now invoked automatically after each import by the import_scrub target (in inputs/input.Makefile) and does not need to run as a daemon. note that when scrub is invoked, it is possible that a previous datasource's import has already scrubbed the names for this import, because tnrs_db runs until all rows in tnrs_input_name are scrubbed.
this also removes clutter in tnrs_db, making it clearer what operations it performs that the library function tnrs.repeated_tnrs_request() does not (namely, interfacing with the DB and profiling the TNRS request).