Revision 1644
Added by Aaron Marcuse-Kubitza almost 13 years ago
inputs/REMIB/src/nodes.all.specimens.csv.make | ||
---|---|---|
1 |
#!/usr/bin/env python |
|
2 |
# Downloads REMIB data for all nodes |
|
3 |
# Usage: self |
|
4 |
|
|
5 |
import csv |
|
6 |
import itertools |
|
7 |
import os.path |
|
8 |
import StringIO |
|
9 |
import sys |
|
10 |
import urllib2 |
|
11 |
|
|
12 |
sys.path.append(os.path.dirname(__file__)+"/../../../lib") |
|
13 |
|
|
14 |
import exc |
|
15 |
import profiling |
|
16 |
import streams |
|
17 |
import strings |
|
18 |
import timeout |
|
19 |
import util |
|
20 |
|
|
21 |
timeout_ = 20 # sec |
|
22 |
|
|
23 |
alphabet = map(chr, xrange(ord('A'), ord('Z')+1)) |
|
24 |
|
|
25 |
class InputException(Exception): pass |
|
26 |
|
|
27 |
def is_ignore(line): |
|
28 |
line = strings.remove_line_ending(line) |
|
29 |
return line == '' or line.startswith('\t') or line.find(',') < 0 |
|
30 |
|
|
31 |
def main(): |
|
32 |
def clear_line(): sys.stderr.write('\n') |
|
33 |
log_indent = 0 |
|
34 |
def log(msg, line_ending='\n'): |
|
35 |
sys.stderr.write((' '*log_indent)+msg+line_ending) |
|
36 |
|
|
37 |
os.chdir(os.path.dirname(__file__)) # dir of output files |
|
38 |
|
|
39 |
# Get by family ('familia') because that is the most general level at which |
|
40 |
# an identification can be made. This assumes all records have a family. |
|
41 |
url_template = ('http://www.conabio.gob.mx/remib/cgi-bin/' |
|
42 |
'remib_distribucion.cgi?lengua=EN&niveltax=familia&taxon=[prefix]%25&' |
|
43 |
'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados' |
|
44 |
'&coleccion=id%3D[node_id]') |
|
45 |
done = False |
|
46 |
for node_id in itertools.count(2): |
|
47 |
if done: break |
|
48 |
log('Processing node #'+str(node_id)+'...') |
|
49 |
log_indent += 1 |
|
50 |
profiler = profiling.ItersProfiler(start_now=True, iter_text='row') |
|
51 |
|
|
52 |
filename = 'node.'+str(node_id)+'.specimens.csv' |
|
53 |
out = streams.LineCountOutputStream(open(filename, 'w')) |
|
54 |
start_line_num = out.line_num |
|
55 |
node_url_template = url_template.replace('[node_id]', str(node_id)) |
|
56 |
|
|
57 |
for prefix_chars in itertools.product(alphabet, repeat=2): |
|
58 |
if done: break |
|
59 |
prefix = ''.join(prefix_chars) |
|
60 |
log('Processing prefix '+prefix+'...') |
|
61 |
row_ct = 0 |
|
62 |
def print_status(line_ending='\n'): |
|
63 |
log('Processed '+str(row_ct)+' row(s)', line_ending) |
|
64 |
log_indent += 1 |
|
65 |
|
|
66 |
url = node_url_template.replace('[prefix]', prefix) |
|
67 |
stream = streams.StreamIter(streams.TimeoutInputStream( |
|
68 |
urllib2.urlopen(url), timeout_)) |
|
69 |
|
|
70 |
# Copy lines |
|
71 |
try: |
|
72 |
util.skip(stream, is_ignore) # skip header |
|
73 |
try: |
|
74 |
metadata_row = csv.reader(stream).next() |
|
75 |
if metadata_row[0] != 'COLLECTION': raise InputException( |
|
76 |
'Invalid metadata row: '+str(metadata_row)) |
|
77 |
except StopIteration: |
|
78 |
done = True # empty response means no more nodes |
|
79 |
|
|
80 |
for line in stream: |
|
81 |
if is_ignore(line): |
|
82 |
error = strings.remove_prefix('\t\t', line) |
|
83 |
if len(error) != len(line): raise InputException(error) |
|
84 |
break |
|
85 |
if row_ct % 100 == 0: print_status('\r') |
|
86 |
# CR at end so next print overwrites msg |
|
87 |
out.write(line) |
|
88 |
row_ct += 1 |
|
89 |
except (InputException, timeout.TimeoutException), e: |
|
90 |
clear_line() |
|
91 |
log('! Output line '+str(out.line_num)+': '+exc.str_(e)) |
|
92 |
stream.close() |
|
93 |
|
|
94 |
profiler.add_iters(row_ct) |
|
95 |
print_status() |
|
96 |
log_indent -= 1 |
|
97 |
|
|
98 |
profiler.stop() |
|
99 |
log(profiler.msg()) |
|
100 |
log_indent -= 1 |
|
101 |
|
|
102 |
main() |
|
103 | 0 |
inputs/REMIB/src/nodes.make | ||
---|---|---|
1 |
#!/usr/bin/env python |
|
2 |
# Downloads REMIB data for all nodes |
|
3 |
# Usage: self |
|
4 |
|
|
5 |
import csv |
|
6 |
import itertools |
|
7 |
import os.path |
|
8 |
import StringIO |
|
9 |
import sys |
|
10 |
import urllib2 |
|
11 |
|
|
12 |
sys.path.append(os.path.dirname(__file__)+"/../../../lib") |
|
13 |
|
|
14 |
import exc |
|
15 |
import profiling |
|
16 |
import streams |
|
17 |
import strings |
|
18 |
import timeout |
|
19 |
import util |
|
20 |
|
|
21 |
timeout_ = 20 # sec |
|
22 |
|
|
23 |
alphabet = map(chr, xrange(ord('A'), ord('Z')+1)) |
|
24 |
|
|
25 |
class InputException(Exception): pass |
|
26 |
|
|
27 |
def is_ignore(line): |
|
28 |
line = strings.remove_line_ending(line) |
|
29 |
return line == '' or line.startswith('\t') or line.find(',') < 0 |
|
30 |
|
|
31 |
def main(): |
|
32 |
def clear_line(): sys.stderr.write('\n') |
|
33 |
log_indent = 0 |
|
34 |
def log(msg, line_ending='\n'): |
|
35 |
sys.stderr.write((' '*log_indent)+msg+line_ending) |
|
36 |
|
|
37 |
os.chdir(os.path.dirname(__file__)) # dir of output files |
|
38 |
|
|
39 |
# Get by family ('familia') because that is the most general level at which |
|
40 |
# an identification can be made. This assumes all records have a family. |
|
41 |
url_template = ('http://www.conabio.gob.mx/remib/cgi-bin/' |
|
42 |
'remib_distribucion.cgi?lengua=EN&niveltax=familia&taxon=[prefix]%25&' |
|
43 |
'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados' |
|
44 |
'&coleccion=id%3D[node_id]') |
|
45 |
done = False |
|
46 |
for node_id in itertools.count(1): |
|
47 |
if done: break |
|
48 |
log('Processing node #'+str(node_id)+'...') |
|
49 |
log_indent += 1 |
|
50 |
profiler = profiling.ItersProfiler(start_now=True, iter_text='row') |
|
51 |
|
|
52 |
filename = 'node.'+str(node_id)+'.specimens.csv' |
|
53 |
out = streams.LineCountOutputStream(open(filename, 'w')) |
|
54 |
start_line_num = out.line_num |
|
55 |
node_url_template = url_template.replace('[node_id]', str(node_id)) |
|
56 |
|
|
57 |
for prefix_chars in itertools.product(alphabet, repeat=2): |
|
58 |
if done: break |
|
59 |
prefix = ''.join(prefix_chars) |
|
60 |
log('Processing prefix '+prefix+'...') |
|
61 |
row_ct = 0 |
|
62 |
def print_status(line_ending='\n'): |
|
63 |
log('Processed '+str(row_ct)+' row(s)', line_ending) |
|
64 |
log_indent += 1 |
|
65 |
|
|
66 |
url = node_url_template.replace('[prefix]', prefix) |
|
67 |
stream = streams.StreamIter(streams.TimeoutInputStream( |
|
68 |
urllib2.urlopen(url), timeout_)) |
|
69 |
|
|
70 |
# Copy lines |
|
71 |
try: |
|
72 |
util.skip(stream, is_ignore) # skip header |
|
73 |
try: |
|
74 |
metadata_row = csv.reader(stream).next() |
|
75 |
if metadata_row[0] != 'COLLECTION': raise InputException( |
|
76 |
'Invalid metadata row: '+str(metadata_row)) |
|
77 |
except StopIteration: |
|
78 |
done = True # empty response means no more nodes |
|
79 |
|
|
80 |
for line in stream: |
|
81 |
if is_ignore(line): |
|
82 |
error = strings.remove_prefix('\t\t', line) |
|
83 |
if len(error) != len(line): raise InputException(error) |
|
84 |
break |
|
85 |
if row_ct % 100 == 0: print_status('\r') |
|
86 |
# CR at end so next print overwrites msg |
|
87 |
out.write(line) |
|
88 |
row_ct += 1 |
|
89 |
except (InputException, timeout.TimeoutException), e: |
|
90 |
clear_line() |
|
91 |
log('! Output line '+str(out.line_num)+': '+exc.str_(e)) |
|
92 |
stream.close() |
|
93 |
|
|
94 |
profiler.add_iters(row_ct) |
|
95 |
print_status() |
|
96 |
log_indent -= 1 |
|
97 |
|
|
98 |
profiler.stop() |
|
99 |
log(profiler.msg()) |
|
100 |
log_indent -= 1 |
|
101 |
|
|
102 |
main() |
|
0 | 103 |
Also available in: Unified diff
Renamed inputs/REMIB/src/nodes.all.specimens.csv.make to inputs/REMIB/src/nodes.make since it will not be used to generate nodes.all.specimens.csv. However, it can still be used with the `src/%.make` make target, but will generate a dummy empty output file "nodes".