Revision 4883
Added by Aaron Marcuse-Kubitza about 12 years ago
inputs/REMIB/nodes.make | ||
---|---|---|
1 |
#!/usr/bin/env python |
|
2 |
# Downloads REMIB data for all nodes |
|
3 |
# Usage: env [start=...] [n=...] self |
|
4 |
|
|
5 |
import csv |
|
6 |
import itertools |
|
7 |
import os.path |
|
8 |
import StringIO |
|
9 |
import sys |
|
10 |
import urllib2 |
|
11 |
|
|
12 |
sys.path.append(os.path.dirname(__file__)+"/../../lib") |
|
13 |
|
|
14 |
import exc |
|
15 |
import opts |
|
16 |
import profiling |
|
17 |
import streams |
|
18 |
import strings |
|
19 |
import timeout |
|
20 |
import util |
|
21 |
|
|
22 |
# Config |
|
23 |
timeout = 20 # sec |
|
24 |
max_consec_empty_responses = 10 |
|
25 |
|
|
26 |
alphabet = map(chr, xrange(ord('A'), ord('Z')+1)) |
|
27 |
|
|
28 |
class InputException(Exception): pass |
|
29 |
|
|
30 |
class EmptyResponseException(InputException): pass |
|
31 |
|
|
32 |
def is_ignore(line): |
|
33 |
line = strings.remove_line_ending(line) |
|
34 |
return line == '' or line.startswith('\t') or line.find(',') < 0 |
|
35 |
|
|
36 |
def main(): |
|
37 |
# Get config from env vars |
|
38 |
start = util.cast(int, opts.get_env_var('start', 1)) |
|
39 |
end = util.cast(int, util.none_if(opts.get_env_var('n', None), u'')) |
|
40 |
if end != None: end += start |
|
41 |
|
|
42 |
log_ = open(sys.argv[0]+'.log', 'a') |
|
43 |
def clear_line(): log_.write('\n') |
|
44 |
log_indent = 0 |
|
45 |
def log(msg, line_ending='\n'): |
|
46 |
log_.write((' '*log_indent)+msg+line_ending) |
|
47 |
|
|
48 |
os.chdir(os.path.dirname(__file__)) # dir of output files |
|
49 |
|
|
50 |
# Get by family ('familia') because that is the most general level at which |
|
51 |
# an identification can be made. This assumes all records have a family. |
|
52 |
url_template = ('http://www.conabio.gob.mx/remib/cgi-bin/' |
|
53 |
'remib_distribucion.cgi?lengua=EN&niveltax=familia&taxon=[prefix]%25&' |
|
54 |
'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados' |
|
55 |
'&coleccion=id%3D[node_id]') |
|
56 |
|
|
57 |
if end != None: node_ids = xrange(start, end) |
|
58 |
else: node_ids = itertools.count(start) |
|
59 |
consec_empty_responses = 0 |
|
60 |
for node_id in node_ids: |
|
61 |
if consec_empty_responses > max_consec_empty_responses: break |
|
62 |
|
|
63 |
log('Processing node #'+str(node_id)+'...') |
|
64 |
log_indent += 1 |
|
65 |
profiler = profiling.ItersProfiler(start_now=True, iter_text='row') |
|
66 |
|
|
67 |
filename = 'Specimen/node.'+str(node_id)+'.csv' |
|
68 |
out = streams.LineCountStream(open(filename, 'w')) |
|
69 |
def log_ex(e): |
|
70 |
clear_line() |
|
71 |
log('! Output line '+str(out.line_num)+': '+exc.str_(e)) |
|
72 |
start_line_num = out.line_num |
|
73 |
node_url_template = url_template.replace('[node_id]', str(node_id)) |
|
74 |
|
|
75 |
for prefix_chars in itertools.product(alphabet, repeat=2): |
|
76 |
prefix = ''.join(prefix_chars) |
|
77 |
log('Processing prefix '+prefix+'...') |
|
78 |
row_ct = 0 |
|
79 |
def print_status(line_ending='\n'): |
|
80 |
log('Processed '+str(row_ct)+' row(s)', line_ending) |
|
81 |
log_indent += 1 |
|
82 |
|
|
83 |
url = node_url_template.replace('[prefix]', prefix) |
|
84 |
stream = streams.StreamIter(streams.TimeoutInputStream( |
|
85 |
urllib2.urlopen(url), timeout)) |
|
86 |
|
|
87 |
is_empty_response = False |
|
88 |
try: |
|
89 |
util.skip(stream, is_ignore) # skip header |
|
90 |
try: metadata_row = csv.reader(stream).next() |
|
91 |
except StopIteration: raise EmptyResponseException() |
|
92 |
if metadata_row[0] != 'COLLECTION': raise InputException( |
|
93 |
'Invalid metadata row: '+str(metadata_row)) |
|
94 |
|
|
95 |
# Copy lines |
|
96 |
for line in stream: |
|
97 |
if is_ignore(line): |
|
98 |
error = strings.remove_prefix('\t\t', line) |
|
99 |
if len(error) != len(line): raise InputException(error) |
|
100 |
break |
|
101 |
out.write(line) |
|
102 |
|
|
103 |
row_ct += 1 |
|
104 |
if row_ct % 100 == 0: print_status('\r') |
|
105 |
# CR at end so next print overwrites msg |
|
106 |
except EmptyResponseException, e: # must come before InputException |
|
107 |
is_empty_response = True |
|
108 |
log_ex(e) |
|
109 |
break # assume node doesn't exist, so abort node |
|
110 |
except InputException, e: log_ex(e) |
|
111 |
except timeout.TimeoutException, e: |
|
112 |
log_ex(e) |
|
113 |
break # assume node is down, so abort node |
|
114 |
finally: # still run if break is called |
|
115 |
stream.close() |
|
116 |
|
|
117 |
profiler.add_iters(row_ct) |
|
118 |
print_status() |
|
119 |
log_indent -= 1 |
|
120 |
|
|
121 |
if is_empty_response: consec_empty_responses += 1 |
|
122 |
else: consec_empty_responses = 0 # reset count |
|
123 |
|
|
124 |
profiler.stop() |
|
125 |
log(profiler.msg()) |
|
126 |
log_indent -= 1 |
|
127 |
|
|
128 |
main() |
|
129 | 0 |
inputs/REMIB/Specimen.src/nodes.make | ||
---|---|---|
1 |
#!/usr/bin/env python |
|
2 |
# Downloads REMIB data for all nodes |
|
3 |
# Usage: env [start=...] [n=...] self |
|
4 |
|
|
5 |
import csv |
|
6 |
import itertools |
|
7 |
import os.path |
|
8 |
import StringIO |
|
9 |
import sys |
|
10 |
import urllib2 |
|
11 |
|
|
12 |
sys.path.append(os.path.dirname(__file__)+"/../../lib") |
|
13 |
|
|
14 |
import exc |
|
15 |
import opts |
|
16 |
import profiling |
|
17 |
import streams |
|
18 |
import strings |
|
19 |
import timeout |
|
20 |
import util |
|
21 |
|
|
22 |
# Config |
|
23 |
timeout = 20 # sec |
|
24 |
max_consec_empty_responses = 10 |
|
25 |
|
|
26 |
alphabet = map(chr, xrange(ord('A'), ord('Z')+1)) |
|
27 |
|
|
28 |
class InputException(Exception): pass |
|
29 |
|
|
30 |
class EmptyResponseException(InputException): pass |
|
31 |
|
|
32 |
def is_ignore(line): |
|
33 |
line = strings.remove_line_ending(line) |
|
34 |
return line == '' or line.startswith('\t') or line.find(',') < 0 |
|
35 |
|
|
36 |
def main(): |
|
37 |
# Get config from env vars |
|
38 |
start = util.cast(int, opts.get_env_var('start', 1)) |
|
39 |
end = util.cast(int, util.none_if(opts.get_env_var('n', None), u'')) |
|
40 |
if end != None: end += start |
|
41 |
|
|
42 |
log_ = open(sys.argv[0]+'.log', 'a') |
|
43 |
def clear_line(): log_.write('\n') |
|
44 |
log_indent = 0 |
|
45 |
def log(msg, line_ending='\n'): |
|
46 |
log_.write((' '*log_indent)+msg+line_ending) |
|
47 |
|
|
48 |
os.chdir(os.path.dirname(__file__)) # dir of output files |
|
49 |
|
|
50 |
# Get by family ('familia') because that is the most general level at which |
|
51 |
# an identification can be made. This assumes all records have a family. |
|
52 |
url_template = ('http://www.conabio.gob.mx/remib/cgi-bin/' |
|
53 |
'remib_distribucion.cgi?lengua=EN&niveltax=familia&taxon=[prefix]%25&' |
|
54 |
'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados' |
|
55 |
'&coleccion=id%3D[node_id]') |
|
56 |
|
|
57 |
if end != None: node_ids = xrange(start, end) |
|
58 |
else: node_ids = itertools.count(start) |
|
59 |
consec_empty_responses = 0 |
|
60 |
for node_id in node_ids: |
|
61 |
if consec_empty_responses > max_consec_empty_responses: break |
|
62 |
|
|
63 |
log('Processing node #'+str(node_id)+'...') |
|
64 |
log_indent += 1 |
|
65 |
profiler = profiling.ItersProfiler(start_now=True, iter_text='row') |
|
66 |
|
|
67 |
filename = 'node.'+str(node_id)+'.csv' |
|
68 |
out = streams.LineCountStream(open(filename, 'w')) |
|
69 |
def log_ex(e): |
|
70 |
clear_line() |
|
71 |
log('! Output line '+str(out.line_num)+': '+exc.str_(e)) |
|
72 |
start_line_num = out.line_num |
|
73 |
node_url_template = url_template.replace('[node_id]', str(node_id)) |
|
74 |
|
|
75 |
for prefix_chars in itertools.product(alphabet, repeat=2): |
|
76 |
prefix = ''.join(prefix_chars) |
|
77 |
log('Processing prefix '+prefix+'...') |
|
78 |
row_ct = 0 |
|
79 |
def print_status(line_ending='\n'): |
|
80 |
log('Processed '+str(row_ct)+' row(s)', line_ending) |
|
81 |
log_indent += 1 |
|
82 |
|
|
83 |
url = node_url_template.replace('[prefix]', prefix) |
|
84 |
stream = streams.StreamIter(streams.TimeoutInputStream( |
|
85 |
urllib2.urlopen(url), timeout)) |
|
86 |
|
|
87 |
is_empty_response = False |
|
88 |
try: |
|
89 |
util.skip(stream, is_ignore) # skip header |
|
90 |
try: metadata_row = csv.reader(stream).next() |
|
91 |
except StopIteration: raise EmptyResponseException() |
|
92 |
if metadata_row[0] != 'COLLECTION': raise InputException( |
|
93 |
'Invalid metadata row: '+str(metadata_row)) |
|
94 |
|
|
95 |
# Copy lines |
|
96 |
for line in stream: |
|
97 |
if is_ignore(line): |
|
98 |
error = strings.remove_prefix('\t\t', line) |
|
99 |
if len(error) != len(line): raise InputException(error) |
|
100 |
break |
|
101 |
out.write(line) |
|
102 |
|
|
103 |
row_ct += 1 |
|
104 |
if row_ct % 100 == 0: print_status('\r') |
|
105 |
# CR at end so next print overwrites msg |
|
106 |
except EmptyResponseException, e: # must come before InputException |
|
107 |
is_empty_response = True |
|
108 |
log_ex(e) |
|
109 |
break # assume node doesn't exist, so abort node |
|
110 |
except InputException, e: log_ex(e) |
|
111 |
except timeout.TimeoutException, e: |
|
112 |
log_ex(e) |
|
113 |
break # assume node is down, so abort node |
|
114 |
finally: # still run if break is called |
|
115 |
stream.close() |
|
116 |
|
|
117 |
profiler.add_iters(row_ct) |
|
118 |
print_status() |
|
119 |
log_indent -= 1 |
|
120 |
|
|
121 |
if is_empty_response: consec_empty_responses += 1 |
|
122 |
else: consec_empty_responses = 0 # reset count |
|
123 |
|
|
124 |
profiler.stop() |
|
125 |
log(profiler.msg()) |
|
126 |
log_indent -= 1 |
|
127 |
|
|
128 |
main() |
|
0 | 129 |
Also available in: Unified diff
inputs/REMIB/: Moved nodes.make into Specimen.src/ so it's with the data it generates