Revision 1643
Added by Aaron Marcuse-Kubitza over 12 years ago
inputs/REMIB/src/nodes.all.specimens.csv.make | ||
---|---|---|
34 | 34 |
def log(msg, line_ending='\n'): |
35 | 35 |
sys.stderr.write((' '*log_indent)+msg+line_ending) |
36 | 36 |
|
37 |
stdout = streams.LineCountOutputStream(sys.stdout)
|
|
37 |
os.chdir(os.path.dirname(__file__)) # dir of output files
|
|
38 | 38 |
|
39 | 39 |
# Get by family ('familia') because that is the most general level at which |
40 | 40 |
# an identification can be made. This assumes all records have a family. |
... | ... | |
43 | 43 |
'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados' |
44 | 44 |
'&coleccion=id%3D[node_id]') |
45 | 45 |
done = False |
46 |
for node_id in itertools.count(1):
|
|
46 |
for node_id in itertools.count(2):
|
|
47 | 47 |
if done: break |
48 | 48 |
log('Processing node #'+str(node_id)+'...') |
49 | 49 |
log_indent += 1 |
50 |
start_line_num = stdout.line_num |
|
51 | 50 |
profiler = profiling.ItersProfiler(start_now=True, iter_text='row') |
52 | 51 |
|
52 |
filename = 'node.'+str(node_id)+'.specimens.csv' |
|
53 |
out = streams.LineCountOutputStream(open(filename, 'w')) |
|
54 |
start_line_num = out.line_num |
|
53 | 55 |
node_url_template = url_template.replace('[node_id]', str(node_id)) |
56 |
|
|
54 | 57 |
for prefix_chars in itertools.product(alphabet, repeat=2): |
55 | 58 |
if done: break |
56 | 59 |
prefix = ''.join(prefix_chars) |
... | ... | |
69 | 72 |
util.skip(stream, is_ignore) # skip header |
70 | 73 |
try: |
71 | 74 |
metadata_row = csv.reader(stream).next() |
72 |
if metadata_row[1] != 'COLLECTION': raise InputException(
|
|
75 |
if metadata_row[0] != 'COLLECTION': raise InputException(
|
|
73 | 76 |
'Invalid metadata row: '+str(metadata_row)) |
74 | 77 |
except StopIteration: |
75 | 78 |
done = True # empty response means no more nodes |
... | ... | |
81 | 84 |
break |
82 | 85 |
if row_ct % 100 == 0: print_status('\r') |
83 | 86 |
# CR at end so next print overwrites msg |
84 |
stdout.write(line)
|
|
87 |
out.write(line) |
|
85 | 88 |
row_ct += 1 |
86 | 89 |
except (InputException, timeout.TimeoutException), e: |
87 | 90 |
clear_line() |
88 |
log('! Output line '+str(stdout.line_num)+': '+exc.str_(e))
|
|
91 |
log('! Output line '+str(out.line_num)+': '+exc.str_(e)) |
|
89 | 92 |
stream.close() |
90 | 93 |
|
91 | 94 |
profiler.add_iters(row_ct) |
... | ... | |
94 | 97 |
|
95 | 98 |
profiler.stop() |
96 | 99 |
log(profiler.msg()) |
97 |
|
|
98 |
if stdout.line_num == start_line_num: line_range = '<none>' |
|
99 |
else: line_range = str(start_line_num)+'-'+str(stdout.line_num-1) |
|
100 |
log('Used output lines '+line_range) |
|
101 |
|
|
102 | 100 |
log_indent -= 1 |
103 | 101 |
|
104 | 102 |
main() |
Also available in: Unified diff
inputs/REMIB/src/nodes.all.specimens.csv.make: Write each node to a separate output file