Project

General

Profile

1 1631 aaronmk
#!/usr/bin/env python
2
# Downloads REMIB data for all nodes
3
# Usage: self
4
5 1633 aaronmk
import csv
6
import itertools
7 1631 aaronmk
import os.path
8
import StringIO
9
import sys
10
import urllib2
11
12
sys.path.append(os.path.dirname(__file__)+"/../../../lib")
13
14 1637 aaronmk
import exc
15 1631 aaronmk
import profiling
16
import streams
17
import strings
18 1637 aaronmk
import timeout
19 1631 aaronmk
import util
20
21 1638 aaronmk
timeout_ = 20 # sec
22 1637 aaronmk
23 1633 aaronmk
alphabet = map(chr, xrange(ord('A'), ord('Z')+1))
24
25 1637 aaronmk
class InputException(Exception): pass
26
27 1631 aaronmk
def is_ignore(line):
28
    line = strings.remove_line_ending(line)
29
    return line == '' or line.startswith('\t') or line.find(',') < 0
30
31
def main():
32 1633 aaronmk
    def clear_line(): sys.stderr.write('\n')
33 1631 aaronmk
    log_indent = 0
34
    def log(msg, line_ending='\n'):
35
        sys.stderr.write(('    '*log_indent)+msg+line_ending)
36
37
    stdout = streams.LineCountOutputStream(sys.stdout)
38
39
    # Get by family ('familia') because that is the most general level at which
40
    # an identification can be made. This assumes all records have a family.
41
    url_template = ('http://www.conabio.gob.mx/remib/cgi-bin/'
42
        'remib_distribucion.cgi?lengua=EN&niveltax=familia&taxon=[prefix]%25&'
43
        'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados'
44
        '&coleccion=id%3D[node_id]')
45 1633 aaronmk
    done = False
46
    for node_id in itertools.count(1):
47
        if done: break
48 1631 aaronmk
        log('Processing node #'+str(node_id)+'...')
49
        log_indent += 1
50 1633 aaronmk
        start_line_num = stdout.line_num
51 1631 aaronmk
        profiler = profiling.ItersProfiler(start_now=True, iter_text='row')
52
53
        node_url_template = url_template.replace('[node_id]', str(node_id))
54 1633 aaronmk
        for prefix_chars in itertools.product(alphabet, repeat=2):
55
            if done: break
56
            prefix = ''.join(prefix_chars)
57 1631 aaronmk
            log('Processing prefix '+prefix+'...')
58 1633 aaronmk
            row_ct = 0
59
            def print_status(line_ending='\n'):
60
                log('Processed '+str(row_ct)+' row(s)', line_ending)
61 1631 aaronmk
            log_indent += 1
62
63
            url = node_url_template.replace('[prefix]', prefix)
64 1637 aaronmk
            stream = streams.StreamIter(streams.TimeoutInputStream(
65
                urllib2.urlopen(url), timeout_))
66 1631 aaronmk
67
            # Copy lines
68 1637 aaronmk
            try:
69 1641 aaronmk
                util.skip(stream, is_ignore) # skip header
70
                try:
71
                    metadata_row = csv.reader(stream).next()
72
                    assert metadata_row[0] == 'COLLECTION'
73
                except StopIteration:
74
                    done = True # empty response means no more nodes
75
76 1637 aaronmk
                for line in stream:
77
                    if is_ignore(line):
78
                        error = strings.remove_prefix('\t\t', line)
79
                        if len(error) != len(line): raise InputException(error)
80
                        break
81
                    if row_ct % 100 == 0: print_status('\r')
82
                        # CR at end so next print overwrites msg
83
                    stdout.write(line)
84
                    row_ct += 1
85
            except (InputException, timeout.TimeoutException), e:
86
                clear_line()
87
                log('! Output line '+str(stdout.line_num)+': '+exc.str_(e))
88 1631 aaronmk
            stream.close()
89
90 1633 aaronmk
            profiler.add_iters(row_ct)
91
            print_status()
92 1631 aaronmk
            log_indent -= 1
93
94 1633 aaronmk
        profiler.stop()
95 1631 aaronmk
        log(profiler.msg())
96
97
        if stdout.line_num == start_line_num: line_range = '<none>'
98
        else: line_range = str(start_line_num)+'-'+str(stdout.line_num-1)
99
        log('Used output lines '+line_range)
100
101
        log_indent -= 1
102
103
main()