Project

General

Profile

1 1631 aaronmk
#!/usr/bin/env python
2
# Downloads REMIB data for all nodes
3 4415 aaronmk
# Usage: env [start=...] [n=...] self
4 1631 aaronmk
5 1633 aaronmk
import csv
6
import itertools
7 1631 aaronmk
import os.path
8
import StringIO
9
import sys
10
import urllib2
11
12 4416 aaronmk
sys.path.append(os.path.dirname(__file__)+"/../../lib")
13 1631 aaronmk
14 1637 aaronmk
import exc
15 1646 aaronmk
import opts
16 1631 aaronmk
import profiling
17
import streams
18
import strings
19 1637 aaronmk
import timeout
20 1631 aaronmk
import util
21
22 1669 aaronmk
# Config
23 1673 aaronmk
timeout = 20 # sec
24 1669 aaronmk
max_consec_empty_responses = 10
25 1637 aaronmk
26 1633 aaronmk
alphabet = map(chr, xrange(ord('A'), ord('Z')+1))
27
28 1637 aaronmk
class InputException(Exception): pass
29
30 1651 aaronmk
class EmptyResponseException(InputException): pass
31
32 1631 aaronmk
def is_ignore(line):
33
    line = strings.remove_line_ending(line)
34
    return line == '' or line.startswith('\t') or line.find(',') < 0
35
36
def main():
37 1646 aaronmk
    # Get config from env vars
38
    start = util.cast(int, opts.get_env_var('start', 1))
39
    end = util.cast(int, util.none_if(opts.get_env_var('n', None), u''))
40 1669 aaronmk
    if end != None: end += start
41 1646 aaronmk
42 4415 aaronmk
    log_ = open(sys.argv[0]+'.log', 'a')
43
    def clear_line(): log_.write('\n')
44 1631 aaronmk
    log_indent = 0
45
    def log(msg, line_ending='\n'):
46 4415 aaronmk
        log_.write(('    '*log_indent)+msg+line_ending)
47 1631 aaronmk
48 1643 aaronmk
    os.chdir(os.path.dirname(__file__)) # dir of output files
49 1631 aaronmk
50
    # Get by family ('familia') because that is the most general level at which
51
    # an identification can be made. This assumes all records have a family.
52
    url_template = ('http://www.conabio.gob.mx/remib/cgi-bin/'
53
        'remib_distribucion.cgi?lengua=EN&niveltax=familia&taxon=[prefix]%25&'
54
        'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados'
55
        '&coleccion=id%3D[node_id]')
56 1646 aaronmk
57 1669 aaronmk
    if end != None: node_ids = xrange(start, end)
58
    else: node_ids = itertools.count(start)
59
    consec_empty_responses = 0
60
    for node_id in node_ids:
61
        if consec_empty_responses > max_consec_empty_responses: break
62
63 1631 aaronmk
        log('Processing node #'+str(node_id)+'...')
64
        log_indent += 1
65
        profiler = profiling.ItersProfiler(start_now=True, iter_text='row')
66
67 4883 aaronmk
        filename = 'node.'+str(node_id)+'.csv'
68 1684 aaronmk
        out = streams.LineCountStream(open(filename, 'w'))
69 1649 aaronmk
        def log_ex(e):
70
            clear_line()
71
            log('! Output line '+str(out.line_num)+': '+exc.str_(e))
72 1643 aaronmk
        start_line_num = out.line_num
73 1631 aaronmk
        node_url_template = url_template.replace('[node_id]', str(node_id))
74 1643 aaronmk
75 1633 aaronmk
        for prefix_chars in itertools.product(alphabet, repeat=2):
76
            prefix = ''.join(prefix_chars)
77 1631 aaronmk
            log('Processing prefix '+prefix+'...')
78 1633 aaronmk
            row_ct = 0
79
            def print_status(line_ending='\n'):
80
                log('Processed '+str(row_ct)+' row(s)', line_ending)
81 1631 aaronmk
            log_indent += 1
82
83
            url = node_url_template.replace('[prefix]', prefix)
84 1637 aaronmk
            stream = streams.StreamIter(streams.TimeoutInputStream(
85 1673 aaronmk
                urllib2.urlopen(url), timeout))
86 1631 aaronmk
87 1669 aaronmk
            is_empty_response = False
88 1637 aaronmk
            try:
89 1641 aaronmk
                util.skip(stream, is_ignore) # skip header
90 1650 aaronmk
                try: metadata_row = csv.reader(stream).next()
91 1651 aaronmk
                except StopIteration: raise EmptyResponseException()
92 1650 aaronmk
                if metadata_row[0] != 'COLLECTION': raise InputException(
93
                    'Invalid metadata row: '+str(metadata_row))
94 1641 aaronmk
95 1650 aaronmk
                # Copy lines
96 1637 aaronmk
                for line in stream:
97
                    if is_ignore(line):
98
                        error = strings.remove_prefix('\t\t', line)
99
                        if len(error) != len(line): raise InputException(error)
100
                        break
101 1647 aaronmk
                    out.write(line)
102
103
                    row_ct += 1
104 1637 aaronmk
                    if row_ct % 100 == 0: print_status('\r')
105
                        # CR at end so next print overwrites msg
106 1651 aaronmk
            except EmptyResponseException, e: # must come before InputException
107 1669 aaronmk
                is_empty_response = True
108 1651 aaronmk
                log_ex(e)
109
                break # assume node doesn't exist, so abort node
110 1649 aaronmk
            except InputException, e: log_ex(e)
111
            except timeout.TimeoutException, e:
112
                log_ex(e)
113
                break # assume node is down, so abort node
114
            finally: # still run if break is called
115
                stream.close()
116
117
                profiler.add_iters(row_ct)
118
                print_status()
119
                log_indent -= 1
120 1669 aaronmk
121
                if is_empty_response: consec_empty_responses += 1
122
                else: consec_empty_responses = 0 # reset count
123 1631 aaronmk
124 1633 aaronmk
        profiler.stop()
125 1631 aaronmk
        log(profiler.msg())
126
        log_indent -= 1
127
128
main()