Project

General

Profile

1 1631 aaronmk
#!/usr/bin/env python
2
# Downloads REMIB data for all nodes
3 1646 aaronmk
# Usage: env [start=...] [n=...] self 2>>log
4 1631 aaronmk
5 1633 aaronmk
import csv
6
import itertools
7 1631 aaronmk
import os.path
8
import StringIO
9
import sys
10
import urllib2
11
12
sys.path.append(os.path.dirname(__file__)+"/../../../lib")
13
14 1637 aaronmk
import exc
15 1646 aaronmk
import opts
16 1631 aaronmk
import profiling
17
import streams
18
import strings
19 1637 aaronmk
import timeout
20 1631 aaronmk
import util
21
22 1669 aaronmk
# Config
23 1673 aaronmk
timeout = 20 # sec
24 1669 aaronmk
max_consec_empty_responses = 10
25 1637 aaronmk
26 1633 aaronmk
alphabet = map(chr, xrange(ord('A'), ord('Z')+1))
27
28 1637 aaronmk
class InputException(Exception): pass
29
30 1651 aaronmk
class EmptyResponseException(InputException): pass
31
32 1631 aaronmk
def is_ignore(line):
33
    line = strings.remove_line_ending(line)
34
    return line == '' or line.startswith('\t') or line.find(',') < 0
35
36
def main():
37 1646 aaronmk
    # Get config from env vars
38
    start = util.cast(int, opts.get_env_var('start', 1))
39
    end = util.cast(int, util.none_if(opts.get_env_var('n', None), u''))
40 1669 aaronmk
    if end != None: end += start
41 1646 aaronmk
42 1633 aaronmk
    def clear_line(): sys.stderr.write('\n')
43 1631 aaronmk
    log_indent = 0
44
    def log(msg, line_ending='\n'):
45
        sys.stderr.write(('    '*log_indent)+msg+line_ending)
46
47 1643 aaronmk
    os.chdir(os.path.dirname(__file__)) # dir of output files
48 1631 aaronmk
49
    # Get by family ('familia') because that is the most general level at which
50
    # an identification can be made. This assumes all records have a family.
51
    url_template = ('http://www.conabio.gob.mx/remib/cgi-bin/'
52
        'remib_distribucion.cgi?lengua=EN&niveltax=familia&taxon=[prefix]%25&'
53
        'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados'
54
        '&coleccion=id%3D[node_id]')
55 1646 aaronmk
56 1669 aaronmk
    if end != None: node_ids = xrange(start, end)
57
    else: node_ids = itertools.count(start)
58
    consec_empty_responses = 0
59
    for node_id in node_ids:
60
        if consec_empty_responses > max_consec_empty_responses: break
61
62 1631 aaronmk
        log('Processing node #'+str(node_id)+'...')
63
        log_indent += 1
64
        profiler = profiling.ItersProfiler(start_now=True, iter_text='row')
65
66 1643 aaronmk
        filename = 'node.'+str(node_id)+'.specimens.csv'
67 1684 aaronmk
        out = streams.LineCountStream(open(filename, 'w'))
68 1649 aaronmk
        def log_ex(e):
69
            clear_line()
70
            log('! Output line '+str(out.line_num)+': '+exc.str_(e))
71 1643 aaronmk
        start_line_num = out.line_num
72 1631 aaronmk
        node_url_template = url_template.replace('[node_id]', str(node_id))
73 1643 aaronmk
74 1633 aaronmk
        for prefix_chars in itertools.product(alphabet, repeat=2):
75
            prefix = ''.join(prefix_chars)
76 1631 aaronmk
            log('Processing prefix '+prefix+'...')
77 1633 aaronmk
            row_ct = 0
78
            def print_status(line_ending='\n'):
79
                log('Processed '+str(row_ct)+' row(s)', line_ending)
80 1631 aaronmk
            log_indent += 1
81
82
            url = node_url_template.replace('[prefix]', prefix)
83 1637 aaronmk
            stream = streams.StreamIter(streams.TimeoutInputStream(
84 1673 aaronmk
                urllib2.urlopen(url), timeout))
85 1631 aaronmk
86 1669 aaronmk
            is_empty_response = False
87 1637 aaronmk
            try:
88 1641 aaronmk
                util.skip(stream, is_ignore) # skip header
89 1650 aaronmk
                try: metadata_row = csv.reader(stream).next()
90 1651 aaronmk
                except StopIteration: raise EmptyResponseException()
91 1650 aaronmk
                if metadata_row[0] != 'COLLECTION': raise InputException(
92
                    'Invalid metadata row: '+str(metadata_row))
93 1641 aaronmk
94 1650 aaronmk
                # Copy lines
95 1637 aaronmk
                for line in stream:
96
                    if is_ignore(line):
97
                        error = strings.remove_prefix('\t\t', line)
98
                        if len(error) != len(line): raise InputException(error)
99
                        break
100 1647 aaronmk
                    out.write(line)
101
102
                    row_ct += 1
103 1637 aaronmk
                    if row_ct % 100 == 0: print_status('\r')
104
                        # CR at end so next print overwrites msg
105 1651 aaronmk
            except EmptyResponseException, e: # must come before InputException
106 1669 aaronmk
                is_empty_response = True
107 1651 aaronmk
                log_ex(e)
108
                break # assume node doesn't exist, so abort node
109 1649 aaronmk
            except InputException, e: log_ex(e)
110
            except timeout.TimeoutException, e:
111
                log_ex(e)
112
                break # assume node is down, so abort node
113
            finally: # still run if break is called
114
                stream.close()
115
116
                profiler.add_iters(row_ct)
117
                print_status()
118
                log_indent -= 1
119 1669 aaronmk
120
                if is_empty_response: consec_empty_responses += 1
121
                else: consec_empty_responses = 0 # reset count
122 1631 aaronmk
123 1633 aaronmk
        profiler.stop()
124 1631 aaronmk
        log(profiler.msg())
125
        log_indent -= 1
126
127
main()