Project

General

Profile

1 1631 aaronmk
#!/usr/bin/env python
2
# Downloads REMIB data for all nodes
3 1646 aaronmk
# Usage: env [start=...] [n=...] self 2>>log
4 1631 aaronmk
5 1633 aaronmk
import csv
6
import itertools
7 1631 aaronmk
import os.path
8
import StringIO
9
import sys
10
import urllib2
11
12
sys.path.append(os.path.dirname(__file__)+"/../../../lib")
13
14 1637 aaronmk
import exc
15 1646 aaronmk
import opts
16 1631 aaronmk
import profiling
17
import streams
18
import strings
19 1637 aaronmk
import timeout
20 1631 aaronmk
import util
21
22 1638 aaronmk
timeout_ = 20 # sec
23 1637 aaronmk
24 1633 aaronmk
alphabet = map(chr, xrange(ord('A'), ord('Z')+1))
25
26 1637 aaronmk
class InputException(Exception): pass
27
28 1651 aaronmk
class EmptyResponseException(InputException): pass
29
30 1631 aaronmk
def is_ignore(line):
31
    line = strings.remove_line_ending(line)
32
    return line == '' or line.startswith('\t') or line.find(',') < 0
33
34
def main():
35 1646 aaronmk
    # Get config from env vars
36
    start = util.cast(int, opts.get_env_var('start', 1))
37
    end = util.cast(int, util.none_if(opts.get_env_var('n', None), u''))
38 1651 aaronmk
    if end == None: end = 150 # about 120 nodes listed on the web form
39
    else: end += start
40 1646 aaronmk
41 1633 aaronmk
    def clear_line(): sys.stderr.write('\n')
42 1631 aaronmk
    log_indent = 0
43
    def log(msg, line_ending='\n'):
44
        sys.stderr.write(('    '*log_indent)+msg+line_ending)
45
46 1643 aaronmk
    os.chdir(os.path.dirname(__file__)) # dir of output files
47 1631 aaronmk
48
    # Get by family ('familia') because that is the most general level at which
49
    # an identification can be made. This assumes all records have a family.
50
    url_template = ('http://www.conabio.gob.mx/remib/cgi-bin/'
51
        'remib_distribucion.cgi?lengua=EN&niveltax=familia&taxon=[prefix]%25&'
52
        'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados'
53
        '&coleccion=id%3D[node_id]')
54 1646 aaronmk
55 1651 aaronmk
    for node_id in xrange(start, end):
56 1631 aaronmk
        log('Processing node #'+str(node_id)+'...')
57
        log_indent += 1
58
        profiler = profiling.ItersProfiler(start_now=True, iter_text='row')
59
60 1643 aaronmk
        filename = 'node.'+str(node_id)+'.specimens.csv'
61
        out = streams.LineCountOutputStream(open(filename, 'w'))
62 1649 aaronmk
        def log_ex(e):
63
            clear_line()
64
            log('! Output line '+str(out.line_num)+': '+exc.str_(e))
65 1643 aaronmk
        start_line_num = out.line_num
66 1631 aaronmk
        node_url_template = url_template.replace('[node_id]', str(node_id))
67 1643 aaronmk
68 1633 aaronmk
        for prefix_chars in itertools.product(alphabet, repeat=2):
69
            prefix = ''.join(prefix_chars)
70 1631 aaronmk
            log('Processing prefix '+prefix+'...')
71 1633 aaronmk
            row_ct = 0
72
            def print_status(line_ending='\n'):
73
                log('Processed '+str(row_ct)+' row(s)', line_ending)
74 1631 aaronmk
            log_indent += 1
75
76
            url = node_url_template.replace('[prefix]', prefix)
77 1637 aaronmk
            stream = streams.StreamIter(streams.TimeoutInputStream(
78
                urllib2.urlopen(url), timeout_))
79 1631 aaronmk
80 1637 aaronmk
            try:
81 1641 aaronmk
                util.skip(stream, is_ignore) # skip header
82 1650 aaronmk
                try: metadata_row = csv.reader(stream).next()
83 1651 aaronmk
                except StopIteration: raise EmptyResponseException()
84 1650 aaronmk
                if metadata_row[0] != 'COLLECTION': raise InputException(
85
                    'Invalid metadata row: '+str(metadata_row))
86 1641 aaronmk
87 1650 aaronmk
                # Copy lines
88 1637 aaronmk
                for line in stream:
89
                    if is_ignore(line):
90
                        error = strings.remove_prefix('\t\t', line)
91
                        if len(error) != len(line): raise InputException(error)
92
                        break
93 1647 aaronmk
                    out.write(line)
94
95
                    row_ct += 1
96 1637 aaronmk
                    if row_ct % 100 == 0: print_status('\r')
97
                        # CR at end so next print overwrites msg
98 1651 aaronmk
            except EmptyResponseException, e: # must come before InputException
99
                log_ex(e)
100
                break # assume node doesn't exist, so abort node
101 1649 aaronmk
            except InputException, e: log_ex(e)
102
            except timeout.TimeoutException, e:
103
                log_ex(e)
104
                break # assume node is down, so abort node
105
            finally: # still run if break is called
106
                stream.close()
107
108
                profiler.add_iters(row_ct)
109
                print_status()
110
                log_indent -= 1
111 1631 aaronmk
112 1633 aaronmk
        profiler.stop()
113 1631 aaronmk
        log(profiler.msg())
114
        log_indent -= 1
115
116
main()