Project

General

Profile

1 1631 aaronmk
#!/usr/bin/env python
2
# Downloads REMIB data for all nodes
3 1646 aaronmk
# Usage: env [start=...] [n=...] self 2>>log
4 1631 aaronmk
5 1633 aaronmk
import csv
6
import itertools
7 1631 aaronmk
import os.path
8
import StringIO
9
import sys
10
import urllib2
11
12
sys.path.append(os.path.dirname(__file__)+"/../../../lib")
13
14 1637 aaronmk
import exc
15 1646 aaronmk
import opts
16 1631 aaronmk
import profiling
17
import streams
18
import strings
19 1637 aaronmk
import timeout
20 1631 aaronmk
import util
21
22 1638 aaronmk
timeout_ = 20 # sec
23 1637 aaronmk
24 1633 aaronmk
alphabet = map(chr, xrange(ord('A'), ord('Z')+1))
25
26 1637 aaronmk
class InputException(Exception): pass
27
28 1631 aaronmk
def is_ignore(line):
29
    line = strings.remove_line_ending(line)
30
    return line == '' or line.startswith('\t') or line.find(',') < 0
31
32
def main():
33 1646 aaronmk
    # Get config from env vars
34
    start = util.cast(int, opts.get_env_var('start', 1))
35
    end = util.cast(int, util.none_if(opts.get_env_var('n', None), u''))
36
    if end != None: end += start
37
38 1633 aaronmk
    def clear_line(): sys.stderr.write('\n')
39 1631 aaronmk
    log_indent = 0
40
    def log(msg, line_ending='\n'):
41
        sys.stderr.write(('    '*log_indent)+msg+line_ending)
42
43 1643 aaronmk
    os.chdir(os.path.dirname(__file__)) # dir of output files
44 1631 aaronmk
45
    # Get by family ('familia') because that is the most general level at which
46
    # an identification can be made. This assumes all records have a family.
47
    url_template = ('http://www.conabio.gob.mx/remib/cgi-bin/'
48
        'remib_distribucion.cgi?lengua=EN&niveltax=familia&taxon=[prefix]%25&'
49
        'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados'
50
        '&coleccion=id%3D[node_id]')
51 1646 aaronmk
52
    if end == None: node_ids = itertools.count(start)
53
    else: node_ids = xrange(start, end)
54 1633 aaronmk
    done = False
55 1646 aaronmk
    for node_id in node_ids:
56 1633 aaronmk
        if done: break
57 1631 aaronmk
        log('Processing node #'+str(node_id)+'...')
58
        log_indent += 1
59
        profiler = profiling.ItersProfiler(start_now=True, iter_text='row')
60
61 1643 aaronmk
        filename = 'node.'+str(node_id)+'.specimens.csv'
62
        out = streams.LineCountOutputStream(open(filename, 'w'))
63 1649 aaronmk
        def log_ex(e):
64
            clear_line()
65
            log('! Output line '+str(out.line_num)+': '+exc.str_(e))
66 1643 aaronmk
        start_line_num = out.line_num
67 1631 aaronmk
        node_url_template = url_template.replace('[node_id]', str(node_id))
68 1643 aaronmk
69 1633 aaronmk
        for prefix_chars in itertools.product(alphabet, repeat=2):
70
            prefix = ''.join(prefix_chars)
71 1631 aaronmk
            log('Processing prefix '+prefix+'...')
72 1633 aaronmk
            row_ct = 0
73
            def print_status(line_ending='\n'):
74
                log('Processed '+str(row_ct)+' row(s)', line_ending)
75 1631 aaronmk
            log_indent += 1
76
77
            url = node_url_template.replace('[prefix]', prefix)
78 1637 aaronmk
            stream = streams.StreamIter(streams.TimeoutInputStream(
79
                urllib2.urlopen(url), timeout_))
80 1631 aaronmk
81 1637 aaronmk
            try:
82 1641 aaronmk
                util.skip(stream, is_ignore) # skip header
83 1650 aaronmk
                try: metadata_row = csv.reader(stream).next()
84 1641 aaronmk
                except StopIteration:
85
                    done = True # empty response means no more nodes
86 1650 aaronmk
                    break
87
                if metadata_row[0] != 'COLLECTION': raise InputException(
88
                    'Invalid metadata row: '+str(metadata_row))
89 1641 aaronmk
90 1650 aaronmk
                # Copy lines
91 1637 aaronmk
                for line in stream:
92
                    if is_ignore(line):
93
                        error = strings.remove_prefix('\t\t', line)
94
                        if len(error) != len(line): raise InputException(error)
95
                        break
96 1647 aaronmk
                    out.write(line)
97
98
                    row_ct += 1
99 1637 aaronmk
                    if row_ct % 100 == 0: print_status('\r')
100
                        # CR at end so next print overwrites msg
101 1649 aaronmk
            except InputException, e: log_ex(e)
102
            except timeout.TimeoutException, e:
103
                log_ex(e)
104
                break # assume node is down, so abort node
105
            finally: # still run if break is called
106
                stream.close()
107
108
                profiler.add_iters(row_ct)
109
                print_status()
110
                log_indent -= 1
111 1631 aaronmk
112 1633 aaronmk
        profiler.stop()
113 1631 aaronmk
        log(profiler.msg())
114
        log_indent -= 1
115
116
main()