Project

General

Profile

1 1631 aaronmk
#!/usr/bin/env python
2
# Downloads REMIB data for all nodes
3 1646 aaronmk
# Usage: env [start=...] [n=...] self 2>>log
4 1631 aaronmk
5 1633 aaronmk
import csv
6
import itertools
7 1631 aaronmk
import os.path
8
import StringIO
9
import sys
10
import urllib2
11
12
sys.path.append(os.path.dirname(__file__)+"/../../../lib")
13
14 1637 aaronmk
import exc
15 1646 aaronmk
import opts
16 1631 aaronmk
import profiling
17
import streams
18
import strings
19 1637 aaronmk
import timeout
20 1631 aaronmk
import util
21
22 1638 aaronmk
timeout_ = 20 # sec
23 1637 aaronmk
24 1633 aaronmk
alphabet = map(chr, xrange(ord('A'), ord('Z')+1))
25
26 1637 aaronmk
class InputException(Exception): pass
27
28 1631 aaronmk
def is_ignore(line):
29
    line = strings.remove_line_ending(line)
30
    return line == '' or line.startswith('\t') or line.find(',') < 0
31
32
def main():
33 1646 aaronmk
    # Get config from env vars
34
    start = util.cast(int, opts.get_env_var('start', 1))
35
    end = util.cast(int, util.none_if(opts.get_env_var('n', None), u''))
36
    if end != None: end += start
37
38 1633 aaronmk
    def clear_line(): sys.stderr.write('\n')
39 1631 aaronmk
    log_indent = 0
40
    def log(msg, line_ending='\n'):
41
        sys.stderr.write(('    '*log_indent)+msg+line_ending)
42
43 1643 aaronmk
    os.chdir(os.path.dirname(__file__)) # dir of output files
44 1631 aaronmk
45
    # Get by family ('familia') because that is the most general level at which
46
    # an identification can be made. This assumes all records have a family.
47
    url_template = ('http://www.conabio.gob.mx/remib/cgi-bin/'
48
        'remib_distribucion.cgi?lengua=EN&niveltax=familia&taxon=[prefix]%25&'
49
        'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados'
50
        '&coleccion=id%3D[node_id]')
51 1646 aaronmk
52
    if end == None: node_ids = itertools.count(start)
53
    else: node_ids = xrange(start, end)
54 1633 aaronmk
    done = False
55 1646 aaronmk
    for node_id in node_ids:
56 1633 aaronmk
        if done: break
57 1631 aaronmk
        log('Processing node #'+str(node_id)+'...')
58
        log_indent += 1
59
        profiler = profiling.ItersProfiler(start_now=True, iter_text='row')
60
61 1643 aaronmk
        filename = 'node.'+str(node_id)+'.specimens.csv'
62
        out = streams.LineCountOutputStream(open(filename, 'w'))
63
        start_line_num = out.line_num
64 1631 aaronmk
        node_url_template = url_template.replace('[node_id]', str(node_id))
65 1643 aaronmk
66 1633 aaronmk
        for prefix_chars in itertools.product(alphabet, repeat=2):
67
            if done: break
68
            prefix = ''.join(prefix_chars)
69 1631 aaronmk
            log('Processing prefix '+prefix+'...')
70 1633 aaronmk
            row_ct = 0
71
            def print_status(line_ending='\n'):
72
                log('Processed '+str(row_ct)+' row(s)', line_ending)
73 1631 aaronmk
            log_indent += 1
74
75
            url = node_url_template.replace('[prefix]', prefix)
76 1637 aaronmk
            stream = streams.StreamIter(streams.TimeoutInputStream(
77
                urllib2.urlopen(url), timeout_))
78 1631 aaronmk
79
            # Copy lines
80 1637 aaronmk
            try:
81 1641 aaronmk
                util.skip(stream, is_ignore) # skip header
82
                try:
83
                    metadata_row = csv.reader(stream).next()
84 1643 aaronmk
                    if metadata_row[0] != 'COLLECTION': raise InputException(
85 1642 aaronmk
                        'Invalid metadata row: '+str(metadata_row))
86 1641 aaronmk
                except StopIteration:
87
                    done = True # empty response means no more nodes
88
89 1637 aaronmk
                for line in stream:
90
                    if is_ignore(line):
91
                        error = strings.remove_prefix('\t\t', line)
92
                        if len(error) != len(line): raise InputException(error)
93
                        break
94 1647 aaronmk
                    out.write(line)
95
96
                    row_ct += 1
97 1637 aaronmk
                    if row_ct % 100 == 0: print_status('\r')
98
                        # CR at end so next print overwrites msg
99
            except (InputException, timeout.TimeoutException), e:
100
                clear_line()
101 1643 aaronmk
                log('! Output line '+str(out.line_num)+': '+exc.str_(e))
102 1631 aaronmk
            stream.close()
103
104 1633 aaronmk
            profiler.add_iters(row_ct)
105
            print_status()
106 1631 aaronmk
            log_indent -= 1
107
108 1633 aaronmk
        profiler.stop()
109 1631 aaronmk
        log(profiler.msg())
110
        log_indent -= 1
111
112
main()