Project

General

Profile

1
#!/usr/bin/env python
2
# Downloads REMIB data for all nodes
3
# Usage: self
4

    
5
import csv
6
import itertools
7
import os.path
8
import StringIO
9
import sys
10
import urllib2
11

    
12
sys.path.append(os.path.dirname(__file__)+"/../../../lib")
13

    
14
import profiling
15
import streams
16
import strings
17
import util
18

    
19
alphabet = map(chr, xrange(ord('A'), ord('Z')+1))
20

    
21
def is_ignore(line):
22
    line = strings.remove_line_ending(line)
23
    return line == '' or line.startswith('\t') or line.find(',') < 0
24

    
25
def main():
26
    def clear_line(): sys.stderr.write('\n')
27
    log_indent = 0
28
    def log(msg, line_ending='\n'):
29
        sys.stderr.write(('    '*log_indent)+msg+line_ending)
30
    
31
    stdout = streams.LineCountOutputStream(sys.stdout)
32
    
33
    # Get by family ('familia') because that is the most general level at which
34
    # an identification can be made. This assumes all records have a family.
35
    url_template = ('http://www.conabio.gob.mx/remib/cgi-bin/'
36
        'remib_distribucion.cgi?lengua=EN&niveltax=familia&taxon=[prefix]%25&'
37
        'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados'
38
        '&coleccion=id%3D[node_id]')
39
    done = False
40
    for node_id in itertools.count(1):
41
        if done: break
42
        log('Processing node #'+str(node_id)+'...')
43
        log_indent += 1
44
        start_line_num = stdout.line_num
45
        profiler = profiling.ItersProfiler(start_now=True, iter_text='row')
46
        
47
        node_url_template = url_template.replace('[node_id]', str(node_id))
48
        for prefix_chars in itertools.product(alphabet, repeat=2):
49
            if done: break
50
            prefix = ''.join(prefix_chars)
51
            log('Processing prefix '+prefix+'...')
52
            row_ct = 0
53
            def print_status(line_ending='\n'):
54
                log('Processed '+str(row_ct)+' row(s)', line_ending)
55
            log_indent += 1
56
            
57
            url = node_url_template.replace('[prefix]', prefix)
58
            stream = streams.StreamIter(urllib2.urlopen(url))
59
            
60
            util.skip(stream, is_ignore) # skip header
61
            try:
62
                metadata_row = csv.reader(stream).next()
63
                assert metadata_row[0] == 'COLLECTION'
64
            except StopIteration: done = True # empty response = no more nodes
65
            
66
            # Copy lines
67
            for line in stream:
68
                if is_ignore(line):
69
                    error = strings.remove_prefix('\t\t', line)
70
                    if len(error) != len(line):
71
                        clear_line()
72
                        log('! Line '+str(stdout.line_num)+': ' +error.rstrip())
73
                    break
74
                if row_ct % 100 == 0: print_status('\r')
75
                    # CR at end so next print overwrites msg
76
                stdout.write(line)
77
                row_ct += 1
78
            stream.close()
79
            
80
            profiler.add_iters(row_ct)
81
            print_status()
82
            log_indent -= 1
83
            
84
        profiler.stop()
85
        log(profiler.msg())
86
        
87
        if stdout.line_num == start_line_num: line_range = '<none>'
88
        else: line_range = str(start_line_num)+'-'+str(stdout.line_num-1)
89
        log('Used output lines '+line_range)
90
        
91
        log_indent -= 1
92

    
93
main()
(1-1/2)