/inputs/REMIB/src/nodes.all.specimens.csv.make - BIEN 3 - NCEAS Projects

root/inputs/REMIB/src/nodes.all.specimens.csv.make @ 1636

       #!/usr/bin/env python
       # Downloads REMIB data for all nodes
       # Usage: self
       import csv
       import itertools
       import os.path
       import StringIO
       import sys
       import urllib2
       sys.path.append(os.path.dirname(__file__)+"/../../../lib")
       import profiling
       import streams
       import strings
       import util
       alphabet = map(chr, xrange(ord('A'), ord('Z')+1))
       def is_ignore(line):
           line = strings.remove_line_ending(line)
           return line == '' or line.startswith('\t') or line.find(',') < 0
       def main():
           def clear_line(): sys.stderr.write('\n')
           log_indent = 0
           def log(msg, line_ending='\n'):
               sys.stderr.write(('    '*log_indent)+msg+line_ending)
           stdout = streams.LineCountOutputStream(sys.stdout)
           # Get by family ('familia') because that is the most general level at which
           # an identification can be made. This assumes all records have a family.
           url_template = ('http://www.conabio.gob.mx/remib/cgi-bin/'
               'remib_distribucion.cgi?lengua=EN&niveltax=familia&taxon=[prefix]%25&'
               'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados'
               '&coleccion=id%3D[node_id]')
           done = False
           for node_id in itertools.count(1):
               if done: break
               log('Processing node #'+str(node_id)+'...')
               log_indent += 1
               start_line_num = stdout.line_num
               profiler = profiling.ItersProfiler(start_now=True, iter_text='row')
               node_url_template = url_template.replace('[node_id]', str(node_id))
               for prefix_chars in itertools.product(alphabet, repeat=2):
                   if done: break
                   prefix = ''.join(prefix_chars)
                   log('Processing prefix '+prefix+'...')
                   row_ct = 0
                   def print_status(line_ending='\n'):
                       log('Processed '+str(row_ct)+' row(s)', line_ending)
                   log_indent += 1
                   url = node_url_template.replace('[prefix]', prefix)
                   stream = streams.StreamIter(urllib2.urlopen(url))
                   util.skip(stream, is_ignore) # skip header
                   try:
                       metadata_row = csv.reader(stream).next()
                       assert metadata_row[0] == 'COLLECTION'
                   except StopIteration: done = True # empty response = no more nodes
                   # Copy lines
                   for line in stream:
                       if is_ignore(line):
                           error = strings.remove_prefix('\t\t', line)
                           if len(error) != len(line):
                               clear_line()
                               log('! Line '+str(stdout.line_num)+': ' +error.rstrip())
                           break
                       if row_ct % 100 == 0: print_status('\r')
                           # CR at end so next print overwrites msg
                       stdout.write(line)
                       row_ct += 1
                   stream.close()
                   profiler.add_iters(row_ct)
                   print_status()
                   log_indent -= 1
               profiler.stop()
               log(profiler.msg())
               if stdout.line_num == start_line_num: line_range = '<none>'
               else: line_range = str(start_line_num)+'-'+str(stdout.line_num-1)
               log('Used output lines '+line_range)
               log_indent -= 1
       main()

« Previous
1
2
Next »

(1-1/2)

Project

General

Profile

root/inputs/REMIB/src/nodes.all.specimens.csv.make @ 1636