/inputs/REMIB/src/nodes.all.specimens.csv.make - BIEN 3 - NCEAS Projects

root/inputs/REMIB/src/nodes.all.specimens.csv.make @ 1631

       #!/usr/bin/env python
       # Downloads REMIB data for all nodes
       # Usage: self
       import os.path
       import StringIO
       import sys
       import urllib2
       sys.path.append(os.path.dirname(__file__)+"/../../../lib")
       import csv
       import profiling
       import streams
       import strings
       import term
       import util
       def is_ignore(line):
           line = strings.remove_line_ending(line)
           return line == '' or line.startswith('\t') or line.find(',') < 0
       def main():
           def clear_line(): sys.stderr.write(term.clear_line)
           log_indent = 0
           def log(msg, line_ending='\n'):
               sys.stderr.write(('    '*log_indent)+msg+line_ending)
           stdout = streams.LineCountOutputStream(sys.stdout)
           # Get by family ('familia') because that is the most general level at which
           # an identification can be made. This assumes all records have a family.
           url_template = ('http://www.conabio.gob.mx/remib/cgi-bin/'
               'remib_distribucion.cgi?lengua=EN&niveltax=familia&taxon=[prefix]%25&'
               'pais=Todos&pais_otro=&estado=100&formato=csv&mapa=no&mapabase=estados'
               '&coleccion=id%3D[node_id]')
           for node_id in xrange(1, 3):
               log('Processing node #'+str(node_id)+'...')
               log_indent += 1
               profiler = profiling.ItersProfiler(start_now=True, iter_text='row')
               row_ct = 0
               def print_status(line_ending='\n'):
                   log('Processed '+str(row_ct)+' row(s)', line_ending)
               node_url_template = url_template.replace('[node_id]', str(node_id))
               for prefix in ['AC']:
                   log('Processing prefix '+prefix+'...')
                   log_indent += 1
                   start_line_num = stdout.line_num
                   url = node_url_template.replace('[prefix]', prefix)
                   stream = streams.StreamIter(urllib2.urlopen(url))
                   util.skip(stream, is_ignore) # skip header
                   metadata_row = csv.reader(stream).next()
                   assert metadata_row[0] == 'COLLECTION'
                   # Copy lines
                   for line in stream:
                       if is_ignore(line):
                           error = strings.remove_prefix('\t\t', line)
                           if len(error) != len(line):
                               clear_line()
                               log('! Line '+str(stdout.line_num)+': ' +error.rstrip())
                           break
                       if row_ct % 100 == 0: print_status('\r')
                           # CR at end so next print overwrites msg
                       stdout.write(line)
                       row_ct += 1
                   stream.close()
                   clear_line()
                   log_indent -= 1
               profiler.stop(row_ct)
               print_status()
               log(profiler.msg())
               if stdout.line_num == start_line_num: line_range = '<none>'
               else: line_range = str(start_line_num)+'-'+str(stdout.line_num-1)
               log('Used output lines '+line_range)
               log_indent -= 1
       main()

« Previous
1
2
Next »

(1-1/2)

Project

General

Profile

root/inputs/REMIB/src/nodes.all.specimens.csv.make @ 1631