/lib/db_xml.py - Diff - BIEN 3 - NCEAS Projects

« Previous | Next »

Revision 3194

Added by Aaron Marcuse-Kubitza over 12 years ago

db_xml.py: put_table(): Subsetting in_table: Don't count # rows because this takes awhile for large datasets. Instead, use the chunking algorithm in digir_client, which ends the loop when a partial or empty partition is encountered.

     def put_table(db, node, in_table, in_row_ct_ref=None,
         row_ins_ct_ref=None, limit=None, start=0, on_error=exc.raise_,
         parent_ids_loc=None, next=None):
         parent_ids_loc=None, next=None, top_call=True):
         '''
         @param node The XML tree that transforms the input to the output. Similar to
             put()'s node param, but with the input column name prefixed by
             input_col_prefix in place of the column value.
         @return (table, col) Where the pkeys (from INSERT RETURNING) are made
         @return sql_gen.Col Where the pkeys (from INSERT RETURNING) are made
             available
         '''
         in_table = sql_gen.as_Table(in_table)
-...
         def put_table_(node, in_row_ct_ref=None):
             return put_table(db, node, in_table, in_row_ct_ref, row_ins_ct_ref,
                 on_error=on_error, parent_ids_loc=parent_ids_loc, next=next)
                 None, 0, on_error, parent_ids_loc, next, False)
         # Subset and/or partition in_table if needed
         in_row_ct = sql.table_row_count(db, in_table)
         if limit != None or start != 0 or in_row_ct > partition_size:
             end = in_row_ct
             if limit != None: end = min(start + limit, end)
         if top_call:
             full_in_table = in_table
             for start_ in xrange(start, end, partition_size):
                 limit_ = min(end - start_, partition_size)
             total = 0
             while limit == None or total < limit:
                 # Adjust partition size if last partition
                 this_limit = partition_size
                 if limit != None: this_limit = min(this_limit, limit - total)
                 # Row # is interally 0-based, but 1-based to the user
                 db.log_debug('********** Partition: rows '+str(start_+1)+'-'
                     +str(start_+limit_)+' **********', level=1.2)
                 db.log_debug('********** Partition: rows '+str(start+1)+'-'
                     +str(start+this_limit)+' **********', level=1.2)
                 # Subset in_table
                 in_table = copy.copy(full_in_table) # don't modify input!
                 in_table.name = str(in_table) # prepend schema
                 sql.run_query_into(db, sql.mk_select(db, full_in_table,
                     limit=limit_, start=start_), into=in_table)
                 cur = sql.run_query_into(db, sql.mk_select(db, full_in_table,
                     limit=this_limit, start=start), into=in_table)
                     # full_in_table will be shadowed (hidden) by created temp table
                 this_ct = cur.rowcount
                 total += this_ct
                 start += this_ct # advance start to fetch next set
                 if this_ct == 0: break # if in_table size is exact multiple of limit
                 sql.add_pkey(db, in_table)
                 # Recurse
-...
                 sql.empty_temp(db, in_table)
                 if this_ct < partition_size: break # partial partition = last
                 # Work around PostgreSQL's temp table disk space leak
                 db.reconnect()

Also available in: Unified diff

Project

General

Profile

Revision 3194

Added by Aaron Marcuse-Kubitza over 12 years ago