/ - Diff - BIEN 3 - NCEAS Projects

« Previous | Next »

Revision 3651

Added by Aaron Marcuse-Kubitza over 12 years ago

db_xml.py: Moved put() before the functions that use it

                     elif child_name == name: return child
         return None
     def get(db, node, limit=None, start=None):
         def pkey(table): return sql.pkey(db, table)
         node = node.firstChild
         table = name_of(node)
         pkey_ = pkey(table)
         fields = []
         conds = {}
         for child in xml_dom.NodeElemIter(node):
             child_name = name_of(child)
             if xml_dom.is_empty(child): fields.append(child_name)
             elif xml_dom.is_text(child): conds[child_name] = xml_dom.value(child)
             else: raise Exception('Joins not supported yet')
         id_ = xml_dom.get_id(node)
         if id_ != None: conds[pkey(table)] = id_ # replace any existing pkey value
         if fields == []: fields.append(pkey_)
         return sql.select(db, table, fields, conds, limit, start)
     class ColRef:
         '''A reference to a table column'''
         def __init__(self, name, idx):
-...
         def __str__(self): return self.name
     # Controls when and how put_table() will partition the input table
     partition_size = 500000 # rows
     input_col_prefix = '$'
     put_table_special_funcs = set(['_simplifyPath'])
     def put_table(db, node, in_table, in_row_ct_ref=None, row_ins_ct_ref=None,
         limit=None, start=0, on_error=exc.raise_, col_defaults={}):
         '''
         @param node The XML tree that transforms the input to the output. Similar to
             put()'s node param, but with the input column name prefixed by
             input_col_prefix in place of the column value.
         @return sql_gen.Col Where the pkeys (from INSERT RETURNING) are made
             available
         '''
         in_table = sql_gen.as_Table(in_table)
         in_table.set_srcs([in_table], overwrite=False)
         db.src = str(in_table)
         db.autoanalyze = True # but don't do this in row-based import
         db.autoexplain = True # but don't do this in row-based import
         # Import col_defaults
         for col, node_ in col_defaults.items():
             col_defaults[col] = put(db, node_, row_ins_ct_ref, on_error)
         # Subset and partition in_table
         # OK to do even if table already the right size because it takes <1 sec.
         full_in_table = in_table
         total = 0
         while limit == None or total < limit:
             # Adjust partition size if last partition
             this_limit = partition_size
             if limit != None: this_limit = min(this_limit, limit - total)
             # Row # is interally 0-based, but 1-based to the user
             db.log_debug('********** Partition: rows '+str(start+1)+'-'
                 +str(start+this_limit)+' **********', level=1.2)
             # Subset in_table
             in_table = copy.copy(full_in_table) # don't modify input!
             in_table.name = str(in_table) # prepend schema
             cur = sql.run_query_into(db, sql.mk_select(db, full_in_table,
                 limit=this_limit, start=start), into=in_table, add_pkey_=True)
                 # full_in_table will be shadowed (hidden) by created temp table
             this_ct = cur.rowcount
             total += this_ct
             start += this_ct # advance start to fetch next set
             if this_ct == 0: break # in_table size is multiple of partition_size
             # Recurse
             pkeys_loc = put(db, node, row_ins_ct_ref, on_error,
                 col_defaults, in_table)
             if in_row_ct_ref != None: in_row_ct_ref[0] += this_ct
             sql.empty_temp(db, in_table)
             if this_ct < partition_size: break # partial partition = last
             # Work around PostgreSQL's temp table disk space leak
             db.reconnect()
         return pkeys_loc
     def put(db, node, row_ins_ct_ref=None, on_error=exc.raise_,
         col_defaults={}, in_table=None, parent_ids_loc=None, next=None):
         def put_(node, in_row_ct_ref=None):
-...
         for child in children: put_(child)
         return pkeys_loc
     def get(db, node, limit=None, start=None):
         def pkey(table): return sql.pkey(db, table)
         node = node.firstChild
         table = name_of(node)
         pkey_ = pkey(table)
         fields = []
         conds = {}
         for child in xml_dom.NodeElemIter(node):
             child_name = name_of(child)
             if xml_dom.is_empty(child): fields.append(child_name)
             elif xml_dom.is_text(child): conds[child_name] = xml_dom.value(child)
             else: raise Exception('Joins not supported yet')
         id_ = xml_dom.get_id(node)
         if id_ != None: conds[pkey(table)] = id_ # replace any existing pkey value
         if fields == []: fields.append(pkey_)
         return sql.select(db, table, fields, conds, limit, start)
     # Controls when and how put_table() will partition the input table
     partition_size = 500000 # rows
     input_col_prefix = '$'
     def put_table(db, node, in_table, in_row_ct_ref=None, row_ins_ct_ref=None,
         limit=None, start=0, on_error=exc.raise_, col_defaults={}):
         '''
         @param node The XML tree that transforms the input to the output. Similar to
             put()'s node param, but with the input column name prefixed by
             input_col_prefix in place of the column value.
         @return sql_gen.Col Where the pkeys (from INSERT RETURNING) are made
             available
         '''
         in_table = sql_gen.as_Table(in_table)
         in_table.set_srcs([in_table], overwrite=False)
         db.src = str(in_table)
         db.autoanalyze = True # but don't do this in row-based import
         db.autoexplain = True # but don't do this in row-based import
         # Import col_defaults
         for col, node_ in col_defaults.items():
             col_defaults[col] = put(db, node_, row_ins_ct_ref, on_error)
         # Subset and partition in_table
         # OK to do even if table already the right size because it takes <1 sec.
         full_in_table = in_table
         total = 0
         while limit == None or total < limit:
             # Adjust partition size if last partition
             this_limit = partition_size
             if limit != None: this_limit = min(this_limit, limit - total)
             # Row # is interally 0-based, but 1-based to the user
             db.log_debug('********** Partition: rows '+str(start+1)+'-'
                 +str(start+this_limit)+' **********', level=1.2)
             # Subset in_table
             in_table = copy.copy(full_in_table) # don't modify input!
             in_table.name = str(in_table) # prepend schema
             cur = sql.run_query_into(db, sql.mk_select(db, full_in_table,
                 limit=this_limit, start=start), into=in_table, add_pkey_=True)
                 # full_in_table will be shadowed (hidden) by created temp table
             this_ct = cur.rowcount
             total += this_ct
             start += this_ct # advance start to fetch next set
             if this_ct == 0: break # in_table size is multiple of partition_size
             # Recurse
             pkeys_loc = put(db, node, row_ins_ct_ref, on_error,
                 col_defaults, in_table)
             if in_row_ct_ref != None: in_row_ct_ref[0] += this_ct
             sql.empty_temp(db, in_table)
             if this_ct < partition_size: break # partial partition = last
             # Work around PostgreSQL's temp table disk space leak
             db.reconnect()
         return pkeys_loc

Also available in: Unified diff

Project

General

Profile

Revision 3651

Added by Aaron Marcuse-Kubitza over 12 years ago