Project

General

Profile

« Previous | Next » 

Revision 3194

db_xml.py: put_table(): Subsetting in_table: Don't count # rows because this takes awhile for large datasets. Instead, use the chunking algorithm in digir_client, which ends the loop when a partial or empty partition is encountered.

View differences:

db_xml.py
138 138

  
139 139
def put_table(db, node, in_table, in_row_ct_ref=None,
140 140
    row_ins_ct_ref=None, limit=None, start=0, on_error=exc.raise_,
141
    parent_ids_loc=None, next=None):
141
    parent_ids_loc=None, next=None, top_call=True):
142 142
    '''
143 143
    @param node The XML tree that transforms the input to the output. Similar to
144 144
        put()'s node param, but with the input column name prefixed by
145 145
        input_col_prefix in place of the column value.
146
    @return (table, col) Where the pkeys (from INSERT RETURNING) are made
146
    @return sql_gen.Col Where the pkeys (from INSERT RETURNING) are made
147 147
        available
148 148
    '''
149 149
    in_table = sql_gen.as_Table(in_table)
......
152 152
    
153 153
    def put_table_(node, in_row_ct_ref=None):
154 154
        return put_table(db, node, in_table, in_row_ct_ref, row_ins_ct_ref,
155
            on_error=on_error, parent_ids_loc=parent_ids_loc, next=next)
155
            None, 0, on_error, parent_ids_loc, next, False)
156 156
    
157 157
    # Subset and/or partition in_table if needed
158
    in_row_ct = sql.table_row_count(db, in_table)
159
    if limit != None or start != 0 or in_row_ct > partition_size:
160
        end = in_row_ct
161
        if limit != None: end = min(start + limit, end)
162
        
158
    if top_call:
163 159
        full_in_table = in_table
164 160
        
165
        for start_ in xrange(start, end, partition_size):
166
            limit_ = min(end - start_, partition_size)
161
        total = 0
162
        while limit == None or total < limit:
163
            # Adjust partition size if last partition
164
            this_limit = partition_size
165
            if limit != None: this_limit = min(this_limit, limit - total)
167 166
            
168 167
            # Row # is interally 0-based, but 1-based to the user
169
            db.log_debug('********** Partition: rows '+str(start_+1)+'-'
170
                +str(start_+limit_)+' **********', level=1.2)
168
            db.log_debug('********** Partition: rows '+str(start+1)+'-'
169
                +str(start+this_limit)+' **********', level=1.2)
171 170
            
172 171
            # Subset in_table
173 172
            in_table = copy.copy(full_in_table) # don't modify input!
174 173
            in_table.name = str(in_table) # prepend schema
175
            sql.run_query_into(db, sql.mk_select(db, full_in_table,
176
                limit=limit_, start=start_), into=in_table)
174
            cur = sql.run_query_into(db, sql.mk_select(db, full_in_table,
175
                limit=this_limit, start=start), into=in_table)
177 176
                # full_in_table will be shadowed (hidden) by created temp table
177
            
178
            this_ct = cur.rowcount
179
            total += this_ct
180
            start += this_ct # advance start to fetch next set
181
            if this_ct == 0: break # if in_table size is exact multiple of limit
182
            
178 183
            sql.add_pkey(db, in_table)
179 184
            
180 185
            # Recurse
......
182 187
            
183 188
            sql.empty_temp(db, in_table)
184 189
            
190
            if this_ct < partition_size: break # partial partition = last
191
            
185 192
            # Work around PostgreSQL's temp table disk space leak
186 193
            db.reconnect()
187 194
        

Also available in: Unified diff