/lib/sql.py - BIEN 3 - NCEAS Projects

root/lib/sql.py @ 2674

       # Database access
       import copy
       import operator
       import re
       import warnings
       import exc
       import dicts
       import iters
       import lists
       from Proxy import Proxy
       import rand
       import sql_gen
       import strings
       import util
       ##### Exceptions
       def get_cur_query(cur, input_query=None, input_params=None):
           raw_query = None
           if hasattr(cur, 'query'): raw_query = cur.query
           elif hasattr(cur, '_last_executed'): raw_query = cur._last_executed
           if raw_query != None: return raw_query
           else: return '[input] '+strings.ustr(input_query)+' % '+repr(input_params)
       def _add_cursor_info(e, *args, **kw_args):
           '''For params, see get_cur_query()'''
           exc.add_msg(e, 'query: '+str(get_cur_query(*args, **kw_args)))
       class DbException(exc.ExceptionWithCause):
           def __init__(self, msg, cause=None, cur=None):
               exc.ExceptionWithCause.__init__(self, msg, cause, cause_newline=True)
               if cur != None: _add_cursor_info(self, cur)
       class ExceptionWithName(DbException):
           def __init__(self, name, cause=None):
               DbException.__init__(self, 'for name: '+strings.as_tt(str(name)), cause)
               self.name = name
       class ExceptionWithNameValue(DbException):
           def __init__(self, name, value, cause=None):
               DbException.__init__(self, 'for name: '+strings.as_tt(str(name))
                   +'; value: '+strings.as_tt(repr(value)), cause)
               self.name = name
               self.value = value
       class ConstraintException(DbException):
           def __init__(self, name, cols, cause=None):
               DbException.__init__(self, 'Violated '+strings.as_tt(name)
                   +' constraint on columns: '+strings.as_tt(', '.join(cols)), cause)
               self.name = name
               self.cols = cols
       class MissingCastException(DbException):
           def __init__(self, type_, col, cause=None):
               DbException.__init__(self, 'Missing cast to type '+strings.as_tt(type_)
                   +' on column: '+strings.as_tt(col), cause)
               self.type = type_
               self.col = col
       class NameException(DbException): pass
       class DuplicateKeyException(ConstraintException): pass
       class NullValueException(ConstraintException): pass
       class FunctionValueException(ExceptionWithNameValue): pass
       class DuplicateTableException(ExceptionWithName): pass
       class DuplicateFunctionException(ExceptionWithName): pass
       class EmptyRowException(DbException): pass
       ##### Warnings
       class DbWarning(UserWarning): pass
       ##### Result retrieval
       def col_names(cur): return (col[0] for col in cur.description)
       def rows(cur): return iter(lambda: cur.fetchone(), None)
       def consume_rows(cur):
           '''Used to fetch all rows so result will be cached'''
           iters.consume_iter(rows(cur))
       def next_row(cur): return rows(cur).next()
       def row(cur):
           row_ = next_row(cur)
           consume_rows(cur)
           return row_
       def next_value(cur): return next_row(cur)[0]
       def value(cur): return row(cur)[0]
       def values(cur): return iters.func_iter(lambda: next_value(cur))
       def value_or_none(cur):
           try: return value(cur)
           except StopIteration: return None
       ##### Input validation
       def esc_name_by_module(module, name):
           if module == 'psycopg2' or module == None: quote = '"'
           elif module == 'MySQLdb': quote = '`'
           else: raise NotImplementedError("Can't escape name for "+module+' database')
           return sql_gen.esc_name(name, quote)
       def esc_name_by_engine(engine, name, **kw_args):
           return esc_name_by_module(db_engines[engine][0], name, **kw_args)
       def esc_name(db, name, **kw_args):
           return esc_name_by_module(util.root_module(db.db), name, **kw_args)
       def qual_name(db, schema, table):
           def esc_name_(name): return esc_name(db, name)
           table = esc_name_(table)
           if schema != None: return esc_name_(schema)+'.'+table
           else: return table
       ##### Database connections
       db_config_names = ['engine', 'host', 'user', 'password', 'database', 'schemas']
       db_engines = {
           'MySQL': ('MySQLdb', {'password': 'passwd', 'database': 'db'}),
           'PostgreSQL': ('psycopg2', {}),
+      }
       DatabaseErrors_set = set([DbException])
       DatabaseErrors = tuple(DatabaseErrors_set)
       def _add_module(module):
           DatabaseErrors_set.add(module.DatabaseError)
           global DatabaseErrors
           DatabaseErrors = tuple(DatabaseErrors_set)
       def db_config_str(db_config):
           return db_config['engine']+' database '+db_config['database']
       def _query_lookup(query, params): return (query, dicts.make_hashable(params))
       log_debug_none = lambda msg, level=2: None
       class DbConn:
           def __init__(self, db_config, serializable=True, autocommit=False,
               caching=True, log_debug=log_debug_none):
               self.db_config = db_config
               self.serializable = serializable
               self.autocommit = autocommit
               self.caching = caching
               self.log_debug = log_debug
               self.debug = log_debug != log_debug_none
               self.__db = None
               self.query_results = {}
               self._savepoint = 0
               self._notices_seen = set()
           def __getattr__(self, name):
               if name == '__dict__': raise Exception('getting __dict__')
               if name == 'db': return self._db()
               else: raise AttributeError()
           def __getstate__(self):
               state = copy.copy(self.__dict__) # shallow copy
               state['log_debug'] = None # don't pickle the debug callback
               state['_DbConn__db'] = None # don't pickle the connection
               return state
           def connected(self): return self.__db != None
           def _db(self):
               if self.__db == None:
                   # Process db_config
                   db_config = self.db_config.copy() # don't modify input!
                   schemas = db_config.pop('schemas', None)
                   module_name, mappings = db_engines[db_config.pop('engine')]
                   module = __import__(module_name)
                   _add_module(module)
                   for orig, new in mappings.iteritems():
                       try: util.rename_key(db_config, orig, new)
                       except KeyError: pass
                   # Connect
                   self.__db = module.connect(**db_config)
                   # Configure connection
                   if self.serializable and not self.autocommit: run_raw_query(self,
                       'SET TRANSACTION ISOLATION LEVEL SERIALIZABLE')
                   if schemas != None:
                       schemas_ = ''.join((esc_name(self, s)+', '
                           for s in schemas.split(',')))
                       run_raw_query(self, "SELECT set_config('search_path', \
       %s || current_setting('search_path'), false)", [schemas_])
               return self.__db
           class DbCursor(Proxy):
               def __init__(self, outer):
                   Proxy.__init__(self, outer.db.cursor())
                   self.outer = outer
                   self.query_results = outer.query_results
                   self.query_lookup = None
                   self.result = []
               def execute(self, query, params=None):
                   self._is_insert = query.upper().find('INSERT') >= 0
                   self.query_lookup = _query_lookup(query, params)
                   try:
                       try:
                           return_value = self.inner.execute(query, params)
                           self.outer.do_autocommit()
                       finally: self.query = get_cur_query(self.inner, query, params)
                   except Exception, e:
                       _add_cursor_info(e, self, query, params)
                       self.result = e # cache the exception as the result
                       self._cache_result()
                       raise
                   # Fetch all rows so result will be cached
                   if self.rowcount == 0 and not self._is_insert: consume_rows(self)
                   return return_value
               def fetchone(self):
                   row = self.inner.fetchone()
                   if row != None: self.result.append(row)
                   # otherwise, fetched all rows
                   else: self._cache_result()
                   return row
               def _cache_result(self):
                   # For inserts, only cache exceptions since inserts are not
                   # idempotent, but an invalid insert will always be invalid
                   if self.query_results != None and (not self._is_insert
                       or isinstance(self.result, Exception)):
                       assert self.query_lookup != None
                       self.query_results[self.query_lookup] = self.CacheCursor(
                           util.dict_subset(dicts.AttrsDictView(self),
                           ['query', 'result', 'rowcount', 'description']))
               class CacheCursor:
                   def __init__(self, cached_result): self.__dict__ = cached_result
                   def execute(self, *args, **kw_args):
                       if isinstance(self.result, Exception): raise self.result
                       # otherwise, result is a rows list
                       self.iter = iter(self.result)
                   def fetchone(self):
                       try: return self.iter.next()
                       except StopIteration: return None
           def esc_value(self, value):
               try: str_ = self.mogrify('%s', [value])
               except NotImplementedError, e:
                   module = util.root_module(self.db)
                   if module == 'MySQLdb':
                       import _mysql
                       str_ = _mysql.escape_string(value)
                   else: raise e
               return strings.to_unicode(str_)
           def esc_name(self, name): return esc_name(self, name) # calls global func
           def can_mogrify(self):
               module = util.root_module(self.db)
               return module == 'psycopg2'
           def mogrify(self, query, params=None):
               if self.can_mogrify(): return self.db.cursor().mogrify(query, params)
               else: raise NotImplementedError("Can't mogrify query")
           def print_notices(self):
               for msg in self.db.notices:
                   if msg not in self._notices_seen:
                       self._notices_seen.add(msg)
                       self.log_debug(msg, level=2)
           def run_query(self, query, params=None, cacheable=False, log_level=2,
               debug_msg_ref=None):
               '''
               @param log_ignore_excs The log_level will be increased by 2 if the query
                   throws one of these exceptions.
               @param debug_msg_ref If specified, the log message will be returned in
                   this instead of being output. This allows you to filter log messages
                   depending on the result of the query.
               '''
               assert query != None
               if not self.caching: cacheable = False
               used_cache = False
               def log_msg(query):
                   if used_cache: cache_status = 'cache hit'
                   elif cacheable: cache_status = 'cache miss'
                   else: cache_status = 'non-cacheable'
                   return 'DB query: '+cache_status+':\n'+strings.as_code(query, 'SQL')
               try:
                   # Get cursor
                   if cacheable:
                       query_lookup = _query_lookup(query, params)
                       try:
                           cur = self.query_results[query_lookup]
                           used_cache = True
                       except KeyError: cur = self.DbCursor(self)
                   else: cur = self.db.cursor()
                   # Log query
                   if self.debug and debug_msg_ref == None: # log before running
                       self.log_debug(log_msg(query), log_level)
                   # Run query
                   cur.execute(query, params)
               finally:
                   self.print_notices()
                   if self.debug and debug_msg_ref != None: # return after running
                       debug_msg_ref[0] = log_msg(str(get_cur_query(cur, query,
                           params)))
               return cur
           def is_cached(self, query, params=None):
               return _query_lookup(query, params) in self.query_results
           def with_savepoint(self, func):
               savepoint = 'level_'+str(self._savepoint)
               self.run_query('SAVEPOINT '+savepoint, log_level=4)
               self._savepoint += 1
               try:
                   try: return_val = func()
                   finally:
                       self._savepoint -= 1
                       assert self._savepoint >= 0
               except:
                   self.run_query('ROLLBACK TO SAVEPOINT '+savepoint, log_level=4)
                   raise
               else:
                   self.run_query('RELEASE SAVEPOINT '+savepoint, log_level=4)
                   self.do_autocommit()
                   return return_val
           def do_autocommit(self):
               '''Autocommits if outside savepoint'''
               assert self._savepoint >= 0
               if self.autocommit and self._savepoint == 0:
                   self.log_debug('Autocommitting')
                   self.db.commit()
           def col_default(self, col):
               table = sql_gen.Table('columns', 'information_schema')
               conds = [('table_name', col.table.name), ('column_name', col.name)]
               schema = col.table.schema
               if schema != None: conds.append(('table_schema', schema))
               return sql_gen.as_Code(value(select(self, table, ['column_default'],
                   conds, order_by='table_schema', limit=1, log_level=3)))
                   # TODO: order_by search_path schema order
       connect = DbConn
       ##### Querying
       def run_raw_query(db, *args, **kw_args):
           '''For params, see DbConn.run_query()'''
           return db.run_query(*args, **kw_args)
       def mogrify(db, query, params):
           module = util.root_module(db.db)
           if module == 'psycopg2': return db.db.cursor().mogrify(query, params)
           else: raise NotImplementedError("Can't mogrify query for "+module+
               ' database')
       ##### Recoverable querying
       def with_savepoint(db, func): return db.with_savepoint(func)
       def run_query(db, query, params=None, recover=None, cacheable=False,
           log_level=2, log_ignore_excs=None, **kw_args):
           '''For params, see run_raw_query()'''
           if recover == None: recover = False
           if log_ignore_excs == None: log_ignore_excs = ()
           log_ignore_excs = tuple(log_ignore_excs)
           debug_msg_ref = None # usually, db.run_query() logs query before running it
           # But if filtering with log_ignore_excs, wait until after exception parsing
           if log_ignore_excs != () or not db.can_mogrify(): debug_msg_ref = [None]
           try:
               try:
                   def run(): return run_raw_query(db, query, params, cacheable,
                       log_level, debug_msg_ref, **kw_args)
                   if recover and not db.is_cached(query, params):
                       return with_savepoint(db, run)
                   else: return run() # don't need savepoint if cached
               except Exception, e:
                   if not recover: raise # need savepoint to run index_cols()
                   msg = exc.str_(e)
                   match = re.search(r'duplicate key value violates unique constraint '
                       r'"((_?[^\W_]+)_.+?)"', msg)
                   if match:
                       constraint, table = match.groups()
                       try: cols = index_cols(db, table, constraint)
                       except NotImplementedError: raise e
                       else: raise DuplicateKeyException(constraint, cols, e)
                   match = re.search(r'null value in column "(.+?)" violates not-null'
                       r' constraint', msg)
                   if match: raise NullValueException('NOT NULL', [match.group(1)], e)
                   match = re.search(r'\b(?:invalid input (?:syntax|value)\b.*?'
                       r'|date/time field value out of range): "(.+?)"\n'
                       r'(?:(?s).*?)\bfunction "(.+?)"', msg)
                   if match:
                       value, name = match.groups()
                       raise FunctionValueException(name, strings.to_unicode(value), e)
                   match = re.search(r'column "(.+?)" is of type (.+?) but expression '
                       r'is of type', msg)
                   if match:
                       col, type_ = match.groups()
                       raise MissingCastException(type_, col, e)
                   match = re.search(r'relation "(.+?)" already exists', msg)
                   if match: raise DuplicateTableException(match.group(1), e)
                   match = re.search(r'function "(.+?)" already exists', msg)
                   if match: raise DuplicateFunctionException(match.group(1), e)
                   raise # no specific exception raised
           except log_ignore_excs:
               log_level += 2
               raise
           finally:
               if debug_msg_ref != None and debug_msg_ref[0] != None:
                   db.log_debug(debug_msg_ref[0], log_level)
       ##### Basic queries
       def next_version(name):
           version = 1 # first existing name was version 0
           match = re.match(r'^(.*)#(\d+)$', name)
           if match:
               name, version = match.groups()
               version = int(version)+1
           return sql_gen.add_suffix(name, '#'+str(version))
       def run_query_into(db, query, params, into=None, *args, **kw_args):
           '''Outputs a query to a temp table.
           For params, see run_query().
           '''
           if into == None: return run_query(db, query, params, *args, **kw_args)
           else: # place rows in temp table
               assert isinstance(into, sql_gen.Table)
               kw_args['recover'] = True
               kw_args.setdefault('log_ignore_excs', (DuplicateTableException,))
               temp = not db.autocommit # tables are permanent in autocommit mode
               # "temporary tables cannot specify a schema name", so remove schema
               if temp: into.schema = None
               while True:
                   try:
                       create_query = 'CREATE'
                       if temp: create_query += ' TEMP'
                       create_query += ' TABLE '+into.to_str(db)+' AS\n'+query
                       return run_query(db, create_query, params, *args, **kw_args)
                           # CREATE TABLE AS sets rowcount to # rows in query
                   except DuplicateTableException, e:
                       into.name = next_version(into.name)
                       # try again with next version of name
       order_by_pkey = object() # tells mk_select() to order by the pkey
       distinct_on_all = object() # tells mk_select() to SELECT DISTINCT ON all columns
       def mk_select(db, tables, fields=None, conds=None, distinct_on=[], limit=None,
           start=None, order_by=order_by_pkey, default_table=None):
           '''
           @param tables The single table to select from, or a list of tables to join
               together, with tables after the first being sql_gen.Join objects
           @param fields Use None to select all fields in the table
           @param conds WHERE conditions: [(compare_left_side, compare_right_side),...]
               * container can be any iterable type
               * compare_left_side: sql_gen.Code|str (for col name)
               * compare_right_side: sql_gen.ValueCond|literal value
           @param distinct_on The columns to SELECT DISTINCT ON, or distinct_on_all to
               use all columns
           @return tuple(query, params)
           '''
           # Parse tables param
           if not lists.is_seq(tables): tables = [tables]
           tables = list(tables) # don't modify input! (list() copies input)
           table0 = sql_gen.as_Table(tables.pop(0)) # first table is separate
           # Parse other params
           if conds == None: conds = []
           elif dicts.is_dict(conds): conds = conds.items()
           conds = list(conds) # don't modify input! (list() copies input)
           assert limit == None or type(limit) == int
           assert start == None or type(start) == int
           if order_by is order_by_pkey:
               if distinct_on != []: order_by = None
               else: order_by = pkey(db, table0, recover=True)
           query = 'SELECT'
           def parse_col(col): return sql_gen.as_Col(col, default_table).to_str(db)
           # DISTINCT ON columns
           if distinct_on != []:
               query += '\nDISTINCT'
               if distinct_on is not distinct_on_all:
                   query += ' ON ('+(', '.join(map(parse_col, distinct_on)))+')'
           # Columns
           query += '\n'
           if fields == None: query += '*'
           else: query += '\n, '.join(map(parse_col, fields))
           # Main table
           query += '\nFROM '+table0.to_str(db)
           # Add joins
           left_table = table0
           for join_ in tables:
               table = join_.table
               # Parse special values
               if join_.type_ is sql_gen.filter_out: # filter no match
                   conds.append((sql_gen.Col(table_not_null_col(db, table), table),
                       None))
               query += '\n'+join_.to_str(db, left_table)
               left_table = table
           missing = True
           if conds != []:
               if len(conds) == 1: whitespace = ' '
               else: whitespace = '\n'
               query += '\n'+sql_gen.combine_conds([sql_gen.ColValueCond(l, r)
                   .to_str(db) for l, r in conds], 'WHERE')
               missing = False
           if order_by != None:
               query += '\nORDER BY '+sql_gen.as_Col(order_by, table0).to_str(db)
           if limit != None: query += '\nLIMIT '+str(limit); missing = False
           if start != None:
               if start != 0: query += '\nOFFSET '+str(start)
               missing = False
           if missing: warnings.warn(DbWarning(
               'SELECT statement missing a WHERE, LIMIT, or OFFSET clause: '+query))
           return (query, [])
       def select(db, *args, **kw_args):
           '''For params, see mk_select() and run_query()'''
           recover = kw_args.pop('recover', None)
           cacheable = kw_args.pop('cacheable', True)
           log_level = kw_args.pop('log_level', 2)
           query, params = mk_select(db, *args, **kw_args)
           return run_query(db, query, params, recover, cacheable, log_level=log_level)
       def mk_insert_select(db, table, cols=None, select_query=None, params=None,
           returning=None, embeddable=False):
           '''
           @param returning str|None An inserted column (such as pkey) to return
           @param embeddable Whether the query should be embeddable as a nested SELECT.
               Warning: If you set this and cacheable=True when the query is run, the
               query will be fully cached, not just if it raises an exception.
           '''
           table = sql_gen.as_Table(table)
           if cols == []: cols = None # no cols (all defaults) = unknown col names
           if cols != None:
               cols = [sql_gen.to_name_only_col(v, table).to_str(db) for v in cols]
           if select_query == None: select_query = 'DEFAULT VALUES'
           if returning != None: returning = sql_gen.as_Col(returning, table)
           # Build query
           first_line = 'INSERT INTO '+table.to_str(db)
           query = first_line
           if cols != None: query += '\n('+', '.join(cols)+')'
           query += '\n'+select_query
           if returning != None:
               returning_name = copy.copy(returning)
               returning_name.table = None
               returning_name = returning_name.to_str(db)
               query += '\nRETURNING '+returning_name
           if embeddable:
               assert returning != None
               # Create function
               function_name = sql_gen.clean_name(first_line)
               return_type = 'SETOF '+returning.to_str(db)+'%TYPE'
               while True:
                   try:
                       func_schema = None
                       if not db.autocommit: func_schema = 'pg_temp'
                       function = sql_gen.Table(function_name, func_schema).to_str(db)
                       function_query = '''\
       CREATE FUNCTION '''+function+'''()
       RETURNS '''+return_type+'''
       LANGUAGE sql
       AS $$
       '''+mogrify(db, query, params)+''';
       $$;
       '''
                       run_query(db, function_query, recover=True, cacheable=True,
                           log_ignore_excs=(DuplicateFunctionException,))
                       break # this version was successful
                   except DuplicateFunctionException, e:
                       function_name = next_version(function_name)
                       # try again with next version of name
               # Return query that uses function
               func_table = sql_gen.NamedTable('f', sql_gen.CustomCode(function+'()'),
                   [returning_name]) # AS clause requires function alias
               return mk_select(db, func_table, start=0, order_by=None)
           return (query, params)
       def insert_select(db, *args, **kw_args):
           '''For params, see mk_insert_select() and run_query_into()
           @param into sql_gen.Table with suggested name of temp table to put RETURNING
               values in
           '''
           into = kw_args.pop('into', None)
           if into != None: kw_args['embeddable'] = True
           recover = kw_args.pop('recover', None)
           cacheable = kw_args.pop('cacheable', True)
           log_level = kw_args.pop('log_level', 2)
           query, params = mk_insert_select(db, *args, **kw_args)
           return run_query_into(db, query, params, into, recover=recover,
               cacheable=cacheable, log_level=log_level)
       default = object() # tells insert() to use the default value for a column
       def insert(db, table, row, *args, **kw_args):
           '''For params, see insert_select()'''
           if lists.is_seq(row): cols = None
           else:
               cols = row.keys()
               row = row.values()
           row = list(row) # ensure that "!= []" works
           # Check for special values
           labels = []
           values = []
           for value in row:
               value = sql_gen.remove_col_rename(sql_gen.as_Value(value)).value
               if value is default: labels.append('DEFAULT')
               else:
                   labels.append('%s')
                   values.append(value)
           # Build query
           if values != []: query = 'VALUES ('+(', '.join(labels))+')'
           else: query = None
           return insert_select(db, table, cols, query, values, *args, **kw_args)
       def mk_update(db, table, changes=None, cond=None):
           '''
           @param changes [(col, new_value),...]
               * container can be any iterable type
               * col: sql_gen.Code|str (for col name)
               * new_value: sql_gen.Code|literal value
           @param cond sql_gen.Code WHERE condition. e.g. use sql_gen.*Cond objects.
           @return str query
           '''
           query = 'UPDATE '+sql_gen.as_Table(table).to_str(db)+'\nSET\n'
           query += ',\n'.join((sql_gen.to_name_only_col(col, table).to_str(db)+' = '
               +sql_gen.as_Value(new_value).to_str(db) for col, new_value in changes))
           if cond != None: query += '\nWHERE\n'+cond.to_str(db)
           return query
       def update(db, *args, **kw_args):
           '''For params, see mk_update() and run_query()'''
           recover = kw_args.pop('recover', None)
           return run_query(db, mk_update(db, *args, **kw_args), [], recover)
       def last_insert_id(db):
           module = util.root_module(db.db)
           if module == 'psycopg2': return value(run_query(db, 'SELECT lastval()'))
           elif module == 'MySQLdb': return db.insert_id()
           else: return None
       def truncate(db, table, schema='public'):
           return run_query(db, 'TRUNCATE '+qual_name(db, schema, table)+' CASCADE')
       def mk_flatten_mapping(db, into, cols, preserve=[], as_items=False):
           '''Creates a mapping from original column names (which may have collisions)
           to names that will be distinct among the columns' tables.
           This is meant to be used for several tables that are being joined together.
           @param cols The columns to combine. Duplicates will be removed.
           @param into The table for the new columns.
           @param preserve [sql_gen.Col...] Columns not to rename. Note that these
               columns will be included in the mapping even if they are not in cols.
               The tables of the provided Col objects will be changed to into, so make
               copies of them if you want to keep the original tables.
           @param as_items Whether to return a list of dict items instead of a dict
           @return dict(orig_col=new_col, ...)
               * orig_col: sql_gen.Col(orig_col_name, orig_table)
               * new_col: sql_gen.Col(orig_col_name, into)
               * All mappings use the into table so its name can easily be
                 changed for all columns at once
           '''
           cols = lists.uniqify(cols)
           items = []
           for col in preserve:
               orig_col = copy.copy(col)
               col.table = into
               items.append((orig_col, col))
           preserve = set(preserve)
           for col in cols:
               if col not in preserve: items.append((col, sql_gen.Col(str(col), into)))
           if not as_items: items = dict(items)
           return items
       def flatten(db, into, joins, cols, limit=None, start=None, **kw_args):
           '''For params, see mk_flatten_mapping()
           @return See return value of mk_flatten_mapping()
           '''
           items = mk_flatten_mapping(db, into, cols, as_items=True, **kw_args)
           cols = [sql_gen.NamedCol(new.name, old) for old, new in items]
           run_query_into(db, *mk_select(db, joins, cols, limit=limit, start=start),
               into=into)
           return dict(items)
       ##### Database structure queries
       def table_row_count(db, table, recover=None):
           return value(run_query(db, *mk_select(db, table, [sql_gen.row_count],
               order_by=None, start=0), recover=recover, log_level=3))
       def table_cols(db, table, recover=None):
           return list(col_names(select(db, table, limit=0, order_by=None,
               recover=recover, log_level=4)))
       def pkey(db, table, recover=None):
           '''Assumed to be first column in table'''
           return table_cols(db, table, recover)[0]
       not_null_col = 'not_null_col'
       def table_not_null_col(db, table, recover=None):
           '''Name assumed to be the value of not_null_col. If not found, uses pkey.'''
           if not_null_col in table_cols(db, table, recover): return not_null_col
           else: return pkey(db, table, recover)
       def index_cols(db, table, index):
           '''Can also use this for UNIQUE constraints, because a UNIQUE index is
           automatically created. When you don't know whether something is a UNIQUE
           constraint or a UNIQUE index, use this function.'''
           module = util.root_module(db.db)
           if module == 'psycopg2':
               return list(values(run_query(db, '''\
       SELECT attname
       FROM
+      (
               SELECT attnum, attname
               FROM pg_index
               JOIN pg_class index ON index.oid = indexrelid
               JOIN pg_class table_ ON table_.oid = indrelid
               JOIN pg_attribute ON attrelid = indrelid AND attnum = ANY (indkey)
               WHERE
                   table_.relname = %(table)s
                   AND index.relname = %(index)s
           UNION
               SELECT attnum, attname
               FROM
+              (
                   SELECT
                       indrelid
                       , (regexp_matches(indexprs, E':varattno (\\\\d+)', 'g'))[1]::int
                           AS indkey
                   FROM pg_index
                   JOIN pg_class index ON index.oid = indexrelid
                   JOIN pg_class table_ ON table_.oid = indrelid
                   WHERE
                       table_.relname = %(table)s
                       AND index.relname = %(index)s
               ) s
               JOIN pg_attribute ON attrelid = indrelid AND attnum = indkey
       ) s
       ORDER BY attnum
       ''',
                   {'table': table, 'index': index}, cacheable=True, log_level=4)))
           else: raise NotImplementedError("Can't list index columns for "+module+
               ' database')
       def constraint_cols(db, table, constraint):
           module = util.root_module(db.db)
           if module == 'psycopg2':
               return list(values(run_query(db, '''\
       SELECT attname
       FROM pg_constraint
       JOIN pg_class ON pg_class.oid = conrelid
       JOIN pg_attribute ON attrelid = conrelid AND attnum = ANY (conkey)
       WHERE
           relname = %(table)s
           AND conname = %(constraint)s
       ORDER BY attnum
       ''',
                   {'table': table, 'constraint': constraint})))
           else: raise NotImplementedError("Can't list constraint columns for "+module+
               ' database')
       row_num_col = '_row_num'
       def add_index(db, expr):
           '''Adds an index on a column or expression if it doesn't already exist.
           Currently, only function calls are supported as expressions.
           '''
           expr = copy.copy(expr) # don't modify input!
           # Extract col
           if isinstance(expr, sql_gen.FunctionCall):
               col = expr.args[0]
               expr = sql_gen.Expr(expr)
           else: col = expr
           assert sql_gen.is_table_col(col)
           index = sql_gen.as_Table(str(expr))
           table = col.table
           col.table = None
           try: run_query(db, 'CREATE INDEX '+index.to_str(db)+' ON '+table.to_str(db)
               +' ('+expr.to_str(db)+')', recover=True, cacheable=True, log_level=3)
           except DuplicateTableException: pass # index already existed
       def add_pkey(db, table, recover=None):
           '''Makes the first column in a table the primary key.
           @pre The table must not already have a primary key.
           '''
           table = sql_gen.as_Table(table)
           index = sql_gen.as_Table(sql_gen.add_suffix(table.name, '_pkey'))
           col = sql_gen.to_name_only_col(pkey(db, table, recover))
           try:
               run_query(db, 'ALTER TABLE '+table.to_str(db)+' ADD CONSTRAINT '
                   +index.to_str(db)+' PRIMARY KEY('+col.to_str(db)+')',
                   recover=True, cacheable=True, log_level=3,
                   log_ignore_excs=(DuplicateTableException,))
           except DuplicateTableException, e:
               index.name = next_version(index.name)
               # try again with next version of name
       def add_row_num(db, table):
           '''Adds a row number column to a table. Its name is in row_num_col. It will
           be the primary key.'''
           table = sql_gen.as_Table(table).to_str(db)
           run_query(db, 'ALTER TABLE '+table+' ADD COLUMN '+row_num_col
               +' serial NOT NULL PRIMARY KEY', log_level=3)
       def tables(db, schema_like='public', table_like='%'):
           module = util.root_module(db.db)
           params = {'schema_like': schema_like, 'table_like': table_like}
           if module == 'psycopg2':
               return values(run_query(db, '''\
       SELECT tablename
       FROM pg_tables
       WHERE
           schemaname LIKE %(schema_like)s
           AND tablename LIKE %(table_like)s
       ORDER BY tablename
       ''',
                   params, cacheable=True))
           elif module == 'MySQLdb':
               return values(run_query(db, 'SHOW TABLES LIKE %(table_like)s', params,
                   cacheable=True))
           else: raise NotImplementedError("Can't list tables for "+module+' database')
       ##### Database management
       def empty_db(db, schema='public', **kw_args):
           '''For kw_args, see tables()'''
           for table in tables(db, schema, **kw_args): truncate(db, table, schema)
       ##### Heuristic queries
       def put(db, table, row, pkey_=None, row_ct_ref=None):
           '''Recovers from errors.
           Only works under PostgreSQL (uses INSERT RETURNING).
           '''
           row = sql_gen.ColDict(db, table, row)
           if pkey_ == None: pkey_ = pkey(db, table, recover=True)
           try:
               cur = insert(db, table, row, pkey_, recover=True)
               if row_ct_ref != None and cur.rowcount >= 0:
                   row_ct_ref[0] += cur.rowcount
               return value(cur)
           except DuplicateKeyException, e:
               row = sql_gen.ColDict(db, table,
                   util.dict_subset_right_join(row, e.cols))
               return value(select(db, table, [pkey_], row, recover=True))
       def get(db, table, row, pkey, row_ct_ref=None, create=False):
           '''Recovers from errors'''
           try: return value(select(db, table, [pkey], row, limit=1, recover=True))
           except StopIteration:
               if not create: raise
               return put(db, table, row, pkey, row_ct_ref) # insert new row
       def is_func_result(col):
           return col.table.name.find('(') >= 0 and col.name == 'result'
       def into_table_name(out_table, in_tables0, mapping, is_func):
           def in_col_str(in_col):
               in_col = sql_gen.remove_col_rename(in_col)
               if isinstance(in_col, sql_gen.Col):
                   table = in_col.table
                   if table == in_tables0:
                       in_col = sql_gen.to_name_only_col(in_col)
                   elif is_func_result(in_col): in_col = table # omit col name
               return str(in_col)
           str_ = str(out_table)
           if is_func:
               str_ += '('
               try: value_in_col = mapping['value']
               except KeyError:
                   str_ += ', '.join((str(k)+'='+in_col_str(v)
                       for k, v in mapping.iteritems()))
               else: str_ += in_col_str(value_in_col)
               str_ += ')'
           else: str_ += '_pkeys'
           return str_
       def put_table(db, out_table, in_tables, mapping, row_ct_ref=None, into=None,
           default=None, is_func=False):
           '''Recovers from errors.
           Only works under PostgreSQL (uses INSERT RETURNING).
           @param in_tables The main input table to select from, followed by a list of
               tables to join with it using the main input table's pkey
           @param mapping dict(out_table_col=in_table_col, ...)
               * out_table_col: str (*not* sql_gen.Col)
               * in_table_col: sql_gen.Col|literal-value
           @param into The table to contain the output and input pkeys.
               Defaults to `out_table.name+'_pkeys'`.
           @param default The *output* column to use as the pkey for missing rows.
               If this output column does not exist in the mapping, uses None.
           @param is_func Whether out_table is the name of a SQL function, not a table
           @return sql_gen.Col Where the output pkeys are made available
           '''
           out_table = sql_gen.as_Table(out_table)
           def log_debug(msg): db.log_debug(msg, level=1.5)
           def col_ustr(str_):
               return strings.repr_no_u(sql_gen.remove_col_rename(str_))
           log_debug('********** New iteration **********')
           log_debug('Inserting these input columns into '+strings.as_tt(
               out_table.to_str(db))+':\n'+strings.as_table(mapping, ustr=col_ustr))
           # Create input joins from list of input tables
           in_tables_ = in_tables[:] # don't modify input!
           in_tables0 = in_tables_.pop(0) # first table is separate
           in_pkey = pkey(db, in_tables0, recover=True)
           in_pkey_col = sql_gen.as_Col(in_pkey, in_tables0)
           input_joins = [in_tables0]+[sql_gen.Join(v,
               {in_pkey: sql_gen.join_same_not_null}) for v in in_tables_]
           if into == None:
               into = into_table_name(out_table, in_tables0, mapping, is_func)
           into = sql_gen.as_Table(into)
           log_debug('Joining together input tables into temp table')
           # Place in new table for speed and so don't modify input if values edited
           in_table = sql_gen.Table('in')
           flatten_cols = filter(sql_gen.is_table_col, mapping.values())
           mapping = dicts.join(mapping, flatten(db, in_table, input_joins,
               flatten_cols, preserve=[in_pkey_col], start=0))
           input_joins = [in_table]
           db.log_debug('Temp table: '+strings.as_tt(in_table.to_str(db)), level=2)
           mapping = sql_gen.ColDict(db, out_table, mapping)
               # after applying dicts.join() because that returns a plain dict
           # Resolve default value column
           try: default = mapping[default]
           except KeyError:
               if default != None:
                   db.log_debug('Default value column '
                       +strings.as_tt(strings.repr_no_u(default))
                       +' does not exist in mapping, falling back to None', level=2.1)
                   default = None
           out_pkey = pkey(db, out_table, recover=True)
           out_pkey_col = sql_gen.as_Col(out_pkey, out_table)
           pkeys_names = [in_pkey, out_pkey]
           pkeys_cols = [in_pkey_col, out_pkey_col]
           pkeys_table_exists_ref = [False]
           def insert_into_pkeys(joins, cols):
               query, params = mk_select(db, joins, cols, order_by=None, start=0)
               if pkeys_table_exists_ref[0]:
                   insert_select(db, into, pkeys_names, query, params)
               else:
                   run_query_into(db, query, params, into=into)
                   pkeys_table_exists_ref[0] = True
           limit_ref = [None]
           conds = set()
           distinct_on = []
           def mk_main_select(joins, cols):
               return mk_select(db, joins, cols, conds, distinct_on,
                   limit=limit_ref[0], start=0)
           exc_strs = set()
           def log_exc(e):
               e_str = exc.str_(e, first_line_only=True)
               log_debug('Caught exception: '+e_str)
               assert e_str not in exc_strs # avoid infinite loops
               exc_strs.add(e_str)
           def remove_all_rows():
               log_debug('Returning NULL for all rows')
               limit_ref[0] = 0 # just create an empty pkeys table
           def ignore(in_col, value):
               in_col_str = strings.as_tt(repr(in_col))
               db.log_debug('Adding index on '+in_col_str+' to enable fast filtering',
                   level=2.5)
               add_index(db, in_col)
               log_debug('Ignoring rows with '+in_col_str+' = '
                   +strings.as_tt(repr(value)))
           def remove_rows(in_col, value):
               ignore(in_col, value)
               cond = (in_col, sql_gen.CompareCond(value, '!='))
               assert cond not in conds # avoid infinite loops
               conds.add(cond)
           def invalid2null(in_col, value):
               ignore(in_col, value)
               update(db, in_table, [(in_col, None)],
                   sql_gen.ColValueCond(in_col, value))
           def insert_pkeys_table(which):
               return sql_gen.Table(sql_gen.add_suffix(in_table.name,
                   '_insert_'+which+'_pkeys'))
           insert_out_pkeys = insert_pkeys_table('out')
           insert_in_pkeys = insert_pkeys_table('in')
           # Do inserts and selects
           join_cols = sql_gen.ColDict(db, out_table)
           while True:
               if limit_ref[0] == 0: # special case
                   log_debug('Creating an empty pkeys table')
                   cur = run_query_into(db, *mk_select(db, out_table, [out_pkey],
                       limit=limit_ref[0]), into=insert_out_pkeys)
                   break # don't do main case
               has_joins = join_cols != {}
               # Prepare to insert new rows
               insert_joins = input_joins[:] # don't modify original!
               insert_args = dict(recover=True, cacheable=False)
               if has_joins:
                   distinct_on = [v.to_Col() for v in join_cols.values()]
                   insert_joins.append(sql_gen.Join(out_table, join_cols,
                       sql_gen.filter_out))
               else:
                   insert_args.update(dict(returning=out_pkey, into=insert_out_pkeys))
               main_select = mk_main_select(insert_joins, mapping.values())[0]
               log_debug('Trying to insert new rows')
               try:
                   cur = insert_select(db, out_table, mapping.keys(), main_select,
                       **insert_args)
                   break # insert successful
               except DuplicateKeyException, e:
                   log_exc(e)
                   old_join_cols = join_cols.copy()
                   join_cols.update(util.dict_subset_right_join(mapping, e.cols))
                   log_debug('Ignoring existing rows, comparing on these columns:\n'
                       +strings.as_inline_table(join_cols, ustr=col_ustr))
                   assert join_cols != old_join_cols # avoid infinite loops
               except NullValueException, e:
                   log_exc(e)
                   out_col, = e.cols
                   try: in_col = mapping[out_col]
                   except KeyError:
                       log_debug('Missing mapping for NOT NULL column '+out_col)
                       remove_all_rows()
                   else: remove_rows(in_col, None)
               except FunctionValueException, e:
                   log_exc(e)
                   func_name = e.name
                   value = e.value
                   for out_col, in_col in mapping.iteritems():
                       invalid2null(sql_gen.unwrap_func_call(in_col, func_name), value)
               except MissingCastException, e:
                   log_exc(e)
                   out_col = e.col
                   mapping[out_col] = sql_gen.wrap_in_func(e.type, mapping[out_col])
               except DatabaseErrors, e:
                   log_exc(e)
                   msg = 'No handler for exception: '+exc.str_(e)
                   warnings.warn(DbWarning(msg))
                   log_debug(msg)
                   remove_all_rows()
               # after exception handled, rerun loop with additional constraints
           if row_ct_ref != None and cur.rowcount >= 0:
               row_ct_ref[0] += cur.rowcount
           if has_joins:
               select_joins = input_joins+[sql_gen.Join(out_table, join_cols)]
               log_debug('Getting output table pkeys of existing/inserted rows')
               insert_into_pkeys(select_joins, pkeys_cols)
           else:
               add_row_num(db, insert_out_pkeys) # for joining with input pkeys
               log_debug('Getting input table pkeys of inserted rows')
               run_query_into(db, *mk_main_select(input_joins, [in_pkey]),
                   into=insert_in_pkeys)
               add_row_num(db, insert_in_pkeys) # for joining with output pkeys
               assert table_row_count(db, insert_out_pkeys) == table_row_count(db,
                   insert_in_pkeys)
               log_debug('Combining output and input pkeys in inserted order')
               pkey_joins = [insert_in_pkeys, sql_gen.Join(insert_out_pkeys,
                   {row_num_col: sql_gen.join_same_not_null})]
               insert_into_pkeys(pkey_joins, pkeys_names)
           db.log_debug('Adding pkey on pkeys table to enable fast joins', level=2.5)
           add_pkey(db, into)
           log_debug('Setting pkeys of missing rows to '+strings.as_tt(repr(default)))
           missing_rows_joins = input_joins+[sql_gen.Join(into,
               {in_pkey: sql_gen.join_same_not_null}, sql_gen.filter_out)]
               # must use join_same_not_null or query will take forever
           insert_into_pkeys(missing_rows_joins,
               [in_pkey_col, sql_gen.NamedCol(out_pkey, default)])
           assert table_row_count(db, into) == table_row_count(db, in_table)
           return sql_gen.Col(out_pkey, into)
       ##### Data cleanup
       def cleanup_table(db, table, cols):
           def esc_name_(name): return esc_name(db, name)
           table = sql_gen.as_Table(table).to_str(db)
           cols = map(esc_name_, cols)
           run_query(db, 'UPDATE '+table+' SET\n'+(',\n'.join(('\n'+col
               +' = nullif(nullif(trim(both from '+col+"), %(null0)s), %(null1)s)"
                   for col in cols))),
               dict(null0='', null1=r'\N'))

(24-24/36)

Project

General

Profile