/lib/sql.py - BIEN 3 - NCEAS Projects

root/lib/sql.py @ 3120

       # Database access
       import copy
       import re
       import warnings
       import exc
       import dicts
       import iters
       import lists
       from Proxy import Proxy
       import rand
       import sql_gen
       import strings
       import util
       ##### Exceptions
       def get_cur_query(cur, input_query=None):
           raw_query = None
           if hasattr(cur, 'query'): raw_query = cur.query
           elif hasattr(cur, '_last_executed'): raw_query = cur._last_executed
           if raw_query != None: return raw_query
           else: return '[input] '+strings.ustr(input_query)
       def _add_cursor_info(e, *args, **kw_args):
           '''For params, see get_cur_query()'''
           exc.add_msg(e, 'query: '+strings.ustr(get_cur_query(*args, **kw_args)))
       class DbException(exc.ExceptionWithCause):
           def __init__(self, msg, cause=None, cur=None):
               exc.ExceptionWithCause.__init__(self, msg, cause, cause_newline=True)
               if cur != None: _add_cursor_info(self, cur)
       class ExceptionWithName(DbException):
           def __init__(self, name, cause=None):
               DbException.__init__(self, 'for name: '+strings.as_tt(str(name)), cause)
               self.name = name
       class ExceptionWithValue(DbException):
           def __init__(self, value, cause=None):
               DbException.__init__(self, 'for value: '+strings.as_tt(repr(value)),
                   cause)
               self.value = value
       class ExceptionWithNameType(DbException):
           def __init__(self, type_, name, cause=None):
               DbException.__init__(self, 'for type: '+strings.as_tt(str(type_))
                   +'; name: '+strings.as_tt(name), cause)
               self.type = type_
               self.name = name
       class ConstraintException(DbException):
           def __init__(self, name, cols, cause=None):
               DbException.__init__(self, 'Violated '+strings.as_tt(name)
                   +' constraint on columns: '+strings.as_tt(', '.join(cols)), cause)
               self.name = name
               self.cols = cols
       class MissingCastException(DbException):
           def __init__(self, type_, col, cause=None):
               DbException.__init__(self, 'Missing cast to type '+strings.as_tt(type_)
                   +' on column: '+strings.as_tt(col), cause)
               self.type = type_
               self.col = col
       class NameException(DbException): pass
       class DuplicateKeyException(ConstraintException): pass
       class NullValueException(ConstraintException): pass
       class InvalidValueException(ExceptionWithValue): pass
       class DuplicateException(ExceptionWithNameType): pass
       class EmptyRowException(DbException): pass
       ##### Warnings
       class DbWarning(UserWarning): pass
       ##### Result retrieval
       def col_names(cur): return (col[0] for col in cur.description)
       def rows(cur): return iter(lambda: cur.fetchone(), None)
       def consume_rows(cur):
           '''Used to fetch all rows so result will be cached'''
           iters.consume_iter(rows(cur))
       def next_row(cur): return rows(cur).next()
       def row(cur):
           row_ = next_row(cur)
           consume_rows(cur)
           return row_
       def next_value(cur): return next_row(cur)[0]
       def value(cur): return row(cur)[0]
       def values(cur): return iters.func_iter(lambda: next_value(cur))
       def value_or_none(cur):
           try: return value(cur)
           except StopIteration: return None
       ##### Escaping
       def esc_name_by_module(module, name):
           if module == 'psycopg2' or module == None: quote = '"'
           elif module == 'MySQLdb': quote = '`'
           else: raise NotImplementedError("Can't escape name for "+module+' database')
           return sql_gen.esc_name(name, quote)
       def esc_name_by_engine(engine, name, **kw_args):
           return esc_name_by_module(db_engines[engine][0], name, **kw_args)
       def esc_name(db, name, **kw_args):
           return esc_name_by_module(util.root_module(db.db), name, **kw_args)
       def qual_name(db, schema, table):
           def esc_name_(name): return esc_name(db, name)
           table = esc_name_(table)
           if schema != None: return esc_name_(schema)+'.'+table
           else: return table
       ##### Database connections
       db_config_names = ['engine', 'host', 'user', 'password', 'database', 'schemas']
       db_engines = {
           'MySQL': ('MySQLdb', {'password': 'passwd', 'database': 'db'}),
           'PostgreSQL': ('psycopg2', {}),
+      }
       DatabaseErrors_set = set([DbException])
       DatabaseErrors = tuple(DatabaseErrors_set)
       def _add_module(module):
           DatabaseErrors_set.add(module.DatabaseError)
           global DatabaseErrors
           DatabaseErrors = tuple(DatabaseErrors_set)
       def db_config_str(db_config):
           return db_config['engine']+' database '+db_config['database']
       log_debug_none = lambda msg, level=2: None
       class DbConn:
           def __init__(self, db_config, autocommit=True, caching=True,
               log_debug=log_debug_none, debug_temp=False):
               '''
               @param debug_temp Whether temporary objects should instead be permanent.
                   This assists in debugging the internal objects used by the program.
               '''
               self.db_config = db_config
               self.autocommit = autocommit
               self.caching = caching
               self.log_debug = log_debug
               self.debug = log_debug != log_debug_none
               self.debug_temp = debug_temp
               self.autoanalyze = False
               self._reset()
           def __getattr__(self, name):
               if name == '__dict__': raise Exception('getting __dict__')
               if name == 'db': return self._db()
               else: raise AttributeError()
           def __getstate__(self):
               state = copy.copy(self.__dict__) # shallow copy
               state['log_debug'] = None # don't pickle the debug callback
               state['_DbConn__db'] = None # don't pickle the connection
               return state
           def clear_cache(self): self.query_results = {}
           def _reset(self):
               self.clear_cache()
               self._savepoint = 0
               self._notices_seen = set()
               self.__db = None
           def connected(self): return self.__db != None
           def close(self):
               if not self.connected(): return
               self.db.close()
               self._reset()
           def _db(self):
               if self.__db == None:
                   # Process db_config
                   db_config = self.db_config.copy() # don't modify input!
                   schemas = db_config.pop('schemas', None)
                   module_name, mappings = db_engines[db_config.pop('engine')]
                   module = __import__(module_name)
                   _add_module(module)
                   for orig, new in mappings.iteritems():
                       try: util.rename_key(db_config, orig, new)
                       except KeyError: pass
                   # Connect
                   self.__db = module.connect(**db_config)
                   # Configure connection
                   if hasattr(self.db, 'set_isolation_level'):
                       import psycopg2.extensions
                       self.db.set_isolation_level(
                           psycopg2.extensions.ISOLATION_LEVEL_READ_COMMITTED)
                   if schemas != None:
                       search_path = [self.esc_name(s) for s in schemas.split(',')]
                       search_path.append(value(run_query(self, 'SHOW search_path',
                           log_level=4)))
                       run_query(self, 'SET search_path TO '+(','.join(search_path)),
                           log_level=3)
               return self.__db
           class DbCursor(Proxy):
               def __init__(self, outer):
                   Proxy.__init__(self, outer.db.cursor())
                   self.outer = outer
                   self.query_results = outer.query_results
                   self.query_lookup = None
                   self.result = []
               def execute(self, query):
                   self._is_insert = query.startswith('INSERT')
                   self.query_lookup = query
                   try:
                       try:
                           cur = self.inner.execute(query)
                           self.outer.do_autocommit()
                       finally: self.query = get_cur_query(self.inner, query)
                   except Exception, e:
                       _add_cursor_info(e, self, query)
                       self.result = e # cache the exception as the result
                       self._cache_result()
                       raise
                   # Always cache certain queries
                   if query.startswith('CREATE') or query.startswith('ALTER'):
                       # structural changes
                       # Rest of query must be unique in the face of name collisions,
                       # so don't cache ADD COLUMN unless it has distinguishing comment
                       if query.find('ADD COLUMN') < 0 or query.endswith('*/'):
                           self._cache_result()
                   elif self.rowcount == 0 and query.startswith('SELECT'): # empty
                       consume_rows(self) # fetch all rows so result will be cached
                   return cur
               def fetchone(self):
                   row = self.inner.fetchone()
                   if row != None: self.result.append(row)
                   # otherwise, fetched all rows
                   else: self._cache_result()
                   return row
               def _cache_result(self):
                   # For inserts that return a result set, don't cache result set since
                   # inserts are not idempotent. Other non-SELECT queries don't have
                   # their result set read, so only exceptions will be cached (an
                   # invalid query will always be invalid).
                   if self.query_results != None and (not self._is_insert
                       or isinstance(self.result, Exception)):
                       assert self.query_lookup != None
                       self.query_results[self.query_lookup] = self.CacheCursor(
                           util.dict_subset(dicts.AttrsDictView(self),
                           ['query', 'result', 'rowcount', 'description']))
               class CacheCursor:
                   def __init__(self, cached_result): self.__dict__ = cached_result
                   def execute(self, *args, **kw_args):
                       if isinstance(self.result, Exception): raise self.result
                       # otherwise, result is a rows list
                       self.iter = iter(self.result)
                   def fetchone(self):
                       try: return self.iter.next()
                       except StopIteration: return None
           def esc_value(self, value):
               try: str_ = self.mogrify('%s', [value])
               except NotImplementedError, e:
                   module = util.root_module(self.db)
                   if module == 'MySQLdb':
                       import _mysql
                       str_ = _mysql.escape_string(value)
                   else: raise e
               return strings.to_unicode(str_)
           def esc_name(self, name): return esc_name(self, name) # calls global func
           def std_code(self, str_):
               '''Standardizes SQL code.
               * Ensures that string literals are prefixed by `E`
               '''
               if str_.startswith("'"): str_ = 'E'+str_
               return str_
           def can_mogrify(self):
               module = util.root_module(self.db)
               return module == 'psycopg2'
           def mogrify(self, query, params=None):
               if self.can_mogrify(): return self.db.cursor().mogrify(query, params)
               else: raise NotImplementedError("Can't mogrify query")
           def print_notices(self):
               if hasattr(self.db, 'notices'):
                   for msg in self.db.notices:
                       if msg not in self._notices_seen:
                           self._notices_seen.add(msg)
                           self.log_debug(msg, level=2)
           def run_query(self, query, cacheable=False, log_level=2,
               debug_msg_ref=None):
               '''
               @param log_ignore_excs The log_level will be increased by 2 if the query
                   throws one of these exceptions.
               @param debug_msg_ref If specified, the log message will be returned in
                   this instead of being output. This allows you to filter log messages
                   depending on the result of the query.
               '''
               assert query != None
               if not self.caching: cacheable = False
               used_cache = False
               def log_msg(query):
                   if used_cache: cache_status = 'cache hit'
                   elif cacheable: cache_status = 'cache miss'
                   else: cache_status = 'non-cacheable'
                   return 'DB query: '+cache_status+':\n'+strings.as_code(query, 'SQL')
               try:
                   # Get cursor
                   if cacheable:
                       try:
                           cur = self.query_results[query]
                           used_cache = True
                       except KeyError: cur = self.DbCursor(self)
                   else: cur = self.db.cursor()
                   # Log query
                   if self.debug and debug_msg_ref == None: # log before running
                       self.log_debug(log_msg(query), log_level)
                   # Run query
                   cur.execute(query)
               finally:
                   self.print_notices()
                   if self.debug and debug_msg_ref != None: # return after running
                       debug_msg_ref[0] = log_msg(str(get_cur_query(cur, query)))
               return cur
           def is_cached(self, query): return query in self.query_results
           def with_autocommit(self, func):
               import psycopg2.extensions
               prev_isolation_level = self.db.isolation_level
               self.db.set_isolation_level(
                   psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
               try: return func()
               finally: self.db.set_isolation_level(prev_isolation_level)
           def with_savepoint(self, func):
               savepoint = 'level_'+str(self._savepoint)
               self.run_query('SAVEPOINT '+savepoint, log_level=4)
               self._savepoint += 1
               try: return func()
               except:
                   self.run_query('ROLLBACK TO SAVEPOINT '+savepoint, log_level=4)
                   raise
               finally:
                   # Always release savepoint, because after ROLLBACK TO SAVEPOINT,
                   # "The savepoint remains valid and can be rolled back to again"
                   # (http://www.postgresql.org/docs/8.3/static/sql-rollback-to.html).
                   self.run_query('RELEASE SAVEPOINT '+savepoint, log_level=4)
                   self._savepoint -= 1
                   assert self._savepoint >= 0
                   self.do_autocommit() # OK to do this after ROLLBACK TO SAVEPOINT
           def do_autocommit(self):
               '''Autocommits if outside savepoint'''
               assert self._savepoint >= 0
               if self.autocommit and self._savepoint == 0:
                   self.log_debug('Autocommitting', level=4)
                   self.db.commit()
           def col_info(self, col):
               table = sql_gen.Table('columns', 'information_schema')
               type_ = sql_gen.Coalesce(sql_gen.Nullif(sql_gen.Col('data_type'),
                   'USER-DEFINED'), sql_gen.Col('udt_name'))
               cols = [type_, 'column_default',
                   sql_gen.Cast('boolean', sql_gen.Col('is_nullable'))]
               conds = [('table_name', col.table.name), ('column_name', col.name)]
               schema = col.table.schema
               if schema != None: conds.append(('table_schema', schema))
               type_, default, nullable = row(select(self, table, cols, conds,
                   order_by='table_schema', limit=1, cacheable=False, log_level=4))
                   # TODO: order_by search_path schema order
               default = sql_gen.as_Code(default, self)
               return sql_gen.TypedCol(col.name, type_, default, nullable)
           def TempFunction(self, name):
               if self.debug_temp: schema = None
               else: schema = 'pg_temp'
               return sql_gen.Function(name, schema)
       connect = DbConn
       ##### Recoverable querying
       def with_savepoint(db, func): return db.with_savepoint(func)
       def run_query(db, query, recover=None, cacheable=False, log_level=2,
           log_ignore_excs=None, **kw_args):
           '''For params, see DbConn.run_query()'''
           if recover == None: recover = False
           if log_ignore_excs == None: log_ignore_excs = ()
           log_ignore_excs = tuple(log_ignore_excs)
           debug_msg_ref = None # usually, db.run_query() logs query before running it
           # But if filtering with log_ignore_excs, wait until after exception parsing
           if log_ignore_excs != () or not db.can_mogrify(): debug_msg_ref = [None]
           try:
               try:
                   def run(): return db.run_query(query, cacheable, log_level,
                       debug_msg_ref, **kw_args)
                   if recover and not db.is_cached(query):
                       return with_savepoint(db, run)
                   else: return run() # don't need savepoint if cached
               except Exception, e:
                   msg = strings.ustr(e.args[0])
                   match = re.match(r'^duplicate key value violates unique constraint '
                       r'"((_?[^\W_]+(?=[._]))?.+?)"', msg)
                   if match:
                       constraint, table = match.groups()
                       cols = []
                       if recover: # need auto-rollback to run index_cols()
                           try: cols = index_cols(db, table, constraint)
                           except NotImplementedError: pass
                       raise DuplicateKeyException(constraint, cols, e)
                   match = re.match(r'^null value in column "(.+?)" violates not-null'
                       r' constraint', msg)
                   if match: raise NullValueException('NOT NULL', [match.group(1)], e)
                   match = re.match(r'^(?:invalid input (?:syntax|value)\b.*?'
                       r'|.+? field value out of range): "(.+?)"', msg)
                   if match:
                       value, = match.groups()
                       raise InvalidValueException(strings.to_unicode(value), e)
                   match = re.match(r'^column "(.+?)" is of type (.+?) but expression '
                       r'is of type', msg)
                   if match:
                       col, type_ = match.groups()
                       raise MissingCastException(type_, col, e)
                   match = re.match(r'^(\S+) "(.+?)".*? already exists', msg)
                   if match:
                       type_, name = match.groups()
                       raise DuplicateException(type_, name, e)
                   raise # no specific exception raised
           except log_ignore_excs:
               log_level += 2
               raise
           finally:
               if debug_msg_ref != None and debug_msg_ref[0] != None:
                   db.log_debug(debug_msg_ref[0], log_level)
       ##### Basic queries
       def next_version(name):
           version = 1 # first existing name was version 0
           match = re.match(r'^(.*)#(\d+)$', name)
           if match:
               name, version = match.groups()
               version = int(version)+1
           return sql_gen.concat(name, '#'+str(version))
       def lock_table(db, table, mode):
           table = sql_gen.as_Table(table)
           run_query(db, 'LOCK TABLE '+table.to_str(db)+' IN '+mode+' MODE')
       def run_query_into(db, query, into=None, add_indexes_=False, **kw_args):
           '''Outputs a query to a temp table.
           For params, see run_query().
           '''
           if into == None: return run_query(db, query, **kw_args)
           assert isinstance(into, sql_gen.Table)
           into.is_temp = True
           # "temporary tables cannot specify a schema name", so remove schema
           into.schema = None
           kw_args['recover'] = True
           kw_args.setdefault('log_ignore_excs', (DuplicateException,))
           temp = not db.debug_temp # tables are permanent in debug_temp mode
           # Create table
           while True:
               create_query = 'CREATE'
               if temp: create_query += ' TEMP'
               create_query += ' TABLE '+into.to_str(db)+' AS\n'+query
               try:
                   cur = run_query(db, create_query, **kw_args)
                       # CREATE TABLE AS sets rowcount to # rows in query
                   break
               except DuplicateException, e:
                   into.name = next_version(into.name)
                   # try again with next version of name
           if add_indexes_: add_indexes(db, into)
           # According to the PostgreSQL doc, "The autovacuum daemon cannot access and
           # therefore cannot vacuum or analyze temporary tables. [...] if a temporary
           # table is going to be used in complex queries, it is wise to run ANALYZE on
           # the temporary table after it is populated."
           # (http://www.postgresql.org/docs/9.1/static/sql-createtable.html)
           # If into is not a temp table, ANALYZE is useful but not required.
           analyze(db, into)
           return cur
       order_by_pkey = object() # tells mk_select() to order by the pkey
       distinct_on_all = object() # tells mk_select() to SELECT DISTINCT ON all columns
       def mk_select(db, tables, fields=None, conds=None, distinct_on=[], limit=None,
           start=None, order_by=order_by_pkey, default_table=None):
           '''
           @param tables The single table to select from, or a list of tables to join
               together, with tables after the first being sql_gen.Join objects
           @param fields Use None to select all fields in the table
           @param conds WHERE conditions: [(compare_left_side, compare_right_side),...]
               * container can be any iterable type
               * compare_left_side: sql_gen.Code|str (for col name)
               * compare_right_side: sql_gen.ValueCond|literal value
           @param distinct_on The columns to SELECT DISTINCT ON, or distinct_on_all to
               use all columns
           @return query
           '''
           # Parse tables param
           tables = lists.mk_seq(tables)
           tables = list(tables) # don't modify input! (list() copies input)
           table0 = sql_gen.as_Table(tables.pop(0)) # first table is separate
           # Parse other params
           if conds == None: conds = []
           elif dicts.is_dict(conds): conds = conds.items()
           conds = list(conds) # don't modify input! (list() copies input)
           assert limit == None or type(limit) == int
           assert start == None or type(start) == int
           if order_by is order_by_pkey:
               if distinct_on != []: order_by = None
               else: order_by = pkey(db, table0, recover=True)
           query = 'SELECT'
           def parse_col(col): return sql_gen.as_Col(col, default_table).to_str(db)
           # DISTINCT ON columns
           if distinct_on != []:
               query += '\nDISTINCT'
               if distinct_on is not distinct_on_all:
                   query += ' ON ('+(', '.join(map(parse_col, distinct_on)))+')'
           # Columns
           if fields == None:
               if query.find('\n') >= 0: whitespace = '\n'
               else: whitespace = ' '
               query += whitespace+'*'
           else:
               assert fields != []
               query += '\n'+('\n, '.join(map(parse_col, fields)))
           # Main table
           query += '\nFROM '+table0.to_str(db)
           # Add joins
           left_table = table0
           for join_ in tables:
               table = join_.table
               # Parse special values
               if join_.type_ is sql_gen.filter_out: # filter no match
                   conds.append((sql_gen.Col(table_not_null_col(db, table), table),
                       sql_gen.CompareCond(None, '~=')))
               query += '\n'+join_.to_str(db, left_table)
               left_table = table
           missing = True
           if conds != []:
               if len(conds) == 1: whitespace = ' '
               else: whitespace = '\n'
               query += '\n'+sql_gen.combine_conds([sql_gen.ColValueCond(l, r)
                   .to_str(db) for l, r in conds], 'WHERE')
               missing = False
           if order_by != None:
               query += '\nORDER BY '+sql_gen.as_Col(order_by, table0).to_str(db)
           if limit != None: query += '\nLIMIT '+str(limit); missing = False
           if start != None:
               if start != 0: query += '\nOFFSET '+str(start)
               missing = False
           if missing: warnings.warn(DbWarning(
               'SELECT statement missing a WHERE, LIMIT, or OFFSET clause: '+query))
           return query
       def select(db, *args, **kw_args):
           '''For params, see mk_select() and run_query()'''
           recover = kw_args.pop('recover', None)
           cacheable = kw_args.pop('cacheable', True)
           log_level = kw_args.pop('log_level', 2)
           return run_query(db, mk_select(db, *args, **kw_args), recover, cacheable,
               log_level=log_level)
       def mk_insert_select(db, table, cols=None, select_query=None, returning=None,
           embeddable=False, ignore=False):
           '''
           @param returning str|None An inserted column (such as pkey) to return
           @param embeddable Whether the query should be embeddable as a nested SELECT.
               Warning: If you set this and cacheable=True when the query is run, the
               query will be fully cached, not just if it raises an exception.
           @param ignore Whether to ignore duplicate keys.
           '''
           table = sql_gen.remove_table_rename(sql_gen.as_Table(table))
           if cols == []: cols = None # no cols (all defaults) = unknown col names
           if cols != None: cols = [sql_gen.to_name_only_col(c, table) for c in cols]
           if select_query == None: select_query = 'DEFAULT VALUES'
           if returning != None: returning = sql_gen.as_Col(returning, table)
           first_line = 'INSERT INTO '+table.to_str(db)
           def mk_insert(select_query):
               query = first_line
               if cols != None:
                   query += '\n('+(', '.join((c.to_str(db) for c in cols)))+')'
               query += '\n'+select_query
               if returning != None:
                   returning_name_col = sql_gen.to_name_only_col(returning)
                   query += '\nRETURNING '+returning_name_col.to_str(db)
               return query
           return_type = 'unknown'
           if returning != None: return_type = returning.to_str(db)+'%TYPE'
           lang = 'sql'
           if ignore:
               # Always return something to set the correct rowcount
               if returning == None: returning = sql_gen.NamedCol('NULL', None)
               embeddable = True # must use function
               lang = 'plpgsql'
               if cols == None:
                   row = [sql_gen.Col(sql_gen.all_cols, 'row')]
                   row_vars = [sql_gen.Table('row')]
               else:
                   row_vars = row = [sql_gen.Col(c.name, 'row') for c in cols]
               query = '''\
       DECLARE
           row '''+table.to_str(db)+'''%ROWTYPE;
       BEGIN
           /* Need an EXCEPTION block for each individual row because "When an error is
           caught by an EXCEPTION clause, [...] all changes to persistent database
           state within the block are rolled back."
           This is unfortunate because "A block containing an EXCEPTION clause is
           significantly more expensive to enter and exit than a block without one."
           (http://www.postgresql.org/docs/8.3/static/plpgsql-control-structures.html\
       #PLPGSQL-ERROR-TRAPPING)
           */
           FOR '''+(', '.join((v.to_str(db) for v in row_vars)))+''' IN
       '''+select_query+'''
           LOOP
               BEGIN
                   RETURN QUERY
       '''+mk_insert(sql_gen.Values(row).to_str(db))+'''
+      ;
               EXCEPTION
                   WHEN unique_violation THEN NULL; -- continue to next row
               END;
           END LOOP;
       END;\
       '''
           else: query = mk_insert(select_query)
           if embeddable:
               # Create function
               function_name = sql_gen.clean_name(first_line)
               while True:
                   try:
                       function = db.TempFunction(function_name)
                       function_query = '''\
       CREATE FUNCTION '''+function.to_str(db)+'''()
       RETURNS SETOF '''+return_type+'''
       LANGUAGE '''+lang+'''
       AS $$
       '''+query+'''
       $$;
       '''
                       run_query(db, function_query, recover=True, cacheable=True,
                           log_ignore_excs=(DuplicateException,))
                       break # this version was successful
                   except DuplicateException, e:
                       function_name = next_version(function_name)
                       # try again with next version of name
               # Return query that uses function
               cols = None
               if returning != None: cols = [returning]
               func_table = sql_gen.NamedTable('f', sql_gen.FunctionCall(function),
                   cols) # AS clause requires function alias
               return mk_select(db, func_table, start=0, order_by=None)
           return query
       def insert_select(db, table, *args, **kw_args):
           '''For params, see mk_insert_select() and run_query_into()
           @param into sql_gen.Table with suggested name of temp table to put RETURNING
               values in
           '''
           into = kw_args.pop('into', None)
           if into != None: kw_args['embeddable'] = True
           recover = kw_args.pop('recover', None)
           if kw_args.get('ignore', False): recover = True
           cacheable = kw_args.pop('cacheable', True)
           log_level = kw_args.pop('log_level', 2)
           cur = run_query_into(db, mk_insert_select(db, table, *args, **kw_args),
               into, recover=recover, cacheable=cacheable, log_level=log_level)
           autoanalyze(db, table)
           return cur
       default = sql_gen.default # tells insert() to use the default value for a column
       def insert(db, table, row, *args, **kw_args):
           '''For params, see insert_select()'''
           if lists.is_seq(row): cols = None
           else:
               cols = row.keys()
               row = row.values()
           row = list(row) # ensure that "== []" works
           if row == []: query = None
           else: query = sql_gen.Values(row).to_str(db)
           return insert_select(db, table, cols, query, *args, **kw_args)
       def mk_update(db, table, changes=None, cond=None, in_place=False):
           '''
           @param changes [(col, new_value),...]
               * container can be any iterable type
               * col: sql_gen.Code|str (for col name)
               * new_value: sql_gen.Code|literal value
           @param cond sql_gen.Code WHERE condition. e.g. use sql_gen.*Cond objects.
           @param in_place If set, locks the table and updates rows in place.
               This avoids creating dead rows in PostgreSQL.
               * cond must be None
           @return str query
           '''
           table = sql_gen.as_Table(table)
           changes = [(sql_gen.to_name_only_col(c, table), sql_gen.as_Value(v))
               for c, v in changes]
           if in_place:
               assert cond == None
               query = 'ALTER TABLE '+table.to_str(db)+'\n'
               query += ',\n'.join(('ALTER COLUMN '+c.to_str(db)+' TYPE '
                   +db.col_info(sql_gen.with_default_table(c, table)).type
                   +'\nUSING '+v.to_str(db) for c, v in changes))
           else:
               query = 'UPDATE '+table.to_str(db)+'\nSET\n'
               query += ',\n'.join((c.to_str(db)+' = '+v.to_str(db)
                   for c, v in changes))
               if cond != None: query += '\nWHERE\n'+cond.to_str(db)
           return query
       def update(db, table, *args, **kw_args):
           '''For params, see mk_update() and run_query()'''
           recover = kw_args.pop('recover', None)
           cacheable = kw_args.pop('cacheable', False)
           log_level = kw_args.pop('log_level', 2)
           cur = run_query(db, mk_update(db, table, *args, **kw_args), recover,
               cacheable, log_level=log_level)
           autoanalyze(db, table)
           return cur
       def last_insert_id(db):
           module = util.root_module(db.db)
           if module == 'psycopg2': return value(run_query(db, 'SELECT lastval()'))
           elif module == 'MySQLdb': return db.insert_id()
           else: return None
       def mk_flatten_mapping(db, into, cols, preserve=[], as_items=False):
           '''Creates a mapping from original column names (which may have collisions)
           to names that will be distinct among the columns' tables.
           This is meant to be used for several tables that are being joined together.
           @param cols The columns to combine. Duplicates will be removed.
           @param into The table for the new columns.
           @param preserve [sql_gen.Col...] Columns not to rename. Note that these
               columns will be included in the mapping even if they are not in cols.
               The tables of the provided Col objects will be changed to into, so make
               copies of them if you want to keep the original tables.
           @param as_items Whether to return a list of dict items instead of a dict
           @return dict(orig_col=new_col, ...)
               * orig_col: sql_gen.Col(orig_col_name, orig_table)
               * new_col: sql_gen.Col(orig_col_name, into)
               * All mappings use the into table so its name can easily be
                 changed for all columns at once
           '''
           cols = lists.uniqify(cols)
           items = []
           for col in preserve:
               orig_col = copy.copy(col)
               col.table = into
               items.append((orig_col, col))
           preserve = set(preserve)
           for col in cols:
               if col not in preserve:
                   items.append((col, sql_gen.Col(str(col), into, col.srcs)))
           if not as_items: items = dict(items)
           return items
       def flatten(db, into, joins, cols, limit=None, start=None, **kw_args):
           '''For params, see mk_flatten_mapping()
           @return See return value of mk_flatten_mapping()
           '''
           items = mk_flatten_mapping(db, into, cols, as_items=True, **kw_args)
           cols = [sql_gen.NamedCol(new.name, old) for old, new in items]
           run_query_into(db, mk_select(db, joins, cols, limit=limit, start=start),
               into=into, add_indexes_=True)
           return dict(items)
       ##### Database structure introspection
       #### Tables
       def tables(db, schema_like='public', table_like='%', exact=False):
           if exact: compare = '='
           else: compare = 'LIKE'
           module = util.root_module(db.db)
           if module == 'psycopg2':
               conds = [('schemaname', sql_gen.CompareCond(schema_like, compare)),
                   ('tablename', sql_gen.CompareCond(table_like, compare))]
               return values(select(db, 'pg_tables', ['tablename'], conds,
                   order_by='tablename', log_level=4))
           elif module == 'MySQLdb':
               return values(run_query(db, 'SHOW TABLES LIKE '+db.esc_value(table_like)
                   , cacheable=True, log_level=4))
           else: raise NotImplementedError("Can't list tables for "+module+' database')
       def table_exists(db, table):
           table = sql_gen.as_Table(table)
           return list(tables(db, table.schema, table.name, exact=True)) != []
       def table_row_count(db, table, recover=None):
           return value(run_query(db, mk_select(db, table, [sql_gen.row_count],
               order_by=None, start=0), recover=recover, log_level=3))
       def table_cols(db, table, recover=None):
           return list(col_names(select(db, table, limit=0, order_by=None,
               recover=recover, log_level=4)))
       def pkey(db, table, recover=None):
           '''Assumed to be first column in table'''
           return table_cols(db, table, recover)[0]
       not_null_col = 'not_null_col'
       def table_not_null_col(db, table, recover=None):
           '''Name assumed to be the value of not_null_col. If not found, uses pkey.'''
           if not_null_col in table_cols(db, table, recover): return not_null_col
           else: return pkey(db, table, recover)
       def index_cols(db, table, index):
           '''Can also use this for UNIQUE constraints, because a UNIQUE index is
           automatically created. When you don't know whether something is a UNIQUE
           constraint or a UNIQUE index, use this function.'''
           module = util.root_module(db.db)
           if module == 'psycopg2':
               return list(values(run_query(db, '''\
       SELECT attname
       FROM
+      (
               SELECT attnum, attname
               FROM pg_index
               JOIN pg_class index ON index.oid = indexrelid
               JOIN pg_class table_ ON table_.oid = indrelid
               JOIN pg_attribute ON attrelid = indrelid AND attnum = ANY (indkey)
               WHERE
                   table_.relname = '''+db.esc_value(table)+'''
                   AND index.relname = '''+db.esc_value(index)+'''
           UNION
               SELECT attnum, attname
               FROM
+              (
                   SELECT
                       indrelid
                       , (regexp_matches(indexprs, E':varattno (\\\\d+)', 'g'))[1]::int
                           AS indkey
                   FROM pg_index
                   JOIN pg_class index ON index.oid = indexrelid
                   JOIN pg_class table_ ON table_.oid = indrelid
                   WHERE
                       table_.relname = '''+db.esc_value(table)+'''
                       AND index.relname = '''+db.esc_value(index)+'''
               ) s
               JOIN pg_attribute ON attrelid = indrelid AND attnum = indkey
       ) s
       ORDER BY attnum
       '''
                   , cacheable=True, log_level=4)))
           else: raise NotImplementedError("Can't list index columns for "+module+
               ' database')
       def constraint_cols(db, table, constraint):
           module = util.root_module(db.db)
           if module == 'psycopg2':
               return list(values(run_query(db, '''\
       SELECT attname
       FROM pg_constraint
       JOIN pg_class ON pg_class.oid = conrelid
       JOIN pg_attribute ON attrelid = conrelid AND attnum = ANY (conkey)
       WHERE
           relname = '''+db.esc_value(table)+'''
           AND conname = '''+db.esc_value(constraint)+'''
       ORDER BY attnum
       '''
                   )))
           else: raise NotImplementedError("Can't list constraint columns for "+module+
               ' database')
       #### Functions
       def function_exists(db, function):
           function = sql_gen.as_Function(function)
           info_table = sql_gen.Table('routines', 'information_schema')
           conds = [('routine_name', function.name)]
           schema = function.schema
           if schema != None: conds.append(('routine_schema', schema))
           # Exclude trigger functions, since they cannot be called directly
           conds.append(('data_type', sql_gen.CompareCond('trigger', '!=')))
           return list(values(select(db, info_table, ['routine_name'], conds,
               order_by='routine_schema', limit=1, log_level=4))) != []
               # TODO: order_by search_path schema order
       ##### Structural changes
       #### Columns
       def add_col(db, table, col, comment=None, **kw_args):
           '''
           @param col TypedCol Name may be versioned, so be sure to propagate any
               renaming back to any source column for the TypedCol.
           @param comment None|str SQL comment used to distinguish columns of the same
               name from each other when they contain different data, to allow the
               ADD COLUMN query to be cached. If not set, query will not be cached.
           '''
           assert isinstance(col, sql_gen.TypedCol)
           while True:
               str_ = 'ALTER TABLE '+table.to_str(db)+' ADD COLUMN '+col.to_str(db)
               if comment != None: str_ += ' '+sql_gen.esc_comment(comment)
               try:
                   run_query(db, str_, recover=True, cacheable=True, **kw_args)
                   break
               except DuplicateException:
                   col.name = next_version(col.name)
                   # try again with next version of name
       def add_not_null(db, col):
           table = col.table
           col = sql_gen.to_name_only_col(col)
           run_query(db, 'ALTER TABLE '+table.to_str(db)+' ALTER COLUMN '
               +col.to_str(db)+' SET NOT NULL', cacheable=True, log_level=3)
       row_num_col = '_row_num'
       row_num_typed_col = sql_gen.TypedCol(row_num_col, 'serial', nullable=False,
           constraints='PRIMARY KEY')
       def add_row_num(db, table):
           '''Adds a row number column to a table. Its name is in row_num_col. It will
           be the primary key.'''
           add_col(db, table, row_num_typed_col, log_level=3)
       #### Indexes
       def add_pkey(db, table, cols=None, recover=None):
           '''Adds a primary key.
           @param cols [sql_gen.Col,...] The columns in the primary key.
               Defaults to the first column in the table.
           @pre The table must not already have a primary key.
           '''
           table = sql_gen.as_Table(table)
           if cols == None: cols = [pkey(db, table, recover)]
           col_strs = [sql_gen.to_name_only_col(v).to_str(db) for v in cols]
           run_query(db, 'ALTER TABLE '+table.to_str(db)+' ADD PRIMARY KEY ('
               +(', '.join(col_strs))+')', recover=True, cacheable=True, log_level=3,
               log_ignore_excs=(DuplicateException,))
       def add_index(db, exprs, table=None, unique=False, ensure_not_null_=True):
           '''Adds an index on column(s) or expression(s) if it doesn't already exist.
           Currently, only function calls are supported as expressions.
           @param ensure_not_null_ If set, translates NULL values to sentinel values.
               This allows indexes to be used for comparisons where NULLs are equal.
           '''
           exprs = lists.mk_seq(exprs)
           # Parse exprs
           old_exprs = exprs[:]
           exprs = []
           cols = []
           for i, expr in enumerate(old_exprs):
               expr = sql_gen.as_Col(expr, table)
               # Handle nullable columns
               if ensure_not_null_:
                   try: expr = ensure_not_null(db, expr)
                   except KeyError: pass # unknown type, so just create plain index
               # Extract col
               expr = copy.deepcopy(expr) # don't modify input!
               if isinstance(expr, sql_gen.FunctionCall):
                   col = expr.args[0]
                   expr = sql_gen.Expr(expr)
               else: col = expr
               assert isinstance(col, sql_gen.Col)
               # Extract table
               if table == None:
                   assert sql_gen.is_table_col(col)
                   table = col.table
               col.table = None
               exprs.append(expr)
               cols.append(col)
           table = sql_gen.as_Table(table)
           index = sql_gen.Table(str(sql_gen.Col(','.join(map(str, cols)), table)))
           # Add index
           while True:
               str_ = 'CREATE'
               if unique: str_ += ' UNIQUE'
               str_ += ' INDEX '+index.to_str(db)+' ON '+table.to_str(db)+' ('+(
                   ', '.join((v.to_str(db) for v in exprs)))+')'
               try:
                   run_query(db, str_, recover=True, cacheable=True, log_level=3,
                       log_ignore_excs=(DuplicateException,))
                   break
               except DuplicateException:
                   index.name = next_version(index.name)
                   # try again with next version of name
       def add_index_col(db, col, suffix, expr, nullable=True):
           if sql_gen.index_col(col) != None: return # already has index col
           new_col = sql_gen.suffixed_col(col, suffix)
           # Add column
           new_typed_col = sql_gen.TypedCol(new_col.name, db.col_info(col).type)
           add_col(db, col.table, new_typed_col, comment='src: '+repr(col),
               log_level=3)
           new_col.name = new_typed_col.name # propagate any renaming
           update(db, col.table, [(new_col, expr)], in_place=True, cacheable=True,
               log_level=3)
           if not nullable: add_not_null(db, new_col)
           add_index(db, new_col)
           col.table.index_cols[col.name] = new_col.name
       # Controls when ensure_not_null() will use index columns
       not_null_index_cols_min_rows = 0 # rows; initially always use index columns
       def ensure_not_null(db, col):
           '''For params, see sql_gen.ensure_not_null()'''
           expr = sql_gen.ensure_not_null(db, col)
           # If a nullable column in a temp table, add separate index column instead.
           # Note that for small datasources, this adds 6-25% to the total import time.
           if (sql_gen.is_temp_col(col) and isinstance(expr, sql_gen.EnsureNotNull)
               and table_row_count(db, col.table) >= not_null_index_cols_min_rows):
               add_index_col(db, col, '::NOT NULL', expr, nullable=False)
               expr = sql_gen.index_col(col)
           return expr
       already_indexed = object() # tells add_indexes() the pkey has already been added
       def add_indexes(db, table, has_pkey=True):
           '''Adds an index on all columns in a table.
           @param has_pkey bool|already_indexed Whether a pkey instead of a regular
               index should be added on the first column.
               * If already_indexed, the pkey is assumed to have already been added
           '''
           cols = table_cols(db, table)
           if has_pkey:
               if has_pkey is not already_indexed: add_pkey(db, table)
               cols = cols[1:]
           for col in cols: add_index(db, col, table)
       #### Tables
       ### Maintenance
       def analyze(db, table):
           table = sql_gen.as_Table(table)
           run_query(db, 'ANALYZE '+table.to_str(db), log_level=3)
       def autoanalyze(db, table):
           if db.autoanalyze: analyze(db, table)
       def vacuum(db, table):
           table = sql_gen.as_Table(table)
           db.with_autocommit(lambda: run_query(db, 'VACUUM ANALYZE '+table.to_str(db),
               log_level=3))
       ### Lifecycle
       def drop_table(db, table):
           table = sql_gen.as_Table(table)
           return run_query(db, 'DROP TABLE IF EXISTS '+table.to_str(db)+' CASCADE')
       def create_table(db, table, cols=[], has_pkey=True, col_indexes=True,
           like=None):
           '''Creates a table.
           @param cols [sql_gen.TypedCol,...] The column names and types
           @param has_pkey If set, the first column becomes the primary key.
           @param col_indexes bool|[ref]
               * If True, indexes will be added on all non-pkey columns.
               * If a list reference, [0] will be set to a function to do this.
                 This can be used to delay index creation until the table is populated.
           '''
           table = sql_gen.as_Table(table)
           if like != None:
               cols = [sql_gen.CustomCode('LIKE '+like.to_str(db)+' INCLUDING ALL')
                   ]+cols
           if has_pkey:
               cols[0] = pkey = copy.copy(cols[0]) # don't modify input!
               pkey.constraints = 'PRIMARY KEY'
           temp = table.is_temp and not db.debug_temp
               # temp tables permanent in debug_temp mode
           # Create table
           while True:
               str_ = 'CREATE'
               if temp: str_ += ' TEMP'
               str_ += ' TABLE '+table.to_str(db)+' (\n'
               str_ += '\n, '.join(c.to_str(db) for c in cols)
               str_ += '\n);\n'
               try:
                   run_query(db, str_, cacheable=True, log_level=2,
                       log_ignore_excs=(DuplicateException,))
                   break
               except DuplicateException:
                   table.name = next_version(table.name)
                   # try again with next version of name
           # Add indexes
           if has_pkey: has_pkey = already_indexed
           def add_indexes_(): add_indexes(db, table, has_pkey)
           if isinstance(col_indexes, list): col_indexes[0] = add_indexes_ # defer
           elif col_indexes: add_indexes_() # add now
       def copy_table_struct(db, src, dest):
           '''Creates a structure-only copy of a table. (Does not copy data.)'''
           create_table(db, dest, has_pkey=False, col_indexes=False, like=src)
       ### Data
       def truncate(db, table, schema='public', **kw_args):
           '''For params, see run_query()'''
           table = sql_gen.as_Table(table, schema)
           return run_query(db, 'TRUNCATE '+table.to_str(db)+' CASCADE', **kw_args)
       def empty_temp(db, tables):
           if db.debug_temp: return # leave temp tables there for debugging
           tables = lists.mk_seq(tables)
           for table in tables: truncate(db, table, log_level=3)
       def empty_db(db, schema='public', **kw_args):
           '''For kw_args, see tables()'''
           for table in tables(db, schema, **kw_args): truncate(db, table, schema)
       def distinct_table(db, table, distinct_on):
           '''Creates a copy of a temp table which is distinct on the given columns.
           The old and new tables will both get an index on these columns, to
           facilitate merge joins.
           @param distinct_on If empty, creates a table with one row. This is useful if
               your distinct_on columns are all literal values.
           @return The new table.
           '''
           new_table = sql_gen.suffixed_table(table, '_distinct')
           copy_table_struct(db, table, new_table)
           limit = None
           if distinct_on == []: limit = 1 # one sample row
           else:
               add_index(db, distinct_on, new_table, unique=True)
               add_index(db, distinct_on, table) # for join optimization
           insert_select(db, new_table, None, mk_select(db, table, start=0,
               limit=limit), ignore=True)
           analyze(db, new_table)
           return new_table

(24-24/37)

Project

General

Profile