/lib/sql.py - BIEN 3 - NCEAS Projects

root/lib/sql.py @ 2460

       # Database access
       import copy
       import operator
       import re
       import warnings
       import exc
       import dicts
       import iters
       import lists
       from Proxy import Proxy
       import rand
       import sql_gen
       import strings
       import util
       ##### Exceptions
       def get_cur_query(cur, input_query=None, input_params=None):
           raw_query = None
           if hasattr(cur, 'query'): raw_query = cur.query
           elif hasattr(cur, '_last_executed'): raw_query = cur._last_executed
           if raw_query != None: return raw_query
           else: return '[input] '+strings.ustr(input_query)+' % '+repr(input_params)
       def _add_cursor_info(e, *args, **kw_args):
           '''For params, see get_cur_query()'''
           exc.add_msg(e, 'query: '+str(get_cur_query(*args, **kw_args)))
       class DbException(exc.ExceptionWithCause):
           def __init__(self, msg, cause=None, cur=None):
               exc.ExceptionWithCause.__init__(self, msg, cause, cause_newline=True)
               if cur != None: _add_cursor_info(self, cur)
       class ExceptionWithName(DbException):
           def __init__(self, name, cause=None):
               DbException.__init__(self, 'for name: '+str(name), cause)
               self.name = name
       class ExceptionWithNameValue(DbException):
           def __init__(self, name, value, cause=None):
               DbException.__init__(self,
                   'for name: '+str(name)+'; value: '+repr(value), cause)
               self.name = name
               self.value = value
       class ConstraintException(DbException):
           def __init__(self, name, cols, cause=None):
               DbException.__init__(self, 'Violated '+name+ ' constraint on columns: '
                   +(', '.join(cols)), cause)
               self.name = name
               self.cols = cols
       class NameException(DbException): pass
       class DuplicateKeyException(ConstraintException): pass
       class NullValueException(ConstraintException): pass
       class FunctionValueException(ExceptionWithNameValue): pass
       class DuplicateTableException(ExceptionWithName): pass
       class DuplicateFunctionException(ExceptionWithName): pass
       class EmptyRowException(DbException): pass
       ##### Warnings
       class DbWarning(UserWarning): pass
       ##### Result retrieval
       def col_names(cur): return (col[0] for col in cur.description)
       def rows(cur): return iter(lambda: cur.fetchone(), None)
       def consume_rows(cur):
           '''Used to fetch all rows so result will be cached'''
           iters.consume_iter(rows(cur))
       def next_row(cur): return rows(cur).next()
       def row(cur):
           row_ = next_row(cur)
           consume_rows(cur)
           return row_
       def next_value(cur): return next_row(cur)[0]
       def value(cur): return row(cur)[0]
       def values(cur): return iters.func_iter(lambda: next_value(cur))
       def value_or_none(cur):
           try: return value(cur)
           except StopIteration: return None
       ##### Input validation
       def clean_name(name): return re.sub(r'\W', r'', name.replace('.', '_'))
       def check_name(name):
           if re.search(r'\W', name) != None: raise NameException('Name "'+name
               +'" may contain only alphanumeric characters and _')
       def esc_name_by_module(module, name, ignore_case=False):
           if module == 'psycopg2' or module == None:
               if ignore_case:
                   # Don't enclose in quotes because this disables case-insensitivity
                   check_name(name)
                   return name
               else: quote = '"'
           elif module == 'MySQLdb': quote = '`'
           else: raise NotImplementedError("Can't escape name for "+module+' database')
           return quote + name.replace(quote, '') + quote
       def esc_name_by_engine(engine, name, **kw_args):
           return esc_name_by_module(db_engines[engine][0], name, **kw_args)
       def esc_name(db, name, **kw_args):
           return esc_name_by_module(util.root_module(db.db), name, **kw_args)
       def qual_name(db, schema, table):
           def esc_name_(name): return esc_name(db, name)
           table = esc_name_(table)
           if schema != None: return esc_name_(schema)+'.'+table
           else: return table
       ##### Database connections
       db_config_names = ['engine', 'host', 'user', 'password', 'database', 'schemas']
       db_engines = {
           'MySQL': ('MySQLdb', {'password': 'passwd', 'database': 'db'}),
           'PostgreSQL': ('psycopg2', {}),
+      }
       DatabaseErrors_set = set([DbException])
       DatabaseErrors = tuple(DatabaseErrors_set)
       def _add_module(module):
           DatabaseErrors_set.add(module.DatabaseError)
           global DatabaseErrors
           DatabaseErrors = tuple(DatabaseErrors_set)
       def db_config_str(db_config):
           return db_config['engine']+' database '+db_config['database']
       def _query_lookup(query, params): return (query, dicts.make_hashable(params))
       log_debug_none = lambda msg, level=2: None
       class DbConn:
           def __init__(self, db_config, serializable=True, autocommit=False,
               caching=True, log_debug=log_debug_none):
               self.db_config = db_config
               self.serializable = serializable
               self.autocommit = autocommit
               self.caching = caching
               self.log_debug = log_debug
               self.debug = log_debug != log_debug_none
               self.__db = None
               self.query_results = {}
               self._savepoint = 0
           def __getattr__(self, name):
               if name == '__dict__': raise Exception('getting __dict__')
               if name == 'db': return self._db()
               else: raise AttributeError()
           def __getstate__(self):
               state = copy.copy(self.__dict__) # shallow copy
               state['log_debug'] = None # don't pickle the debug callback
               state['_DbConn__db'] = None # don't pickle the connection
               return state
           def connected(self): return self.__db != None
           def _db(self):
               if self.__db == None:
                   # Process db_config
                   db_config = self.db_config.copy() # don't modify input!
                   schemas = db_config.pop('schemas', None)
                   module_name, mappings = db_engines[db_config.pop('engine')]
                   module = __import__(module_name)
                   _add_module(module)
                   for orig, new in mappings.iteritems():
                       try: util.rename_key(db_config, orig, new)
                       except KeyError: pass
                   # Connect
                   self.__db = module.connect(**db_config)
                   # Configure connection
                   if self.serializable and not self.autocommit: run_raw_query(self,
                       'SET TRANSACTION ISOLATION LEVEL SERIALIZABLE')
                   if schemas != None:
                       schemas_ = ''.join((esc_name(self, s)+', '
                           for s in schemas.split(',')))
                       run_raw_query(self, "SELECT set_config('search_path', \
       %s || current_setting('search_path'), false)", [schemas_])
               return self.__db
           class DbCursor(Proxy):
               def __init__(self, outer):
                   Proxy.__init__(self, outer.db.cursor())
                   self.outer = outer
                   self.query_results = outer.query_results
                   self.query_lookup = None
                   self.result = []
               def execute(self, query, params=None):
                   self._is_insert = query.upper().find('INSERT') >= 0
                   self.query_lookup = _query_lookup(query, params)
                   try:
                       try:
                           return_value = self.inner.execute(query, params)
                           self.outer.do_autocommit()
                       finally: self.query = get_cur_query(self.inner)
                   except Exception, e:
                       _add_cursor_info(e, self, query, params)
                       self.result = e # cache the exception as the result
                       self._cache_result()
                       raise
                   # Fetch all rows so result will be cached
                   if self.rowcount == 0 and not self._is_insert: consume_rows(self)
                   return return_value
               def fetchone(self):
                   row = self.inner.fetchone()
                   if row != None: self.result.append(row)
                   # otherwise, fetched all rows
                   else: self._cache_result()
                   return row
               def _cache_result(self):
                   # For inserts, only cache exceptions since inserts are not
                   # idempotent, but an invalid insert will always be invalid
                   if self.query_results != None and (not self._is_insert
                       or isinstance(self.result, Exception)):
                       assert self.query_lookup != None
                       self.query_results[self.query_lookup] = self.CacheCursor(
                           util.dict_subset(dicts.AttrsDictView(self),
                           ['query', 'result', 'rowcount', 'description']))
               class CacheCursor:
                   def __init__(self, cached_result): self.__dict__ = cached_result
                   def execute(self, *args, **kw_args):
                       if isinstance(self.result, Exception): raise self.result
                       # otherwise, result is a rows list
                       self.iter = iter(self.result)
                   def fetchone(self):
                       try: return self.iter.next()
                       except StopIteration: return None
           def esc_value(self, value):
               module = util.root_module(self.db)
               if module == 'psycopg2': str_ = self.db.cursor().mogrify('%s', [value])
               elif module == 'MySQLdb':
                   import _mysql
                   str_ = _mysql.escape_string(value)
               else: raise NotImplementedError("Can't escape value for "+module
                   +' database')
               return strings.to_unicode(str_)
           def esc_name(self, name): return esc_name(self, name) # calls global func
           def run_query(self, query, params=None, cacheable=False, log_level=2,
               exc_log_level=None):
               '''
               @param exc_log_level The log_level if the query throws an exception.
                   Defaults to the value of log_level.
               '''
               assert query != None
               if exc_log_level == None: exc_log_level = log_level
               if not self.caching: cacheable = False
               used_cache = False
               success = False
               try:
                   # Get cursor
                   if cacheable:
                       query_lookup = _query_lookup(query, params)
                       try:
                           cur = self.query_results[query_lookup]
                           used_cache = True
                       except KeyError: cur = self.DbCursor(self)
                   else: cur = self.db.cursor()
                   # Run query
                   cur.execute(query, params)
                   success = True
               finally:
                   if self.debug: # only compute msg if needed
                       if not success: log_level = exc_log_level
                       if used_cache: cache_status = 'Cache hit'
                       elif cacheable: cache_status = 'Cache miss'
                       else: cache_status = 'Non-cacheable'
                       self.log_debug(cache_status+': '+strings.one_line(
                           str(get_cur_query(cur, query, params))), log_level)
               return cur
           def is_cached(self, query, params=None):
               return _query_lookup(query, params) in self.query_results
           def with_savepoint(self, func):
               savepoint = 'level_'+str(self._savepoint)
               self.run_query('SAVEPOINT '+savepoint, log_level=4)
               self._savepoint += 1
               try:
                   try: return_val = func()
                   finally:
                       self._savepoint -= 1
                       assert self._savepoint >= 0
               except:
                   self.run_query('ROLLBACK TO SAVEPOINT '+savepoint, log_level=4)
                   raise
               else:
                   self.run_query('RELEASE SAVEPOINT '+savepoint, log_level=4)
                   self.do_autocommit()
                   return return_val
           def do_autocommit(self):
               '''Autocommits if outside savepoint'''
               assert self._savepoint >= 0
               if self.autocommit and self._savepoint == 0:
                   self.log_debug('Autocommiting')
                   self.db.commit()
       connect = DbConn
       ##### Querying
       def run_raw_query(db, *args, **kw_args):
           '''For params, see DbConn.run_query()'''
           return db.run_query(*args, **kw_args)
       def mogrify(db, query, params):
           module = util.root_module(db.db)
           if module == 'psycopg2': return db.db.cursor().mogrify(query, params)
           else: raise NotImplementedError("Can't mogrify query for "+module+
               ' database')
       ##### Recoverable querying
       def with_savepoint(db, func): return db.with_savepoint(func)
       def run_query(db, query, params=None, recover=None, cacheable=False, **kw_args):
           '''For params, see run_raw_query()'''
           if recover == None: recover = False
           try:
               def run(): return run_raw_query(db, query, params, cacheable, **kw_args)
               if recover and not db.is_cached(query, params):
                   return with_savepoint(db, run)
               else: return run() # don't need savepoint if cached
           except Exception, e:
               if not recover: raise # need savepoint to run index_cols()
               msg = exc.str_(e)
               match = re.search(r'duplicate key value violates unique constraint '
                   r'"((_?[^\W_]+)_[^"]+?)"', msg)
               if match:
                   constraint, table = match.groups()
                   try: cols = index_cols(db, table, constraint)
                   except NotImplementedError: raise e
                   else: raise DuplicateKeyException(constraint, cols, e)
               match = re.search(r'null value in column "(\w+?)" violates not-null '
                   r'constraint', msg)
               if match: raise NullValueException('NOT NULL', [match.group(1)], e)
               match = re.search(r'\b(?:invalid input (?:syntax|value)\b.*?'
                   r'|date/time field value out of range): "(.+?)"\n'
                   r'(?:(?s).*?)\bfunction "(\w+?)".*?\bat assignment', msg)
               if match:
                   value, name = match.groups()
                   raise FunctionValueException(name, strings.to_unicode(value), e)
               match = re.search(r'relation "(\w+?)" already exists', msg)
               if match: raise DuplicateTableException(match.group(1), e)
               match = re.search(r'function "(\w+?)" already exists', msg)
               if match: raise DuplicateFunctionException(match.group(1), e)
               raise # no specific exception raised
       ##### Basic queries
       def next_version(name):
           '''Prepends the version # so it won't be removed if the name is truncated'''
           version = 1 # first existing name was version 0
           match = re.match(r'^v(\d+)_(.*)$', name)
           if match:
               version = int(match.group(1))+1
               name = match.group(2)
           return 'v'+str(version)+'_'+name
       def run_query_into(db, query, params, into=None, *args, **kw_args):
           '''Outputs a query to a temp table.
           For params, see run_query().
           '''
           if into == None: return run_query(db, query, params, *args, **kw_args)
           else: # place rows in temp table
               assert isinstance(into, sql_gen.Table)
               kw_args['recover'] = True
               kw_args.setdefault('exc_log_level', kw_args.get('log_level', 2) + 2)
                   # by default, will have exc_log_level=4
               temp = not db.debug # tables are created as permanent in debug mode
               # "temporary tables cannot specify a schema name", so remove schema
               if temp: into.schema = None
               while True:
                   try:
                       create_query = 'CREATE'
                       if temp: create_query += ' TEMP'
                       create_query += ' TABLE '+into.to_str(db)+' AS '+query
                       return run_query(db, create_query, params, *args, **kw_args)
                           # CREATE TABLE AS sets rowcount to # rows in query
                   except DuplicateTableException, e:
                       into.name = next_version(into.name)
                       # try again with next version of name
       order_by_pkey = object() # tells mk_select() to order by the pkey
       distinct_on_all = object() # tells mk_select() to SELECT DISTINCT ON all columns
       def mk_select(db, tables, fields=None, conds=None, distinct_on=[], limit=None,
           start=None, order_by=order_by_pkey, default_table=None):
           '''
           @param tables The single table to select from, or a list of tables to join
               together, with tables after the first being sql_gen.Join objects
           @param fields Use None to select all fields in the table
           @param conds WHERE conditions: [(compare_left_side, compare_right_side),...]
               * container can be any iterable type
               * compare_left_side: sql_gen.Code|str (for col name)
               * compare_right_side: sql_gen.ValueCond|literal value
           @param distinct_on The columns to SELECT DISTINCT ON, or distinct_on_all to
               use all columns
           @return tuple(query, params)
           '''
           # Parse tables param
           if not lists.is_seq(tables): tables = [tables]
           tables = list(tables) # don't modify input! (list() copies input)
           table0 = sql_gen.as_Table(tables.pop(0)) # first table is separate
           # Parse other params
           if conds == None: conds = []
           elif isinstance(conds, dict): conds = conds.items()
           conds = list(conds) # don't modify input! (list() copies input)
           assert limit == None or type(limit) == int
           assert start == None or type(start) == int
           if order_by is order_by_pkey:
               if distinct_on != []: order_by = None
               else: order_by = pkey(db, table0, recover=True)
           query = 'SELECT'
           def parse_col(col): return sql_gen.as_Col(col, default_table).to_str(db)
           # DISTINCT ON columns
           if distinct_on != []:
               query += ' DISTINCT'
               if distinct_on is not distinct_on_all:
                   query += ' ON ('+(', '.join(map(parse_col, distinct_on)))+')'
           # Columns
           query += ' '
           if fields == None: query += '*'
           else: query += ', '.join(map(parse_col, fields))
           # Main table
           query += ' FROM '+table0.to_str(db)
           # Add joins
           left_table = table0
           for join_ in tables:
               table = join_.table
               # Parse special values
               if join_.type_ is sql_gen.filter_out: # filter no match
                   conds.append((sql_gen.Col(table_not_null_col(db, table), table),
                       None))
               query += ' '+join_.to_str(db, left_table)
               left_table = table
           missing = True
           if conds != []:
               query += ' WHERE '+(' AND '.join(('('+sql_gen.ColValueCond(l, r)
                   .to_str(db)+')' for l, r in conds)))
               missing = False
           if order_by != None:
               query += ' ORDER BY '+sql_gen.as_Col(order_by, table0).to_str(db)
           if limit != None: query += ' LIMIT '+str(limit); missing = False
           if start != None:
               if start != 0: query += ' OFFSET '+str(start)
               missing = False
           if missing: warnings.warn(DbWarning(
               'SELECT statement missing a WHERE, LIMIT, or OFFSET clause: '+query))
           return (query, [])
       def select(db, *args, **kw_args):
           '''For params, see mk_select() and run_query()'''
           recover = kw_args.pop('recover', None)
           cacheable = kw_args.pop('cacheable', True)
           log_level = kw_args.pop('log_level', 2)
           query, params = mk_select(db, *args, **kw_args)
           return run_query(db, query, params, recover, cacheable, log_level=log_level)
       def mk_insert_select(db, table, cols=None, select_query=None, params=None,
           returning=None, embeddable=False):
           '''
           @param returning str|None An inserted column (such as pkey) to return
           @param embeddable Whether the query should be embeddable as a nested SELECT.
               Warning: If you set this and cacheable=True when the query is run, the
               query will be fully cached, not just if it raises an exception.
           '''
           table = sql_gen.as_Table(table)
           if cols == []: cols = None # no cols (all defaults) = unknown col names
           if cols != None: cols = [sql_gen.as_Col(v).to_str(db) for v in cols]
           if select_query == None: select_query = 'DEFAULT VALUES'
           if returning != None: returning = sql_gen.as_Col(returning, table)
           # Build query
           query = 'INSERT INTO '+table.to_str(db)
           if cols != None: query += ' ('+', '.join(cols)+')'
           query += ' '+select_query
           if returning != None:
               returning_name = copy.copy(returning)
               returning_name.table = None
               returning_name = returning_name.to_str(db)
               query += ' RETURNING '+returning_name
           if embeddable:
               assert returning != None
               # Create function
               function_name = '_'.join(['insert', table.name] + cols)
               return_type = 'SETOF '+returning.to_str(db)+'%TYPE'
               while True:
                   try:
                       func_schema = None
                       if not db.debug: func_schema = 'pg_temp'
                       function = sql_gen.Table(function_name, func_schema).to_str(db)
                       function_query = '''\
       CREATE FUNCTION '''+function+'''() RETURNS '''+return_type+'''
           LANGUAGE sql
           AS $$'''+mogrify(db, query, params)+''';$$;
       '''
                       run_query(db, function_query, recover=True, cacheable=True,
                           exc_log_level=4)
                       break # this version was successful
                   except DuplicateFunctionException, e:
                       function_name = next_version(function_name)
                       # try again with next version of name
               # Return query that uses function
               func_table = sql_gen.NamedTable('f', sql_gen.CustomCode(function+'()'),
                   [returning_name]) # AS clause requires function alias
               return mk_select(db, func_table, start=0, order_by=None)
           return (query, params)
       def insert_select(db, *args, **kw_args):
           '''For params, see mk_insert_select() and run_query_into()
           @param into sql_gen.Table with suggested name of temp table to put RETURNING
               values in
           '''
           into = kw_args.pop('into', None)
           if into != None: kw_args['embeddable'] = True
           recover = kw_args.pop('recover', None)
           cacheable = kw_args.pop('cacheable', True)
           query, params = mk_insert_select(db, *args, **kw_args)
           return run_query_into(db, query, params, into, recover=recover,
               cacheable=cacheable)
       default = object() # tells insert() to use the default value for a column
       def insert(db, table, row, *args, **kw_args):
           '''For params, see insert_select()'''
           if lists.is_seq(row): cols = None
           else:
               cols = row.keys()
               row = row.values()
           row = list(row) # ensure that "!= []" works
           # Check for special values
           labels = []
           values = []
           for value in row:
               if value is default: labels.append('DEFAULT')
               else:
                   labels.append('%s')
                   values.append(value)
           # Build query
           if values != []: query = ' VALUES ('+(', '.join(labels))+')'
           else: query = None
           return insert_select(db, table, cols, query, values, *args, **kw_args)
       def mk_update(db, table, changes=None, cond=None):
           '''
           @param changes [(col, new_value),...]
               * container can be any iterable type
               * col: sql_gen.Code|str (for col name)
               * new_value: sql_gen.Code|literal value
           @param cond sql_gen.Code WHERE condition. e.g. use sql_gen.*Cond objects.
           @return str query
           '''
           query = 'UPDATE '+sql_gen.as_Table(table).to_str(db)+'\nSET\n'
           query += ',\n'.join((sql_gen.to_name_only_col(col, table).to_str(db)+' = '
               +sql_gen.as_Value(new_value).to_str(db) for col, new_value in changes))
           if cond != None: query += ' WHERE '+cond.to_str(db)
           return query
       def update(db, *args, **kw_args):
           '''For params, see mk_update() and run_query()'''
           recover = kw_args.pop('recover', None)
           return run_query(db, mk_update(db, *args, **kw_args), [], recover)
       def last_insert_id(db):
           module = util.root_module(db.db)
           if module == 'psycopg2': return value(run_query(db, 'SELECT lastval()'))
           elif module == 'MySQLdb': return db.insert_id()
           else: return None
       def truncate(db, table, schema='public'):
           return run_query(db, 'TRUNCATE '+qual_name(db, schema, table)+' CASCADE')
       def mk_flatten_mapping(db, into, cols, preserve=[], as_items=False):
           '''Creates a mapping from original column names (which may have collisions)
           to names that will be distinct among the columns' tables.
           This is meant to be used for several tables that are being joined together.
           @param cols The columns to combine. Duplicates will be removed.
           @param into The table for the new columns.
           @param preserve [sql_gen.Col...] Columns not to rename. Note that these
               columns will be included in the mapping even if they are not in cols.
               The tables of the provided Col objects will be changed to into, so make
               copies of them if you want to keep the original tables.
           @param as_items Whether to return a list of dict items instead of a dict
           @return dict(orig_col=new_col, ...)
               * orig_col: sql_gen.Col(orig_col_name, orig_table)
               * new_col: sql_gen.Col(orig_col_name, into)
               * All mappings use the into table so its name can easily be
                 changed for all columns at once
           '''
           cols = lists.uniqify(cols)
           items = []
           for col in preserve:
               orig_col = copy.copy(col)
               col.table = into
               items.append((orig_col, col))
           preserve = set(preserve)
           for col in cols:
               if col not in preserve:
                   items.append((col, sql_gen.Col(clean_name(str(col)), into)))
           if not as_items: items = dict(items)
           return items
       def flatten(db, into, joins, cols, limit=None, start=None, **kw_args):
           '''For params, see mk_flatten_mapping()
           @return See return value of mk_flatten_mapping()
           '''
           items = mk_flatten_mapping(db, into, cols, as_items=True, **kw_args)
           cols = [sql_gen.NamedCol(new.name, old) for old, new in items]
           run_query_into(db, *mk_select(db, joins, cols, limit=limit, start=start),
               into=into)
           return dict(items)
       ##### Database structure queries
       def table_row_count(db, table, recover=None):
           return value(run_query(db, *mk_select(db, table, [sql_gen.row_count],
               order_by=None, start=0), recover=recover, log_level=3))
       def table_cols(db, table, recover=None):
           return list(col_names(select(db, table, limit=0, order_by=None,
               recover=recover, log_level=4)))
       def pkey(db, table, recover=None):
           '''Assumed to be first column in table'''
           return table_cols(db, table, recover)[0]
       not_null_col = 'not_null'
       def table_not_null_col(db, table, recover=None):
           '''Name assumed to be the value of not_null_col. If not found, uses pkey.'''
           if not_null_col in table_cols(db, table, recover): return not_null_col
           else: return pkey(db, table, recover)
       def index_cols(db, table, index):
           '''Can also use this for UNIQUE constraints, because a UNIQUE index is
           automatically created. When you don't know whether something is a UNIQUE
           constraint or a UNIQUE index, use this function.'''
           module = util.root_module(db.db)
           if module == 'psycopg2':
               return list(values(run_query(db, '''\
       SELECT attname
       FROM
+      (
               SELECT attnum, attname
               FROM pg_index
               JOIN pg_class index ON index.oid = indexrelid
               JOIN pg_class table_ ON table_.oid = indrelid
               JOIN pg_attribute ON attrelid = indrelid AND attnum = ANY (indkey)
               WHERE
                   table_.relname = %(table)s
                   AND index.relname = %(index)s
           UNION
               SELECT attnum, attname
               FROM
+              (
                   SELECT
                       indrelid
                       , (regexp_matches(indexprs, E':varattno (\\\\d+)', 'g'))[1]::int
                           AS indkey
                   FROM pg_index
                   JOIN pg_class index ON index.oid = indexrelid
                   JOIN pg_class table_ ON table_.oid = indrelid
                   WHERE
                       table_.relname = %(table)s
                       AND index.relname = %(index)s
               ) s
               JOIN pg_attribute ON attrelid = indrelid AND attnum = indkey
       ) s
       ORDER BY attnum
       ''',
                   {'table': table, 'index': index}, cacheable=True, log_level=4)))
           else: raise NotImplementedError("Can't list index columns for "+module+
               ' database')
       def constraint_cols(db, table, constraint):
           module = util.root_module(db.db)
           if module == 'psycopg2':
               return list(values(run_query(db, '''\
       SELECT attname
       FROM pg_constraint
       JOIN pg_class ON pg_class.oid = conrelid
       JOIN pg_attribute ON attrelid = conrelid AND attnum = ANY (conkey)
       WHERE
           relname = %(table)s
           AND conname = %(constraint)s
       ORDER BY attnum
       ''',
                   {'table': table, 'constraint': constraint})))
           else: raise NotImplementedError("Can't list constraint columns for "+module+
               ' database')
       row_num_col = '_row_num'
       def index_col(db, col):
           '''Adds an index on a column if it doesn't already exist.'''
           assert sql_gen.is_table_col(col)
           table = col.table
           index = sql_gen.as_Table(clean_name(str(col)))
           col = sql_gen.to_name_only_col(col)
           try: run_query(db, 'CREATE INDEX '+index.to_str(db)+' ON '+table.to_str(db)
               +' ('+col.to_str(db)+')', recover=True, cacheable=True, log_level=3)
           except DuplicateTableException: pass # index already existed
       def index_pkey(db, table, recover=None):
           '''Makes the first column in a table the primary key.
           @pre The table must not already have a primary key.
           '''
           table = sql_gen.as_Table(table)
           index = sql_gen.as_Table(table.name+'_pkey')
           col = sql_gen.to_name_only_col(pkey(db, table, recover))
           run_query(db, 'ALTER TABLE '+table.to_str(db)+' ADD CONSTRAINT '
               +index.to_str(db)+' PRIMARY KEY('+col.to_str(db)+')', recover=recover,
               log_level=3)
       def add_row_num(db, table):
           '''Adds a row number column to a table. Its name is in row_num_col. It will
           be the primary key.'''
           table = sql_gen.as_Table(table).to_str(db)
           run_query(db, 'ALTER TABLE '+table+' ADD COLUMN '+row_num_col
               +' serial NOT NULL PRIMARY KEY', log_level=3)
       def tables(db, schema='public', table_like='%'):
           module = util.root_module(db.db)
           params = {'schema': schema, 'table_like': table_like}
           if module == 'psycopg2':
               return values(run_query(db, '''\
       SELECT tablename
       FROM pg_tables
       WHERE
           schemaname = %(schema)s
           AND tablename LIKE %(table_like)s
       ORDER BY tablename
       ''',
                   params, cacheable=True))
           elif module == 'MySQLdb':
               return values(run_query(db, 'SHOW TABLES LIKE %(table_like)s', params,
                   cacheable=True))
           else: raise NotImplementedError("Can't list tables for "+module+' database')
       ##### Database management
       def empty_db(db, schema='public', **kw_args):
           '''For kw_args, see tables()'''
           for table in tables(db, schema, **kw_args): truncate(db, table, schema)
       ##### Heuristic queries
       def put(db, table, row, pkey_=None, row_ct_ref=None):
           '''Recovers from errors.
           Only works under PostgreSQL (uses INSERT RETURNING).
           '''
           if pkey_ == None: pkey_ = pkey(db, table, recover=True)
           try:
               cur = insert(db, table, row, pkey_, recover=True)
               if row_ct_ref != None and cur.rowcount >= 0:
                   row_ct_ref[0] += cur.rowcount
               return value(cur)
           except DuplicateKeyException, e:
               return value(select(db, table, [pkey_],
                   util.dict_subset_right_join(row, e.cols), recover=True))
       def get(db, table, row, pkey, row_ct_ref=None, create=False):
           '''Recovers from errors'''
           try: return value(select(db, table, [pkey], row, limit=1, recover=True))
           except StopIteration:
               if not create: raise
               return put(db, table, row, pkey, row_ct_ref) # insert new row
       def put_table(db, out_table, in_tables, mapping, row_ct_ref=None):
           '''Recovers from errors.
           Only works under PostgreSQL (uses INSERT RETURNING).
           @param in_tables The main input table to select from, followed by a list of
               tables to join with it using the main input table's pkey
           @param mapping dict(out_table_col=in_table_col, ...)
               * out_table_col: sql_gen.Col|str
               * in_table_col: sql_gen.Col Wrap literal values in a sql_gen.NamedCol
           @return sql_gen.Col Where the output pkeys are made available
           '''
           out_table = sql_gen.as_Table(out_table)
           for in_table_col in mapping.itervalues():
               assert isinstance(in_table_col, sql_gen.Col)
           def log_debug(msg): db.log_debug(msg, level=1.5)
           temp_prefix = out_table.name
           pkeys = sql_gen.Table(temp_prefix+'_pkeys')
           # Create input joins from list of input tables
           in_tables_ = in_tables[:] # don't modify input!
           in_tables0 = in_tables_.pop(0) # first table is separate
           in_pkey = pkey(db, in_tables0, recover=True)
           in_pkey_col = sql_gen.as_Col(in_pkey, in_tables0)
           input_joins = [in_tables0]+[sql_gen.Join(v,
               {in_pkey: sql_gen.join_same_not_null}) for v in in_tables_]
           log_debug('Joining together input tables')
           # Place in new table for speed and so don't modify input if values edited
           in_table = sql_gen.Table(temp_prefix+'_in')
           flatten_cols = filter(sql_gen.is_table_col, mapping.values())
           mapping = dicts.join(mapping, flatten(db, in_table, input_joins,
               flatten_cols, preserve=[in_pkey_col], start=0))
           input_joins = [in_table]
           out_pkey = pkey(db, out_table, recover=True)
           out_pkey_col = sql_gen.as_Col(out_pkey, out_table)
           pkeys_names = [in_pkey, out_pkey]
           pkeys_cols = [in_pkey_col, out_pkey_col]
           pkeys_table_exists_ref = [False]
           def insert_into_pkeys(joins, cols):
               query, params = mk_select(db, joins, cols, order_by=None, start=0)
               if pkeys_table_exists_ref[0]:
                   insert_select(db, pkeys, pkeys_names, query, params)
               else:
                   run_query_into(db, query, params, into=pkeys)
                   pkeys_table_exists_ref[0] = True
           limit_ref = [None]
           conds = set()
           distinct_on = []
           def mk_main_select(joins, cols):
               return mk_select(db, joins, cols, conds, distinct_on,
                   limit=limit_ref[0], start=0)
           def log_exc(e):
               log_debug('Caught exception: '+exc.str_(e, first_line_only=True))
           def remove_all_rows():
               log_debug('Returning NULL for all rows')
               limit_ref[0] = 0 # just create an empty pkeys table
           def ignore(in_col, value):
               in_col_str = str(in_col)
               log_debug('Adding index on '+in_col_str+' to enable fast filtering')
               index_col(db, in_col)
               log_debug('Ignoring rows with '+in_col_str+' = '+repr(value))
           def remove_rows(in_col, value):
               ignore(in_col, value)
               cond = (in_col, sql_gen.CompareCond(value, '!='))
               assert cond not in conds # avoid infinite loops
               conds.add(cond)
           def invalid2null(in_col, value):
               ignore(in_col, value)
               update(db, in_table, [(in_col, None)],
                   sql_gen.ColValueCond(in_col, value))
           # Do inserts and selects
           join_cols = {}
           insert_out_pkeys = sql_gen.Table(temp_prefix+'_insert_out_pkeys')
           insert_in_pkeys = sql_gen.Table(temp_prefix+'_insert_in_pkeys')
           while True:
               has_joins = join_cols != {}
               # Prepare to insert new rows
               insert_joins = input_joins[:] # don't modify original!
               insert_args = dict(recover=True, cacheable=False)
               if has_joins:
                   distinct_on = [v.to_Col() for v in join_cols.values()]
                   insert_joins.append(sql_gen.Join(out_table, join_cols,
                       sql_gen.filter_out))
               else:
                   insert_args.update(dict(returning=out_pkey, into=insert_out_pkeys))
               log_debug('Inserting new rows')
               try:
                   cur = insert_select(db, out_table, mapping.keys(),
                       *mk_main_select(insert_joins, mapping.values()), **insert_args)
                   break # insert successful
               except DuplicateKeyException, e:
                   log_exc(e)
                   old_join_cols = join_cols.copy()
                   join_cols.update(util.dict_subset(mapping, e.cols))
                   log_debug('Ignoring existing rows, comparing on '+str(join_cols))
                   assert join_cols != old_join_cols # avoid infinite loops
               except NullValueException, e:
                   log_exc(e)
                   out_col, = e.cols
                   try: in_col = mapping[out_col]
                   except KeyError:
                       log_debug('Missing mapping for NOT NULL '+out_col)
                       remove_all_rows()
                   else: remove_rows(in_col, None)
               except FunctionValueException, e:
                   log_exc(e)
                   assert e.name == out_table.name
                   out_col = 'value' # assume function param was named "value"
                   invalid2null(mapping[out_col], e.value)
               except DatabaseErrors, e:
                   log_exc(e)
                   msg = 'No handler for exception: '+exc.str_(e, first_line_only=True)
                   warnings.warn(DbWarning(msg))
                   log_debug(msg)
                   remove_all_rows()
               # after exception handled, rerun loop with additional constraints
           if row_ct_ref != None and cur.rowcount >= 0:
               row_ct_ref[0] += cur.rowcount
           if has_joins:
               select_joins = input_joins+[sql_gen.Join(out_table, join_cols)]
               log_debug('Getting output pkeys of existing/inserted rows')
               insert_into_pkeys(select_joins, pkeys_cols)
           else:
               add_row_num(db, insert_out_pkeys) # for joining with input pkeys
               log_debug('Getting input pkeys for rows in insert')
               run_query_into(db, *mk_main_select(input_joins, [in_pkey]),
                   into=insert_in_pkeys)
               add_row_num(db, insert_in_pkeys) # for joining with output pkeys
               assert table_row_count(db, insert_out_pkeys) == table_row_count(db,
                   insert_in_pkeys)
               log_debug('Joining together output and input pkeys')
               pkey_joins = [insert_in_pkeys, sql_gen.Join(insert_out_pkeys,
                   {row_num_col: sql_gen.join_same_not_null})]
               insert_into_pkeys(pkey_joins, pkeys_names)
           log_debug('Adding pkey on returned pkeys table to enable fast joins')
           index_pkey(db, pkeys)
           log_debug("Setting missing rows' pkeys to NULL")
           missing_rows_joins = input_joins+[sql_gen.Join(pkeys,
               {in_pkey: sql_gen.join_same_not_null}, sql_gen.filter_out)]
               # must use join_same_not_null or query will take forever
           insert_into_pkeys(missing_rows_joins,
               [in_pkey_col, sql_gen.NamedCol(out_pkey, None)])
           assert table_row_count(db, pkeys) == table_row_count(db, in_table)
           return sql_gen.Col(out_pkey, pkeys)
       ##### Data cleanup
       def cleanup_table(db, table, cols):
           def esc_name_(name): return esc_name(db, name)
           table = sql_gen.as_Table(table).to_str(db)
           cols = map(esc_name_, cols)
           run_query(db, 'UPDATE '+table+' SET\n'+(',\n'.join(('\n'+col
               +' = nullif(nullif(trim(both from '+col+"), %(null0)s), %(null1)s)"
                   for col in cols))),
               dict(null0='', null1=r'\N'))

(23-23/35)

Project

General

Profile