Project

General

Profile

Postgres queries

Debugging

debug-print

LANGUAGE sql:

SELECT util.raise('NOTICE', '>>>'||expr);

LANGUAGE plpgsql:

PERFORM util.raise('NOTICE', '>>>'||expr);

debug-print caught exception

PERFORM util.raise('NOTICE', '>>>'||SQLERRM);

EXPLAIN from within function

requires the BIEN util schema

PERFORM util.explain2table($$
query
$$);

Programming constructs

anyelement wrapper around text function

CREATE OR REPLACE FUNCTION function(value anyelement)
  RETURNS anyelement AS
$BODY$
DECLARE
    result value%TYPE := function(value::text)::unknown;
BEGIN
    RETURN result;
END;
$BODY$
  LANGUAGE plpgsql IMMUTABLE
  COST 100;

comma operator

-- (a, b, c) -> c
(SELECT c FROM (SELECT a, b) s)
-- (a, b, void) -> void
(SELECT NULL::void FROM (SELECT
a
,
b
) s)

duplicates

SELECT column, COUNT(*) FROM table GROUP BY column HAVING COUNT(*) > 1 ORDER BY column

exception block

exception_type: see PostgreSQL Error Codes

BEGIN
    ...
EXCEPTION
WHEN exception_type THEN ...;
WHEN exception_type THEN NULL; -- suppress
WHEN   exception_type
    OR exception_type
    THEN NULL;
WHEN exception_type THEN -- selective suppress
    IF SQLERRM LIKE ... THEN NULL;
    ELSE RAISE; -- rethrow
    END IF;
WHEN OTHERS THEN ...;
END;

EXISTS()

SELECT NOT EXISTS( -- there is no row that is != _
SELECT NULL
FROM _
WHERE _ IS DISTINCT FROM _
LIMIT 1
);

expression substitution

EXPLAIN can be used to substitute constants, rename columns, and simplify expressions. unlike the equivalent Python code for renaming and simplifying, this is fully general and uses only Postgres.

the simplify-only case (with all vars table cols) is implemented in util.canon_sql()

eg. col_old + (const + 1), with const = 1 and col_old renamed to col_new, simplifies to col_new + 2:

-- the following code requires: renamings, constants, column types, result type

EXPLAIN
-- declare types (use WITH so original names *won't* be inlined, and to avoid `types.` prefix)
-- note that these must use the *new* column names
WITH types AS -- if WITH not supported, use temp table
(
    SELECT
      NULL::/*result type:*/integer AS result_type_null -- when using only table columns, use $$||util.typeof(expr_sql, NULL::table)||$$ for the type
    , /*col types:*/ NULL::/*col_old_type:*/integer AS col_new
    /*, (NULL::table).table_col_old AS table_col_new*/
    /*...*/
)
SELECT *
FROM
-- specify renamings and constants (use nested SELECT so they *will* be inlined)
-- with no renamings or constants, can also use just `types`
(
    SELECT
    -- with no renamings, can also use `*, constants...`
      result_type_null
    , /*renamings:*/ col_new AS col_old -- note reverse order, because we want to substitute col_new for col_old
    , /*constants:*/ 1 AS const -- to use a var from the surrounding code, use $$||util.quote_typed(var)||$$ instead of just var
    /*...*/
    FROM types
) s
WHERE (/*expr:*/ col_old + (const + 1) ) IS DISTINCT FROM result_type_null
    -- can't just use IS NOT NULL, or even `= result_type_null`, because these will simplify NULL exprs too far
;
-- then extract Filter expression from EXPLAIN output: (col_new + 2) = result_type_null
-- then extract result expression from Filter expression: col_new + 2

for-loop

DECLARE
    loop_var text = NULL;
BEGIN
    FOR loop_var IN
        SELECT ...
    LOOP
        RETURN QUERY
            -- ...
        ;
    END LOOP;
END;

FULL JOIN

SELECT * FROM
(VALUES (1), (2), (4)) v1 (col)
FULL JOIN
(VALUES (1), (3), (4)) v2 (col)
ON v1.col = v2.col -- can't use USING because that includes only the left column in the result
ORDER BY COALESCE(v1.col, v2.col) -- otherwise, the v2-only rows will come after all v1 rows

hstore a column list

unlike hstore(ROW(col1, col2)), this preserves the column names and can even apply aliases:

SELECT
(SELECT hstore(s) FROM (SELECT col1, col2 AS col2_alias, value AS col3) s) AS row_hstore
FROM table

if-statement in PL/pgSQL

IF cond THEN
    -- ...
ELSE
    -- ...
END IF;

if-statement in SQL

SELECT CASE WHEN _
THEN util.coalesce( -- perform both statements
      _::text
    , _::text
)::void
END

inlining

to test how an IMMUTABLE function will be inlined:

SELECT * FROM (VALUES (NULL)) v (c)
WHERE
func(c)
IS NOT NULL

in-place update

  • with UPDATE/VACUUM:
    UPDATE table SET column = expr(column);
    VACUUM ANALYZE VERBOSE table; -- needed because autovacuum sometimes doesn't run
    
  • with ALTER COLUMN TYPE USING:
    ALTER TABLE table ALTER COLUMN column TYPE curr_type USING expr(column);
    
    note that ALTER columns can't use concurrent ALTER columns' results:
    CREATE TEMP TABLE abc (a text, b text);
    INSERT INTO abc VALUES ('a', 'b');
    
    ALTER TABLE abc
      ALTER COLUMN a TYPE text USING 'x'
    , ALTER COLUMN b TYPE text USING a
    ;
    SELECT * FROM abc;
    -- "x";"a" not "x";"x" as expected
    

is more complete than

the operator contains ? to indicate that you are comparing the NULLity of the values

CREATE OR REPLACE FUNCTION is_more_complete_than("left" anyelement, "right" anyelement)
  RETURNS boolean AS
$BODY$
SELECT $1 IS NOT DISTINCT FROM $2 OR ($1 IS NOT NULL AND $2 IS NULL)
$BODY$
  LANGUAGE sql IMMUTABLE
  COST 100;

CREATE OPERATOR ?>=(
  PROCEDURE = is_more_complete_than,
  LEFTARG = anyelement,
  RIGHTARG = anyelement);

is populated more often than

the operator contains ? to indicate that you are comparing the NULLity of the values, and * to indicate a fuzzy match (similar to ~* indicating case-insensitive regexp matching)

CREATE OR REPLACE FUNCTION is_populated_more_often_than("left" anyelement, "right" anyelement)
  RETURNS boolean AS
$BODY$
SELECT $1 IS NOT NULL >= $2 IS NOT NULL -- true > false
$BODY$
  LANGUAGE sql IMMUTABLE
  COST 100;

CREATE OPERATOR ?*>=(
  PROCEDURE = is_populated_more_often_than,
  LEFTARG = anyelement,
  RIGHTARG = anyelement);

maintaining derived-column formulas across column renames

  1. use a CHECK constraint, which will be auto-updated with column renames:
    -- the following code requires: table, col, expr
    
    SELECT util.derived_col_update((('table', 'col'), $$expr$$));
    -- if table has data:
    SELECT util.derived_cols_populate('table');
    
  2. whenever columns are renamed, update the trigger function to use the CHECK constraint's updated formula:
    SELECT util.derived_cols_update('table');
    

nested transaction

should not be necessary, because an error will always abort and roll back all transactions that don't catch it (including any single-command autocommit transaction). because of this, a nested transaction is not automatically created for each function.

BEGIN -- nested transaction
    ...
EXCEPTION
WHEN SQLSTATE /*successful_completion*/'00000' THEN NULL; -- need at least one WHEN clause
END;

query plan fixing

use these commands if a query plan is incorrect:

-- uncomment additional options as needed
SET enable_seqscan = off;
SET join_collapse_limit = 1; -- turn it off
--SET enable_sort = off;
--SET enable_mergejoin = off;
--SET enable_hashjoin = off;
/*query*/
RESET enable_hashjoin;
RESET enable_mergejoin;
RESET enable_sort;
RESET join_collapse_limit;
RESET enable_seqscan;

sandbox block

BEGIN
    __

    -- roll back DB changes
    PERFORM util.raise('ROLBK', 'sandbox block finished and rolled back');
EXCEPTION
WHEN SQLSTATE 'ROLBK' THEN NULL;
END;

vars for use in SELECT exprs

SELECT expr(var)
FROM (SELECT _ AS var) s

void-returning sql-language functions

SELECT func(_)
FROM inputs
;
SELECT NULL::void; -- don't fold away functions called in previous query

Profiling

function in for loop

SELECT function(v) FROM unnest(array_fill(arg_value, array[iter_ct])) f (v)

DB structure

Query columns for pgAdmin export

Need at least one data row for pgAdmin to enable CSV export

SELECT *
FROM
(
    SELECT *, NULL::text AS _
    FROM <query>
    LIMIT 0
) q
RIGHT JOIN (VALUES (NULL)) v (_) USING (_)

Array starting index

SELECT array_lower(field, 1) FROM table WHERE field IS NOT NULL LIMIT 1

pg_temp schemas in use

  1. query:
    SELECT ('"'||schemaname||'"."'||tablename||'"')::regclass AS qual_table
    FROM pg_tables
    WHERE NOT schemaname = ANY (ARRAY[]::text[])
    
  2. If you get an error of the form:
    ERROR:  permission denied for schema pg_temp_#
    
    add this schema inside ARRAY[] above and re-run the query until there are no more errors
  3. The contents of ARRAY[] above are the pg_temp schemas in use

Adding covering indexes on foreign keys

  1. Run the following query (modified from Postgresql: Indexes on Foreign Keys):
    SELECT conrelid::regclass
             ,conname
             ,reltuples::bigint
        FROM pg_constraint
             JOIN pg_class ON (conrelid = pg_class.oid)
       WHERE contype = 'f'
             AND NOT EXISTS (
               SELECT 1
                 FROM pg_index
                WHERE indrelid = conrelid
                      AND conkey[1] = indkey[0] AND indpred IS NULL
             )
             AND reltuples > 0 -- limit results to only tables in use
    ORDER BY reltuples DESC
    ;
    
  2. For each result, add a covering index: CREATE INDEX ON table (field);

SQL_ASCII

CREATE DATABASE database WITH OWNER owner TEMPLATE template0 ENCODING 'SQL_ASCII' LC_COLLATE 'C' LC_CTYPE 'C';

UTF8 takes 1/3 longer than it:
129min UTF8 / (5929.75s * 1min/60s) SQL_ASCII = 1.33 = 1/3 longer for UTF8

  • UTF8:
    Read 53310244 row(s)
    6889.27user 225.13system 2:09:04elapsed 91%CPU (0avgtext+0avgdata 51824maxresident)k
    0inputs+0outputs (0major+7311minor)pagefaults 0swaps
    
  • SQL_ASCII:
    (after CREATE DATABASE ... SQL_ASCII above)
    rm=1 t ./inputs/GBIF/raw_occurrence_record/run load_data 
         7309.16 real      4581.97 user        32.80 sys
    Read 53310244 row(s)
         5929.75 real      4580.94 user        30.81 sys
    

Table stats

Highest value of integer index column

  • also a fast way to get count(*) using a row number column
SELECT row_num
FROM joins
ORDER BY row_num DESC
LIMIT 1

DDL

Change fkeys to ON DELETE CASCADE on live DB

  1. in DDL export, replace text
    
        ADD CONSTRAINT
    
    with
     ADD CONSTRAINT
    
  2. grep -F "FOREIGN KEY" schema_export.sql >changes.sql
  3. In changes.sql, replace regexp
    ^ALTER TABLE ONLY (\w+) ADD CONSTRAINT (\w+) FOREIGN KEY \((\w+)\) REFERENCES .*(?=;$)
    
    with
    
    ALTER TABLE $1 DROP CONSTRAINT $2;
    $0 ON UPDATE CASCADE ON DELETE CASCADE
    

make COMMENTs start on their own line

makes sure every COMMENT starts and ends on its own line, so that it appears correctly in the formats it's most likely to be read in (ie. in the DDL export, not the COMMENT edit box in pgAdmin)

in DDL export:

  1. replace regexp
    ^(COMMENT .*? IS '(?!$))((?s).*?';)$
    
    with
    $1
    $2
    
  2. replace regexp
    ^(COMMENT .*? IS '(?s:.*?(.)))(';)$
    
    with BeanShell snippet
    _2.equals("\n") ? _0 : _1+"\n"+_3
    

wrap function call in nested SELECT

needed when Postgres incorrectly does not constant-fold (inline) a function, leading to a slowdown if the function is therefore run many times

WARNING: not idempotent
WARNING: does not handle nested () correctly. these need to be manually reviewed and fixed.

replace vars below: [function], [schema]

  1. replace regexp
    \b(?:[schema]\.)?[function]\(\S*?\)
    
    with
    (SELECT $0)
    
  2. search for text
    [function]
    
    1. for each occurrence, check that nested () have been handled correctly

Data import

INSERT/ON DUPLICATE KEY SELECT

COPY FROM column list verification

note that this approach does not require a separate programming language to do the CSV parsing of the header row, as we currently do it

  1. COPY FROM just the header row into a temp table of text columns, whose names are the same as the columns of the table to insert into
  2. in the temp table's first row, check that the values match the column names:
    1. get an array of the temp table's column names
    2. create a SQL statement that makes an array of each column in the temp table, using the column names list above:
      SELECT ARRAY[col1::text, col2::text, ...] FROM table
      
    3. run this SQL statement to produce a text array
    4. compare the two arrays

Data maintenance

merging pkeys which are referenced by fkeys

to merge pkey 2 into pkey 1:

  1. defer the pkey constraint
  2. rename pkey 1 to pkey 2
    • all fkeys of both pkeys will now use pkey 2
    • there are now two copies of pkey 2 in the main table
  3. rename one of the copies of pkey 2 to pkey 1, using LIMIT 1 on the UPDATE (or the ctid, although LIMIT 1 works just fine)
    • all fkeys will now use pkey 1, because the UPDATE cascades even though it is applied to only one copy of pkey 2
    • the pkey 2 entry now has no fkeys pointing to it
  4. delete pkey 2
    • because there are no fkeys pointing to it, the delete will not cascade

DB info

Data directory path

SHOW data_directory;

log file searching

sudo -s
grep -hE 'pattern' /var/lib/postgresql/9.*/main/pg_log/postgresql-[date]-*

datatype support for hash indexes

CREATE TEMP TABLE t (col datatype);
CREATE INDEX ON t USING hash (col);
-- if error, no hash support for that datatype

Disk usage

Total disk usage

in Ubuntu:

(cd /var/lib/postgresql/9.*/; sudo du --summarize --human-readable main/base/)

temporary disk usage

the temporary space mechanism is Postgres's alternative to using the system swap space, which usually has a system limit that's less than the available disk space

in Ubuntu:

Postgres 9.3

WARNING: this also includes permanent disk space, in addition to temporary space used by a disk-intensive operation

note that listing just pgsql_tmp is no longer sufficient to see all the temporary space, because a lot of it is now stored in the database's base/ subfolder

(cd /var/lib/postgresql/9.*/; sudo du --max-depth=1 --all --human-readable main/base/pgsql_tmp/) # temporary space for just temp tables
(cd /var/lib/postgresql/9.*/; sudo du --max-depth=1 --all --human-readable main/base/*/) # temporary space for permanent tables (as well as permanent space)

Postgres 9.1

(cd /var/lib/postgresql/9.*/; sudo du --max-depth=1 --all --human-readable main/base/pgsql_tmp/) # temporary space for temp tables, and possibly also permanent tables

Schema sizes

SELECT
schema
, pg_size_pretty(total) AS total
, pg_size_pretty(relation) AS relation
, pg_size_pretty(indexes) AS indexes
FROM
(
    SELECT
    schema
    , sum(pg_total_relation_size(qual_table))::bigint AS total
    , sum(pg_relation_size(qual_table))::bigint AS relation
    , sum(pg_indexes_size(qual_table))::bigint AS indexes
    FROM
    (
        SELECT
        schemaname AS schema
        , tablename AS table
        , ('"'||schemaname||'"."'||tablename||'"')::regclass AS qual_table
        FROM pg_tables
        WHERE schemaname NOT LIKE 'pg_%'
    ) s
    GROUP BY schema
    ORDER BY total DESC
) s

Index sizes

SELECT
qual_index
, is_unique
, pg_size_pretty(size) AS size
FROM
(
    SELECT
    qual_index
    , is_unique
    , pg_relation_size(qual_index)::bigint AS size
    FROM
    (
        SELECT
        ('"'||nspname||'"."'||relname||'"')::regclass AS qual_index
        , indisunique as is_unique
        FROM pg_index
        JOIN pg_class index ON index.oid = indexrelid
        JOIN pg_namespace ON pg_namespace.oid = relnamespace
        WHERE nspname NOT LIKE 'pg_%'
    ) s
    ORDER BY size DESC
) s

Non-unique index sizes

SELECT
qual_index
, pg_size_pretty(size) AS size
FROM
(
    SELECT
    qual_index
    , pg_relation_size(qual_index)::bigint AS size
    FROM
    (
        SELECT
        ('"'||nspname||'"."'||relname||'"')::regclass AS qual_index
        FROM pg_index
        JOIN pg_class index ON index.oid = indexrelid
        JOIN pg_namespace ON pg_namespace.oid = relnamespace
        WHERE
        nspname NOT LIKE 'pg_%'
        AND NOT indisunique
    ) s
    ORDER BY size DESC
) s