Project

General

Profile

1 1942 aaronmk
#!/usr/bin/env python
2
# Loads a command's CSV output stream into a PostgreSQL table.
3
# The command may be run more than once.
4
5
import csv
6
import os.path
7 1963 aaronmk
import re
8 1942 aaronmk
import subprocess
9
import sys
10
11
sys.path.append(os.path.dirname(__file__)+"/../lib")
12
13
import csvs
14
import exc
15
import opts
16
import sql
17
import streams
18 1963 aaronmk
import strings
19 1965 aaronmk
import util
20 1942 aaronmk
21
def main():
22
    # Usage
23
    env_names = []
24
    def usage_err():
25
        raise SystemExit('Usage: '+opts.env_usage(env_names)+' '+sys.argv[0]
26
            +' input_cmd [args...]')
27
28
    # Parse args
29
    input_cmd = sys.argv[1:]
30
    if input_cmd == []: usage_err()
31
32
    # Get config from env vars
33
    table = opts.get_env_var('table', None, env_names)
34
    schema = opts.get_env_var('schema', 'public', env_names)
35
    db_config = opts.get_env_vars(sql.db_config_names, None, env_names)
36
    debug = opts.env_flag('debug', False, env_names)
37
    if not (table != None and 'engine' in db_config): usage_err()
38
39
    # Connect to DB
40
    db = sql.connect(db_config)
41
42 1963 aaronmk
    use_copy_from = [True]
43
44
    # Loads data into the table using the currently-selected approach.
45
    def load_():
46 1942 aaronmk
        # Open input stream
47
        proc = subprocess.Popen(input_cmd, stdout=subprocess.PIPE, bufsize=-1)
48
        in_ = proc.stdout
49
50
        # Get format info
51
        info = csvs.stream_info(in_, parse_header=True)
52
        dialect = info.dialect
53 1963 aaronmk
        if csvs.is_tsv(dialect): use_copy_from[0] = False
54 1965 aaronmk
        cols = info.header
55
        for i, col in enumerate(cols): # replace empty column names
56
            if col == '': cols[i] = 'column_'+str(i)
57 1942 aaronmk
58 1963 aaronmk
        # Select schema and escape names
59 2062 aaronmk
        def esc_name(name): return sql.esc_name(db, name)
60 1963 aaronmk
        sql.run_query(db, 'SET search_path TO '+esc_name(schema))
61
        esc_table = esc_name(table)
62 1965 aaronmk
        esc_cols = map(esc_name, cols)
63 1942 aaronmk
64
        # Create CREATE TABLE statement
65
        pkey = esc_name(table+'_pkey')
66 1963 aaronmk
        create_table = 'CREATE TABLE '+esc_table+' (\n'
67 1965 aaronmk
        create_table += '    row_num serial NOT NULL PRIMARY KEY\n'
68
        for esc_col in esc_cols: create_table += '    , '+esc_col+' text\n'
69 1942 aaronmk
        create_table += ');\n'
70
        if debug: sys.stderr.write(create_table)
71
72 1963 aaronmk
        # Create table
73
        sql.run_query(db, create_table)
74
75 1942 aaronmk
        # Create COPY FROM statement
76 1963 aaronmk
        if use_copy_from[0]:
77
            cur = db.db.cursor()
78
            copy_from = ('COPY '+esc_table+' ('+(', '.join(esc_cols))
79
                +') FROM STDIN DELIMITER %(delimiter)s NULL %(null)s')
80
            assert not csvs.is_tsv(dialect)
81 1942 aaronmk
            copy_from += ' CSV'
82
            if dialect.quoting != csv.QUOTE_NONE:
83
                copy_from += ' QUOTE %(quotechar)s'
84
                if dialect.doublequote: copy_from += ' ESCAPE %(quotechar)s'
85 1963 aaronmk
            copy_from += ';\n'
86
            copy_from = cur.mogrify(copy_from, dict(delimiter=dialect.delimiter,
87 2116 aaronmk
                null='', quotechar=dialect.quotechar))
88 1963 aaronmk
            if debug: sys.stderr.write(copy_from)
89 1942 aaronmk
90 1963 aaronmk
        # Load the data
91 1942 aaronmk
        line_in = streams.ProgressInputStream(in_, sys.stderr,
92
            'Processed %d row(s)', n=10000)
93 1963 aaronmk
        try:
94
            if use_copy_from[0]:
95
                sys.stderr.write('Using COPY FROM\n')
96
                db.db.cursor().copy_expert(copy_from, line_in)
97
            else:
98
                sys.stderr.write('Using INSERT\n')
99 1965 aaronmk
                cols_ct = len(cols)+1 # +1 for row_num
100 1963 aaronmk
                for row in csvs.make_reader(line_in, dialect):
101
                    row = map(strings.to_unicode, row)
102
                    row.insert(0, sql.default) # leave space for autogen row_num
103 1965 aaronmk
                    util.list_set_length(row, cols_ct) # truncate extra cols
104 1963 aaronmk
                    sql.insert(db, esc_table, row, table_is_esc=True)
105 1942 aaronmk
        finally:
106
            line_in.close() # also closes proc.stdout
107
            proc.wait()
108 2116 aaronmk
109
        # Clean up the data
110
        sys.stderr.write('Cleaning up table\n')
111
        sql.cleanup_table(db, esc_table, cols, table_is_esc=True)
112 1963 aaronmk
    load = lambda: sql.with_savepoint(db, load_)
113 1942 aaronmk
114 1963 aaronmk
    try: load()
115
    except sql.DatabaseErrors, e:
116
        if use_copy_from[0]: # first try
117
            exc.print_ex(e, plain=True)
118
            use_copy_from[0] = False
119
            load() # try again with different approach
120
        else: raise e
121
    db.db.commit()
122 1942 aaronmk
123
main()