Revision 5903
Added by Aaron Marcuse-Kubitza about 12 years ago
lib/sql.py | ||
---|---|---|
1486 | 1486 |
|
1487 | 1487 |
limit = None |
1488 | 1488 |
if distinct_on == []: limit = 1 # one sample row |
1489 |
else: |
|
1490 |
add_index(db, distinct_on, new_table, unique=True) |
|
1491 |
add_index(db, distinct_on, table) # for join optimization |
|
1489 |
else: add_index(db, distinct_on, table) # for join optimization |
|
1492 | 1490 |
|
1493 |
insert_select(db, new_table, None, mk_select(db, table, order_by=None,
|
|
1494 |
limit=limit), ignore=True)
|
|
1491 |
insert_select(db, new_table, None, mk_select(db, table, |
|
1492 |
distinct_on=distinct_on, order_by=None, limit=limit))
|
|
1495 | 1493 |
analyze(db, new_table) |
1496 | 1494 |
|
1497 | 1495 |
return new_table |
Also available in: Unified diff
sql.py: distinct_table(): Use DISTINCT ON instead of a unique index and insert_select()'s ignore mode to remove duplicate rows. This uses whichever sorting method PostgreSQL deems to be fastest instead of requiring the use of a B-tree index. Since most of the slower operations in TNRS's import are distinct_table() calls, this should speed up the TNRS import, which is a bottleneck for the DB import as a whole because the TNRS import must complete before other datasources can be imported.