Project

General

Profile

1 9826 aaronmk
SELECT util.search_path_append('util');
2
3 11868 aaronmk
SELECT create_if_not_exists($$CREATE INDEX raw_occurrence_record_plants_institution_code
4 12879 aaronmk
ON raw_occurrence_record_plants ("specimen_duplicate_institutions")$$)
5 11868 aaronmk
; -- runtime: 45 min ("2629095.310 ms") @starscream
6 9644 aaronmk
7
-- Remove institutions that we have direct data for
8 11867 aaronmk
-- runtime: 3.5 min ("208377.225 ms") @starscream
9 11887 aaronmk
-- rerun time: ~0 ("Time: 2.271 ms") thanks to index (*after* manual vacuuming)
10 9644 aaronmk
DELETE FROM :table
11 12879 aaronmk
WHERE "specimen_duplicate_institutions" IN (
12 9644 aaronmk
      'ACAD'
13
    , 'ARIZ'
14
    , 'BRIT'
15
    , 'MO'
16
    , 'MT'
17
    , 'NY'
18
    , 'QFA'
19
    , 'TEX'
20
    , 'TRT'
21
    , 'TRTE'
22
    , 'U'
23
    , 'UBC'
24
    , 'WIN'
25
)
26 9857 aaronmk
/* there are 4.5 million duplicates as of r9855 [1]
27 9856 aaronmk
28
[1] 59,998,354 rows before [2] - 55,417,646 rows after [3] = 4,580,708
29
30
[2] 59998354
31
SELECT "raw_occurrence_record-row_num"
32
FROM "GBIF".raw_occurrence_record_plants
33
ORDER BY "raw_occurrence_record-row_num" DESC
34
LIMIT 1
35
Total query runtime: 19436 ms.
36
37
[3] 55417646
38
SELECT COUNT(*) FROM "GBIF".raw_occurrence_record_plants
39
Total query runtime: 23 ms.
40
*/
41 9644 aaronmk
/* list obtained using the following on r9459:
42
SELECT DISTINCT dataprovider
43
FROM sourcelist
44
JOIN provider_count ON provider_count.dataprovider = sourcelist.name
45 12516 aaronmk
WHERE source_id = (SELECT source_by_shortname('GBIF'))
46 9644 aaronmk
ORDER BY dataprovider
47
*/
48
;