Project

General

Profile

1
SELECT util.search_path_append('util');
2

    
3
SELECT create_if_not_exists($$CREATE INDEX raw_occurrence_record_plants_institution_code
4
ON raw_occurrence_record_plants ("specimen_duplicate_institutions")$$)
5
; -- runtime: 45 min ("2629095.310 ms") @starscream
6

    
7
-- Remove institutions that we have direct data for
8
-- runtime: 3.5 min ("208377.225 ms") @starscream
9
-- rerun time: ~0 ("Time: 2.271 ms") thanks to index (*after* manual vacuuming)
10
DELETE FROM :table
11
WHERE "specimen_duplicate_institutions" IN (
12
      'ACAD'
13
    , 'ARIZ'
14
    , 'BRIT'
15
    , 'MO'
16
    , 'MT'
17
    , 'NY'
18
    , 'QFA'
19
    , 'TEX'
20
    , 'TRT'
21
    , 'TRTE'
22
    , 'U'
23
    , 'UBC'
24
    , 'WIN'
25
)
26
/* there are 4.5 million duplicates as of r9855 [1]
27

    
28
[1] 59,998,354 rows before [2] - 55,417,646 rows after [3] = 4,580,708
29

    
30
[2] 59998354
31
SELECT "raw_occurrence_record-row_num"
32
FROM "GBIF".raw_occurrence_record_plants
33
ORDER BY "raw_occurrence_record-row_num" DESC
34
LIMIT 1
35
Total query runtime: 19436 ms.
36

    
37
[3] 55417646
38
SELECT COUNT(*) FROM "GBIF".raw_occurrence_record_plants
39
Total query runtime: 23 ms.
40
*/
41
/* list obtained using the following on r9459:
42
SELECT DISTINCT dataprovider
43
FROM sourcelist
44
JOIN provider_count ON provider_count.dataprovider = sourcelist.name
45
WHERE source_id = (SELECT source_by_shortname('GBIF'))
46
ORDER BY dataprovider
47
*/
48
;
(7-7/11)