1 |
9826
|
aaronmk
|
SELECT util.search_path_append('util');
|
2 |
|
|
|
3 |
11868
|
aaronmk
|
SELECT create_if_not_exists($$CREATE INDEX raw_occurrence_record_plants_institution_code
|
4 |
12879
|
aaronmk
|
ON raw_occurrence_record_plants ("specimen_duplicate_institutions")$$)
|
5 |
11868
|
aaronmk
|
; -- runtime: 45 min ("2629095.310 ms") @starscream
|
6 |
9644
|
aaronmk
|
|
7 |
|
|
-- Remove institutions that we have direct data for
|
8 |
11867
|
aaronmk
|
-- runtime: 3.5 min ("208377.225 ms") @starscream
|
9 |
11887
|
aaronmk
|
-- rerun time: ~0 ("Time: 2.271 ms") thanks to index (*after* manual vacuuming)
|
10 |
9644
|
aaronmk
|
DELETE FROM :table
|
11 |
12879
|
aaronmk
|
WHERE "specimen_duplicate_institutions" IN (
|
12 |
9644
|
aaronmk
|
'ACAD'
|
13 |
|
|
, 'ARIZ'
|
14 |
|
|
, 'BRIT'
|
15 |
|
|
, 'MO'
|
16 |
|
|
, 'MT'
|
17 |
|
|
, 'NY'
|
18 |
|
|
, 'QFA'
|
19 |
|
|
, 'TEX'
|
20 |
|
|
, 'TRT'
|
21 |
|
|
, 'TRTE'
|
22 |
|
|
, 'U'
|
23 |
|
|
, 'UBC'
|
24 |
|
|
, 'WIN'
|
25 |
|
|
)
|
26 |
9857
|
aaronmk
|
/* there are 4.5 million duplicates as of r9855 [1]
|
27 |
9856
|
aaronmk
|
|
28 |
|
|
[1] 59,998,354 rows before [2] - 55,417,646 rows after [3] = 4,580,708
|
29 |
|
|
|
30 |
|
|
[2] 59998354
|
31 |
|
|
SELECT "raw_occurrence_record-row_num"
|
32 |
|
|
FROM "GBIF".raw_occurrence_record_plants
|
33 |
|
|
ORDER BY "raw_occurrence_record-row_num" DESC
|
34 |
|
|
LIMIT 1
|
35 |
|
|
Total query runtime: 19436 ms.
|
36 |
|
|
|
37 |
|
|
[3] 55417646
|
38 |
|
|
SELECT COUNT(*) FROM "GBIF".raw_occurrence_record_plants
|
39 |
|
|
Total query runtime: 23 ms.
|
40 |
|
|
*/
|
41 |
9644
|
aaronmk
|
/* list obtained using the following on r9459:
|
42 |
|
|
SELECT DISTINCT dataprovider
|
43 |
|
|
FROM sourcelist
|
44 |
|
|
JOIN provider_count ON provider_count.dataprovider = sourcelist.name
|
45 |
12516
|
aaronmk
|
WHERE source_id = (SELECT source_by_shortname('GBIF'))
|
46 |
9644
|
aaronmk
|
ORDER BY dataprovider
|
47 |
|
|
*/
|
48 |
|
|
;
|