1
|
SELECT util.search_path_append('util');
|
2
|
|
3
|
SELECT create_if_not_exists($$CREATE INDEX raw_occurrence_record_plants_institution_code
|
4
|
ON raw_occurrence_record_plants ("specimenHolderInstitutions")$$)
|
5
|
; -- runtime: 45 min ("2629095.310 ms") @starscream
|
6
|
|
7
|
-- Remove institutions that we have direct data for
|
8
|
-- runtime: 3.5 min ("208377.225 ms") @starscream
|
9
|
-- rerun time: ~0 ("Time: 2.271 ms") thanks to index (*after* manual vacuuming)
|
10
|
DELETE FROM :table
|
11
|
WHERE "specimenHolderInstitutions" IN (
|
12
|
'ACAD'
|
13
|
, 'ARIZ'
|
14
|
, 'BRIT'
|
15
|
, 'MO'
|
16
|
, 'MT'
|
17
|
, 'NY'
|
18
|
, 'QFA'
|
19
|
, 'TEX'
|
20
|
, 'TRT'
|
21
|
, 'TRTE'
|
22
|
, 'U'
|
23
|
, 'UBC'
|
24
|
, 'WIN'
|
25
|
)
|
26
|
/* there are 4.5 million duplicates as of r9855 [1]
|
27
|
|
28
|
[1] 59,998,354 rows before [2] - 55,417,646 rows after [3] = 4,580,708
|
29
|
|
30
|
[2] 59998354
|
31
|
SELECT "raw_occurrence_record-row_num"
|
32
|
FROM "GBIF".raw_occurrence_record_plants
|
33
|
ORDER BY "raw_occurrence_record-row_num" DESC
|
34
|
LIMIT 1
|
35
|
Total query runtime: 19436 ms.
|
36
|
|
37
|
[3] 55417646
|
38
|
SELECT COUNT(*) FROM "GBIF".raw_occurrence_record_plants
|
39
|
Total query runtime: 23 ms.
|
40
|
*/
|
41
|
/* list obtained using the following on r9459:
|
42
|
SELECT DISTINCT dataprovider
|
43
|
FROM sourcelist
|
44
|
JOIN provider_count ON provider_count.dataprovider = sourcelist.name
|
45
|
WHERE source_id = source_by_shortname('GBIF')
|
46
|
ORDER BY dataprovider
|
47
|
*/
|
48
|
;
|