SELECT util.search_path_append('util'); SELECT create_if_not_exists($$CREATE INDEX raw_occurrence_record_plants_institution_code ON raw_occurrence_record_plants ("specimen_duplicate_institutions")$$) ; -- runtime: 45 min ("2629095.310 ms") @starscream -- Remove institutions that we have direct data for -- runtime: 3.5 min ("208377.225 ms") @starscream -- rerun time: ~0 ("Time: 2.271 ms") thanks to index (*after* manual vacuuming) DELETE FROM :table WHERE "specimen_duplicate_institutions" IN ( 'ACAD' , 'ARIZ' , 'BRIT' , 'MO' , 'MT' , 'NY' , 'QFA' , 'TEX' , 'TRT' , 'TRTE' , 'U' , 'UBC' , 'WIN' ) /* there are 4.5 million duplicates as of r9855 [1] [1] 59,998,354 rows before [2] - 55,417,646 rows after [3] = 4,580,708 [2] 59998354 SELECT "raw_occurrence_record-row_num" FROM "GBIF".raw_occurrence_record_plants ORDER BY "raw_occurrence_record-row_num" DESC LIMIT 1 Total query runtime: 19436 ms. [3] 55417646 SELECT COUNT(*) FROM "GBIF".raw_occurrence_record_plants Total query runtime: 23 ms. */ /* list obtained using the following on r9459: SELECT DISTINCT dataprovider FROM sourcelist JOIN provider_count ON provider_count.dataprovider = sourcelist.name WHERE source_id = (SELECT source_by_shortname('GBIF')) ORDER BY dataprovider */ ;