Revision 10330
Added by Aaron Marcuse-Kubitza over 11 years ago
inputs/REMIB/Specimen/create.sql | ||
---|---|---|
1 |
-- runtime: 7 s ("Time: 6888.394 ms") |
|
2 |
SELECT * |
|
3 |
FROM "Specimen.src" |
|
4 |
WHERE |
|
5 |
acronym IS NOT NULL |
|
6 |
AND coll_year ~ E'^(?:1[7-9]|20)\\d{2}$' |
|
7 |
AND country !~ E'\\d' |
|
8 |
AND (long_deg !~ E'[[:alpha:]]' OR long_deg IN ('RESTRINGIDO')) |
|
1 |
SELECT * FROM "Specimen.src" |
inputs/REMIB/Specimen/postprocess.sql | ||
---|---|---|
2 | 2 |
|
3 | 3 |
SELECT create_if_not_exists($$CREATE INDEX "Specimen.acronym" ON $$||:table_str||$$ (acronym)$$); |
4 | 4 |
|
5 |
SELECT create_if_not_exists($$CREATE INDEX "Specimen.acronym" ON $$||:table_str||$$ (acronym )$$); |
|
6 |
SELECT create_if_not_exists($$CREATE INDEX "Specimen.coll_year" ON $$||:table_str||$$ (coll_year )$$); |
|
7 |
SELECT create_if_not_exists($$CREATE INDEX "Specimen.country_verbatim" ON $$||:table_str||$$ (country_verbatim )$$); |
|
8 |
SELECT create_if_not_exists($$CREATE INDEX "Specimen.longitude_deg_verbatim" ON $$||:table_str||$$ (longitude_deg_verbatim)$$); |
|
9 |
|
|
10 |
-- remove frameshifted rows |
|
11 |
-- rerun time: 6 s ("Time: 5537.211 ms") thanks to index |
|
12 |
DELETE FROM :table |
|
13 |
WHERE |
|
14 |
acronym IS NULL |
|
15 |
OR coll_year !~ E'^(?:1[7-9]|20)\\d{2}$' |
|
16 |
OR country_verbatim ~ E'\\d' |
|
17 |
OR (longitude_deg_verbatim ~ E'[[:alpha:]]' AND longitude_deg_verbatim NOT IN ('RESTRINGIDO')) |
|
18 |
; |
|
19 |
|
|
5 | 20 |
-- Remove institutions that we have direct data for |
6 | 21 |
DELETE FROM :table |
7 | 22 |
WHERE acronym IN ( |
Also available in: Unified diff
inputs/REMIB/Specimen/create.sql: moved filtering out of frameshifted rows to postprocess.sql, where it can happen in an idempotent DELETE. this allows filters to remove additional rows to easily be added on top of the existing filters, without needing to remake Specimen (which takes a long time, because of the many stage I derived columns that get added). the logical inversion inherent in the DELETE condition has been factored through rather than wrapped in NOT (...), because removal of frameshifted rows is more accurately specified as the detection of specific patterns that indicate frameshifting rather than the validation of all fields.