Project

General

Profile

1 7844 aaronmk
CREATE OR REPLACE FUNCTION score_ok(score double precision)
2
  RETURNS boolean AS
3
$BODY$
4
SELECT $1 >= 0.8
5
$BODY$
6 9984 aaronmk
  LANGUAGE sql IMMUTABLE STRICT
7 7844 aaronmk
  COST 100;
8
9 9985 aaronmk
CREATE OR REPLACE FUNCTION family_is_homonym(family text)
10
  RETURNS boolean AS
11
$BODY$
12
SELECT EXISTS(SELECT * FROM "IRMNG".family_homonym_epithet WHERE "taxonNameOrEpithet" = $1)
13
$BODY$
14 9986 aaronmk
  LANGUAGE sql STABLE STRICT
15 9985 aaronmk
  COST 100;
16
17
CREATE OR REPLACE FUNCTION genus_is_homonym(genus text)
18
  RETURNS boolean AS
19
$BODY$
20
SELECT EXISTS(SELECT * FROM "IRMNG".genus_homonym_epithet WHERE "taxonNameOrEpithet" = $1)
21
$BODY$
22 9986 aaronmk
  LANGUAGE sql STABLE STRICT
23 9985 aaronmk
  COST 100;
24
25 9756 aaronmk
/* IMPORTANT: when changing this table's schema, you must regenerate data.sql:
26
$ <this_file>/../test_taxonomic_names/test_scrub
27 9990 aaronmk
you must also make the same changes on vegbiendev: e.g.
28 9994 aaronmk
$ make backups/TNRS.backup-remake # runtime: 3 min ("real 2m44.277s")
29 9990 aaronmk
SELECT util.set_col_types('"TNRS".tnrs', ARRAY[
30
  ('col', 'new_type')
31
]::util.col_cast[]);
32
runtime: 4 min ("242100 ms")
33 9756 aaronmk
*/
34 5183 aaronmk
CREATE TABLE tnrs
35 5110 aaronmk
(
36 9905 aaronmk
  "Time_submitted" timestamp with time zone NOT NULL DEFAULT now(),
37 9908 aaronmk
  "Name_number" integer NOT NULL,
38 5110 aaronmk
  "Name_submitted" text NOT NULL,
39 9913 aaronmk
  "Overall_score" double precision,
40 5110 aaronmk
  "Name_matched" text,
41
  "Name_matched_rank" text,
42 9913 aaronmk
  "Name_score" double precision,
43 5110 aaronmk
  "Name_matched_author" text,
44
  "Name_matched_url" text,
45
  "Author_matched" text,
46 9913 aaronmk
  "Author_score" double precision,
47 5110 aaronmk
  "Family_matched" text,
48 9913 aaronmk
  "Family_score" double precision,
49 9529 aaronmk
  "Name_matched_accepted_family" text,
50 5110 aaronmk
  "Genus_matched" text,
51 9913 aaronmk
  "Genus_score" double precision,
52 5110 aaronmk
  "Specific_epithet_matched" text,
53 9913 aaronmk
  "Specific_epithet_score" double precision,
54 5110 aaronmk
  "Infraspecific_rank" text,
55
  "Infraspecific_epithet_matched" text,
56 9913 aaronmk
  "Infraspecific_epithet_score" double precision,
57 5110 aaronmk
  "Infraspecific_rank_2" text,
58
  "Infraspecific_epithet_2_matched" text,
59 9913 aaronmk
  "Infraspecific_epithet_2_score" double precision,
60 5110 aaronmk
  "Annotations" text,
61
  "Unmatched_terms" text,
62
  "Taxonomic_status" text,
63
  "Accepted_name" text,
64
  "Accepted_name_author" text,
65
  "Accepted_name_rank" text,
66
  "Accepted_name_url" text,
67 9762 aaronmk
  "Accepted_name_species" text,
68
  "Accepted_name_family" text,
69 5110 aaronmk
  "Selected" text,
70
  "Source" text,
71
  "Warnings" text,
72
  "Accepted_name_lsid" text,
73 7133 aaronmk
  "Accepted_scientific_name" text,
74 7293 aaronmk
  "Max_score" double precision,
75 9972 aaronmk
  "Is_homonym" boolean,
76 9763 aaronmk
  "Is_plant" boolean,
77 9907 aaronmk
  CONSTRAINT tnrs_pkey PRIMARY KEY ("Time_submitted" , "Name_number" ),
78 9906 aaronmk
  CONSTRAINT "tnrs_Name_submitted_key" UNIQUE ("Name_submitted" )
79 5110 aaronmk
)
80
WITH (
81
  OIDS=FALSE
82
);
83 7132 aaronmk
84 7844 aaronmk
CREATE UNIQUE INDEX tnrs_score_ok
85
  ON tnrs
86
  USING btree
87
  ("Name_submitted" )
88
  WHERE score_ok("Max_score");
89
90 9765 aaronmk
/* IMPORTANT: when changing this function, you must regenerate the derived cols:
91 9974 aaronmk
UPDATE "TNRS".tnrs SET "Name_submitted" = "Name_submitted"
92 9992 aaronmk
runtime: 16 min ("5052509 rows affected, 980692 ms execution time")
93 9993 aaronmk
VACUUM ANALYZE "TNRS".tnrs -- to remove previous rows
94
runtime: 50 s ("48264 ms")
95 9765 aaronmk
*/
96 9512 aaronmk
CREATE OR REPLACE FUNCTION tnrs_populate_fields()
97 7134 aaronmk
  RETURNS trigger AS
98
$BODY$
99 9763 aaronmk
DECLARE
100
    "Specific_epithet_is_plant" boolean :=
101
        (CASE
102
        WHEN   new."Infraspecific_epithet_matched"   IS NOT NULL
103
            OR new."Infraspecific_epithet_2_matched" IS NOT NULL
104 9914 aaronmk
            OR new."Specific_epithet_score" >= 0.9 -- fuzzy match
105 9763 aaronmk
            THEN true
106
        ELSE NULL -- ambiguous
107
        END);
108 9982 aaronmk
    never_homonym boolean = COALESCE(new."Author_score" >= 0.6, false); -- author disambiguates
109 9988 aaronmk
    family_is_homonym boolean = NOT never_homonym AND "TNRS".family_is_homonym(new."Family_matched");
110
    genus_is_homonym  boolean = NOT never_homonym AND "TNRS".genus_is_homonym( new."Genus_matched" );
111 7134 aaronmk
BEGIN
112 7848 aaronmk
    new."Accepted_scientific_name" = NULLIF(concat_ws(' '
113 9762 aaronmk
        , NULLIF(NULLIF(new."Accepted_name_family", 'Unknown'), new."Accepted_name")
114 7134 aaronmk
        , new."Accepted_name"
115
        , new."Accepted_name_author"
116 7848 aaronmk
    ), '');
117 7293 aaronmk
    new."Max_score" = GREATEST(
118 9914 aaronmk
          new."Overall_score"
119
        , new."Family_score"
120
        , new."Genus_score"
121
        , new."Specific_epithet_score"
122 7293 aaronmk
    );
123 9987 aaronmk
    new."Is_homonym" = COALESCE(genus_is_homonym, family_is_homonym);
124 9763 aaronmk
    new."Is_plant" = (CASE
125 9973 aaronmk
        WHEN new."Family_score" = 1 AND NOT family_is_homonym -- exact match
126
            THEN true
127
        ELSE -- consider genus
128 9763 aaronmk
            (CASE
129 9914 aaronmk
            WHEN new."Genus_score" =  1    -- exact match
130 9973 aaronmk
                THEN
131
                (CASE
132
                WHEN NOT genus_is_homonym THEN true
133
                ELSE "Specific_epithet_is_plant"
134
                END)
135 9914 aaronmk
            WHEN new."Genus_score" >= 0.85 -- fuzzy match
136 9763 aaronmk
                THEN "Specific_epithet_is_plant"
137
            ELSE NULL -- ambiguous
138
            END)
139
        END);
140 7134 aaronmk
141
    RETURN new;
142
END;
143
$BODY$
144
  LANGUAGE plpgsql VOLATILE
145
  COST 100;
146
147 9512 aaronmk
CREATE TRIGGER tnrs_populate_fields
148 7134 aaronmk
  BEFORE INSERT OR UPDATE
149
  ON tnrs
150
  FOR EACH ROW
151 9512 aaronmk
  EXECUTE PROCEDURE tnrs_populate_fields();
152 7251 aaronmk
153
154 9759 aaronmk
CREATE OR REPLACE VIEW "MatchedTaxon" AS
155 7823 aaronmk
SELECT
156
  "Time_submitted" AS "*Name_matched.Time_submitted"
157 7830 aaronmk
, "Name_submitted" AS "concatenatedScientificName"
158 7829 aaronmk
, "Name_matched" AS "matchedTaxonName"
159 7823 aaronmk
, "Name_matched_rank" AS "matchedTaxonRank"
160
, "Name_score" AS "*Name_matched.Name_score"
161
, "Name_matched_author" AS "matchedScientificNameAuthorship"
162
, "Name_matched_url" AS "matchedScientificNameID"
163
, "Author_score" AS "*Name_matched.Author_score"
164
, "Family_score" AS "matchedFamilyConfidence_fraction"
165 9762 aaronmk
, COALESCE("Name_matched_accepted_family", "Accepted_name_family") AS "matchedFamily"
166 7831 aaronmk
, "Genus_matched" AS "matchedGenus"
167 7823 aaronmk
, "Genus_score" AS "matchedGenusConfidence_fraction"
168
, "Specific_epithet_matched" AS "matchedSpecificEpithet"
169
, "Specific_epithet_score" AS "matchedSpeciesConfidence_fraction"
170
, "Infraspecific_epithet_matched" AS "matchedInfraspecificEpithet"
171
, "Infraspecific_epithet_score" AS "*Name_matched.Infraspecific_epithet_score"
172
, "Annotations" AS "identificationQualifier"
173
, "Unmatched_terms" AS "morphospeciesSuffix"
174 7833 aaronmk
, "Taxonomic_status" AS "taxonomicStatus"
175 7823 aaronmk
, "Accepted_name" AS "acceptedTaxonName"
176
, "Accepted_name_author" AS "acceptedScientificNameAuthorship"
177
, "Accepted_name_rank" AS "acceptedTaxonRank"
178
, "Accepted_name_url" AS "acceptedScientificNameID"
179 9762 aaronmk
, "Accepted_name_species" AS "*Name_matched.Accepted_name_species"
180
, "Accepted_name_family" AS "acceptedFamily"
181 7823 aaronmk
, "Selected" AS "*Name_matched.Selected"
182 7828 aaronmk
, "Source" AS "*Name_matched.Source"
183 7823 aaronmk
, "Warnings" AS "*Name_matched.Warnings"
184
, "Accepted_name_lsid" AS "*Name_matched.Accepted_name_lsid"
185
, "Accepted_scientific_name" AS "acceptedScientificName"
186
, "Max_score" AS "matchedTaxonConfidence_fraction"
187 9616 aaronmk
FROM tnrs
188
;
189 7823 aaronmk
190 9759 aaronmk
CREATE OR REPLACE VIEW "ValidMatchedTaxon" AS
191
SELECT *
192
FROM "MatchedTaxon"
193
WHERE score_ok("matchedTaxonConfidence_fraction")
194
;
195
196 7823 aaronmk
CREATE OR REPLACE VIEW "AcceptedTaxon" AS
197
SELECT
198
  "Time_submitted" AS "*Accepted_name.Time_submitted"
199
, "Name_submitted" AS "acceptedScientificName"
200
, "Genus_matched" AS "acceptedGenus"
201
, "Specific_epithet_matched" AS "acceptedSpecificEpithet"
202
, "Infraspecific_epithet_matched" AS "acceptedInfraspecificEpithet"
203
, "Taxonomic_status" AS "acceptedTaxonomicStatus"
204
, "Selected" AS "*Accepted_name.Selected"
205
, "Source" AS "*Accepted_name.Source"
206
, "Warnings" AS "*Accepted_name.Warnings"
207
, "Accepted_name_lsid" AS "*Accepted_name.Accepted_name_lsid"
208 9760 aaronmk
FROM tnrs
209
;
210 7823 aaronmk
211
CREATE OR REPLACE VIEW "ScrubbedTaxon" AS
212
SELECT *
213 9758 aaronmk
FROM "ValidMatchedTaxon"
214 7823 aaronmk
NATURAL LEFT JOIN "AcceptedTaxon"
215
;