Revision 12896
Added by Aaron Marcuse-Kubitza almost 11 years ago
trunk/validation/aggregating/specimens/NY/qualitative_validations_source_db_NYBG.VegCore.sql | ||
---|---|---|
1 | 1 |
-- ------------------------------------------------------------------------- |
2 | 2 |
-- Quantitative validation queries against NYBG source db |
3 | 3 |
-- |
4 |
-- Queries table `nybg_raw` in mysql db `bien2_staging` on nimoy.nceas.ucsb.edu |
|
5 |
-- Verbatim import of NYSpecimenDataAmericas.csv, a DwC format dump of |
|
6 |
-- specimen records from NYBG specimen database, except for the three |
|
7 |
-- columns key, IdentifiedDate and CollectedDate, which where added and |
|
8 |
-- populated by Brad after importing raw data. |
|
4 |
-- Queries staging table vegbien."NY"."Ecatalog_all" @vegbiendev.nceas.ucsb.edu |
|
9 | 5 |
-- ------------------------------------------------------------------------- |
10 | 6 |
|
7 |
SET search_path TO "NY"; |
|
8 |
|
|
11 | 9 |
-- ------------------------------- |
12 | 10 |
-- 1. Count of total records (specimens) in source db |
13 | 11 |
-- Check: full join against equivalent query on BIEN3 db should return 1 row |
14 | 12 |
-- ------------------------------- |
15 | 13 |
SELECT COUNT(*) totalSpecimenRecords |
16 |
FROM nybg_raw;
|
|
14 |
FROM "Ecatalog_all";
|
|
17 | 15 |
|
18 | 16 |
-- ------------------------------- |
19 | 17 |
-- 2. Count of unique (verbatim) non-null families |
20 | 18 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row |
21 | 19 |
-- ------------------------------- |
22 |
SELECT COUNT(DISTINCT Family) AS families FROM nybg_raw WHERE Family IS NOT NULL;
|
|
20 |
SELECT COUNT(DISTINCT Family) AS families FROM "Ecatalog_all" WHERE Family IS NOT NULL;
|
|
23 | 21 |
|
24 | 22 |
-- ------------------------------- |
25 | 23 |
-- 3. List of verbatim families |
26 | 24 |
-- Check: Full inner join to equivalent query on BIEN3 db should return same number of rows |
27 | 25 |
-- ------------------------------- |
28 | 26 |
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species |
29 |
FROM nybg_raw
|
|
27 |
FROM "Ecatalog_all"
|
|
30 | 28 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL; |
31 | 29 |
|
32 | 30 |
-- ------------------------------- |
33 | 31 |
-- 4. Count of unique (verbatim) non-null species, without author |
34 | 32 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row |
35 | 33 |
-- ------------------------------- |
36 |
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species)) AS subspecies FROM nybg_raw
|
|
34 |
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species)) AS subspecies FROM "Ecatalog_all"
|
|
37 | 35 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL; |
38 | 36 |
|
39 | 37 |
-- ------------------------------- |
40 | 38 |
-- 5. List of verbatim species, excluding author |
41 | 39 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 3335 rows |
42 | 40 |
-- ------------------------------- |
43 |
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS subspecies FROM nybg_raw
|
|
41 |
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS subspecies FROM "Ecatalog_all"
|
|
44 | 42 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL; |
45 | 43 |
|
46 | 44 |
-- ------------------------------- |
47 | 45 |
-- 6. Count of unique (verbatim) non-null subspecific taxa, without author |
48 | 46 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row |
49 | 47 |
-- ------------------------------- |
50 |
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species, Subspecies)) AS Species FROM nybg_raw
|
|
48 |
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species, Subspecies)) AS Species FROM "Ecatalog_all"
|
|
51 | 49 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL; |
52 | 50 |
|
53 | 51 |
-- ------------------------------- |
... | ... | |
55 | 53 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 40145 rows |
56 | 54 |
-- ------------------------------- |
57 | 55 |
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species |
58 |
FROM nybg_raw
|
|
56 |
FROM "Ecatalog_all"
|
|
59 | 57 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL; |
60 | 58 |
|
61 | 59 |
-- ------------------------------- |
... | ... | |
66 | 64 |
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))), |
67 | 65 |
TRIM(IFNULL(ScientificNameAuthor,'')) |
68 | 66 |
))) AS fullScientificNames |
69 |
FROM nybg_raw
|
|
67 |
FROM "Ecatalog_all"
|
|
70 | 68 |
WHERE Genus IS NOT NULL; |
71 | 69 |
|
72 | 70 |
-- ------------------------------- |
... | ... | |
77 | 75 |
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))), |
78 | 76 |
TRIM(IFNULL(ScientificNameAuthor,'')) |
79 | 77 |
)) AS fullScientificName |
80 |
FROM nybg_raw
|
|
78 |
FROM "Ecatalog_all"
|
|
81 | 79 |
WHERE Genus IS NOT NULL; |
82 | 80 |
|
83 | 81 |
-- ------------------------------- |
... | ... | |
86 | 84 |
-- Note: Majority should be from 'NY'; these are standard herbarium acronyms |
87 | 85 |
-- ------------------------------- |
88 | 86 |
SELECT InstitutionCode, COUNT(*) AS records |
89 |
FROM nybg_raw
|
|
87 |
FROM "Ecatalog_all"
|
|
90 | 88 |
GROUP BY InstitutionCode; |
91 | 89 |
|
92 | 90 |
-- ------------------------------- |
... | ... | |
96 | 94 |
-- character set issues, either in source db or in BIEN |
97 | 95 |
-- ------------------------------- |
98 | 96 |
SELECT DISTINCT Country, StateProvince, County |
99 |
FROM nybg_raw;
|
|
97 |
FROM "Ecatalog_all";
|
|
100 | 98 |
|
101 | 99 |
-- ------------------------------- |
102 | 100 |
-- 12. Check distinct Collector names + collection numbers + collection dates, |
... | ... | |
105 | 103 |
-- ------------------------------- |
106 | 104 |
SELECT IFNULL(Collector,'') AS collectorName, IFNULL(FieldNumber,'') AS collectionNumber, |
107 | 105 |
IFNULL(CollectedDate,'') AS dataCollected, COUNT(*) AS specimenRecords |
108 |
FROM nybg_raw
|
|
106 |
FROM "Ecatalog_all"
|
|
109 | 107 |
GROUP BY Collector, FieldNumber, CollectedDate |
110 | 108 |
ORDER BY Collector, FieldNumber, CollectedDate; |
111 | 109 |
|
... | ... | |
117 | 115 |
SELECT |
118 | 116 |
( |
119 | 117 |
SELECT COUNT(*) |
120 |
FROM nybg_raw
|
|
118 |
FROM "Ecatalog_all"
|
|
121 | 119 |
WHERE Latitude IS NOT NULL |
122 | 120 |
) AS allLats, |
123 | 121 |
( |
124 | 122 |
SELECT COUNT(*) |
125 |
FROM nybg_raw
|
|
123 |
FROM "Ecatalog_all"
|
|
126 | 124 |
WHERE Latitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$' |
127 | 125 |
) AS decimalLats, |
128 | 126 |
( |
129 | 127 |
SELECT COUNT(*) |
130 |
FROM nybg_raw
|
|
128 |
FROM "Ecatalog_all"
|
|
131 | 129 |
WHERE Longitude IS NOT NULL |
132 | 130 |
) AS allLongs, |
133 | 131 |
( |
134 | 132 |
SELECT COUNT(*) |
135 |
FROM nybg_raw
|
|
133 |
FROM "Ecatalog_all"
|
|
136 | 134 |
WHERE Longitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$' |
137 | 135 |
) AS decimalLongs; |
138 | 136 |
|
... | ... | |
144 | 142 |
SELECT |
145 | 143 |
( |
146 | 144 |
SELECT COUNT(*) |
147 |
FROM nybg_raw
|
|
145 |
FROM "Ecatalog_all"
|
|
148 | 146 |
WHERE Latitude>90 OR Latitude<-90 |
149 | 147 |
) AS badLats, |
150 | 148 |
( |
151 | 149 |
SELECT COUNT(*) |
152 |
FROM nybg_raw
|
|
150 |
FROM "Ecatalog_all"
|
|
153 | 151 |
WHERE Longitude>180 OR Longitude<-180 |
154 | 152 |
) AS badLongs; |
155 | 153 |
|
... | ... | |
158 | 156 |
-- Check: full join to equivalent query against BIEN3 should return 125592 records |
159 | 157 |
-- ------------------------------- |
160 | 158 |
SELECT DISTINCT Locality AS localityDescription |
161 |
FROM nybg_raw
|
|
159 |
FROM "Ecatalog_all"
|
|
162 | 160 |
WHERE Locality IS NOT NULL; |
163 | 161 |
|
164 | 162 |
-- ------------------------------- |
... | ... | |
167 | 165 |
-- Note: specimens descriptions in nybg extract is in column PlantFungusDescription |
168 | 166 |
-- ------------------------------- |
169 | 167 |
SELECT PlantFungusDescription AS specimenDescription |
170 |
FROM nybg_raw
|
|
168 |
FROM "Ecatalog_all"
|
|
171 | 169 |
WHERE PlantFungusDescription IS NOT NULL; |
172 |
|
|
173 |
|
|
174 |
|
|
175 |
|
|
176 |
|
|
177 |
|
|
178 |
|
|
179 |
|
|
180 |
|
|
181 |
|
Also available in: Unified diff
validation/aggregating/specimens/NY/qualitative_validations_source_db_NYBG.VegCore.sql: use our staging tables instead of the BIEN2 MySQL staging tables