1 |
1 |
-- -------------------------------------------------------------------------
|
2 |
2 |
-- Quantitative validation queries against NYBG source db
|
3 |
|
--
|
|
3 |
--
|
4 |
4 |
-- Queries table `nybg_raw` in mysql db `bien2_staging` on nimoy.nceas.ucsb.edu
|
5 |
|
-- Verbatim import of NYSpecimenDataAmericas.csv, a DwC format dump of
|
|
5 |
-- Verbatim import of NYSpecimenDataAmericas.csv, a DwC format dump of
|
6 |
6 |
-- specimen records from NYBG specimen database, except for the three
|
7 |
|
-- columns key, IdentifiedDate and CollectedDate, which where added and
|
8 |
|
-- populated by Brad after importing raw data.
|
|
7 |
-- columns key, IdentifiedDate and CollectedDate, which where added and
|
|
8 |
-- populated by Brad after importing raw data.
|
9 |
9 |
-- -------------------------------------------------------------------------
|
10 |
10 |
|
11 |
11 |
-- -------------------------------
|
... | ... | |
26 |
26 |
-- Check: Full inner join to equivalent query on BIEN3 db should return same number of rows
|
27 |
27 |
-- -------------------------------
|
28 |
28 |
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species
|
29 |
|
FROM nybg_raw
|
|
29 |
FROM nybg_raw
|
30 |
30 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL;
|
31 |
31 |
|
32 |
32 |
-- -------------------------------
|
33 |
33 |
-- 4. Count of unique (verbatim) non-null species, without author
|
34 |
34 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
|
35 |
35 |
-- -------------------------------
|
36 |
|
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species)) AS subspecies FROM nybg_raw
|
|
36 |
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species)) AS subspecies FROM nybg_raw
|
37 |
37 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL;
|
38 |
38 |
|
39 |
39 |
-- -------------------------------
|
40 |
40 |
-- 5. List of verbatim species, excluding author
|
41 |
41 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 3335 rows
|
42 |
42 |
-- -------------------------------
|
43 |
|
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS subspecies FROM nybg_raw
|
|
43 |
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS subspecies FROM nybg_raw
|
44 |
44 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL;
|
45 |
45 |
|
46 |
46 |
-- -------------------------------
|
47 |
47 |
-- 6. Count of unique (verbatim) non-null subspecific taxa, without author
|
48 |
48 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
|
49 |
49 |
-- -------------------------------
|
50 |
|
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species, Subspecies)) AS Species FROM nybg_raw
|
|
50 |
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species, Subspecies)) AS Species FROM nybg_raw
|
51 |
51 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL;
|
52 |
52 |
|
53 |
53 |
-- -------------------------------
|
... | ... | |
55 |
55 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 40145 rows
|
56 |
56 |
-- -------------------------------
|
57 |
57 |
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species
|
58 |
|
FROM nybg_raw
|
|
58 |
FROM nybg_raw
|
59 |
59 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL;
|
60 |
60 |
|
61 |
61 |
-- -------------------------------
|
... | ... | |
63 |
63 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
|
64 |
64 |
-- -------------------------------
|
65 |
65 |
SELECT COUNT(DISTINCT TRIM(CONCAT_WS(' ',
|
66 |
|
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))),
|
|
66 |
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))),
|
67 |
67 |
TRIM(IFNULL(ScientificNameAuthor,''))
|
68 |
68 |
))) AS fullScientificNames
|
69 |
|
FROM nybg_raw
|
|
69 |
FROM nybg_raw
|
70 |
70 |
WHERE Genus IS NOT NULL;
|
71 |
71 |
|
72 |
72 |
-- -------------------------------
|
... | ... | |
74 |
74 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 45997 rows
|
75 |
75 |
-- -------------------------------
|
76 |
76 |
SELECT DISTINCT TRIM(CONCAT_WS(' ',
|
77 |
|
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))),
|
|
77 |
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))),
|
78 |
78 |
TRIM(IFNULL(ScientificNameAuthor,''))
|
79 |
79 |
)) AS fullScientificName
|
80 |
|
FROM nybg_raw
|
|
80 |
FROM nybg_raw
|
81 |
81 |
WHERE Genus IS NOT NULL;
|
82 |
82 |
|
83 |
83 |
-- -------------------------------
|
... | ... | |
95 |
95 |
-- Note: character set issues may cause mis-matches. This query is a good way to reveal
|
96 |
96 |
-- character set issues, either in source db or in BIEN
|
97 |
97 |
-- -------------------------------
|
98 |
|
SELECT DISTINCT Country, StateProvince, County
|
99 |
|
FROM nybg_raw;
|
|
98 |
SELECT DISTINCT Country, StateProvince, County
|
|
99 |
FROM nybg_raw;
|
100 |
100 |
|
101 |
101 |
-- -------------------------------
|
102 |
|
-- 12. Check distinct Collector names + collection numbers + collection dates,
|
|
102 |
-- 12. Check distinct Collector names + collection numbers + collection dates,
|
103 |
103 |
-- plus total records
|
104 |
104 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 309396 rows
|
105 |
105 |
-- -------------------------------
|
106 |
|
SELECT IFNULL(Collector,'') AS collectorName, IFNULL(FieldNumber,'') AS collectionNumber,
|
|
106 |
SELECT IFNULL(Collector,'') AS collectorName, IFNULL(FieldNumber,'') AS collectionNumber,
|
107 |
107 |
IFNULL(CollectedDate,'') AS dataCollected, COUNT(*) AS specimenRecords
|
108 |
108 |
FROM nybg_raw
|
109 |
109 |
GROUP BY Collector, FieldNumber, CollectedDate
|
110 |
110 |
ORDER BY Collector, FieldNumber, CollectedDate;
|
111 |
111 |
|
112 |
112 |
-- -------------------------------
|
113 |
|
-- 13. Count of all verbatim Latitude and Longitude values, as well as all
|
|
113 |
-- 13. Count of all verbatim Latitude and Longitude values, as well as all
|
114 |
114 |
-- latitude and longitude values that are decimals numbers
|
115 |
115 |
-- Check: full join to equivalent query against BIEN3 should return 1 row
|
116 |
116 |
-- -------------------------------
|
117 |
|
SELECT
|
|
117 |
SELECT
|
118 |
118 |
(
|
119 |
119 |
SELECT COUNT(*)
|
120 |
120 |
FROM nybg_raw
|
... | ... | |
141 |
141 |
-- of decimal latitude or decimal longitude
|
142 |
142 |
-- Check: full join to equivalent query against BIEN3 should return 1 row
|
143 |
143 |
-- -------------------------------
|
144 |
|
SELECT
|
|
144 |
SELECT
|
145 |
145 |
(
|
146 |
146 |
SELECT COUNT(*)
|
147 |
147 |
FROM nybg_raw
|
validation/aggregating/specimens/**.sql: removed trailing whitespace, using the steps at wiki.vegpath.org/Aggregating_validations_refactoring#translate-to-Postgres