Revision 12894
Added by Aaron Marcuse-Kubitza almost 11 years ago
trunk/validation/aggregating/specimens/NY/qualitative_validations_source_db_NYBG.sql | ||
---|---|---|
1 |
-- ------------------------------------------------------------------------- |
|
2 |
-- Quantitative validation queries against NYBG source db |
|
3 |
-- |
|
4 |
-- Queries table `nybg_raw` in mysql db `bien2_staging` on nimoy.nceas.ucsb.edu |
|
5 |
-- Verbatim import of NYSpecimenDataAmericas.csv, a DwC format dump of |
|
6 |
-- specimen records from NYBG specimen database, except for the three |
|
7 |
-- columns key, IdentifiedDate and CollectedDate, which where added and |
|
8 |
-- populated by Brad after importing raw data. |
|
9 |
-- ------------------------------------------------------------------------- |
|
10 |
|
|
11 |
-- ------------------------------- |
|
12 |
-- 1. Count of total records (specimens) in source db |
|
13 |
-- Check: full join against equivalent query on BIEN3 db should return 1 row |
|
14 |
-- ------------------------------- |
|
15 |
SELECT COUNT(*) totalSpecimenRecords |
|
16 |
FROM nybg_raw; |
|
17 |
|
|
18 |
-- ------------------------------- |
|
19 |
-- 2. Count of unique (verbatim) non-null families |
|
20 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row |
|
21 |
-- ------------------------------- |
|
22 |
SELECT COUNT(DISTINCT Family) AS families FROM nybg_raw WHERE Family IS NOT NULL; |
|
23 |
|
|
24 |
-- ------------------------------- |
|
25 |
-- 3. List of verbatim families |
|
26 |
-- Check: Full inner join to equivalent query on BIEN3 db should return same number of rows |
|
27 |
-- ------------------------------- |
|
28 |
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species |
|
29 |
FROM nybg_raw |
|
30 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL; |
|
31 |
|
|
32 |
-- ------------------------------- |
|
33 |
-- 4. Count of unique (verbatim) non-null species, without author |
|
34 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row |
|
35 |
-- ------------------------------- |
|
36 |
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species)) AS subspecies FROM nybg_raw |
|
37 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL; |
|
38 |
|
|
39 |
-- ------------------------------- |
|
40 |
-- 5. List of verbatim species, excluding author |
|
41 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 3335 rows |
|
42 |
-- ------------------------------- |
|
43 |
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS subspecies FROM nybg_raw |
|
44 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL; |
|
45 |
|
|
46 |
-- ------------------------------- |
|
47 |
-- 6. Count of unique (verbatim) non-null subspecific taxa, without author |
|
48 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row |
|
49 |
-- ------------------------------- |
|
50 |
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species, Subspecies)) AS Species FROM nybg_raw |
|
51 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL; |
|
52 |
|
|
53 |
-- ------------------------------- |
|
54 |
-- 7. List of verbatim subspecific taxa, without author |
|
55 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 40145 rows |
|
56 |
-- ------------------------------- |
|
57 |
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species |
|
58 |
FROM nybg_raw |
|
59 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL; |
|
60 |
|
|
61 |
-- ------------------------------- |
|
62 |
-- 8. Count of unique (verbatim) taxa including author, for all taxa identified at least to genus |
|
63 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row |
|
64 |
-- ------------------------------- |
|
65 |
SELECT COUNT(DISTINCT TRIM(CONCAT_WS(' ', |
|
66 |
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))), |
|
67 |
TRIM(IFNULL(ScientificNameAuthor,'')) |
|
68 |
))) AS fullScientificNames |
|
69 |
FROM nybg_raw |
|
70 |
WHERE Genus IS NOT NULL; |
|
71 |
|
|
72 |
-- ------------------------------- |
|
73 |
-- 9. List of unique (verbatim) taxa including author, for all taxa identified at least to genus |
|
74 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 45997 rows |
|
75 |
-- ------------------------------- |
|
76 |
SELECT DISTINCT TRIM(CONCAT_WS(' ', |
|
77 |
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))), |
|
78 |
TRIM(IFNULL(ScientificNameAuthor,'')) |
|
79 |
)) AS fullScientificName |
|
80 |
FROM nybg_raw |
|
81 |
WHERE Genus IS NOT NULL; |
|
82 |
|
|
83 |
-- ------------------------------- |
|
84 |
-- 10. Count number of records by institution |
|
85 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 6 rows |
|
86 |
-- Note: Majority should be from 'NY'; these are standard herbarium acronyms |
|
87 |
-- ------------------------------- |
|
88 |
SELECT InstitutionCode, COUNT(*) AS records |
|
89 |
FROM nybg_raw |
|
90 |
GROUP BY InstitutionCode; |
|
91 |
|
|
92 |
-- ------------------------------- |
|
93 |
-- 11. List of three standard political divisions |
|
94 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 5232 rows |
|
95 |
-- Note: character set issues may cause mis-matches. This query is a good way to reveal |
|
96 |
-- character set issues, either in source db or in BIEN |
|
97 |
-- ------------------------------- |
|
98 |
SELECT DISTINCT Country, StateProvince, County |
|
99 |
FROM nybg_raw; |
|
100 |
|
|
101 |
-- ------------------------------- |
|
102 |
-- 12. Check distinct Collector names + collection numbers + collection dates, |
|
103 |
-- plus total records |
|
104 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 309396 rows |
|
105 |
-- ------------------------------- |
|
106 |
SELECT IFNULL(Collector,'') AS collectorName, IFNULL(FieldNumber,'') AS collectionNumber, |
|
107 |
IFNULL(CollectedDate,'') AS dataCollected, COUNT(*) AS specimenRecords |
|
108 |
FROM nybg_raw |
|
109 |
GROUP BY Collector, FieldNumber, CollectedDate |
|
110 |
ORDER BY Collector, FieldNumber, CollectedDate; |
|
111 |
|
|
112 |
-- ------------------------------- |
|
113 |
-- 13. Count of all verbatim Latitude and Longitude values, as well as all |
|
114 |
-- latitude and longitude values that are decimals numbers |
|
115 |
-- Check: full join to equivalent query against BIEN3 should return 1 row |
|
116 |
-- ------------------------------- |
|
117 |
SELECT |
|
118 |
( |
|
119 |
SELECT COUNT(*) |
|
120 |
FROM nybg_raw |
|
121 |
WHERE Latitude IS NOT NULL |
|
122 |
) AS allLats, |
|
123 |
( |
|
124 |
SELECT COUNT(*) |
|
125 |
FROM nybg_raw |
|
126 |
WHERE Latitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$' |
|
127 |
) AS decimalLats, |
|
128 |
( |
|
129 |
SELECT COUNT(*) |
|
130 |
FROM nybg_raw |
|
131 |
WHERE Longitude IS NOT NULL |
|
132 |
) AS allLongs, |
|
133 |
( |
|
134 |
SELECT COUNT(*) |
|
135 |
FROM nybg_raw |
|
136 |
WHERE Longitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$' |
|
137 |
) AS decimalLongs; |
|
138 |
|
|
139 |
-- ------------------------------- |
|
140 |
-- 14. Count of all verbatim Latitude and Longitude values that are not valid values |
|
141 |
-- of decimal latitude or decimal longitude |
|
142 |
-- Check: full join to equivalent query against BIEN3 should return 1 row |
|
143 |
-- ------------------------------- |
|
144 |
SELECT |
|
145 |
( |
|
146 |
SELECT COUNT(*) |
|
147 |
FROM nybg_raw |
|
148 |
WHERE Latitude>90 OR Latitude<-90 |
|
149 |
) AS badLats, |
|
150 |
( |
|
151 |
SELECT COUNT(*) |
|
152 |
FROM nybg_raw |
|
153 |
WHERE Longitude>180 OR Longitude<-180 |
|
154 |
) AS badLongs; |
|
155 |
|
|
156 |
-- ------------------------------- |
|
157 |
-- 15. List distinct non-null locality descriptions |
|
158 |
-- Check: full join to equivalent query against BIEN3 should return 125592 records |
|
159 |
-- ------------------------------- |
|
160 |
SELECT DISTINCT Locality AS localityDescription |
|
161 |
FROM nybg_raw |
|
162 |
WHERE Locality IS NOT NULL; |
|
163 |
|
|
164 |
-- ------------------------------- |
|
165 |
-- 16. List distinct non-null specimen descriptions |
|
166 |
-- Check: full join to equivalent query against BIEN3 should return 158460 records |
|
167 |
-- Note: specimens descriptions in nybg extract is in column PlantFungusDescription |
|
168 |
-- ------------------------------- |
|
169 |
SELECT PlantFungusDescription AS specimenDescription |
|
170 |
FROM nybg_raw |
|
171 |
WHERE PlantFungusDescription IS NOT NULL; |
|
172 |
|
|
173 |
|
|
174 |
|
|
175 |
|
|
176 |
|
|
177 |
|
|
178 |
|
|
179 |
|
|
180 |
|
|
181 |
|
trunk/validation/aggregating/specimens/NY/qualitative_validations_source_db_NYBG.sql.url | ||
---|---|---|
1 |
mailto:bboyle@email.arizona.edu?Brad_Boyle.2013-12-12-9:01.Quantative+validations+for+NYBG |
trunk/validation/aggregating/specimens/NY/_archive/qualitative_validations_source_db_NYBG.sql | ||
---|---|---|
1 |
-- ------------------------------------------------------------------------- |
|
2 |
-- Quantitative validation queries against NYBG source db |
|
3 |
-- |
|
4 |
-- Queries table `nybg_raw` in mysql db `bien2_staging` on nimoy.nceas.ucsb.edu |
|
5 |
-- Verbatim import of NYSpecimenDataAmericas.csv, a DwC format dump of |
|
6 |
-- specimen records from NYBG specimen database, except for the three |
|
7 |
-- columns key, IdentifiedDate and CollectedDate, which where added and |
|
8 |
-- populated by Brad after importing raw data. |
|
9 |
-- ------------------------------------------------------------------------- |
|
10 |
|
|
11 |
-- ------------------------------- |
|
12 |
-- 1. Count of total records (specimens) in source db |
|
13 |
-- Check: full join against equivalent query on BIEN3 db should return 1 row |
|
14 |
-- ------------------------------- |
|
15 |
SELECT COUNT(*) totalSpecimenRecords |
|
16 |
FROM nybg_raw; |
|
17 |
|
|
18 |
-- ------------------------------- |
|
19 |
-- 2. Count of unique (verbatim) non-null families |
|
20 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row |
|
21 |
-- ------------------------------- |
|
22 |
SELECT COUNT(DISTINCT Family) AS families FROM nybg_raw WHERE Family IS NOT NULL; |
|
23 |
|
|
24 |
-- ------------------------------- |
|
25 |
-- 3. List of verbatim families |
|
26 |
-- Check: Full inner join to equivalent query on BIEN3 db should return same number of rows |
|
27 |
-- ------------------------------- |
|
28 |
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species |
|
29 |
FROM nybg_raw |
|
30 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL; |
|
31 |
|
|
32 |
-- ------------------------------- |
|
33 |
-- 4. Count of unique (verbatim) non-null species, without author |
|
34 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row |
|
35 |
-- ------------------------------- |
|
36 |
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species)) AS subspecies FROM nybg_raw |
|
37 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL; |
|
38 |
|
|
39 |
-- ------------------------------- |
|
40 |
-- 5. List of verbatim species, excluding author |
|
41 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 3335 rows |
|
42 |
-- ------------------------------- |
|
43 |
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS subspecies FROM nybg_raw |
|
44 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL; |
|
45 |
|
|
46 |
-- ------------------------------- |
|
47 |
-- 6. Count of unique (verbatim) non-null subspecific taxa, without author |
|
48 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row |
|
49 |
-- ------------------------------- |
|
50 |
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species, Subspecies)) AS Species FROM nybg_raw |
|
51 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL; |
|
52 |
|
|
53 |
-- ------------------------------- |
|
54 |
-- 7. List of verbatim subspecific taxa, without author |
|
55 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 40145 rows |
|
56 |
-- ------------------------------- |
|
57 |
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species |
|
58 |
FROM nybg_raw |
|
59 |
WHERE Genus IS NOT NULL AND Species IS NOT NULL; |
|
60 |
|
|
61 |
-- ------------------------------- |
|
62 |
-- 8. Count of unique (verbatim) taxa including author, for all taxa identified at least to genus |
|
63 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row |
|
64 |
-- ------------------------------- |
|
65 |
SELECT COUNT(DISTINCT TRIM(CONCAT_WS(' ', |
|
66 |
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))), |
|
67 |
TRIM(IFNULL(ScientificNameAuthor,'')) |
|
68 |
))) AS fullScientificNames |
|
69 |
FROM nybg_raw |
|
70 |
WHERE Genus IS NOT NULL; |
|
71 |
|
|
72 |
-- ------------------------------- |
|
73 |
-- 9. List of unique (verbatim) taxa including author, for all taxa identified at least to genus |
|
74 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 45997 rows |
|
75 |
-- ------------------------------- |
|
76 |
SELECT DISTINCT TRIM(CONCAT_WS(' ', |
|
77 |
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))), |
|
78 |
TRIM(IFNULL(ScientificNameAuthor,'')) |
|
79 |
)) AS fullScientificName |
|
80 |
FROM nybg_raw |
|
81 |
WHERE Genus IS NOT NULL; |
|
82 |
|
|
83 |
-- ------------------------------- |
|
84 |
-- 10. Count number of records by institution |
|
85 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 6 rows |
|
86 |
-- Note: Majority should be from 'NY'; these are standard herbarium acronyms |
|
87 |
-- ------------------------------- |
|
88 |
SELECT InstitutionCode, COUNT(*) AS records |
|
89 |
FROM nybg_raw |
|
90 |
GROUP BY InstitutionCode; |
|
91 |
|
|
92 |
-- ------------------------------- |
|
93 |
-- 11. List of three standard political divisions |
|
94 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 5232 rows |
|
95 |
-- Note: character set issues may cause mis-matches. This query is a good way to reveal |
|
96 |
-- character set issues, either in source db or in BIEN |
|
97 |
-- ------------------------------- |
|
98 |
SELECT DISTINCT Country, StateProvince, County |
|
99 |
FROM nybg_raw; |
|
100 |
|
|
101 |
-- ------------------------------- |
|
102 |
-- 12. Check distinct Collector names + collection numbers + collection dates, |
|
103 |
-- plus total records |
|
104 |
-- Check: Full inner join to equivalent query on BIEN3 db should return 309396 rows |
|
105 |
-- ------------------------------- |
|
106 |
SELECT IFNULL(Collector,'') AS collectorName, IFNULL(FieldNumber,'') AS collectionNumber, |
|
107 |
IFNULL(CollectedDate,'') AS dataCollected, COUNT(*) AS specimenRecords |
|
108 |
FROM nybg_raw |
|
109 |
GROUP BY Collector, FieldNumber, CollectedDate |
|
110 |
ORDER BY Collector, FieldNumber, CollectedDate; |
|
111 |
|
|
112 |
-- ------------------------------- |
|
113 |
-- 13. Count of all verbatim Latitude and Longitude values, as well as all |
|
114 |
-- latitude and longitude values that are decimals numbers |
|
115 |
-- Check: full join to equivalent query against BIEN3 should return 1 row |
|
116 |
-- ------------------------------- |
|
117 |
SELECT |
|
118 |
( |
|
119 |
SELECT COUNT(*) |
|
120 |
FROM nybg_raw |
|
121 |
WHERE Latitude IS NOT NULL |
|
122 |
) AS allLats, |
|
123 |
( |
|
124 |
SELECT COUNT(*) |
|
125 |
FROM nybg_raw |
|
126 |
WHERE Latitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$' |
|
127 |
) AS decimalLats, |
|
128 |
( |
|
129 |
SELECT COUNT(*) |
|
130 |
FROM nybg_raw |
|
131 |
WHERE Longitude IS NOT NULL |
|
132 |
) AS allLongs, |
|
133 |
( |
|
134 |
SELECT COUNT(*) |
|
135 |
FROM nybg_raw |
|
136 |
WHERE Longitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$' |
|
137 |
) AS decimalLongs; |
|
138 |
|
|
139 |
-- ------------------------------- |
|
140 |
-- 14. Count of all verbatim Latitude and Longitude values that are not valid values |
|
141 |
-- of decimal latitude or decimal longitude |
|
142 |
-- Check: full join to equivalent query against BIEN3 should return 1 row |
|
143 |
-- ------------------------------- |
|
144 |
SELECT |
|
145 |
( |
|
146 |
SELECT COUNT(*) |
|
147 |
FROM nybg_raw |
|
148 |
WHERE Latitude>90 OR Latitude<-90 |
|
149 |
) AS badLats, |
|
150 |
( |
|
151 |
SELECT COUNT(*) |
|
152 |
FROM nybg_raw |
|
153 |
WHERE Longitude>180 OR Longitude<-180 |
|
154 |
) AS badLongs; |
|
155 |
|
|
156 |
-- ------------------------------- |
|
157 |
-- 15. List distinct non-null locality descriptions |
|
158 |
-- Check: full join to equivalent query against BIEN3 should return 125592 records |
|
159 |
-- ------------------------------- |
|
160 |
SELECT DISTINCT Locality AS localityDescription |
|
161 |
FROM nybg_raw |
|
162 |
WHERE Locality IS NOT NULL; |
|
163 |
|
|
164 |
-- ------------------------------- |
|
165 |
-- 16. List distinct non-null specimen descriptions |
|
166 |
-- Check: full join to equivalent query against BIEN3 should return 158460 records |
|
167 |
-- Note: specimens descriptions in nybg extract is in column PlantFungusDescription |
|
168 |
-- ------------------------------- |
|
169 |
SELECT PlantFungusDescription AS specimenDescription |
|
170 |
FROM nybg_raw |
|
171 |
WHERE PlantFungusDescription IS NOT NULL; |
|
172 |
|
|
173 |
|
|
174 |
|
|
175 |
|
|
176 |
|
|
177 |
|
|
178 |
|
|
179 |
|
|
180 |
|
|
181 |
|
trunk/validation/aggregating/specimens/NY/_archive/qualitative_validations_source_db_NYBG.sql.url | ||
---|---|---|
1 |
mailto:bboyle@email.arizona.edu?Brad_Boyle.2013-12-12-9:01.Quantative+validations+for+NYBG |
Also available in: Unified diff
archived validation/aggregating/specimens/NY/qualitative_validations_source_db_NYBG.sql