Project

General

Profile

« Previous | Next » 

Revision 12894

archived validation/aggregating/specimens/NY/qualitative_validations_source_db_NYBG.sql

View differences:

trunk/validation/aggregating/specimens/NY/qualitative_validations_source_db_NYBG.sql
1
-- -------------------------------------------------------------------------
2
--  Quantitative validation queries against NYBG source db
3
-- 
4
-- Queries table `nybg_raw` in mysql db `bien2_staging` on nimoy.nceas.ucsb.edu
5
-- Verbatim import of NYSpecimenDataAmericas.csv, a DwC format dump of 
6
-- specimen records from NYBG specimen database, except for the three
7
-- columns key, IdentifiedDate and CollectedDate, which where added and 
8
-- populated by Brad after importing raw data. 
9
-- -------------------------------------------------------------------------
10

  
11
-- -------------------------------
12
-- 1. Count of total records (specimens) in source db
13
-- Check: full join against equivalent query on BIEN3 db should return 1 row
14
-- -------------------------------
15
SELECT COUNT(*) totalSpecimenRecords
16
FROM nybg_raw;
17

  
18
-- -------------------------------
19
-- 2. Count of unique (verbatim) non-null families
20
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
21
-- -------------------------------
22
SELECT COUNT(DISTINCT Family) AS families FROM nybg_raw WHERE Family IS NOT NULL;
23

  
24
-- -------------------------------
25
-- 3. List of verbatim families
26
-- Check: Full inner join to equivalent query on BIEN3 db should return same number of rows
27
-- -------------------------------
28
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species
29
FROM nybg_raw 
30
WHERE Genus IS NOT NULL AND Species IS NOT NULL;
31

  
32
-- -------------------------------
33
-- 4. Count of unique (verbatim) non-null species, without author
34
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
35
-- -------------------------------
36
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species)) AS subspecies FROM nybg_raw 
37
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL;
38

  
39
-- -------------------------------
40
-- 5. List of verbatim species, excluding author
41
-- Check: Full inner join to equivalent query on BIEN3 db should return 3335 rows
42
-- -------------------------------
43
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS subspecies FROM nybg_raw 
44
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL;
45

  
46
-- -------------------------------
47
-- 6. Count of unique (verbatim) non-null subspecific taxa, without author
48
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
49
-- -------------------------------
50
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species, Subspecies)) AS Species FROM nybg_raw 
51
WHERE Genus IS NOT NULL AND Species IS NOT NULL;
52

  
53
-- -------------------------------
54
-- 7. List of verbatim subspecific taxa, without author
55
-- Check: Full inner join to equivalent query on BIEN3 db should return 40145 rows
56
-- -------------------------------
57
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species
58
FROM nybg_raw 
59
WHERE Genus IS NOT NULL AND Species IS NOT NULL;
60

  
61
-- -------------------------------
62
-- 8. Count of unique (verbatim) taxa including author, for all taxa identified at least to genus
63
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
64
-- -------------------------------
65
SELECT COUNT(DISTINCT TRIM(CONCAT_WS(' ',
66
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))), 
67
TRIM(IFNULL(ScientificNameAuthor,''))
68
))) AS fullScientificNames
69
FROM nybg_raw 
70
WHERE Genus IS NOT NULL;
71

  
72
-- -------------------------------
73
-- 9. List of unique (verbatim) taxa including author, for all taxa identified at least to genus
74
-- Check: Full inner join to equivalent query on BIEN3 db should return 45997 rows
75
-- -------------------------------
76
SELECT DISTINCT TRIM(CONCAT_WS(' ',
77
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))), 
78
TRIM(IFNULL(ScientificNameAuthor,''))
79
)) AS fullScientificName
80
FROM nybg_raw 
81
WHERE Genus IS NOT NULL;
82

  
83
-- -------------------------------
84
-- 10. Count number of records by institution
85
-- Check: Full inner join to equivalent query on BIEN3 db should return 6 rows
86
-- Note: Majority should be from 'NY'; these are standard herbarium acronyms
87
-- -------------------------------
88
SELECT InstitutionCode, COUNT(*) AS records
89
FROM nybg_raw
90
GROUP BY InstitutionCode;
91

  
92
-- -------------------------------
93
-- 11. List of three standard political divisions
94
-- Check: Full inner join to equivalent query on BIEN3 db should return 5232 rows
95
-- Note: character set issues may cause mis-matches. This query is a good way to reveal
96
-- character set issues, either in source db or in BIEN
97
-- -------------------------------
98
SELECT DISTINCT Country, StateProvince, County 
99
FROM nybg_raw; 
100

  
101
-- -------------------------------
102
-- 12. Check distinct Collector names + collection numbers + collection dates, 
103
-- plus total records
104
-- Check: Full inner join to equivalent query on BIEN3 db should return 309396 rows
105
-- -------------------------------
106
SELECT IFNULL(Collector,'') AS collectorName, IFNULL(FieldNumber,'') AS collectionNumber, 
107
IFNULL(CollectedDate,'') AS dataCollected, COUNT(*) AS specimenRecords
108
FROM nybg_raw
109
GROUP BY Collector, FieldNumber, CollectedDate
110
ORDER BY Collector, FieldNumber, CollectedDate;
111

  
112
-- -------------------------------
113
-- 13. Count of all  verbatim Latitude and Longitude values, as well as all 
114
-- latitude and longitude values that are decimals numbers
115
-- Check: full join to equivalent query against BIEN3 should return 1 row
116
-- -------------------------------
117
SELECT 
118
(
119
SELECT COUNT(*)
120
FROM nybg_raw
121
WHERE Latitude IS NOT NULL
122
) AS allLats,
123
(
124
SELECT COUNT(*)
125
FROM nybg_raw
126
WHERE Latitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$'
127
) AS decimalLats,
128
(
129
SELECT COUNT(*)
130
FROM nybg_raw
131
WHERE Longitude IS NOT NULL
132
) AS allLongs,
133
(
134
SELECT COUNT(*)
135
FROM nybg_raw
136
WHERE Longitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$'
137
) AS decimalLongs;
138

  
139
-- -------------------------------
140
-- 14. Count of all verbatim Latitude and Longitude values that are not valid values
141
-- of decimal latitude or decimal longitude
142
-- Check: full join to equivalent query against BIEN3 should return 1 row
143
-- -------------------------------
144
SELECT 
145
(
146
SELECT COUNT(*)
147
FROM nybg_raw
148
WHERE Latitude>90 OR Latitude<-90
149
) AS badLats,
150
(
151
SELECT COUNT(*)
152
FROM nybg_raw
153
WHERE Longitude>180 OR Longitude<-180
154
) AS badLongs;
155

  
156
-- -------------------------------
157
-- 15. List distinct non-null locality descriptions
158
-- Check: full join to equivalent query against BIEN3 should return 125592 records
159
-- -------------------------------
160
SELECT DISTINCT Locality AS localityDescription
161
FROM nybg_raw
162
WHERE Locality IS NOT NULL;
163

  
164
-- -------------------------------
165
-- 16. List distinct non-null specimen descriptions
166
-- Check: full join to equivalent query against BIEN3 should return 158460 records
167
-- Note: specimens descriptions in nybg extract is in column PlantFungusDescription
168
-- -------------------------------
169
SELECT PlantFungusDescription AS specimenDescription
170
FROM nybg_raw
171
WHERE PlantFungusDescription IS NOT NULL;
172

  
173

  
174

  
175

  
176

  
177

  
178

  
179

  
180

  
181

  
trunk/validation/aggregating/specimens/NY/qualitative_validations_source_db_NYBG.sql.url
1
mailto:bboyle@email.arizona.edu?Brad_Boyle.2013-12-12-9:01.Quantative+validations+for+NYBG
trunk/validation/aggregating/specimens/NY/_archive/qualitative_validations_source_db_NYBG.sql
1
-- -------------------------------------------------------------------------
2
--  Quantitative validation queries against NYBG source db
3
-- 
4
-- Queries table `nybg_raw` in mysql db `bien2_staging` on nimoy.nceas.ucsb.edu
5
-- Verbatim import of NYSpecimenDataAmericas.csv, a DwC format dump of 
6
-- specimen records from NYBG specimen database, except for the three
7
-- columns key, IdentifiedDate and CollectedDate, which where added and 
8
-- populated by Brad after importing raw data. 
9
-- -------------------------------------------------------------------------
10

  
11
-- -------------------------------
12
-- 1. Count of total records (specimens) in source db
13
-- Check: full join against equivalent query on BIEN3 db should return 1 row
14
-- -------------------------------
15
SELECT COUNT(*) totalSpecimenRecords
16
FROM nybg_raw;
17

  
18
-- -------------------------------
19
-- 2. Count of unique (verbatim) non-null families
20
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
21
-- -------------------------------
22
SELECT COUNT(DISTINCT Family) AS families FROM nybg_raw WHERE Family IS NOT NULL;
23

  
24
-- -------------------------------
25
-- 3. List of verbatim families
26
-- Check: Full inner join to equivalent query on BIEN3 db should return same number of rows
27
-- -------------------------------
28
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species
29
FROM nybg_raw 
30
WHERE Genus IS NOT NULL AND Species IS NOT NULL;
31

  
32
-- -------------------------------
33
-- 4. Count of unique (verbatim) non-null species, without author
34
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
35
-- -------------------------------
36
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species)) AS subspecies FROM nybg_raw 
37
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL;
38

  
39
-- -------------------------------
40
-- 5. List of verbatim species, excluding author
41
-- Check: Full inner join to equivalent query on BIEN3 db should return 3335 rows
42
-- -------------------------------
43
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS subspecies FROM nybg_raw 
44
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL;
45

  
46
-- -------------------------------
47
-- 6. Count of unique (verbatim) non-null subspecific taxa, without author
48
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
49
-- -------------------------------
50
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species, Subspecies)) AS Species FROM nybg_raw 
51
WHERE Genus IS NOT NULL AND Species IS NOT NULL;
52

  
53
-- -------------------------------
54
-- 7. List of verbatim subspecific taxa, without author
55
-- Check: Full inner join to equivalent query on BIEN3 db should return 40145 rows
56
-- -------------------------------
57
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species
58
FROM nybg_raw 
59
WHERE Genus IS NOT NULL AND Species IS NOT NULL;
60

  
61
-- -------------------------------
62
-- 8. Count of unique (verbatim) taxa including author, for all taxa identified at least to genus
63
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
64
-- -------------------------------
65
SELECT COUNT(DISTINCT TRIM(CONCAT_WS(' ',
66
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))), 
67
TRIM(IFNULL(ScientificNameAuthor,''))
68
))) AS fullScientificNames
69
FROM nybg_raw 
70
WHERE Genus IS NOT NULL;
71

  
72
-- -------------------------------
73
-- 9. List of unique (verbatim) taxa including author, for all taxa identified at least to genus
74
-- Check: Full inner join to equivalent query on BIEN3 db should return 45997 rows
75
-- -------------------------------
76
SELECT DISTINCT TRIM(CONCAT_WS(' ',
77
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))), 
78
TRIM(IFNULL(ScientificNameAuthor,''))
79
)) AS fullScientificName
80
FROM nybg_raw 
81
WHERE Genus IS NOT NULL;
82

  
83
-- -------------------------------
84
-- 10. Count number of records by institution
85
-- Check: Full inner join to equivalent query on BIEN3 db should return 6 rows
86
-- Note: Majority should be from 'NY'; these are standard herbarium acronyms
87
-- -------------------------------
88
SELECT InstitutionCode, COUNT(*) AS records
89
FROM nybg_raw
90
GROUP BY InstitutionCode;
91

  
92
-- -------------------------------
93
-- 11. List of three standard political divisions
94
-- Check: Full inner join to equivalent query on BIEN3 db should return 5232 rows
95
-- Note: character set issues may cause mis-matches. This query is a good way to reveal
96
-- character set issues, either in source db or in BIEN
97
-- -------------------------------
98
SELECT DISTINCT Country, StateProvince, County 
99
FROM nybg_raw; 
100

  
101
-- -------------------------------
102
-- 12. Check distinct Collector names + collection numbers + collection dates, 
103
-- plus total records
104
-- Check: Full inner join to equivalent query on BIEN3 db should return 309396 rows
105
-- -------------------------------
106
SELECT IFNULL(Collector,'') AS collectorName, IFNULL(FieldNumber,'') AS collectionNumber, 
107
IFNULL(CollectedDate,'') AS dataCollected, COUNT(*) AS specimenRecords
108
FROM nybg_raw
109
GROUP BY Collector, FieldNumber, CollectedDate
110
ORDER BY Collector, FieldNumber, CollectedDate;
111

  
112
-- -------------------------------
113
-- 13. Count of all  verbatim Latitude and Longitude values, as well as all 
114
-- latitude and longitude values that are decimals numbers
115
-- Check: full join to equivalent query against BIEN3 should return 1 row
116
-- -------------------------------
117
SELECT 
118
(
119
SELECT COUNT(*)
120
FROM nybg_raw
121
WHERE Latitude IS NOT NULL
122
) AS allLats,
123
(
124
SELECT COUNT(*)
125
FROM nybg_raw
126
WHERE Latitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$'
127
) AS decimalLats,
128
(
129
SELECT COUNT(*)
130
FROM nybg_raw
131
WHERE Longitude IS NOT NULL
132
) AS allLongs,
133
(
134
SELECT COUNT(*)
135
FROM nybg_raw
136
WHERE Longitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$'
137
) AS decimalLongs;
138

  
139
-- -------------------------------
140
-- 14. Count of all verbatim Latitude and Longitude values that are not valid values
141
-- of decimal latitude or decimal longitude
142
-- Check: full join to equivalent query against BIEN3 should return 1 row
143
-- -------------------------------
144
SELECT 
145
(
146
SELECT COUNT(*)
147
FROM nybg_raw
148
WHERE Latitude>90 OR Latitude<-90
149
) AS badLats,
150
(
151
SELECT COUNT(*)
152
FROM nybg_raw
153
WHERE Longitude>180 OR Longitude<-180
154
) AS badLongs;
155

  
156
-- -------------------------------
157
-- 15. List distinct non-null locality descriptions
158
-- Check: full join to equivalent query against BIEN3 should return 125592 records
159
-- -------------------------------
160
SELECT DISTINCT Locality AS localityDescription
161
FROM nybg_raw
162
WHERE Locality IS NOT NULL;
163

  
164
-- -------------------------------
165
-- 16. List distinct non-null specimen descriptions
166
-- Check: full join to equivalent query against BIEN3 should return 158460 records
167
-- Note: specimens descriptions in nybg extract is in column PlantFungusDescription
168
-- -------------------------------
169
SELECT PlantFungusDescription AS specimenDescription
170
FROM nybg_raw
171
WHERE PlantFungusDescription IS NOT NULL;
172

  
173

  
174

  
175

  
176

  
177

  
178

  
179

  
180

  
181

  
trunk/validation/aggregating/specimens/NY/_archive/qualitative_validations_source_db_NYBG.sql.url
1
mailto:bboyle@email.arizona.edu?Brad_Boyle.2013-12-12-9:01.Quantative+validations+for+NYBG

Also available in: Unified diff