Project

General

Profile

« Previous | Next » 

Revision 12896

validation/aggregating/specimens/NY/qualitative_validations_source_db_NYBG.VegCore.sql: use our staging tables instead of the BIEN2 MySQL staging tables

View differences:

trunk/validation/aggregating/specimens/NY/qualitative_validations_source_db_NYBG.VegCore.sql
1 1
-- -------------------------------------------------------------------------
2 2
--  Quantitative validation queries against NYBG source db
3 3
--
4
-- Queries table `nybg_raw` in mysql db `bien2_staging` on nimoy.nceas.ucsb.edu
5
-- Verbatim import of NYSpecimenDataAmericas.csv, a DwC format dump of
6
-- specimen records from NYBG specimen database, except for the three
7
-- columns key, IdentifiedDate and CollectedDate, which where added and
8
-- populated by Brad after importing raw data.
4
-- Queries staging table vegbien."NY"."Ecatalog_all" @vegbiendev.nceas.ucsb.edu
9 5
-- -------------------------------------------------------------------------
10 6

  
7
SET search_path TO "NY";
8

  
11 9
-- -------------------------------
12 10
-- 1. Count of total records (specimens) in source db
13 11
-- Check: full join against equivalent query on BIEN3 db should return 1 row
14 12
-- -------------------------------
15 13
SELECT COUNT(*) totalSpecimenRecords
16
FROM nybg_raw;
14
FROM "Ecatalog_all";
17 15

  
18 16
-- -------------------------------
19 17
-- 2. Count of unique (verbatim) non-null families
20 18
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
21 19
-- -------------------------------
22
SELECT COUNT(DISTINCT Family) AS families FROM nybg_raw WHERE Family IS NOT NULL;
20
SELECT COUNT(DISTINCT Family) AS families FROM "Ecatalog_all" WHERE Family IS NOT NULL;
23 21

  
24 22
-- -------------------------------
25 23
-- 3. List of verbatim families
26 24
-- Check: Full inner join to equivalent query on BIEN3 db should return same number of rows
27 25
-- -------------------------------
28 26
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species
29
FROM nybg_raw
27
FROM "Ecatalog_all"
30 28
WHERE Genus IS NOT NULL AND Species IS NOT NULL;
31 29

  
32 30
-- -------------------------------
33 31
-- 4. Count of unique (verbatim) non-null species, without author
34 32
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
35 33
-- -------------------------------
36
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species)) AS subspecies FROM nybg_raw
34
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species)) AS subspecies FROM "Ecatalog_all"
37 35
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL;
38 36

  
39 37
-- -------------------------------
40 38
-- 5. List of verbatim species, excluding author
41 39
-- Check: Full inner join to equivalent query on BIEN3 db should return 3335 rows
42 40
-- -------------------------------
43
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS subspecies FROM nybg_raw
41
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS subspecies FROM "Ecatalog_all"
44 42
WHERE Genus IS NOT NULL AND Species IS NOT NULL AND Subspecies IS NOT NULL;
45 43

  
46 44
-- -------------------------------
47 45
-- 6. Count of unique (verbatim) non-null subspecific taxa, without author
48 46
-- Check: Full inner join to equivalent query on BIEN3 db should return 1 row
49 47
-- -------------------------------
50
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species, Subspecies)) AS Species FROM nybg_raw
48
SELECT COUNT(DISTINCT CONCAT_WS(' ', Genus, Species, Subspecies)) AS Species FROM "Ecatalog_all"
51 49
WHERE Genus IS NOT NULL AND Species IS NOT NULL;
52 50

  
53 51
-- -------------------------------
......
55 53
-- Check: Full inner join to equivalent query on BIEN3 db should return 40145 rows
56 54
-- -------------------------------
57 55
SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species
58
FROM nybg_raw
56
FROM "Ecatalog_all"
59 57
WHERE Genus IS NOT NULL AND Species IS NOT NULL;
60 58

  
61 59
-- -------------------------------
......
66 64
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))),
67 65
TRIM(IFNULL(ScientificNameAuthor,''))
68 66
))) AS fullScientificNames
69
FROM nybg_raw
67
FROM "Ecatalog_all"
70 68
WHERE Genus IS NOT NULL;
71 69

  
72 70
-- -------------------------------
......
77 75
TRIM(CONCAT_WS(' ', Genus, IFNULL(Species,''), IFNULL(Subspecies,''))),
78 76
TRIM(IFNULL(ScientificNameAuthor,''))
79 77
)) AS fullScientificName
80
FROM nybg_raw
78
FROM "Ecatalog_all"
81 79
WHERE Genus IS NOT NULL;
82 80

  
83 81
-- -------------------------------
......
86 84
-- Note: Majority should be from 'NY'; these are standard herbarium acronyms
87 85
-- -------------------------------
88 86
SELECT InstitutionCode, COUNT(*) AS records
89
FROM nybg_raw
87
FROM "Ecatalog_all"
90 88
GROUP BY InstitutionCode;
91 89

  
92 90
-- -------------------------------
......
96 94
-- character set issues, either in source db or in BIEN
97 95
-- -------------------------------
98 96
SELECT DISTINCT Country, StateProvince, County
99
FROM nybg_raw;
97
FROM "Ecatalog_all";
100 98

  
101 99
-- -------------------------------
102 100
-- 12. Check distinct Collector names + collection numbers + collection dates,
......
105 103
-- -------------------------------
106 104
SELECT IFNULL(Collector,'') AS collectorName, IFNULL(FieldNumber,'') AS collectionNumber,
107 105
IFNULL(CollectedDate,'') AS dataCollected, COUNT(*) AS specimenRecords
108
FROM nybg_raw
106
FROM "Ecatalog_all"
109 107
GROUP BY Collector, FieldNumber, CollectedDate
110 108
ORDER BY Collector, FieldNumber, CollectedDate;
111 109

  
......
117 115
SELECT
118 116
(
119 117
SELECT COUNT(*)
120
FROM nybg_raw
118
FROM "Ecatalog_all"
121 119
WHERE Latitude IS NOT NULL
122 120
) AS allLats,
123 121
(
124 122
SELECT COUNT(*)
125
FROM nybg_raw
123
FROM "Ecatalog_all"
126 124
WHERE Latitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$'
127 125
) AS decimalLats,
128 126
(
129 127
SELECT COUNT(*)
130
FROM nybg_raw
128
FROM "Ecatalog_all"
131 129
WHERE Longitude IS NOT NULL
132 130
) AS allLongs,
133 131
(
134 132
SELECT COUNT(*)
135
FROM nybg_raw
133
FROM "Ecatalog_all"
136 134
WHERE Longitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$'
137 135
) AS decimalLongs;
138 136

  
......
144 142
SELECT
145 143
(
146 144
SELECT COUNT(*)
147
FROM nybg_raw
145
FROM "Ecatalog_all"
148 146
WHERE Latitude>90 OR Latitude<-90
149 147
) AS badLats,
150 148
(
151 149
SELECT COUNT(*)
152
FROM nybg_raw
150
FROM "Ecatalog_all"
153 151
WHERE Longitude>180 OR Longitude<-180
154 152
) AS badLongs;
155 153

  
......
158 156
-- Check: full join to equivalent query against BIEN3 should return 125592 records
159 157
-- -------------------------------
160 158
SELECT DISTINCT Locality AS localityDescription
161
FROM nybg_raw
159
FROM "Ecatalog_all"
162 160
WHERE Locality IS NOT NULL;
163 161

  
164 162
-- -------------------------------
......
167 165
-- Note: specimens descriptions in nybg extract is in column PlantFungusDescription
168 166
-- -------------------------------
169 167
SELECT PlantFungusDescription AS specimenDescription
170
FROM nybg_raw
168
FROM "Ecatalog_all"
171 169
WHERE PlantFungusDescription IS NOT NULL;
172

  
173

  
174

  
175

  
176

  
177

  
178

  
179

  
180

  
181

  

Also available in: Unified diff