Project

General

Profile

NY aggregating validations

current queries


past queries

2011-12-9

NYBG_validations.txt (e-mail from Brad Boyle on 2011-12-9)

nimoy > mysql db bien2_staging > table nybg_raw

SELECT 
(
SELECT COUNT(DISTINCT Family) FROM nybg_raw WHERE Family IS NOT NULL
) AS families, 
(
SELECT COUNT(DISTINCT Genus) FROM nybg_raw WHERE Genus IS NOT NULL
) AS genera,
(
SELECT COUNT(DISTINCT Genus, Species) FROM nybg_raw 
WHERE Genus IS NOT NULL AND Species IS NOT NULL
) AS species;

SELECT DISTINCT CONCAT_WS(' ', Genus, Species) AS Species
FROM nybg_raw 
WHERE Genus IS NOT NULL AND Species IS NOT NULL;

SELECT DISTINCT Genus AS Genera
FROM nybg_raw 
WHERE Genus IS NOT NULL;

SELECT DISTINCT Family as Families
FROM nybg_raw 
WHERE Family IS NOT NULL;

SELECT ScientificName, Genus, Species, Subspecies, ScientificNameAuthor
FROM nybg_raw
LIMIT 25;

SELECT ScientificName, Genus, Species, Subspecies, ScientificNameAuthor
FROM nybg_raw
WHERE Subspecies IS NOT NULL
LIMIT 15;

SELECT InstitutionCode, COUNT(*) AS records
FROM nybg_raw
GROUP BY InstitutionCode;

SELECT DISTINCT CollectionCode 
FROM nybg_raw;

SELECT DISTINCT Country, StateProvince, County 
FROM nybg_raw
LIMIT 25;

SELECT COUNT(*) as totalRecords, COUNT(DISTINCT CatalogNumber) as uniqueCatalogNumbers
FROM nybg_raw;

SELECT CatalogNumber, COUNT(*) AS records
FROM nybg_raw
GROUP BY CatalogNumber
HAVING records>1;

SELECT b.CatalogNumber, DateLastModified, ScientificName, Collector, FieldNumber, 
CollectedDate
FROM nybg_raw AS a JOIN (
SELECT CatalogNumber, COUNT(*) AS records
FROM nybg_raw
WHERE CatalogNumber IS NOT NULL
GROUP BY CatalogNumber
HAVING records>1
) AS b
ON a.CatalogNumber=b.CatalogNumber
ORDER BY b.CatalogNumber ASC, DateLastModified DESC;

SELECT DateLastModified, ScientificName, Collector, FieldNumber, 
CollectedDate
FROM nybg_raw 
WHERE CatalogNumber IS NULL
ORDER BY CatalogNumber ASC, DateLastModified DESC
LIMIT 12;

SELECT 
(
SELECT COUNT(*)
FROM nybg_raw
WHERE Latitude IS NOT NULL
) AS allLats,
(
SELECT COUNT(*)
FROM nybg_raw
WHERE Latitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$'
) AS decimalLats,
(
SELECT COUNT(*)
FROM nybg_raw
WHERE Longitude IS NOT NULL
) AS allLongs,
(
SELECT COUNT(*)
FROM nybg_raw
WHERE Longitude REGEXP '^(-|\\+){0,1}([0-9]+\\.[0-9]*|[0-9]*\\.[0-9]+|[0-9]+)$'
) AS decimalLongs;

SELECT Latitude, Longitude 
FROM nybg_raw
WHERE  Latitude IS NOT NULL AND  Longitude IS NOT NULL
LIMIT 10;

SELECT 
(
SELECT COUNT(*)
FROM nybg_raw
WHERE Latitude>90 OR Latitude<-90
) AS badLats,
(
SELECT COUNT(*)
FROM nybg_raw
WHERE Longitude>180 OR Longitude<-180
) AS badLongs;

SELECT DayCollected, MonthCollected, YearCollected 
FROM nybg_raw
WHERE DayCollected IS NOT NULL OR MonthCollected IS NOT NULL OR YearCollected IS NOT NULL
LIMIT 10;

SELECT
(
SELECT COUNT(*)
FROM nybg_raw
WHERE NOT(DayCollected REGEXP '^-?[0-9]+$')
OR NOT(MonthCollected REGEXP '^-?[0-9]+$')
OR NOT(YearCollected REGEXP '^-?[0-9]+$')
) AS badDateCollected,
(
SELECT COUNT(*)
FROM nybg_raw
WHERE NOT(DayIdentified REGEXP '^-?[0-9]+$')
OR NOT(MonthIdentified REGEXP '^-?[0-9]+$')
OR NOT(YearIdentified REGEXP '^-?[0-9]+$')
) AS badDateIdentified;

SELECT Locality
FROM nybg_raw
WHERE Locality IS NOT NULL
LIMIT 10;

SELECT PlantFungusDescription
FROM nybg_raw
WHERE PlantFungusDescription IS NOT NULL
LIMIT 10;

SELECT Habitat, Vegetation
FROM nybg_raw
WHERE Habitat IS NOT NULL OR Vegetation IS NOT NULL
LIMIT 10;

Query extraction from document

  1. Replace regexp ^SELECT[^~]*?; with BeanShell expression _0.replaceAll("(?m)^","~")
  2. Replace regexp ^(?![~\n]).*\n with text ""
  3. Replace regexp ^~ with text ""
  4. Replace regexp \n{2,} with text \n\n