Project

General

Profile

Task #902

Updated by Aaron Marcuse-Kubitza over 10 years ago

h3. issue 

 * in some queries, the columns being joined on are completely the wrong set (columns that are not even used in any unique index) 
 * this creates unusual (and perhaps unstable) query plans (#884), which in turn trigger a disk space leak in Postgres/Ubuntu (#887) and make the query hang indefinitely 
 * the problem occurs 8 min into the import ("03:00:28" - "02:52:30"; see snapshot below > Mac > @ARIZ.omoccurrences@) 
 * see the following @pg_stat_activity@ snapshots (note the @JOIN@ columns in the @EXPLAIN@ output below each query): 
 <pre><code class="SQL"> 
 config: Ubuntu 12.04 with compatibility version of Postgres 9.3 

 import command: 
 . bin/import_all # all datasources 

 /*NVS.StemObservation*/ 	 CREATE TEMP TABLE "stemobservation_pkeys" AS 
 SELECT 
 "in#11"."row_num" 
 , "stemobservation"."stemobservation_id" AS "out.stemobservation_id" 
 FROM "in#11" 
 JOIN "stemobservation" ON COALESCE("stemobservation"."height_m", CAST('NaN' AS double precision)) = COALESCE("in#11"."NVS.StemObservation.Height::double", CAST('NaN' AS double precision)) 
 /* EXPLAIN: 
 Merge Join    (cost=1045217.21..95909185.15 rows=6323041431 width=8) 
   Merge Cond: ((COALESCE(stemobservation.height_m, 'NaN'::double precision)) = (COALESCE("in#11"."NVS.StemObservation.Height::double", 'NaN'::double precision))) 
   ->    Sort    (cost=30039.56..30492.76 rows=181279 width=12) 
         Sort Key: (COALESCE(stemobservation.height_m, 'NaN'::double precision)) 
         ->    Seq Scan on stemobservation    (cost=0.00..14206.79 rows=181279 width=12) 
   ->    Materialize    (cost=1015177.64..1050057.81 rows=6976033 width=12) 
         ->    Sort    (cost=1015177.64..1032617.73 rows=6976033 width=12) 
               Sort Key: (COALESCE("in#11"."NVS.StemObs 

 /*CVS.taxon_observation.***/ 	 CREATE TEMP TABLE "location_pkeys" AS 
 SELECT 
 "in#5"."row_num" 
 , "location"."location_id" AS "out.location_id" 
 FROM "in#5" 
 JOIN "location" ON ("location"."accesslevel" = "in#5"."CVS.taxon_observation.**.accessRights::accesslevel" OR ("location"."accesslevel" IS NULL AND "in#5"."CVS.taxon_observation.**.accessRights::accesslevel" IS NULL)) 
 /* EXPLAIN: 
 Nested Loop    (cost=0.00..83049883712.45 rows=27683225000 width=8) 
   Join Filter: ((location.accesslevel = "in#5"."CVS.taxon_observation.**.accessRights::accesslevel") OR ((location.accesslevel IS NULL) AND ("in#5"."CVS.taxon_observation.**.accessRights::accesslevel" IS NULL))) 
   ->    Seq Scan on location    (cost=0.00..175778.45 rows=5536645 width=8) 
   ->    Materialize    (cost=0.00..35434.00 rows=1000000 width=8) 
         ->    Seq Scan on "in#5"    (cost=0.00..30434.00 rows=1000000 width=8) 
 */ 
 </code></pre><pre><code class="SQL"> 
 config: Ubuntu 14.04 with official Ubuntu version of Postgres 9.3 

 import command: 
 declare -ax inputs=(inputs/{ARIZ,SALVIAS}/) # SALVIAS because it's part of the test case 
 export version=test_import 
 . bin/import_all 

 /*ARIZ.omoccurrences*/ 	 CREATE TEMP TABLE "location_pkeys" AS 
 SELECT 
 "in#6"."occurrenceID" 
 , "location"."location_id" AS "out.location_id" 
 FROM "in#6" 
 JOIN "location" ON COALESCE("location"."elevation_m", CAST('NaN' AS double precision)) = COALESCE("in#6"."_alt(1=verbatimElevation, 2=_avg(1=minimumElevat.result::double", CAST('NaN' AS double precision)) 
 /* EXPLAIN: 
 Merge Join    (cost=52039.20..3508151.98 rows=230343442 width=8) 
   Merge Cond: ((COALESCE(location.elevation_m, 'NaN'::double precision)) = (COALESCE("in#6"."_alt(1=verbatimElevation, 2=_avg(1=minimumElevat.result::double", 'NaN'::double precision))) 
   ->    Sort    (cost=23780.86..24261.44 rows=192230 width=12) 
         Sort Key: (COALESCE(location.elevation_m, 'NaN'::double precision)) 
         ->    Seq Scan on location    (cost=0.00..6910.30 rows=192230 width=12) 
   ->    Sort    (cost=28258.34..28857.47 rows=239654 width=12) 
         Sort Key: (COALESCE("in#6"."_alt(1=verbatimElevation, 2=_avg(1=minimumElevat.result::double", 'NaN'::double precision)) 
         - 
 */ 
 </code></pre><pre><code class="SQL"> 
 config: Mac OS X 10.8 with the installer version of Postgres 9.3.1 (not the standalone .app) 

 import command: 
 $0 # nested shell to contain the env changes 
 unset TMOUT # TMOUT causes shell to exit even with background processes 
 set -o ignoreeof # prevent Ctrl+D from exiting shell to keep attached jobs 
 unset n # clear any limit set in .profile (unless desired) 
 unset log # allow logging output to go to log files 
 export version=test_import 
 declare -ax inputs; inputs=(inputs/{ARIZ,SALVIAS}/) # or NY; SALVIAS because it's part of the test case 
 . bin/import_all 
 kill % # cancel after_import() 

 backend_start 
 2014-05-09 02:52:29.987578-07 
 query_start 
 2014-05-09 03:00:28.378299-07 
 /*ARIZ.omoccurrences*/ 	 CREATE TEMP TABLE "location_pkeys" AS 
 SELECT 
 "in#6"."occurrenceID" 
 , "location"."location_id" AS "out.location_id" 
 FROM "in#6" 
 JOIN "location" ON COALESCE("location"."elevation_m", CAST('NaN' AS double precision)) = COALESCE("in#6"."_alt(1=verbatimElevation, 2=_avg(1=minimumElevat.result::double", CAST('NaN' AS double precision)) 
 /* EXPLAIN: 
 Merge Join    (cost=86617.89..9234073.27 rows=609750474 width=8) 
   Merge Cond: ((COALESCE("in#6"."_alt(1=verbatimElevation, 2=_avg(1=minimumElevat.result::double", 'NaN'::double precision)) = (COALESCE(location.elevation_m, 'NaN'::double precision))) 
   ->    Sort    (cost=28258.34..28857.47 rows=239654 width=12) 
         Sort Key: (COALESCE("in#6"."_alt(1=verbatimElevation, 2=_avg(1=minimumElevat.result::double", 'NaN'::double precision)) 
         ->    Seq Scan on "in#6"    (cost=0.00..6844.54 rows=239654 width=12) 
   ->    Sort    (cost=58359.55..59631.70 rows=508859 width=12) 
         Sort Key: (COALESCE(location.elevation_m, 'NaN'::double precision)) 
         ->  
 */ 
 </code></pre> 
 * was not an issue in the last successful import 
 * this is definitely the cause of the incorrect query plans (#884), but is _not_ the cause of the disk space leak (#887), as that is a Postgres bug _triggered by_ the unusual query plans (no query plan should ever cause a disk space leak or make the query hang indefinitely). 
 * if this is caused by a Postgres/Ubuntu bug, it may be the same bug that caused the disk space leak (#887) 
 * it is more likely caused by a Postgres/Ubuntu bug than our import code, because the changes we made since the last successful import (2014-2-2) do not look like they would cause radical changes in the joins 

 h3. fix 

 # -check if r13361 fixed this (see explanation in r13392)- _no, problem still occurs_ 
 # see proposed steps in #905

Back