Skip to content

Commit 9e144b2

Browse files
NGA-TRANalamb
andauthored
Add reproducer for tpch Q16 deserialization bug (#16662)
* Add reproducer for tpch Q16 deserialization bug * Add small Parquet file with 20 rows from part table for testing * Apply suggestions from code review Co-authored-by: Andrew Lamb <[email protected]> * Make the test fail and ignore it until the bug is fixed per review comments * fix clippy --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent ebb8e95 commit 9e144b2

File tree

2 files changed

+44
-0
lines changed

2 files changed

+44
-0
lines changed
4.25 KB
Binary file not shown.

datafusion/proto/tests/cases/roundtrip_physical_plan.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
use std::any::Any;
1919
use std::fmt::{Display, Formatter};
2020
use std::ops::Deref;
21+
2122
use std::sync::Arc;
2223
use std::vec;
2324

@@ -1736,3 +1737,46 @@ async fn roundtrip_physical_plan_node() {
17361737

17371738
let _ = plan.execute(0, ctx.task_ctx()).unwrap();
17381739
}
1740+
1741+
// Failing due to https://github.com/apache/datafusion/pull/16662
1742+
#[ignore]
1743+
#[tokio::test]
1744+
async fn test_tpch_part_in_list_query_with_real_parquet_data() -> Result<()> {
1745+
// Test the specific query: SELECT p_size FROM part WHERE p_size IN (14, 6, 5, 31)
1746+
//
1747+
// NOTE: This test uses a minimal subset of TPC-H part.parquet data (tpch_part_small.parquet)
1748+
// which contains only 20 rows with p_size values in [14, 6, 5, 31] to reproduce the bug.
1749+
// Using alltypes_plain.parquet does NOT reproduce the issue, suggesting the bug
1750+
// is specific to certain characteristics of TPC-H parquet files or their schema.
1751+
1752+
use datafusion_common::test_util::datafusion_test_data;
1753+
1754+
let ctx = SessionContext::new();
1755+
1756+
// Register the TPC-H part table using the local test data
1757+
let test_data = datafusion_test_data();
1758+
let table_sql = format!(
1759+
"CREATE EXTERNAL TABLE part STORED AS PARQUET LOCATION '{test_data}/tpch_part_small.parquet'"
1760+
1761+
);
1762+
ctx.sql(&table_sql).await.map_err(|e| {
1763+
DataFusionError::External(format!("Failed to create part table: {e}").into())
1764+
})?;
1765+
1766+
// Test the exact problematic query
1767+
let sql = "SELECT p_size FROM part WHERE p_size IN (14, 6, 5, 31)";
1768+
1769+
let logical_plan = ctx.sql(sql).await?.into_unoptimized_plan();
1770+
let optimized_plan = ctx.state().optimize(&logical_plan)?;
1771+
let physical_plan = ctx.state().create_physical_plan(&optimized_plan).await?;
1772+
1773+
// Serialize the physical plan - bug may happen here already but not necessarily manifests
1774+
let codec = DefaultPhysicalExtensionCodec {};
1775+
let proto = PhysicalPlanNode::try_from_physical_plan(physical_plan.clone(), &codec)?;
1776+
1777+
// This will fail with the bug, but should succeed when fixed
1778+
let _deserialized_plan =
1779+
proto.try_into_physical_plan(&ctx, ctx.runtime_env().as_ref(), &codec)?;
1780+
1781+
Ok(())
1782+
}

0 commit comments

Comments
 (0)