diff --git a/datafusion/core/tests/data/tpch_customer_small.parquet b/datafusion/core/tests/data/tpch_customer_small.parquet new file mode 100644 index 000000000000..3d5f73ef3a06 Binary files /dev/null and b/datafusion/core/tests/data/tpch_customer_small.parquet differ diff --git a/datafusion/core/tests/data/tpch_lineitem_small.parquet b/datafusion/core/tests/data/tpch_lineitem_small.parquet new file mode 100644 index 000000000000..5e98706669d3 Binary files /dev/null and b/datafusion/core/tests/data/tpch_lineitem_small.parquet differ diff --git a/datafusion/core/tests/data/tpch_nation_small.parquet b/datafusion/core/tests/data/tpch_nation_small.parquet new file mode 100644 index 000000000000..99da99594cf8 Binary files /dev/null and b/datafusion/core/tests/data/tpch_nation_small.parquet differ diff --git a/datafusion/core/tests/data/tpch_orders_small.parquet b/datafusion/core/tests/data/tpch_orders_small.parquet new file mode 100644 index 000000000000..79e043137caf Binary files /dev/null and b/datafusion/core/tests/data/tpch_orders_small.parquet differ diff --git a/datafusion/core/tests/data/tpch_partsupp_small.parquet b/datafusion/core/tests/data/tpch_partsupp_small.parquet new file mode 100644 index 000000000000..711d58dda749 Binary files /dev/null and b/datafusion/core/tests/data/tpch_partsupp_small.parquet differ diff --git a/datafusion/core/tests/data/tpch_region_small.parquet b/datafusion/core/tests/data/tpch_region_small.parquet new file mode 100644 index 000000000000..5e00a1f6da1d Binary files /dev/null and b/datafusion/core/tests/data/tpch_region_small.parquet differ diff --git a/datafusion/core/tests/data/tpch_supplier_small.parquet b/datafusion/core/tests/data/tpch_supplier_small.parquet new file mode 100644 index 000000000000..18323395fcbe Binary files /dev/null and b/datafusion/core/tests/data/tpch_supplier_small.parquet differ diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index 909c2efa67de..2d27a21447b2 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -1738,45 +1738,104 @@ async fn roundtrip_physical_plan_node() { let _ = plan.execute(0, ctx.task_ctx()).unwrap(); } -// Failing due to https://github.com/apache/datafusion/pull/16662 -// Fixed: Column index mismatch during protobuf deserialization -#[tokio::test] -async fn test_tpch_part_in_list_query_with_real_parquet_data() -> Result<()> { - // Test the specific query: SELECT p_size FROM part WHERE p_size IN (14, 6, 5, 31) - // - // NOTE: This test uses a minimal subset of TPC-H part.parquet data (tpch_part_small.parquet) - // which contains only 20 rows with p_size values in [14, 6, 5, 31] to reproduce the bug. - // Using alltypes_plain.parquet does NOT reproduce the issue, suggesting the bug - // is specific to certain characteristics of TPC-H parquet files or their schema. - +/// Helper function to create a SessionContext with all TPC-H tables registered as external tables +async fn tpch_context() -> Result { use datafusion_common::test_util::datafusion_test_data; let ctx = SessionContext::new(); - - // Register the TPC-H part table using the local test data let test_data = datafusion_test_data(); - let table_sql = format!( - "CREATE EXTERNAL TABLE part STORED AS PARQUET LOCATION '{test_data}/tpch_part_small.parquet'" - ); - ctx.sql(&table_sql).await.map_err(|e| { - DataFusionError::External(format!("Failed to create part table: {e}").into()) + // TPC-H table names + let tables = [ + "part", "supplier", "partsupp", "customer", "orders", "lineitem", "nation", + "region", + ]; + + // Create external tables for all TPC-H tables + for table in &tables { + let table_sql = format!( + "CREATE EXTERNAL TABLE {table} STORED AS PARQUET LOCATION '{test_data}/tpch_{table}_small.parquet'" + ); + ctx.sql(&table_sql).await.map_err(|e| { + DataFusionError::External( + format!("Failed to create {table} table: {e}").into(), + ) + })?; + } + + Ok(ctx) +} + +/// Helper function to get TPC-H query SQL +fn get_tpch_query_sql(query: usize) -> Result> { + use std::fs; + + if !(1..=22).contains(&query) { + return Err(DataFusionError::External( + format!("Invalid TPC-H query number: {query}").into(), + )); + } + + let filename = format!("../../benchmarks/queries/q{query}.sql"); + let contents = fs::read_to_string(&filename).map_err(|e| { + DataFusionError::External( + format!("Failed to read query file {filename}: {e}").into(), + ) })?; - // Test the exact problematic query - let sql = "SELECT p_size FROM part WHERE p_size IN (14, 6, 5, 31)"; + Ok(contents + .split(';') + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) + .collect()) +} - let logical_plan = ctx.sql(sql).await?.into_unoptimized_plan(); - let optimized_plan = ctx.state().optimize(&logical_plan)?; - let physical_plan = ctx.state().create_physical_plan(&optimized_plan).await?; +#[tokio::test] +async fn test_serialize_deserialize_tpch_queries() -> Result<()> { + // Create context with TPC-H tables + let ctx = tpch_context().await?; + + // repeat to run all 22 queries + for query in 1..=22 { + // run all statements in the query + let sql = get_tpch_query_sql(query)?; + for stmt in sql { + let logical_plan = ctx.sql(&stmt).await?.into_unoptimized_plan(); + let optimized_plan = ctx.state().optimize(&logical_plan)?; + let physical_plan = ctx.state().create_physical_plan(&optimized_plan).await?; + + // serialize the physical plan + let codec = DefaultPhysicalExtensionCodec {}; + let proto = + PhysicalPlanNode::try_from_physical_plan(physical_plan.clone(), &codec)?; + + // deserialize the physical plan + let _deserialized_plan = + proto.try_into_physical_plan(&ctx, ctx.runtime_env().as_ref(), &codec)?; + } + } - // Serialize the physical plan - bug may happen here already but not necessarily manifests - let codec = DefaultPhysicalExtensionCodec {}; - let proto = PhysicalPlanNode::try_from_physical_plan(physical_plan.clone(), &codec)?; + Ok(()) +} - // This will fail with the bug, but should succeed when fixed - let _deserialized_plan = - proto.try_into_physical_plan(&ctx, ctx.runtime_env().as_ref(), &codec)?; +// bug: https://github.com/apache/datafusion/issues/16772 +// Only 4 queries pass: q3, q5, q10, q12 +// Ignore the test until the bug is fixed +#[ignore] +#[tokio::test] +async fn test_round_trip_tpch_queries() -> Result<()> { + // Create context with TPC-H tables + let ctx = tpch_context().await?; + + // repeat to run all 22 queries + for query in 1..=22 { + // run all statements in the query + let sql = get_tpch_query_sql(query)?; + for stmt in sql { + roundtrip_test_sql_with_context(&stmt, &ctx).await?; + } + } Ok(()) }