From 8477b235143ec9d233c8285410f944004b349c0d Mon Sep 17 00:00:00 2001 From: Dmitrii Blaginin Date: Tue, 11 Mar 2025 19:42:55 +0000 Subject: [PATCH 1/6] Add insta for df testing --- Cargo.lock | 1 + Cargo.toml | 1 + datafusion-cli/Cargo.toml | 2 +- datafusion/core/Cargo.toml | 2 + datafusion/core/tests/dataframe/mod.rs | 82 ++++++++++++-------------- 5 files changed, 43 insertions(+), 45 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 13f33aab45ae..12c38e0a6090 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1780,6 +1780,7 @@ dependencies = [ "env_logger", "flate2", "futures", + "insta", "itertools 0.14.0", "log", "nix", diff --git a/Cargo.toml b/Cargo.toml index 871377f8dfc0..16a6967910a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -165,6 +165,7 @@ sqlparser = { version = "0.54.0", features = ["visitor"] } tempfile = "3" tokio = { version = "1.43", features = ["macros", "rt", "sync"] } url = "2.5.4" +insta = { version = "1.41.1", features = ["glob", "filters"] } [profile.release] codegen-units = 1 diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 258fd995a73e..fcc28075c9ee 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -67,7 +67,7 @@ url = { workspace = true } [dev-dependencies] assert_cmd = "2.0" ctor = { workspace = true } -insta = { version = "1.41.1", features = ["glob", "filters"] } insta-cmd = "0.6.0" +insta = { workspace = true } predicates = "3.0" rstest = { workspace = true } diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index fd1fd4164da0..c0f487b8b7fc 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -161,6 +161,8 @@ serde_json = { workspace = true } sysinfo = "0.33.1" test-utils = { path = "../../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"] } +insta = { workspace = true } + [target.'cfg(not(target_os = "windows"))'.dev-dependencies] nix = { version = "0.29.0", features = ["fs"] } diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index a902cf8ae65b..94886b8ad671 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -38,6 +38,7 @@ use datafusion_functions_aggregate::expr_fn::{ }; use datafusion_functions_nested::make_array::make_array_udf; use datafusion_functions_window::expr_fn::{first_value, row_number}; +use insta::assert_snapshot; use object_store::local::LocalFileSystem; use sqlparser::ast::NullTreatment; use std::collections::HashMap; @@ -82,18 +83,14 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_plan::{get_plan_string, ExecutionPlanProperties}; // Get string representation of the plan -async fn assert_physical_plan(df: &DataFrame, expected: Vec<&str>) { +async fn physical_plan_to_string(df: &DataFrame) -> String { let physical_plan = df .clone() .create_physical_plan() .await .expect("Error creating physical plan"); - let actual = get_plan_string(&physical_plan); - assert_eq!( - expected, actual, - "\n**Optimized Plan Mismatch\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" - ); + get_plan_string(&physical_plan).join("\n") } pub fn table_with_constraints() -> Arc { @@ -542,14 +539,14 @@ async fn test_aggregate_with_pk() -> Result<()> { // expression even if it is not part of the group by expression and can // select "name" column even though it wasn't explicitly grouped let df = df.select(vec![col("id"), col("name")])?; - assert_physical_plan( - &df, - vec![ - "AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[]", - " DataSourceExec: partitions=1, partition_sizes=[1]", - ], - ) - .await; + + assert_snapshot!( + physical_plan_to_string(&df).await, + @r###" + AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[] + DataSourceExec: partitions=1, partition_sizes=[1] + "### + ); let df_results = df.collect().await?; @@ -584,16 +581,15 @@ async fn test_aggregate_with_pk2() -> Result<()> { // id = 1 AND name = 'a' let predicate = col("id").eq(lit(1i32)).and(col("name").eq(lit("a"))); let df = df.filter(predicate)?; - assert_physical_plan( - &df, - vec![ - "CoalesceBatchesExec: target_batch_size=8192", - " FilterExec: id@0 = 1 AND name@1 = a", - " AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[]", - " DataSourceExec: partitions=1, partition_sizes=[1]", - ], - ) - .await; + assert_snapshot!( + physical_plan_to_string(&df).await, + @r###" + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: id@0 = 1 AND name@1 = a + AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[] + DataSourceExec: partitions=1, partition_sizes=[1] + "### + ); // Since id and name are functionally dependant, we can use name among expression // even if it is not part of the group by expression. @@ -633,16 +629,15 @@ async fn test_aggregate_with_pk3() -> Result<()> { // Select expression refers to id, and name columns. // id, name let df = df.select(vec![col("id"), col("name")])?; - assert_physical_plan( - &df, - vec![ - "CoalesceBatchesExec: target_batch_size=8192", - " FilterExec: id@0 = 1", - " AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[]", - " DataSourceExec: partitions=1, partition_sizes=[1]", - ], - ) - .await; + assert_snapshot!( + physical_plan_to_string(&df).await, + @r###" + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: id@0 = 1 + AggregateExec: mode=Single, gby=[id@0 as id, name@1 as name], aggr=[] + DataSourceExec: partitions=1, partition_sizes=[1] + "### + ); // Since id and name are functionally dependant, we can use name among expression // even if it is not part of the group by expression. @@ -684,16 +679,15 @@ async fn test_aggregate_with_pk4() -> Result<()> { // In this case aggregate shouldn't be expanded, since these // columns are not used. - assert_physical_plan( - &df, - vec![ - "CoalesceBatchesExec: target_batch_size=8192", - " FilterExec: id@0 = 1", - " AggregateExec: mode=Single, gby=[id@0 as id], aggr=[]", - " DataSourceExec: partitions=1, partition_sizes=[1]", - ], - ) - .await; + assert_snapshot!( + physical_plan_to_string(&df).await, + @r###" + CoalesceBatchesExec: target_batch_size=8192 + FilterExec: id@0 = 1 + AggregateExec: mode=Single, gby=[id@0 as id], aggr=[] + DataSourceExec: partitions=1, partition_sizes=[1] + "### + ); let df_results = df.collect().await?; From 6bfc50d4acd1bbe03b50ffed0a7a1a9ebda4019a Mon Sep 17 00:00:00 2001 From: Dmitrii Blaginin Date: Tue, 11 Mar 2025 19:51:05 +0000 Subject: [PATCH 2/6] Do not use `get_plan_string` --- datafusion/core/tests/dataframe/mod.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 94886b8ad671..6956f27d526a 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -80,7 +80,7 @@ use datafusion_expr::{ use datafusion_physical_expr::expressions::Column; use datafusion_physical_expr::Partitioning; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; -use datafusion_physical_plan::{get_plan_string, ExecutionPlanProperties}; +use datafusion_physical_plan::{displayable, ExecutionPlanProperties}; // Get string representation of the plan async fn physical_plan_to_string(df: &DataFrame) -> String { @@ -90,7 +90,8 @@ async fn physical_plan_to_string(df: &DataFrame) -> String { .await .expect("Error creating physical plan"); - get_plan_string(&physical_plan).join("\n") + let formated = displayable(physical_plan.as_ref()).indent(true); + formated.to_string() } pub fn table_with_constraints() -> Arc { From 229e232c099f5dfc54b1c284a68535f98e97c899 Mon Sep 17 00:00:00 2001 From: Dmitrii Blaginin Date: Tue, 11 Mar 2025 20:21:28 +0000 Subject: [PATCH 3/6] Switch from `assert_batches_eq` --- datafusion/common/src/test_util.rs | 12 + datafusion/core/tests/dataframe/mod.rs | 777 +++++++++++++------------ 2 files changed, 428 insertions(+), 361 deletions(-) diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs index 298f54389cf8..04979780ddf8 100644 --- a/datafusion/common/src/test_util.rs +++ b/datafusion/common/src/test_util.rs @@ -17,6 +17,9 @@ //! Utility functions to make testing DataFusion based crates easier +use crate::arrow::util::pretty::pretty_format_batches_with_options; +use crate::format::DEFAULT_FORMAT_OPTIONS; +use arrow::array::RecordBatch; use std::{error::Error, path::PathBuf}; /// Compares formatted output of a record batch with an expected @@ -73,6 +76,15 @@ macro_rules! assert_batches_eq { }; } +pub fn batches_to_string(batches: &[RecordBatch]) -> String { + let actual_lines = + pretty_format_batches_with_options(batches, &DEFAULT_FORMAT_OPTIONS) + .unwrap() + .to_string(); + + actual_lines.trim().to_string() +} + /// Compares formatted output of a record batch with an expected /// vector of strings in a way that order does not matter. /// This is a macro so errors appear on the correct line diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 6956f27d526a..0189d5fb1944 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -47,6 +47,7 @@ use std::sync::Arc; use tempfile::TempDir; use url::Url; +use datafusion::assert_batches_sorted_eq; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::datasource::MemTable; use datafusion::error::Result; @@ -60,8 +61,8 @@ use datafusion::test_util::{ parquet_test_data, populate_csv_partitions, register_aggregate_csv, test_table, test_table_with_name, }; -use datafusion::{assert_batches_eq, assert_batches_sorted_eq}; use datafusion_catalog::TableProvider; +use datafusion_common::test_util::batches_to_string; use datafusion_common::{ assert_contains, Constraint, Constraints, DataFusionError, ParamValues, ScalarValue, TableReference, UnnestOptions, @@ -2205,15 +2206,15 @@ async fn filtered_aggr_with_param_values() -> Result<()> { .with_param_values(ParamValues::List(vec![ScalarValue::from(10u64)])); let df_results = df?.collect().await?; - assert_batches_eq!( - &[ - "+------------------------------------------------+", - "| count(table1.c2) FILTER (WHERE table1.c3 > $1) |", - "+------------------------------------------------+", - "| 54 |", - "+------------------------------------------------+", - ], - &df_results + assert_snapshot!( + batches_to_string(&df_results), + @r###" + +------------------------------------------------+ + | count(table1.c2) FILTER (WHERE table1.c3 > $1) | + +------------------------------------------------+ + | 54 | + +------------------------------------------------+ + "### ); Ok(()) @@ -2259,20 +2260,21 @@ async fn write_parquet_with_order() -> Result<()> { let df = ctx.sql("SELECT * FROM data").await?; let results = df.collect().await?; - assert_batches_eq!( - &[ - "+---+---+", - "| a | b |", - "+---+---+", - "| 1 | 2 |", - "| 2 | 6 |", - "| 3 | 5 |", - "| 5 | 3 |", - "| 7 | 4 |", - "+---+---+", - ], - &results + assert_snapshot!( + batches_to_string(&results), + @r###" + +---+---+ + | a | b | + +---+---+ + | 1 | 2 | + | 2 | 6 | + | 3 | 5 | + | 5 | 3 | + | 7 | 4 | + +---+---+ + "### ); + Ok(()) } @@ -2316,19 +2318,19 @@ async fn write_csv_with_order() -> Result<()> { let df = ctx.sql("SELECT * FROM data").await?; let results = df.collect().await?; - assert_batches_eq!( - &[ - "+---+---+", - "| a | b |", - "+---+---+", - "| 1 | 2 |", - "| 2 | 6 |", - "| 3 | 5 |", - "| 5 | 3 |", - "| 7 | 4 |", - "+---+---+", - ], - &results + assert_snapshot!( + batches_to_string(&results), + @r###" + +---+---+ + | a | b | + +---+---+ + | 1 | 2 | + | 2 | 6 | + | 3 | 5 | + | 5 | 3 | + | 7 | 4 | + +---+---+ + "### ); Ok(()) } @@ -2373,19 +2375,19 @@ async fn write_json_with_order() -> Result<()> { let df = ctx.sql("SELECT * FROM data").await?; let results = df.collect().await?; - assert_batches_eq!( - &[ - "+---+---+", - "| a | b |", - "+---+---+", - "| 1 | 2 |", - "| 2 | 6 |", - "| 3 | 5 |", - "| 5 | 3 |", - "| 7 | 4 |", - "+---+---+", - ], - &results + assert_snapshot!( + batches_to_string(&results), + @r###" + +---+---+ + | a | b | + +---+---+ + | 1 | 2 | + | 2 | 6 | + | 3 | 5 | + | 5 | 3 | + | 7 | 4 | + +---+---+ + "### ); Ok(()) } @@ -2427,19 +2429,19 @@ async fn write_table_with_order() -> Result<()> { let df = ctx.sql("SELECT * FROM data").await?; let results = df.collect().await?; - assert_batches_eq!( - &[ - "+-----------+", - "| tablecol1 |", - "+-----------+", - "| a |", - "| b |", - "| c |", - "| x |", - "| z |", - "+-----------+", - ], - &results + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----------+ + | tablecol1 | + +-----------+ + | a | + | b | + | c | + | x | + | z | + +-----------+ + "### ); Ok(()) } @@ -2971,16 +2973,19 @@ async fn sort_on_unprojected_columns() -> Result<()> { .unwrap(); let results = df.collect().await.unwrap(); - #[rustfmt::skip] - let expected = ["+-----+", - "| a |", - "+-----+", - "| 100 |", - "| 10 |", - "| 10 |", - "| 1 |", - "+-----+"]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----+ + | a | + +-----+ + | 100 | + | 10 | + | 10 | + | 1 | + +-----+ + "### + ); Ok(()) } @@ -3015,15 +3020,18 @@ async fn sort_on_distinct_columns() -> Result<()> { .unwrap(); let results = df.collect().await.unwrap(); - #[rustfmt::skip] - let expected = ["+-----+", - "| a |", - "+-----+", - "| 100 |", - "| 10 |", - "| 1 |", - "+-----+"]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----+ + | a | + +-----+ + | 100 | + | 10 | + | 1 | + +-----+ + "### + ); Ok(()) } @@ -3155,14 +3163,17 @@ async fn filter_with_alias_overwrite() -> Result<()> { .unwrap(); let results = df.collect().await.unwrap(); - #[rustfmt::skip] - let expected = ["+------+", - "| a |", - "+------+", - "| true |", - "| true |", - "+------+"]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +------+ + | a | + +------+ + | true | + | true | + +------+ + "### + ); Ok(()) } @@ -3188,16 +3199,19 @@ async fn select_with_alias_overwrite() -> Result<()> { let results = df.collect().await?; - #[rustfmt::skip] - let expected = ["+-------+", - "| a |", - "+-------+", - "| false |", - "| true |", - "| true |", - "| false |", - "+-------+"]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-------+ + | a | + +-------+ + | false | + | true | + | true | + | false | + +-------+ + "### + ); Ok(()) } @@ -3220,24 +3234,26 @@ async fn test_grouping_sets() -> Result<()> { let results = df.collect().await?; - let expected = vec![ - "+-----------+-----+---------------+", - "| a | b | count(test.a) |", - "+-----------+-----+---------------+", - "| | 100 | 1 |", - "| | 10 | 2 |", - "| | 1 | 1 |", - "| abcDEF | | 1 |", - "| abcDEF | 1 | 1 |", - "| abc123 | | 1 |", - "| abc123 | 10 | 1 |", - "| CBAdef | | 1 |", - "| CBAdef | 10 | 1 |", - "| 123AbcDef | | 1 |", - "| 123AbcDef | 100 | 1 |", - "+-----------+-----+---------------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----------+-----+---------------+ + | a | b | count(test.a) | + +-----------+-----+---------------+ + | | 100 | 1 | + | | 10 | 2 | + | | 1 | 1 | + | abcDEF | | 1 | + | abcDEF | 1 | 1 | + | abc123 | | 1 | + | abc123 | 10 | 1 | + | CBAdef | | 1 | + | CBAdef | 10 | 1 | + | 123AbcDef | | 1 | + | 123AbcDef | 100 | 1 | + +-----------+-----+---------------+ + "### + ); Ok(()) } @@ -3261,23 +3277,25 @@ async fn test_grouping_sets_count() -> Result<()> { let results = df.collect().await?; - let expected = vec![ - "+----+----+-----------------+", - "| c1 | c2 | count(Int32(1)) |", - "+----+----+-----------------+", - "| | 5 | 14 |", - "| | 4 | 23 |", - "| | 3 | 19 |", - "| | 2 | 22 |", - "| | 1 | 22 |", - "| e | | 21 |", - "| d | | 18 |", - "| c | | 21 |", - "| b | | 19 |", - "| a | | 21 |", - "+----+----+-----------------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +----+----+-----------------+ + | c1 | c2 | count(Int32(1)) | + +----+----+-----------------+ + | | 5 | 14 | + | | 4 | 23 | + | | 3 | 19 | + | | 2 | 22 | + | | 1 | 22 | + | e | | 21 | + | d | | 18 | + | c | | 21 | + | b | | 19 | + | a | | 21 | + +----+----+-----------------+ + "### + ); Ok(()) } @@ -3308,48 +3326,50 @@ async fn test_grouping_set_array_agg_with_overflow() -> Result<()> { let results = df.collect().await?; - let expected = vec![ - "+----+----+--------+---------------------+", - "| c1 | c2 | sum_c3 | avg_c3 |", - "+----+----+--------+---------------------+", - "| | 5 | -194 | -13.857142857142858 |", - "| | 4 | 29 | 1.2608695652173914 |", - "| | 3 | 395 | 20.789473684210527 |", - "| | 2 | 184 | 8.363636363636363 |", - "| | 1 | 367 | 16.681818181818183 |", - "| e | | 847 | 40.333333333333336 |", - "| e | 5 | -22 | -11.0 |", - "| e | 4 | 261 | 37.285714285714285 |", - "| e | 3 | 192 | 48.0 |", - "| e | 2 | 189 | 37.8 |", - "| e | 1 | 227 | 75.66666666666667 |", - "| d | | 458 | 25.444444444444443 |", - "| d | 5 | -99 | -49.5 |", - "| d | 4 | 162 | 54.0 |", - "| d | 3 | 124 | 41.333333333333336 |", - "| d | 2 | 328 | 109.33333333333333 |", - "| d | 1 | -57 | -8.142857142857142 |", - "| c | | -28 | -1.3333333333333333 |", - "| c | 5 | 24 | 12.0 |", - "| c | 4 | -43 | -10.75 |", - "| c | 3 | 190 | 47.5 |", - "| c | 2 | -389 | -55.57142857142857 |", - "| c | 1 | 190 | 47.5 |", - "| b | | -111 | -5.842105263157895 |", - "| b | 5 | -1 | -0.2 |", - "| b | 4 | -223 | -44.6 |", - "| b | 3 | -84 | -42.0 |", - "| b | 2 | 102 | 25.5 |", - "| b | 1 | 95 | 31.666666666666668 |", - "| a | | -385 | -18.333333333333332 |", - "| a | 5 | -96 | -32.0 |", - "| a | 4 | -128 | -32.0 |", - "| a | 3 | -27 | -4.5 |", - "| a | 2 | -46 | -15.333333333333334 |", - "| a | 1 | -88 | -17.6 |", - "+----+----+--------+---------------------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +----+----+--------+---------------------+ + | c1 | c2 | sum_c3 | avg_c3 | + +----+----+--------+---------------------+ + | | 5 | -194 | -13.857142857142858 | + | | 4 | 29 | 1.2608695652173914 | + | | 3 | 395 | 20.789473684210527 | + | | 2 | 184 | 8.363636363636363 | + | | 1 | 367 | 16.681818181818183 | + | e | | 847 | 40.333333333333336 | + | e | 5 | -22 | -11.0 | + | e | 4 | 261 | 37.285714285714285 | + | e | 3 | 192 | 48.0 | + | e | 2 | 189 | 37.8 | + | e | 1 | 227 | 75.66666666666667 | + | d | | 458 | 25.444444444444443 | + | d | 5 | -99 | -49.5 | + | d | 4 | 162 | 54.0 | + | d | 3 | 124 | 41.333333333333336 | + | d | 2 | 328 | 109.33333333333333 | + | d | 1 | -57 | -8.142857142857142 | + | c | | -28 | -1.3333333333333333 | + | c | 5 | 24 | 12.0 | + | c | 4 | -43 | -10.75 | + | c | 3 | 190 | 47.5 | + | c | 2 | -389 | -55.57142857142857 | + | c | 1 | 190 | 47.5 | + | b | | -111 | -5.842105263157895 | + | b | 5 | -1 | -0.2 | + | b | 4 | -223 | -44.6 | + | b | 3 | -84 | -42.0 | + | b | 2 | 102 | 25.5 | + | b | 1 | 95 | 31.666666666666668 | + | a | | -385 | -18.333333333333332 | + | a | 5 | -96 | -32.0 | + | a | 4 | -128 | -32.0 | + | a | 3 | -27 | -4.5 | + | a | 2 | -46 | -15.333333333333334 | + | a | 1 | -88 | -17.6 | + +----+----+--------+---------------------+ + "### + ); Ok(()) } @@ -3625,16 +3645,18 @@ async fn unnest_dict_encoded_columns() -> Result<()> { .unnest_columns(&["make_array_expr"])?; let results = df.collect().await.unwrap(); - let expected = [ - "+-----------------+---------+", - "| make_array_expr | column1 |", - "+-----------------+---------+", - "| x | x |", - "| y | y |", - "| z | z |", - "+-----------------+---------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----------------+---------+ + | make_array_expr | column1 | + +-----------------+---------+ + | x | x | + | y | y | + | z | z | + +-----------------+---------+ + "### + ); // make_array(dict_encoded_string,literal string) let make_array_udf_expr2 = make_array_udf().call(vec![ @@ -3651,19 +3673,21 @@ async fn unnest_dict_encoded_columns() -> Result<()> { .unnest_columns(&["make_array_expr"])?; let results = df.collect().await.unwrap(); - let expected = [ - "+-----------------+---------+", - "| make_array_expr | column1 |", - "+-----------------+---------+", - "| x | x |", - "| fixed_string | x |", - "| y | y |", - "| fixed_string | y |", - "| z | z |", - "| fixed_string | z |", - "+-----------------+---------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----------------+---------+ + | make_array_expr | column1 | + +-----------------+---------+ + | x | x | + | fixed_string | x | + | y | y | + | fixed_string | y | + | z | z | + | fixed_string | z | + +-----------------+---------+ + "### + ); Ok(()) } @@ -3671,17 +3695,19 @@ async fn unnest_dict_encoded_columns() -> Result<()> { async fn unnest_column_nulls() -> Result<()> { let df = table_with_lists_and_nulls().await?; let results = df.clone().collect().await?; - let expected = [ - "+--------+----+", - "| list | id |", - "+--------+----+", - "| [1, 2] | A |", - "| | B |", - "| [] | C |", - "| [3] | D |", - "+--------+----+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +--------+----+ + | list | id | + +--------+----+ + | [1, 2] | A | + | | B | + | [] | C | + | [3] | D | + +--------+----+ + "### + ); // Unnest, preserving nulls (row with B is preserved) let options = UnnestOptions::new().with_preserve_nulls(true); @@ -3691,33 +3717,37 @@ async fn unnest_column_nulls() -> Result<()> { .unnest_columns_with_options(&["list"], options)? .collect() .await?; - let expected = [ - "+------+----+", - "| list | id |", - "+------+----+", - "| 1 | A |", - "| 2 | A |", - "| | B |", - "| 3 | D |", - "+------+----+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +------+----+ + | list | id | + +------+----+ + | 1 | A | + | 2 | A | + | | B | + | 3 | D | + +------+----+ + "### + ); let options = UnnestOptions::new().with_preserve_nulls(false); let results = df .unnest_columns_with_options(&["list"], options)? .collect() .await?; - let expected = [ - "+------+----+", - "| list | id |", - "+------+----+", - "| 1 | A |", - "| 2 | A |", - "| 3 | D |", - "+------+----+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +------+----+ + | list | id | + +------+----+ + | 1 | A | + | 2 | A | + | 3 | D | + +------+----+ + "### + ); Ok(()) } @@ -4178,22 +4208,24 @@ async fn unnest_multiple_columns() -> Result<()> { // large_list: [null, 1.1], [2.2, 3.3, 4.4], null, [], // fixed_list: null, [1,2], [3,4], null // string: a, b, c, d - let expected = [ - "+------+------------+------------+--------+", - "| list | large_list | fixed_list | string |", - "+------+------------+------------+--------+", - "| 1 | | | a |", - "| 2 | 1.1 | | a |", - "| 3 | | | a |", - "| | 2.2 | 1 | b |", - "| | 3.3 | 2 | b |", - "| | 4.4 | | b |", - "| | | 3 | c |", - "| | | 4 | c |", - "| | | | d |", - "+------+------------+------------+--------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +------+------------+------------+--------+ + | list | large_list | fixed_list | string | + +------+------------+------------+--------+ + | 1 | | | a | + | 2 | 1.1 | | a | + | 3 | | | a | + | | 2.2 | 1 | b | + | | 3.3 | 2 | b | + | | 4.4 | | b | + | | | 3 | c | + | | | 4 | c | + | | | | d | + +------+------------+------------+--------+ + "### + ); // Test with `preserve_nulls = false`` let results = df @@ -4207,21 +4239,23 @@ async fn unnest_multiple_columns() -> Result<()> { // large_list: [null, 1.1], [2.2, 3.3, 4.4], null, [], // fixed_list: null, [1,2], [3,4], null // string: a, b, c, d - let expected = [ - "+------+------------+------------+--------+", - "| list | large_list | fixed_list | string |", - "+------+------------+------------+--------+", - "| 1 | | | a |", - "| 2 | 1.1 | | a |", - "| 3 | | | a |", - "| | 2.2 | 1 | b |", - "| | 3.3 | 2 | b |", - "| | 4.4 | | b |", - "| | | 3 | c |", - "| | | 4 | c |", - "+------+------------+------------+--------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +------+------------+------------+--------+ + | list | large_list | fixed_list | string | + +------+------------+------------+--------+ + | 1 | | | a | + | 2 | 1.1 | | a | + | 3 | | | a | + | | 2.2 | 1 | b | + | | 3.3 | 2 | b | + | | 4.4 | | b | + | | | 3 | c | + | | | 4 | c | + +------+------------+------------+--------+ + "### + ); Ok(()) } @@ -4247,18 +4281,18 @@ async fn unnest_non_nullable_list() -> Result<()> { .collect() .await?; - // Unnesting may produce NULLs even if the list is non-nullable. - #[rustfmt::skip] - let expected = [ - "+----+", - "| c1 |", - "+----+", - "| 1 |", - "| 2 |", - "| |", - "+----+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +----+ + | c1 | + +----+ + | 1 | + | 2 | + | | + +----+ + "### + ); Ok(()) } @@ -4695,14 +4729,16 @@ async fn test_array_agg() -> Result<()> { let results = df.collect().await?; - let expected = [ - "+-------------------------------------+", - "| array_agg(test.a) |", - "+-------------------------------------+", - "| [abcDEF, abc123, CBAdef, 123AbcDef] |", - "+-------------------------------------+", - ]; - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-------------------------------------+ + | array_agg(test.a) | + +-------------------------------------+ + | [abcDEF, abc123, CBAdef, 123AbcDef] | + +-------------------------------------+ + "### + ); Ok(()) } @@ -4767,13 +4803,13 @@ async fn test_dataframe_placeholder_missing_param_values() -> Result<()> { ); // N.B., the test is basically `SELECT 1 as a WHERE a = 3;` which returns no results. - #[rustfmt::skip] - let expected = [ - "++", - "++" - ]; - - assert_batches_eq!(expected, &df.collect().await.unwrap()); + assert_snapshot!( + batches_to_string(&df.collect().await.unwrap()), + @r###" + ++ + ++ + "### + ); Ok(()) } @@ -4829,16 +4865,16 @@ async fn test_dataframe_placeholder_column_parameter() -> Result<()> { "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" ); - #[rustfmt::skip] - let expected = [ - "+----+", - "| $1 |", - "+----+", - "| 3 |", - "+----+" - ]; - - assert_batches_eq!(expected, &df.collect().await.unwrap()); + assert_snapshot!( + batches_to_string(&df.collect().await.unwrap()), + @r###" + +----+ + | $1 | + +----+ + | 3 | + +----+ + "### + ); Ok(()) } @@ -4902,16 +4938,16 @@ async fn test_dataframe_placeholder_like_expression() -> Result<()> { "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" ); - #[rustfmt::skip] - let expected = [ - "+-----+", - "| a |", - "+-----+", - "| foo |", - "+-----+" - ]; - - assert_batches_eq!(expected, &df.collect().await.unwrap()); + assert_snapshot!( + batches_to_string(&df.collect().await.unwrap()), + @r###" + +-----+ + | a | + +-----+ + | foo | + +-----+ + "### + ); Ok(()) } @@ -4966,9 +5002,16 @@ async fn write_partitioned_parquet_results() -> Result<()> { // Check that the c2 column is gone and that c1 is abc. let results = filter_df.collect().await?; - let expected = ["+-----+", "| c1 |", "+-----+", "| abc |", "+-----+"]; - - assert_batches_eq!(expected, &results); + assert_snapshot!( + batches_to_string(&results), + @r###" + +-----+ + | c1 | + +-----+ + | abc | + +-----+ + "### + ); // Read the entire set of parquet files let df = ctx @@ -5254,32 +5297,37 @@ async fn boolean_dictionary_as_filter() { let df = ctx.table("dict_batch").await.unwrap(); // view_all - let expected = [ - "+---------+", - "| my_dict |", - "+---------+", - "| true |", - "| true |", - "| false |", - "| |", - "| false |", - "| true |", - "| false |", - "+---------+", - ]; - assert_batches_eq!(expected, &df.clone().collect().await.unwrap()); + assert_snapshot!( + batches_to_string(&df.clone().collect().await.unwrap()), + @r###" + +---------+ + | my_dict | + +---------+ + | true | + | true | + | false | + | | + | false | + | true | + | false | + +---------+ + "### + ); let result_df = df.clone().filter(col("my_dict")).unwrap(); - let expected = [ - "+---------+", - "| my_dict |", - "+---------+", - "| true |", - "| true |", - "| true |", - "+---------+", - ]; - assert_batches_eq!(expected, &result_df.collect().await.unwrap()); + + assert_snapshot!( + batches_to_string(&result_df.collect().await.unwrap()), + @r###" + +---------+ + | my_dict | + +---------+ + | true | + | true | + | true | + +---------+ + "### + ); // test nested dictionary let keys = vec![0, 2]; // 0 -> true, 2 -> false @@ -5307,27 +5355,29 @@ async fn boolean_dictionary_as_filter() { let df = ctx.table("nested_dict_batch").await.unwrap(); // view_all - let expected = [ - "+----------------+", - "| my_nested_dict |", - "+----------------+", - "| true |", - "| false |", - "+----------------+", - ]; - - assert_batches_eq!(expected, &df.clone().collect().await.unwrap()); + assert_snapshot!( + batches_to_string(&df.clone().collect().await.unwrap()), + @r###" + +----------------+ + | my_nested_dict | + +----------------+ + | true | + | false | + +----------------+ + "### + ); let result_df = df.clone().filter(col("my_nested_dict")).unwrap(); - let expected = [ - "+----------------+", - "| my_nested_dict |", - "+----------------+", - "| true |", - "+----------------+", - ]; - - assert_batches_eq!(expected, &result_df.collect().await.unwrap()); + assert_snapshot!( + batches_to_string(&result_df.collect().await.unwrap()), + @r###" + +----------------+ + | my_nested_dict | + +----------------+ + | true | + +----------------+ + "### + ); } #[tokio::test] @@ -5732,11 +5782,16 @@ async fn test_insert_into_casting_support() -> Result<()> { .await .unwrap(); - // The result should be the same as the input which is ['a123', 'b456'] - let expected = [ - "+------+", "| a |", "+------+", "| a123 |", "| b456 |", "+------+", - ]; - - assert_batches_eq!(expected, &res); + assert_snapshot!( + batches_to_string(&res), + @r###" + +------+ + | a | + +------+ + | a123 | + | b456 | + +------+ + "### + ); Ok(()) } From 124c0cba74384a533e14d442ff18ed36d1905feb Mon Sep 17 00:00:00 2001 From: Dmitrii Blaginin Date: Tue, 11 Mar 2025 20:56:06 +0000 Subject: [PATCH 4/6] Switch from `assert_batches_sorted_eq` --- datafusion/common/src/test_util.rs | 18 +- datafusion/core/tests/dataframe/mod.rs | 1711 +++++++++++++----------- 2 files changed, 925 insertions(+), 804 deletions(-) diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs index 04979780ddf8..b801c452af2c 100644 --- a/datafusion/common/src/test_util.rs +++ b/datafusion/common/src/test_util.rs @@ -77,12 +77,28 @@ macro_rules! assert_batches_eq { } pub fn batches_to_string(batches: &[RecordBatch]) -> String { + let actual = pretty_format_batches_with_options(batches, &DEFAULT_FORMAT_OPTIONS) + .unwrap() + .to_string(); + + actual.trim().to_string() +} + +pub fn batches_to_sort_string(batches: &[RecordBatch]) -> String { let actual_lines = pretty_format_batches_with_options(batches, &DEFAULT_FORMAT_OPTIONS) .unwrap() .to_string(); - actual_lines.trim().to_string() + let mut actual_lines: Vec<&str> = actual_lines.trim().lines().collect(); + + // sort except for header + footer + let num_lines = actual_lines.len(); + if num_lines > 3 { + actual_lines.as_mut_slice()[2..num_lines - 1].sort_unstable() + } + + actual_lines.join("\n") } /// Compares formatted output of a record batch with an expected diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 0189d5fb1944..5a30f713047f 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -47,7 +47,6 @@ use std::sync::Arc; use tempfile::TempDir; use url::Url; -use datafusion::assert_batches_sorted_eq; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::datasource::MemTable; use datafusion::error::Result; @@ -62,7 +61,7 @@ use datafusion::test_util::{ test_table_with_name, }; use datafusion_catalog::TableProvider; -use datafusion_common::test_util::batches_to_string; +use datafusion_common::test_util::{batches_to_sort_string, batches_to_string}; use datafusion_common::{ assert_contains, Constraint, Constraints, DataFusionError, ParamValues, ScalarValue, TableReference, UnnestOptions, @@ -329,9 +328,16 @@ async fn select_with_periods() -> Result<()> { let df_results = df.collect().await?; - assert_batches_sorted_eq!( - ["+------+", "| f.c1 |", "+------+", "| 1 |", "| 10 |", "+------+"], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +------+ + | f.c1 | + +------+ + | 1 | + | 10 | + +------+ + "### ); Ok(()) @@ -428,16 +434,16 @@ async fn drop_with_quotes() -> Result<()> { let df_results = df.collect().await?; - assert_batches_sorted_eq!( - [ - "+------+", - "| f\"c2 |", - "+------+", - "| 2 |", - "| 11 |", - "+------+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +------+ + | f"c2 | + +------+ + | 11 | + | 2 | + +------+ + "### ); Ok(()) @@ -460,9 +466,16 @@ async fn drop_with_periods() -> Result<()> { let df_results = df.collect().await?; - assert_batches_sorted_eq!( - ["+------+", "| f.c2 |", "+------+", "| 2 |", "| 11 |", "+------+"], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +------+ + | f.c2 | + +------+ + | 11 | + | 2 | + +------+ + "### ); Ok(()) @@ -484,18 +497,20 @@ async fn aggregate() -> Result<()> { let df: Vec = df.aggregate(group_expr, aggr_expr)?.collect().await?; - assert_batches_sorted_eq!( - ["+----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+", - "| c1 | min(aggregate_test_100.c12) | max(aggregate_test_100.c12) | avg(aggregate_test_100.c12) | sum(aggregate_test_100.c12) | count(aggregate_test_100.c12) | count(DISTINCT aggregate_test_100.c12) |", - "+----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+", - "| a | 0.02182578039211991 | 0.9800193410444061 | 0.48754517466109415 | 10.238448667882977 | 21 | 21 |", - "| b | 0.04893135681998029 | 0.9185813970744787 | 0.41040709263815384 | 7.797734760124923 | 19 | 19 |", - "| c | 0.0494924465469434 | 0.991517828651004 | 0.6600456536439784 | 13.860958726523545 | 21 | 21 |", - "| d | 0.061029375346466685 | 0.9748360509016578 | 0.48855379387549824 | 8.793968289758968 | 18 | 18 |", - "| e | 0.01479305307777301 | 0.9965400387585364 | 0.48600669271341534 | 10.206140546981722 | 21 | 21 |", - "+----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+"], - &df - ); + assert_snapshot!( + batches_to_sort_string(&df), + @r###" + +----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+ + | c1 | min(aggregate_test_100.c12) | max(aggregate_test_100.c12) | avg(aggregate_test_100.c12) | sum(aggregate_test_100.c12) | count(aggregate_test_100.c12) | count(DISTINCT aggregate_test_100.c12) | + +----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+ + | a | 0.02182578039211991 | 0.9800193410444061 | 0.48754517466109415 | 10.238448667882977 | 21 | 21 | + | b | 0.04893135681998029 | 0.9185813970744787 | 0.41040709263815384 | 7.797734760124923 | 19 | 19 | + | c | 0.0494924465469434 | 0.991517828651004 | 0.6600456536439784 | 13.860958726523545 | 21 | 21 | + | d | 0.061029375346466685 | 0.9748360509016578 | 0.48855379387549824 | 8.793968289758968 | 18 | 18 | + | e | 0.01479305307777301 | 0.9965400387585364 | 0.48600669271341534 | 10.206140546981722 | 21 | 21 | + +----+-----------------------------+-----------------------------+-----------------------------+-----------------------------+-------------------------------+----------------------------------------+ + "### + ); Ok(()) } @@ -552,16 +567,16 @@ async fn test_aggregate_with_pk() -> Result<()> { let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!([ - "+----+------+", - "| id | name |", - "+----+------+", - "| 1 | a |", - "+----+------+" - ], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+------+ + | id | name | + +----+------+ + | 1 | a | + +----+------+ + "### + ); Ok(()) } @@ -597,15 +612,16 @@ async fn test_aggregate_with_pk2() -> Result<()> { // even if it is not part of the group by expression. let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!( - ["+----+------+", - "| id | name |", - "+----+------+", - "| 1 | a |", - "+----+------+",], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+------+ + | id | name | + +----+------+ + | 1 | a | + +----+------+ + "### + ); Ok(()) } @@ -645,15 +661,16 @@ async fn test_aggregate_with_pk3() -> Result<()> { // even if it is not part of the group by expression. let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!( - ["+----+------+", - "| id | name |", - "+----+------+", - "| 1 | a |", - "+----+------+",], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+------+ + | id | name | + +----+------+ + | 1 | a | + +----+------+ + "### + ); Ok(()) } @@ -693,15 +710,16 @@ async fn test_aggregate_with_pk4() -> Result<()> { let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!([ - "+----+", - "| id |", - "+----+", - "| 1 |", - "+----+",], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+ + | id | + +----+ + | 1 | + +----+ + "### + ); Ok(()) } @@ -720,20 +738,20 @@ async fn test_aggregate_alias() -> Result<()> { let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!([ - "+----+", - "| c2 |", - "+----+", - "| 2 |", - "| 3 |", - "| 4 |", - "| 5 |", - "| 6 |", - "+----+", - ], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+ + | c2 | + +----+ + | 2 | + | 3 | + | 4 | + | 5 | + | 6 | + +----+ + "### + ); Ok(()) } @@ -767,22 +785,20 @@ async fn test_aggregate_with_union() -> Result<()> { let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!( - [ - "+----+------------+", - "| c1 | sum_result |", - "+----+------------+", - "| a | 84 |", - "| b | 69 |", - "| c | 124 |", - "| d | 126 |", - "| e | 121 |", - "+----+------------+" - ], - &df_results - ); - + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+------------+ + | c1 | sum_result | + +----+------------+ + | a | 84 | + | b | 69 | + | c | 124 | + | d | 126 | + | e | 121 | + +----+------------+ + "### + ); Ok(()) } @@ -805,20 +821,20 @@ async fn test_aggregate_subexpr() -> Result<()> { let df_results = df.collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!([ - "+----------------+------+", - "| c2 + Int32(10) | sum |", - "+----------------+------+", - "| 12 | 431 |", - "| 13 | 248 |", - "| 14 | 453 |", - "| 15 | 95 |", - "| 16 | -146 |", - "+----------------+------+", - ], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----------------+------+ + | c2 + Int32(10) | sum | + +----------------+------+ + | 12 | 431 | + | 13 | 248 | + | 14 | 453 | + | 15 | 95 | + | 16 | -146 | + +----------------+------+ + "### + ); Ok(()) } @@ -899,36 +915,36 @@ async fn window_using_aggregates() -> Result<()> { let df: Vec = df.select(aggr_expr)?.collect().await?; - assert_batches_sorted_eq!( - [ - "+-------------+----------+-----------------+---------------+--------+-----+------+----+------+", - "| first_value | last_val | approx_distinct | approx_median | median | max | min | c2 | c3 |", - "+-------------+----------+-----------------+---------------+--------+-----+------+----+------+", - "| | | | | | | | 1 | -85 |", - "| -85 | -101 | 14 | -12 | -101 | 83 | -101 | 4 | -54 |", - "| -85 | -101 | 17 | -25 | -101 | 83 | -101 | 5 | -31 |", - "| -85 | -12 | 10 | -32 | -12 | 83 | -85 | 3 | 13 |", - "| -85 | -25 | 3 | -56 | -25 | -25 | -85 | 1 | -5 |", - "| -85 | -31 | 18 | -29 | -31 | 83 | -101 | 5 | 36 |", - "| -85 | -38 | 16 | -25 | -38 | 83 | -101 | 4 | 65 |", - "| -85 | -43 | 7 | -43 | -43 | 83 | -85 | 2 | 45 |", - "| -85 | -48 | 6 | -35 | -48 | 83 | -85 | 2 | -43 |", - "| -85 | -5 | 4 | -37 | -5 | -5 | -85 | 1 | 83 |", - "| -85 | -54 | 15 | -17 | -54 | 83 | -101 | 4 | -38 |", - "| -85 | -56 | 2 | -70 | -56 | -56 | -85 | 1 | -25 |", - "| -85 | -72 | 9 | -43 | -72 | 83 | -85 | 3 | -12 |", - "| -85 | -85 | 1 | -85 | -85 | -85 | -85 | 1 | -56 |", - "| -85 | 13 | 11 | -17 | 13 | 83 | -85 | 3 | 14 |", - "| -85 | 13 | 11 | -25 | 13 | 83 | -85 | 3 | 13 |", - "| -85 | 14 | 12 | -12 | 14 | 83 | -85 | 3 | 17 |", - "| -85 | 17 | 13 | -11 | 17 | 83 | -85 | 4 | -101 |", - "| -85 | 45 | 8 | -34 | 45 | 83 | -85 | 3 | -72 |", - "| -85 | 65 | 17 | -17 | 65 | 83 | -101 | 5 | -101 |", - "| -85 | 83 | 5 | -25 | 83 | 83 | -85 | 2 | -48 |", - "+-------------+----------+-----------------+---------------+--------+-----+------+----+------+", - ], - &df - ); + assert_snapshot!( + batches_to_sort_string(&df), + @r###" + +-------------+----------+-----------------+---------------+--------+-----+------+----+------+ + | first_value | last_val | approx_distinct | approx_median | median | max | min | c2 | c3 | + +-------------+----------+-----------------+---------------+--------+-----+------+----+------+ + | | | | | | | | 1 | -85 | + | -85 | -101 | 14 | -12 | -101 | 83 | -101 | 4 | -54 | + | -85 | -101 | 17 | -25 | -101 | 83 | -101 | 5 | -31 | + | -85 | -12 | 10 | -32 | -12 | 83 | -85 | 3 | 13 | + | -85 | -25 | 3 | -56 | -25 | -25 | -85 | 1 | -5 | + | -85 | -31 | 18 | -29 | -31 | 83 | -101 | 5 | 36 | + | -85 | -38 | 16 | -25 | -38 | 83 | -101 | 4 | 65 | + | -85 | -43 | 7 | -43 | -43 | 83 | -85 | 2 | 45 | + | -85 | -48 | 6 | -35 | -48 | 83 | -85 | 2 | -43 | + | -85 | -5 | 4 | -37 | -5 | -5 | -85 | 1 | 83 | + | -85 | -54 | 15 | -17 | -54 | 83 | -101 | 4 | -38 | + | -85 | -56 | 2 | -70 | -56 | -56 | -85 | 1 | -25 | + | -85 | -72 | 9 | -43 | -72 | 83 | -85 | 3 | -12 | + | -85 | -85 | 1 | -85 | -85 | -85 | -85 | 1 | -56 | + | -85 | 13 | 11 | -17 | 13 | 83 | -85 | 3 | 14 | + | -85 | 13 | 11 | -25 | 13 | 83 | -85 | 3 | 13 | + | -85 | 14 | 12 | -12 | 14 | 83 | -85 | 3 | 17 | + | -85 | 17 | 13 | -11 | 17 | 83 | -85 | 4 | -101 | + | -85 | 45 | 8 | -34 | 45 | 83 | -85 | 3 | -72 | + | -85 | 65 | 17 | -17 | 65 | 83 | -101 | 5 | -101 | + | -85 | 83 | 5 | -25 | 83 | 83 | -85 | 2 | -48 | + +-------------+----------+-----------------+---------------+--------+-----+------+----+------+ + "### + ); Ok(()) } @@ -981,19 +997,20 @@ async fn test_distinct_sort_by() -> Result<()> { let df_results = plan.clone().collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!( - ["+----+", - "| c1 |", - "+----+", - "| a |", - "| b |", - "| c |", - "| d |", - "| e |", - "+----+"], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+ + | c1 | + +----+ + | a | + | b | + | c | + | d | + | e | + +----+ + "### + ); Ok(()) } @@ -1028,19 +1045,20 @@ async fn test_distinct_on() -> Result<()> { let df_results = plan.clone().collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!( - ["+----+", - "| c1 |", - "+----+", - "| a |", - "| b |", - "| c |", - "| d |", - "| e |", - "+----+"], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+ + | c1 | + +----+ + | a | + | b | + | c | + | d | + | e | + +----+ + "### + ); Ok(()) } @@ -1062,19 +1080,20 @@ async fn test_distinct_on_sort_by() -> Result<()> { let df_results = plan.clone().collect().await?; - #[rustfmt::skip] - assert_batches_sorted_eq!( - ["+----+", - "| c1 |", - "+----+", - "| a |", - "| b |", - "| c |", - "| d |", - "| e |", - "+----+"], - &df_results - ); + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+ + | c1 | + +----+ + | a | + | b | + | c | + | d | + | e | + +----+ + "### + ); Ok(()) } @@ -1135,15 +1154,15 @@ async fn join_coercion_unnamed() -> Result<()> { let join = right.join(left, JoinType::LeftAnti, &cols, &cols, filter)?; let results = join.collect().await?; - assert_batches_sorted_eq!( - [ - "+----+------+", - "| id | name |", - "+----+------+", - "| 10 | d |", - "+----+------+", - ], - &results + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----+------+ + | id | name | + +----+------+ + | 10 | d | + +----+------+ + "### ); Ok(()) } @@ -1355,35 +1374,35 @@ async fn register_table() -> Result<()> { .await?; let table_results = &table.aggregate(group_expr, aggr_expr)?.collect().await?; - assert_batches_sorted_eq!( - [ - "+----+-----------------------------+", - "| c1 | sum(aggregate_test_100.c12) |", - "+----+-----------------------------+", - "| a | 10.238448667882977 |", - "| b | 7.797734760124923 |", - "| c | 13.860958726523545 |", - "| d | 8.793968289758968 |", - "| e | 10.206140546981722 |", - "+----+-----------------------------+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+-----------------------------+ + | c1 | sum(aggregate_test_100.c12) | + +----+-----------------------------+ + | a | 10.238448667882977 | + | b | 7.797734760124923 | + | c | 13.860958726523545 | + | d | 8.793968289758968 | + | e | 10.206140546981722 | + +----+-----------------------------+ + "### ); // the results are the same as the results from the view, modulo the leaf table name - assert_batches_sorted_eq!( - [ - "+----+---------------------+", - "| c1 | sum(test_table.c12) |", - "+----+---------------------+", - "| a | 10.238448667882977 |", - "| b | 7.797734760124923 |", - "| c | 13.860958726523545 |", - "| d | 8.793968289758968 |", - "| e | 10.206140546981722 |", - "+----+---------------------+" - ], - table_results + assert_snapshot!( + batches_to_sort_string(table_results), + @r###" + +----+---------------------+ + | c1 | sum(test_table.c12) | + +----+---------------------+ + | a | 10.238448667882977 | + | b | 7.797734760124923 | + | c | 13.860958726523545 | + | d | 8.793968289758968 | + | e | 10.206140546981722 | + +----+---------------------+ + "### ); Ok(()) } @@ -1413,20 +1432,20 @@ async fn with_column() -> Result<()> { // check that new column added let df_results = df.clone().collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+-----+-----+", - "| c1 | c2 | c3 | sum |", - "+----+----+-----+-----+", - "| a | 3 | -12 | -9 |", - "| a | 3 | -72 | -69 |", - "| a | 3 | 13 | 16 |", - "| a | 3 | 13 | 16 |", - "| a | 3 | 14 | 17 |", - "| a | 3 | 17 | 20 |", - "+----+----+-----+-----+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+----+-----+-----+ + | c1 | c2 | c3 | sum | + +----+----+-----+-----+ + | a | 3 | -12 | -9 | + | a | 3 | -72 | -69 | + | a | 3 | 13 | 16 | + | a | 3 | 13 | 16 | + | a | 3 | 14 | 17 | + | a | 3 | 17 | 20 | + +----+----+-----+-----+ + "### ); // check that col with the same name overwritten @@ -1436,20 +1455,20 @@ async fn with_column() -> Result<()> { .collect() .await?; - assert_batches_sorted_eq!( - [ - "+-----+----+-----+-----+", - "| c1 | c2 | c3 | sum |", - "+-----+----+-----+-----+", - "| -69 | 3 | -72 | -69 |", - "| -9 | 3 | -12 | -9 |", - "| 16 | 3 | 13 | 16 |", - "| 16 | 3 | 13 | 16 |", - "| 17 | 3 | 14 | 17 |", - "| 20 | 3 | 17 | 20 |", - "+-----+----+-----+-----+" - ], - &df_results_overwrite + assert_snapshot!( + batches_to_sort_string(&df_results_overwrite), + @r###" + +-----+----+-----+-----+ + | c1 | c2 | c3 | sum | + +-----+----+-----+-----+ + | -69 | 3 | -72 | -69 | + | -9 | 3 | -12 | -9 | + | 16 | 3 | 13 | 16 | + | 16 | 3 | 13 | 16 | + | 17 | 3 | 14 | 17 | + | 20 | 3 | 17 | 20 | + +-----+----+-----+-----+ + "### ); // check that col with the same name overwritten using same name as reference @@ -1459,20 +1478,20 @@ async fn with_column() -> Result<()> { .collect() .await?; - assert_batches_sorted_eq!( - [ - "+----+----+-----+-----+", - "| c1 | c2 | c3 | sum |", - "+----+----+-----+-----+", - "| a | 4 | -12 | -9 |", - "| a | 4 | -72 | -69 |", - "| a | 4 | 13 | 16 |", - "| a | 4 | 13 | 16 |", - "| a | 4 | 14 | 17 |", - "| a | 4 | 17 | 20 |", - "+----+----+-----+-----+" - ], - &df_results_overwrite_self + assert_snapshot!( + batches_to_sort_string(&df_results_overwrite_self), + @r###" + +----+----+-----+-----+ + | c1 | c2 | c3 | sum | + +----+----+-----+-----+ + | a | 4 | -12 | -9 | + | a | 4 | -72 | -69 | + | a | 4 | 13 | 16 | + | a | 4 | 13 | 16 | + | a | 4 | 14 | 17 | + | a | 4 | 17 | 20 | + +----+----+-----+-----+ + "### ); Ok(()) @@ -1498,16 +1517,16 @@ async fn test_window_function_with_column() -> Result<()> { assert_eq!(5, df.schema().fields().len()); let df_results = df.clone().collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+-----+-----+---+", - "| c1 | c2 | c3 | s | r |", - "+----+----+-----+-----+---+", - "| c | 2 | 1 | 3 | 1 |", - "| d | 5 | -40 | -35 | 2 |", - "+----+----+-----+-----+---+", - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+----+-----+-----+---+ + | c1 | c2 | c3 | s | r | + +----+----+-----+-----+---+ + | c | 2 | 1 | 3 | 1 | + | d | 5 | -40 | -35 | 2 | + +----+----+-----+-----+---+ + "### ); Ok(()) @@ -1540,15 +1559,15 @@ async fn with_column_join_same_columns() -> Result<()> { .limit(0, Some(1))?; let df_results = df.clone().collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+", - "| c1 | c1 |", - "+----+----+", - "| a | a |", - "+----+----+", - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+----+ + | c1 | c1 | + +----+----+ + | a | a | + +----+----+ + "### ); let df_with_column = df.clone().with_column("new_column", lit(true))?; @@ -1578,16 +1597,17 @@ async fn with_column_join_same_columns() -> Result<()> { let df_results = df_with_column.collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+------------+", - "| c1 | c1 | new_column |", - "+----+----+------------+", - "| a | a | true |", - "+----+----+------------+", - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+----+------------+ + | c1 | c1 | new_column | + +----+----+------------+ + | a | a | true | + +----+----+------------+ + "### ); + Ok(()) } @@ -1633,15 +1653,15 @@ async fn with_column_renamed() -> Result<()> { let batches = &df_sum_renamed.collect().await?; - assert_batches_sorted_eq!( - [ - "+-----+-----+-----+-------+", - "| one | two | c3 | total |", - "+-----+-----+-----+-------+", - "| a | 3 | -72 | -69 |", - "+-----+-----+-----+-------+", - ], - batches + assert_snapshot!( + batches_to_sort_string(batches), + @r###" + +-----+-----+-----+-------+ + | one | two | c3 | total | + +-----+-----+-----+-------+ + | a | 3 | -72 | -69 | + +-----+-----+-----+-------+ + "### ); Ok(()) @@ -1705,15 +1725,15 @@ async fn with_column_renamed_join() -> Result<()> { .limit(0, Some(1))?; let df_results = df.clone().collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+-----+----+----+-----+", - "| c1 | c2 | c3 | c1 | c2 | c3 |", - "+----+----+-----+----+----+-----+", - "| a | 1 | -85 | a | 1 | -85 |", - "+----+----+-----+----+----+-----+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+----+-----+----+----+-----+ + | c1 | c2 | c3 | c1 | c2 | c3 | + +----+----+-----+----+----+-----+ + | a | 1 | -85 | a | 1 | -85 | + +----+----+-----+----+----+-----+ + "### ); let df_renamed = df.clone().with_column_renamed("t1.c1", "AAA")?; @@ -1741,15 +1761,15 @@ async fn with_column_renamed_join() -> Result<()> { let df_results = df_renamed.collect().await?; - assert_batches_sorted_eq!( - [ - "+-----+----+-----+----+----+-----+", - "| AAA | c2 | c3 | c1 | c2 | c3 |", - "+-----+----+-----+----+----+-----+", - "| a | 1 | -85 | a | 1 | -85 |", - "+-----+----+-----+----+----+-----+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +-----+----+-----+----+----+-----+ + | AAA | c2 | c3 | c1 | c2 | c3 | + +-----+----+-----+----+----+-----+ + | a | 1 | -85 | a | 1 | -85 | + +-----+----+-----+----+----+-----+ + "### ); Ok(()) @@ -1782,15 +1802,15 @@ async fn with_column_renamed_case_sensitive() -> Result<()> { let res = &df_renamed.clone().collect().await?; - assert_batches_sorted_eq!( - [ - "+---------+", - "| CoLuMn1 |", - "+---------+", - "| a |", - "+---------+" - ], - res + assert_snapshot!( + batches_to_sort_string(res), + @r###" + +---------+ + | CoLuMn1 | + +---------+ + | a | + +---------+ + "### ); let df_renamed = df_renamed @@ -1798,9 +1818,15 @@ async fn with_column_renamed_case_sensitive() -> Result<()> { .collect() .await?; - assert_batches_sorted_eq!( - ["+----+", "| c1 |", "+----+", "| a |", "+----+"], - &df_renamed + assert_snapshot!( + batches_to_sort_string(&df_renamed), + @r###" + +----+ + | c1 | + +----+ + | a | + +----+ + "### ); Ok(()) @@ -1816,15 +1842,15 @@ async fn cast_expr_test() -> Result<()> { let df_results = df.clone().collect().await?; df.clone().show().await?; - assert_batches_sorted_eq!( - [ - "+----+----+-----+", - "| c2 | c3 | sum |", - "+----+----+-----+", - "| 2 | 1 | 3 |", - "+----+----+-----+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +----+----+-----+ + | c2 | c3 | sum | + +----+----+-----+ + | 2 | 1 | 3 | + +----+----+-----+ + "### ); Ok(()) @@ -1882,16 +1908,16 @@ async fn with_column_name() -> Result<()> { let df_results = df.collect().await?; - assert_batches_sorted_eq!( - [ - "+------+-------+", - "| f.c1 | f.c2 |", - "+------+-------+", - "| 1 | hello |", - "| 10 | hello |", - "+------+-------+" - ], - &df_results + assert_snapshot!( + batches_to_sort_string(&df_results), + @r###" + +------+-------+ + | f.c1 | f.c2 | + +------+-------+ + | 1 | hello | + | 10 | hello | + +------+-------+ + "### ); Ok(()) @@ -1925,15 +1951,15 @@ async fn cache_test() -> Result<()> { let df_results = df.collect().await?; let cached_df_results = cached_df.collect().await?; - assert_batches_sorted_eq!( - [ - "+----+----+-----+", - "| c2 | c3 | sum |", - "+----+----+-----+", - "| 2 | 1 | 3 |", - "+----+----+-----+" - ], - &cached_df_results + assert_snapshot!( + batches_to_sort_string(&cached_df_results), + @r###" + +----+----+-----+ + | c2 | c3 | sum | + +----+----+-----+ + | 2 | 1 | 3 | + +----+----+-----+ + "### ); assert_eq!(&df_results, &cached_df_results); @@ -3416,16 +3442,17 @@ async fn join_with_alias_filter() -> Result<()> { ); let results = df.collect().await?; - let expected: Vec<&str> = vec![ - "+----+----+---+----+---+---+", - "| a | a | b | c | b | c |", - "+----+----+---+----+---+---+", - "| 11 | 13 | c | 30 | c | 3 |", - "| 1 | 3 | a | 10 | a | 1 |", - "+----+----+---+----+---+---+", - ]; - - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----+----+---+----+---+---+ + | a | a | b | c | b | c | + +----+----+---+----+---+---+ + | 1 | 3 | a | 10 | a | 1 | + | 11 | 13 | c | 30 | c | 3 | + +----+----+---+----+---+---+ + "### + ); Ok(()) } @@ -3463,15 +3490,18 @@ async fn right_semi_with_alias_filter() -> Result<()> { ); let results = df.collect().await?; - let expected: Vec<&str> = vec![ - "+-----+---+---+", - "| a | b | c |", - "+-----+---+---+", - "| 10 | b | 2 |", - "| 100 | d | 4 |", - "+-----+---+---+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +-----+---+---+ + | a | b | c | + +-----+---+---+ + | 10 | b | 2 | + | 100 | d | 4 | + +-----+---+---+ + "### + ); + Ok(()) } @@ -3507,15 +3537,18 @@ async fn right_anti_filter_push_down() -> Result<()> { ); let results = df.collect().await?; - let expected: Vec<&str> = vec![ - "+----+---+---+", - "| a | b | c |", - "+----+---+---+", - "| 13 | c | 3 |", - "| 3 | a | 1 |", - "+----+---+---+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----+---+---+ + | a | b | c | + +----+---+---+ + | 13 | c | 3 | + | 3 | a | 1 | + +----+---+---+ + "### + ); + Ok(()) } @@ -3524,33 +3557,39 @@ async fn unnest_columns() -> Result<()> { const NUM_ROWS: usize = 4; let df = table_with_nested_types(NUM_ROWS).await?; let results = df.collect().await?; - let expected = ["+----------+------------------------------------------------+--------------------+", - "| shape_id | points | tags |", - "+----------+------------------------------------------------+--------------------+", - "| 1 | [{x: -3, y: -4}, {x: -3, y: 6}, {x: 2, y: -2}] | [tag1] |", - "| 2 | | [tag1, tag2] |", - "| 3 | [{x: -9, y: 2}, {x: -10, y: -4}] | |", - "| 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | [tag1, tag2, tag3] |", - "+----------+------------------------------------------------+--------------------+"]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+------------------------------------------------+--------------------+ + | shape_id | points | tags | + +----------+------------------------------------------------+--------------------+ + | 1 | [{x: -3, y: -4}, {x: -3, y: 6}, {x: 2, y: -2}] | [tag1] | + | 2 | | [tag1, tag2] | + | 3 | [{x: -9, y: 2}, {x: -10, y: -4}] | | + | 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | [tag1, tag2, tag3] | + +----------+------------------------------------------------+--------------------+ + "### + ); // Unnest tags let df = table_with_nested_types(NUM_ROWS).await?; let results = df.unnest_columns(&["tags"])?.collect().await?; - let expected = [ - "+----------+------------------------------------------------+------+", - "| shape_id | points | tags |", - "+----------+------------------------------------------------+------+", - "| 1 | [{x: -3, y: -4}, {x: -3, y: 6}, {x: 2, y: -2}] | tag1 |", - "| 2 | | tag1 |", - "| 2 | | tag2 |", - "| 3 | [{x: -9, y: 2}, {x: -10, y: -4}] | |", - "| 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag1 |", - "| 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag2 |", - "| 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag3 |", - "+----------+------------------------------------------------+------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+------------------------------------------------+------+ + | shape_id | points | tags | + +----------+------------------------------------------------+------+ + | 1 | [{x: -3, y: -4}, {x: -3, y: 6}, {x: 2, y: -2}] | tag1 | + | 2 | | tag1 | + | 2 | | tag2 | + | 3 | [{x: -9, y: 2}, {x: -10, y: -4}] | | + | 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag1 | + | 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag2 | + | 4 | [{x: -3, y: 5}, {x: 2, y: -1}] | tag3 | + +----------+------------------------------------------------+------+ + "### + ); // Test aggregate results for tags. let df = table_with_nested_types(NUM_ROWS).await?; @@ -3560,21 +3599,23 @@ async fn unnest_columns() -> Result<()> { // Unnest points let df = table_with_nested_types(NUM_ROWS).await?; let results = df.unnest_columns(&["points"])?.collect().await?; - let expected = [ - "+----------+-----------------+--------------------+", - "| shape_id | points | tags |", - "+----------+-----------------+--------------------+", - "| 1 | {x: -3, y: -4} | [tag1] |", - "| 1 | {x: -3, y: 6} | [tag1] |", - "| 1 | {x: 2, y: -2} | [tag1] |", - "| 2 | | [tag1, tag2] |", - "| 3 | {x: -10, y: -4} | |", - "| 3 | {x: -9, y: 2} | |", - "| 4 | {x: -3, y: 5} | [tag1, tag2, tag3] |", - "| 4 | {x: 2, y: -1} | [tag1, tag2, tag3] |", - "+----------+-----------------+--------------------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+-----------------+--------------------+ + | shape_id | points | tags | + +----------+-----------------+--------------------+ + | 1 | {x: -3, y: -4} | [tag1] | + | 1 | {x: -3, y: 6} | [tag1] | + | 1 | {x: 2, y: -2} | [tag1] | + | 2 | | [tag1, tag2] | + | 3 | {x: -10, y: -4} | | + | 3 | {x: -9, y: 2} | | + | 4 | {x: -3, y: 5} | [tag1, tag2, tag3] | + | 4 | {x: 2, y: -1} | [tag1, tag2, tag3] | + +----------+-----------------+--------------------+ + "### + ); // Test aggregate results for points. let df = table_with_nested_types(NUM_ROWS).await?; @@ -3588,26 +3629,28 @@ async fn unnest_columns() -> Result<()> { .unnest_columns(&["tags"])? .collect() .await?; - let expected = vec![ - "+----------+-----------------+------+", - "| shape_id | points | tags |", - "+----------+-----------------+------+", - "| 1 | {x: -3, y: -4} | tag1 |", - "| 1 | {x: -3, y: 6} | tag1 |", - "| 1 | {x: 2, y: -2} | tag1 |", - "| 2 | | tag1 |", - "| 2 | | tag2 |", - "| 3 | {x: -10, y: -4} | |", - "| 3 | {x: -9, y: 2} | |", - "| 4 | {x: -3, y: 5} | tag1 |", - "| 4 | {x: -3, y: 5} | tag2 |", - "| 4 | {x: -3, y: 5} | tag3 |", - "| 4 | {x: 2, y: -1} | tag1 |", - "| 4 | {x: 2, y: -1} | tag2 |", - "| 4 | {x: 2, y: -1} | tag3 |", - "+----------+-----------------+------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+-----------------+------+ + | shape_id | points | tags | + +----------+-----------------+------+ + | 1 | {x: -3, y: -4} | tag1 | + | 1 | {x: -3, y: 6} | tag1 | + | 1 | {x: 2, y: -2} | tag1 | + | 2 | | tag1 | + | 2 | | tag2 | + | 3 | {x: -10, y: -4} | | + | 3 | {x: -9, y: 2} | | + | 4 | {x: -3, y: 5} | tag1 | + | 4 | {x: -3, y: 5} | tag2 | + | 4 | {x: -3, y: 5} | tag3 | + | 4 | {x: 2, y: -1} | tag1 | + | 4 | {x: 2, y: -1} | tag2 | + | 4 | {x: 2, y: -1} | tag3 | + +----------+-----------------+------+ + "### + ); // Test aggregate results for points and tags. let df = table_with_nested_types(NUM_ROWS).await?; @@ -3761,19 +3804,21 @@ async fn unnest_fixed_list() -> Result<()> { let df = ctx.table("shapes").await?; let results = df.clone().collect().await?; - let expected = [ - "+----------+----------------+", - "| shape_id | tags |", - "+----------+----------------+", - "| 1 | |", - "| 2 | [tag21, tag22] |", - "| 3 | [tag31, tag32] |", - "| 4 | |", - "| 5 | [tag51, tag52] |", - "| 6 | [tag61, tag62] |", - "+----------+----------------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+----------------+ + | shape_id | tags | + +----------+----------------+ + | 1 | | + | 2 | [tag21, tag22] | + | 3 | [tag31, tag32] | + | 4 | | + | 5 | [tag51, tag52] | + | 6 | [tag61, tag62] | + +----------+----------------+ + "### + ); let options = UnnestOptions::new().with_preserve_nulls(true); @@ -3781,23 +3826,25 @@ async fn unnest_fixed_list() -> Result<()> { .unnest_columns_with_options(&["tags"], options)? .collect() .await?; - let expected = vec![ - "+----------+-------+", - "| shape_id | tags |", - "+----------+-------+", - "| 1 | |", - "| 2 | tag21 |", - "| 2 | tag22 |", - "| 3 | tag31 |", - "| 3 | tag32 |", - "| 4 | |", - "| 5 | tag51 |", - "| 5 | tag52 |", - "| 6 | tag61 |", - "| 6 | tag62 |", - "+----------+-------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+-------+ + | shape_id | tags | + +----------+-------+ + | 1 | | + | 2 | tag21 | + | 2 | tag22 | + | 3 | tag31 | + | 3 | tag32 | + | 4 | | + | 5 | tag51 | + | 5 | tag52 | + | 6 | tag61 | + | 6 | tag62 | + +----------+-------+ + "### + ); Ok(()) } @@ -3811,19 +3858,21 @@ async fn unnest_fixed_list_drop_nulls() -> Result<()> { let df = ctx.table("shapes").await?; let results = df.clone().collect().await?; - let expected = [ - "+----------+----------------+", - "| shape_id | tags |", - "+----------+----------------+", - "| 1 | |", - "| 2 | [tag21, tag22] |", - "| 3 | [tag31, tag32] |", - "| 4 | |", - "| 5 | [tag51, tag52] |", - "| 6 | [tag61, tag62] |", - "+----------+----------------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+----------------+ + | shape_id | tags | + +----------+----------------+ + | 1 | | + | 2 | [tag21, tag22] | + | 3 | [tag31, tag32] | + | 4 | | + | 5 | [tag51, tag52] | + | 6 | [tag61, tag62] | + +----------+----------------+ + "### + ); let options = UnnestOptions::new().with_preserve_nulls(false); @@ -3831,21 +3880,23 @@ async fn unnest_fixed_list_drop_nulls() -> Result<()> { .unnest_columns_with_options(&["tags"], options)? .collect() .await?; - let expected = [ - "+----------+-------+", - "| shape_id | tags |", - "+----------+-------+", - "| 2 | tag21 |", - "| 2 | tag22 |", - "| 3 | tag31 |", - "| 3 | tag32 |", - "| 5 | tag51 |", - "| 5 | tag52 |", - "| 6 | tag61 |", - "| 6 | tag62 |", - "+----------+-------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+-------+ + | shape_id | tags | + +----------+-------+ + | 2 | tag21 | + | 2 | tag22 | + | 3 | tag31 | + | 3 | tag32 | + | 5 | tag51 | + | 5 | tag52 | + | 6 | tag61 | + | 6 | tag62 | + +----------+-------+ + "### + ); Ok(()) } @@ -3878,44 +3929,48 @@ async fn unnest_fixed_list_non_null() -> Result<()> { let df = ctx.table("shapes").await?; let results = df.clone().collect().await?; - let expected = [ - "+----------+----------------+", - "| shape_id | tags |", - "+----------+----------------+", - "| 1 | [tag11, tag12] |", - "| 2 | [tag21, tag22] |", - "| 3 | [tag31, tag32] |", - "| 4 | [tag41, tag42] |", - "| 5 | [tag51, tag52] |", - "| 6 | [tag61, tag62] |", - "+----------+----------------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+----------------+ + | shape_id | tags | + +----------+----------------+ + | 1 | [tag11, tag12] | + | 2 | [tag21, tag22] | + | 3 | [tag31, tag32] | + | 4 | [tag41, tag42] | + | 5 | [tag51, tag52] | + | 6 | [tag61, tag62] | + +----------+----------------+ + "### + ); let options = UnnestOptions::new().with_preserve_nulls(true); let results = df .unnest_columns_with_options(&["tags"], options)? .collect() .await?; - let expected = vec![ - "+----------+-------+", - "| shape_id | tags |", - "+----------+-------+", - "| 1 | tag11 |", - "| 1 | tag12 |", - "| 2 | tag21 |", - "| 2 | tag22 |", - "| 3 | tag31 |", - "| 3 | tag32 |", - "| 4 | tag41 |", - "| 4 | tag42 |", - "| 5 | tag51 |", - "| 5 | tag52 |", - "| 6 | tag61 |", - "| 6 | tag62 |", - "+----------+-------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+-------+ + | shape_id | tags | + +----------+-------+ + | 1 | tag11 | + | 1 | tag12 | + | 2 | tag21 | + | 2 | tag22 | + | 3 | tag31 | + | 3 | tag32 | + | 4 | tag41 | + | 4 | tag42 | + | 5 | tag51 | + | 5 | tag52 | + | 6 | tag61 | + | 6 | tag62 | + +----------+-------+ + "### + ); Ok(()) } @@ -3926,18 +3981,20 @@ async fn unnest_aggregate_columns() -> Result<()> { let df = table_with_nested_types(NUM_ROWS).await?; let results = df.select_columns(&["tags"])?.collect().await?; - let expected = [ - r#"+--------------------+"#, - r#"| tags |"#, - r#"+--------------------+"#, - r#"| [tag1] |"#, - r#"| [tag1, tag2] |"#, - r#"| |"#, - r#"| [tag1, tag2, tag3] |"#, - r#"| [tag1, tag2, tag3] |"#, - r#"+--------------------+"#, - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +--------------------+ + | tags | + +--------------------+ + | | + | [tag1, tag2, tag3] | + | [tag1, tag2, tag3] | + | [tag1, tag2] | + | [tag1] | + +--------------------+ + "### + ); let df = table_with_nested_types(NUM_ROWS).await?; let results = df @@ -3945,14 +4002,16 @@ async fn unnest_aggregate_columns() -> Result<()> { .aggregate(vec![], vec![count(col("tags"))])? .collect() .await?; - let expected = [ - r#"+-------------+"#, - r#"| count(tags) |"#, - r#"+-------------+"#, - r#"| 9 |"#, - r#"+-------------+"#, - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +-------------+ + | count(tags) | + +-------------+ + | 9 | + +-------------+ + "### + ); Ok(()) } @@ -4022,22 +4081,24 @@ async fn unnest_array_agg() -> Result<()> { assert!(rb.num_rows() > 0); } - let expected = vec![ - "+----------+--------+", - "| shape_id | tag_id |", - "+----------+--------+", - "| 1 | 11 |", - "| 1 | 12 |", - "| 1 | 13 |", - "| 2 | 21 |", - "| 2 | 22 |", - "| 2 | 23 |", - "| 3 | 31 |", - "| 3 | 32 |", - "| 3 | 33 |", - "+----------+--------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+--------+ + | shape_id | tag_id | + +----------+--------+ + | 1 | 11 | + | 1 | 12 | + | 1 | 13 | + | 2 | 21 | + | 2 | 22 | + | 2 | 23 | + | 3 | 31 | + | 3 | 32 | + | 3 | 33 | + +----------+--------+ + "### + ); // Doing an `array_agg` by `shape_id` produces: let results = df @@ -4048,16 +4109,18 @@ async fn unnest_array_agg() -> Result<()> { )? .collect() .await?; - let expected = [ - "+----------+--------------+", - "| shape_id | tag_id |", - "+----------+--------------+", - "| 1 | [11, 12, 13] |", - "| 2 | [21, 22, 23] |", - "| 3 | [31, 32, 33] |", - "+----------+--------------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+--------------+ + | shape_id | tag_id | + +----------+--------------+ + | 1 | [11, 12, 13] | + | 2 | [21, 22, 23] | + | 3 | [31, 32, 33] | + +----------+--------------+ + "### + ); // Unnesting again should produce the original batch. let results = ctx @@ -4070,22 +4133,24 @@ async fn unnest_array_agg() -> Result<()> { .unnest_columns(&["tag_id"])? .collect() .await?; - let expected = vec![ - "+----------+--------+", - "| shape_id | tag_id |", - "+----------+--------+", - "| 1 | 11 |", - "| 1 | 12 |", - "| 1 | 13 |", - "| 2 | 21 |", - "| 2 | 22 |", - "| 2 | 23 |", - "| 3 | 31 |", - "| 3 | 32 |", - "| 3 | 33 |", - "+----------+--------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+--------+ + | shape_id | tag_id | + +----------+--------+ + | 1 | 11 | + | 1 | 12 | + | 1 | 13 | + | 2 | 21 | + | 2 | 22 | + | 2 | 23 | + | 3 | 31 | + | 3 | 32 | + | 3 | 33 | + +----------+--------+ + "### + ); Ok(()) } @@ -4112,22 +4177,24 @@ async fn unnest_with_redundant_columns() -> Result<()> { let df = ctx.table("shapes").await?; let results = df.clone().collect().await?; - let expected = vec![ - "+----------+--------+", - "| shape_id | tag_id |", - "+----------+--------+", - "| 1 | 11 |", - "| 1 | 12 |", - "| 1 | 13 |", - "| 2 | 21 |", - "| 2 | 22 |", - "| 2 | 23 |", - "| 3 | 31 |", - "| 3 | 32 |", - "| 3 | 33 |", - "+----------+--------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+--------+ + | shape_id | tag_id | + +----------+--------+ + | 1 | 11 | + | 1 | 12 | + | 1 | 13 | + | 2 | 21 | + | 2 | 22 | + | 2 | 23 | + | 3 | 31 | + | 3 | 32 | + | 3 | 33 | + +----------+--------+ + "### + ); // Doing an `array_agg` by `shape_id` produces: let df = df @@ -4155,22 +4222,24 @@ async fn unnest_with_redundant_columns() -> Result<()> { ); let results = df.collect().await?; - let expected = [ - "+----------+", - "| shape_id |", - "+----------+", - "| 1 |", - "| 1 |", - "| 1 |", - "| 2 |", - "| 2 |", - "| 2 |", - "| 3 |", - "| 3 |", - "| 3 |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----------+ + | shape_id | + +----------+ + | 1 | + | 1 | + | 1 | + | 2 | + | 2 | + | 2 | + | 3 | + | 3 | + | 3 | + +----------+ + "### + ); Ok(()) } @@ -4333,22 +4402,24 @@ async fn test_read_batches() -> Result<()> { ]; let df = ctx.read_batches(batches).unwrap(); df.clone().show().await.unwrap(); - let result = df.collect().await?; - let expected = [ - "+----+--------+", - "| id | number |", - "+----+--------+", - "| 1 | 1.12 |", - "| 2 | 3.4 |", - "| 3 | 2.33 |", - "| 4 | 9.1 |", - "| 5 | 6.66 |", - "| 3 | 1.11 |", - "| 4 | 2.22 |", - "| 5 | 3.33 |", - "+----+--------+", - ]; - assert_batches_sorted_eq!(expected, &result); + let results = df.collect().await?; + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----+--------+ + | id | number | + +----+--------+ + | 1 | 1.12 | + | 2 | 3.4 | + | 3 | 1.11 | + | 3 | 2.33 | + | 4 | 2.22 | + | 4 | 9.1 | + | 5 | 3.33 | + | 5 | 6.66 | + +----+--------+ + "### + ); Ok(()) } #[tokio::test] @@ -4365,9 +4436,14 @@ async fn test_read_batches_empty() -> Result<()> { let batches = vec![]; let df = ctx.read_batches(batches).unwrap(); df.clone().show().await.unwrap(); - let result = df.collect().await?; - let expected = ["++", "++"]; - assert_batches_sorted_eq!(expected, &result); + let results = df.collect().await?; + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + ++ + ++ + "### + ); Ok(()) } @@ -4410,15 +4486,17 @@ async fn consecutive_projection_same_schema() -> Result<()> { .unwrap(); let results = df.collect().await?; - let expected = [ - "+----+----+----+", - "| id | t | t2 |", - "+----+----+----+", - "| 0 | | |", - "| 1 | 10 | 10 |", - "+----+----+----+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +----+----+----+ + | id | t | t2 | + +----+----+----+ + | 0 | | | + | 1 | 10 | 10 | + +----+----+----+ + "### + ); Ok(()) } @@ -5024,16 +5102,17 @@ async fn write_partitioned_parquet_results() -> Result<()> { // Check that the df has the entire set of data let results = df.collect().await?; - let expected = [ - "+-----+-----+", - "| c1 | c2 |", - "+-----+-----+", - "| abc | 123 |", - "| def | 456 |", - "+-----+-----+", - ]; - - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +-----+-----+ + | c1 | c2 | + +-----+-----+ + | abc | 123 | + | def | 456 | + +-----+-----+ + "### + ); Ok(()) } @@ -5153,45 +5232,51 @@ async fn sparse_union_is_null() { let df = ctx.table("union_batch").await.unwrap(); // view_all - let expected = [ - "+----------+", - "| my_union |", - "+----------+", - "| {A=1} |", - "| {A=} |", - "| {B=3.2} |", - "| {B=} |", - "| {C=a} |", - "| {C=} |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &df.clone().collect().await.unwrap()); + assert_snapshot!( + batches_to_sort_string(&df.clone().collect().await.unwrap()), + @r###" + +----------+ + | my_union | + +----------+ + | {A=1} | + | {A=} | + | {B=3.2} | + | {B=} | + | {C=a} | + | {C=} | + +----------+ + "### + ); // filter where is null let result_df = df.clone().filter(col("my_union").is_null()).unwrap(); - let expected = [ - "+----------+", - "| my_union |", - "+----------+", - "| {A=} |", - "| {B=} |", - "| {C=} |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &result_df.collect().await.unwrap()); + assert_snapshot!( + batches_to_sort_string(&result_df.collect().await.unwrap()), + @r###" + +----------+ + | my_union | + +----------+ + | {A=} | + | {B=} | + | {C=} | + +----------+ + "### + ); // filter where is not null let result_df = df.filter(col("my_union").is_not_null()).unwrap(); - let expected = [ - "+----------+", - "| my_union |", - "+----------+", - "| {A=1} |", - "| {B=3.2} |", - "| {C=a} |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &result_df.collect().await.unwrap()); + assert_snapshot!( + batches_to_sort_string(&result_df.collect().await.unwrap()), + @r###" + +----------+ + | my_union | + +----------+ + | {A=1} | + | {B=3.2} | + | {C=a} | + +----------+ + "### + ); } #[tokio::test] @@ -5230,45 +5315,51 @@ async fn dense_union_is_null() { let df = ctx.table("union_batch").await.unwrap(); // view_all - let expected = [ - "+----------+", - "| my_union |", - "+----------+", - "| {A=1} |", - "| {A=} |", - "| {B=3.2} |", - "| {B=} |", - "| {C=a} |", - "| {C=} |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &df.clone().collect().await.unwrap()); + assert_snapshot!( + batches_to_sort_string(&df.clone().collect().await.unwrap()), + @r###" + +----------+ + | my_union | + +----------+ + | {A=1} | + | {A=} | + | {B=3.2} | + | {B=} | + | {C=a} | + | {C=} | + +----------+ + "### + ); // filter where is null let result_df = df.clone().filter(col("my_union").is_null()).unwrap(); - let expected = [ - "+----------+", - "| my_union |", - "+----------+", - "| {A=} |", - "| {B=} |", - "| {C=} |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &result_df.collect().await.unwrap()); + assert_snapshot!( + batches_to_sort_string(&result_df.collect().await.unwrap()), + @r###" + +----------+ + | my_union | + +----------+ + | {A=} | + | {B=} | + | {C=} | + +----------+ + "### + ); // filter where is not null let result_df = df.filter(col("my_union").is_not_null()).unwrap(); - let expected = [ - "+----------+", - "| my_union |", - "+----------+", - "| {A=1} |", - "| {B=3.2} |", - "| {C=a} |", - "+----------+", - ]; - assert_batches_sorted_eq!(expected, &result_df.collect().await.unwrap()); + assert_snapshot!( + batches_to_sort_string(&result_df.collect().await.unwrap()), + @r###" + +----------+ + | my_union | + +----------+ + | {A=1} | + | {B=3.2} | + | {C=a} | + +----------+ + "### + ); } #[tokio::test] @@ -5405,17 +5496,19 @@ async fn test_alias() -> Result<()> { col("table_alias.a"), col("b") + col("table_alias.one"), ])?; - let expected = [ - "+-----------+---------------------------------+", - "| a | table_alias.b + table_alias.one |", - "+-----------+---------------------------------+", - "| abcDEF | 2 |", - "| abc123 | 11 |", - "| CBAdef | 11 |", - "| 123AbcDef | 101 |", - "+-----------+---------------------------------+", - ]; - assert_batches_sorted_eq!(expected, &df.collect().await?); + assert_snapshot!( + batches_to_sort_string(&df.collect().await.unwrap()), + @r###" + +-----------+---------------------------------+ + | a | table_alias.b + table_alias.one | + +-----------+---------------------------------+ + | 123AbcDef | 101 | + | CBAdef | 11 | + | abc123 | 11 | + | abcDEF | 2 | + +-----------+---------------------------------+ + "### + ); Ok(()) } @@ -5426,17 +5519,19 @@ async fn test_alias_self_join() -> Result<()> { let left = create_test_table("t1").await?; let right = left.clone().alias("t2")?; let joined = left.join(right, JoinType::Full, &["a"], &["a"], None)?; - let expected = [ - "+-----------+-----+-----------+-----+", - "| a | b | a | b |", - "+-----------+-----+-----------+-----+", - "| abcDEF | 1 | abcDEF | 1 |", - "| abc123 | 10 | abc123 | 10 |", - "| CBAdef | 10 | CBAdef | 10 |", - "| 123AbcDef | 100 | 123AbcDef | 100 |", - "+-----------+-----+-----------+-----+", - ]; - assert_batches_sorted_eq!(expected, &joined.collect().await?); + assert_snapshot!( + batches_to_sort_string(&joined.collect().await.unwrap()), + @r###" + +-----------+-----+-----------+-----+ + | a | b | a | b | + +-----------+-----+-----------+-----+ + | 123AbcDef | 100 | 123AbcDef | 100 | + | CBAdef | 10 | CBAdef | 10 | + | abc123 | 10 | abc123 | 10 | + | abcDEF | 1 | abcDEF | 1 | + +-----------+-----+-----------+-----+ + "### + ); Ok(()) } @@ -5451,20 +5546,21 @@ async fn test_alias_empty() -> Result<()> { .display_indent_schema() .to_string(); assert_eq!(plan, expected); - let expected = [ - "+-----------+-----+", - "| a | b |", - "+-----------+-----+", - "| abcDEF | 1 |", - "| abc123 | 10 |", - "| CBAdef | 10 |", - "| 123AbcDef | 100 |", - "+-----------+-----+", - ]; - assert_batches_sorted_eq!( - expected, - &df.select(vec![col("a"), col("b")])?.collect().await? + + assert_snapshot!( + batches_to_sort_string(&df.select(vec![col("a"), col("b")])?.collect().await.unwrap()), + @r###" + +-----------+-----+ + | a | b | + +-----------+-----+ + | 123AbcDef | 100 | + | CBAdef | 10 | + | abc123 | 10 | + | abcDEF | 1 | + +-----------+-----+ + "### ); + Ok(()) } @@ -5490,17 +5586,20 @@ async fn test_alias_nested() -> Result<()> { let select1 = df .clone() .select(vec![col("alias2.a"), col("b") + col("alias2.one")])?; - let expected = [ - "+-----------+-----------------------+", - "| a | alias2.b + alias2.one |", - "+-----------+-----------------------+", - "| 123AbcDef | 101 |", - "| CBAdef | 11 |", - "| abc123 | 11 |", - "| abcDEF | 2 |", - "+-----------+-----------------------+", - ]; - assert_batches_sorted_eq!(expected, &select1.collect().await?); + + assert_snapshot!( + batches_to_sort_string(&select1.collect().await.unwrap()), + @r###" + +-----------+-----------------------+ + | a | alias2.b + alias2.one | + +-----------+-----------------------+ + | 123AbcDef | 101 | + | CBAdef | 11 | + | abc123 | 11 | + | abcDEF | 2 | + +-----------+-----------------------+ + "### + ); // Only the outermost alias is visible let select2 = df.select(vec![col("alias1.a")]); @@ -5665,16 +5764,19 @@ async fn test_fill_null() -> Result<()> { )?; let results = df_filled.collect().await?; - let expected = [ - "+---+---------+", - "| a | b |", - "+---+---------+", - "| 1 | x |", - "| 0 | default |", - "| 3 | z |", - "+---+---------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +---+---------+ + | a | b | + +---+---------+ + | 0 | default | + | 1 | x | + | 3 | z | + +---+---------+ + "### + ); + Ok(()) } @@ -5690,32 +5792,35 @@ async fn test_fill_null_all_columns() -> Result<()> { let results = df_filled.clone().collect().await?; - let expected = [ - "+---+---------+", - "| a | b |", - "+---+---------+", - "| 1 | x |", - "| | default |", - "| 3 | z |", - "+---+---------+", - ]; - - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +---+---------+ + | a | b | + +---+---------+ + | | default | + | 1 | x | + | 3 | z | + +---+---------+ + "### + ); // Fill column "a" null values with a value that cannot be cast to Int32. let df_filled = df_filled.fill_null(ScalarValue::Int32(Some(0)), vec![])?; let results = df_filled.collect().await?; - let expected = [ - "+---+---------+", - "| a | b |", - "+---+---------+", - "| 1 | x |", - "| 0 | default |", - "| 3 | z |", - "+---+---------+", - ]; - assert_batches_sorted_eq!(expected, &results); + assert_snapshot!( + batches_to_sort_string(&results), + @r###" + +---+---------+ + | a | b | + +---+---------+ + | 0 | default | + | 1 | x | + | 3 | z | + +---+---------+ + "### + ); Ok(()) } From 1b0c39834a3dc8b7aec291fb7e524c086a5e50eb Mon Sep 17 00:00:00 2001 From: Dmitrii Blaginin Date: Tue, 11 Mar 2025 21:24:22 +0000 Subject: [PATCH 5/6] Switch from `assert_eq` --- datafusion/core/tests/dataframe/mod.rs | 862 ++++++++++++------------- 1 file changed, 429 insertions(+), 433 deletions(-) diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 5a30f713047f..673966ede878 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -856,9 +856,7 @@ async fn test_aggregate_name_collision() -> Result<()> { // The select expr has the same display_name as the group_expr, // but since they are different expressions, it should fail. .expect_err("Expected error"); - let expected = "Schema error: No field named aggregate_test_100.c2. \ - Valid fields are \"aggregate_test_100.c2 + aggregate_test_100.c3\"."; - assert_eq!(df.strip_backtrace(), expected); + assert_snapshot!(df.strip_backtrace(), @r###"Schema error: No field named aggregate_test_100.c2. Valid fields are "aggregate_test_100.c2 + aggregate_test_100.c3"."###); Ok(()) } @@ -1026,7 +1024,7 @@ async fn test_distinct_sort_by_unprojected() -> Result<()> { // try to sort on some value not present in input to distinct .sort(vec![col("c2").sort(true, true)]) .unwrap_err(); - assert_eq!(err.strip_backtrace(), "Error during planning: For SELECT DISTINCT, ORDER BY expressions c2 must appear in select list"); + assert_snapshot!(err.strip_backtrace(), @"Error during planning: For SELECT DISTINCT, ORDER BY expressions c2 must appear in select list"); Ok(()) } @@ -1113,7 +1111,7 @@ async fn test_distinct_on_sort_by_unprojected() -> Result<()> { // try to sort on some value not present in input to distinct .sort(vec![col("c2").sort(true, true)]) .unwrap_err(); - assert_eq!(err.strip_backtrace(), "Error during planning: For SELECT DISTINCT, ORDER BY expressions c2 must appear in select list"); + assert_snapshot!(err.strip_backtrace(), @"Error during planning: For SELECT DISTINCT, ORDER BY expressions c2 must appear in select list"); Ok(()) } @@ -1181,12 +1179,13 @@ async fn join_on() -> Result<()> { [col("a.c1").not_eq(col("b.c1")), col("a.c2").eq(col("b.c2"))], )?; - let expected_plan = "Inner Join: Filter: a.c1 != b.c1 AND a.c2 = b.c2\ - \n Projection: a.c1, a.c2\ - \n TableScan: a\ - \n Projection: b.c1, b.c2\ - \n TableScan: b"; - assert_eq!(expected_plan, format!("{}", join.logical_plan())); + assert_snapshot!(join.logical_plan(), @r###" + Inner Join: Filter: a.c1 != b.c1 AND a.c2 = b.c2 + Projection: a.c1, a.c2 + TableScan: a + Projection: b.c1, b.c2 + TableScan: b + "###); Ok(()) } @@ -1202,15 +1201,14 @@ async fn join_on_filter_datatype() -> Result<()> { JoinType::Inner, Some(Expr::Literal(ScalarValue::Null)), )?; - let expected_plan = "EmptyRelation"; - assert_eq!(expected_plan, format!("{}", join.into_optimized_plan()?)); + assert_snapshot!(join.into_optimized_plan().unwrap(), @"EmptyRelation"); // JOIN ON expression must be boolean type let join = left.join_on(right, JoinType::Inner, Some(lit("TRUE")))?; - let expected = join.into_optimized_plan().unwrap_err(); - assert_eq!( - expected.strip_backtrace(), - "type_coercion\ncaused by\nError during planning: Join condition must be boolean type, but got Utf8" + let err = join.into_optimized_plan().unwrap_err(); + assert_snapshot!( + err.strip_backtrace(), + @"type_coercion\ncaused by\nError during planning: Join condition must be boolean type, but got Utf8" ); Ok(()) } @@ -1227,8 +1225,7 @@ async fn join_ambiguous_filter() -> Result<()> { let join = left .join_on(right, JoinType::Inner, [col("c1").eq(col("c1"))]) .expect_err("join didn't fail check"); - let expected = "Schema error: Ambiguous reference to unqualified field c1"; - assert_eq!(join.strip_backtrace(), expected); + assert_snapshot!(join.strip_backtrace(), @"Schema error: Ambiguous reference to unqualified field c1"); Ok(()) } @@ -1572,27 +1569,29 @@ async fn with_column_join_same_columns() -> Result<()> { let df_with_column = df.clone().with_column("new_column", lit(true))?; - assert_eq!( - "\ - Projection: t1.c1, t2.c1, Boolean(true) AS new_column\ - \n Limit: skip=0, fetch=1\ - \n Sort: t1.c1 ASC NULLS FIRST\ - \n Inner Join: t1.c1 = t2.c1\ - \n TableScan: t1\ - \n TableScan: t2", - format!("{}", df_with_column.logical_plan()) + assert_snapshot!( + df_with_column.logical_plan(), + @r###" + Projection: t1.c1, t2.c1, Boolean(true) AS new_column + Limit: skip=0, fetch=1 + Sort: t1.c1 ASC NULLS FIRST + Inner Join: t1.c1 = t2.c1 + TableScan: t1 + TableScan: t2 + "### ); - assert_eq!( - "\ - Projection: t1.c1, t2.c1, Boolean(true) AS new_column\ - \n Sort: t1.c1 ASC NULLS FIRST, fetch=1\ - \n Inner Join: t1.c1 = t2.c1\ - \n SubqueryAlias: t1\ - \n TableScan: aggregate_test_100 projection=[c1]\ - \n SubqueryAlias: t2\ - \n TableScan: aggregate_test_100 projection=[c1]", - format!("{}", df_with_column.clone().into_optimized_plan()?) + assert_snapshot!( + df_with_column.clone().into_optimized_plan().unwrap(), + @r###" + Projection: t1.c1, t2.c1, Boolean(true) AS new_column + Sort: t1.c1 ASC NULLS FIRST, fetch=1 + Inner Join: t1.c1 = t2.c1 + SubqueryAlias: t1 + TableScan: aggregate_test_100 projection=[c1] + SubqueryAlias: t2 + TableScan: aggregate_test_100 projection=[c1] + "### ); let df_results = df_with_column.collect().await?; @@ -1689,8 +1688,7 @@ async fn with_column_renamed_ambiguous() -> Result<()> { // can be t1.c2 or t2.c2 .with_column_renamed("c2", "AAA") .unwrap_err(); - let expected_err = "Schema error: Ambiguous reference to unqualified field c2"; - assert_eq!(actual_err.strip_backtrace(), expected_err); + assert_snapshot!(actual_err.strip_backtrace(), @"Schema error: Ambiguous reference to unqualified field c2"); Ok(()) } @@ -1738,25 +1736,29 @@ async fn with_column_renamed_join() -> Result<()> { let df_renamed = df.clone().with_column_renamed("t1.c1", "AAA")?; - assert_eq!("\ - Projection: t1.c1 AS AAA, t1.c2, t1.c3, t2.c1, t2.c2, t2.c3\ - \n Limit: skip=0, fetch=1\ - \n Sort: t1.c1 ASC NULLS FIRST, t1.c2 ASC NULLS FIRST, t1.c3 ASC NULLS FIRST, t2.c1 ASC NULLS FIRST, t2.c2 ASC NULLS FIRST, t2.c3 ASC NULLS FIRST\ - \n Inner Join: t1.c1 = t2.c1\ - \n TableScan: t1\ - \n TableScan: t2", - format!("{}", df_renamed.logical_plan()) + assert_snapshot!( + df_renamed.logical_plan(), + @r###" + Projection: t1.c1 AS AAA, t1.c2, t1.c3, t2.c1, t2.c2, t2.c3 + Limit: skip=0, fetch=1 + Sort: t1.c1 ASC NULLS FIRST, t1.c2 ASC NULLS FIRST, t1.c3 ASC NULLS FIRST, t2.c1 ASC NULLS FIRST, t2.c2 ASC NULLS FIRST, t2.c3 ASC NULLS FIRST + Inner Join: t1.c1 = t2.c1 + TableScan: t1 + TableScan: t2 + "### ); - assert_eq!("\ - Projection: t1.c1 AS AAA, t1.c2, t1.c3, t2.c1, t2.c2, t2.c3\ - \n Sort: t1.c1 ASC NULLS FIRST, t1.c2 ASC NULLS FIRST, t1.c3 ASC NULLS FIRST, t2.c1 ASC NULLS FIRST, t2.c2 ASC NULLS FIRST, t2.c3 ASC NULLS FIRST, fetch=1\ - \n Inner Join: t1.c1 = t2.c1\ - \n SubqueryAlias: t1\ - \n TableScan: aggregate_test_100 projection=[c1, c2, c3]\ - \n SubqueryAlias: t2\ - \n TableScan: aggregate_test_100 projection=[c1, c2, c3]", - format!("{}", df_renamed.clone().into_optimized_plan()?) + assert_snapshot!( + df_renamed.clone().into_optimized_plan().unwrap(), + @r###" + Projection: t1.c1 AS AAA, t1.c2, t1.c3, t2.c1, t2.c2, t2.c3 + Sort: t1.c1 ASC NULLS FIRST, t1.c2 ASC NULLS FIRST, t1.c3 ASC NULLS FIRST, t2.c1 ASC NULLS FIRST, t2.c2 ASC NULLS FIRST, t2.c3 ASC NULLS FIRST, fetch=1 + Inner Join: t1.c1 = t2.c1 + SubqueryAlias: t1 + TableScan: aggregate_test_100 projection=[c1, c2, c3] + SubqueryAlias: t2 + TableScan: aggregate_test_100 projection=[c1, c2, c3] + "### ); let df_results = df_renamed.collect().await?; @@ -1944,9 +1946,9 @@ async fn cache_test() -> Result<()> { let cached_df = df.clone().cache().await?; - assert_eq!( - "TableScan: ?table? projection=[c2, c3, sum]", - format!("{}", cached_df.clone().into_optimized_plan()?) + assert_snapshot!( + cached_df.clone().into_optimized_plan().unwrap(), + @"TableScan: ?table? projection=[c2, c3, sum]" ); let df_results = df.collect().await?; @@ -2492,52 +2494,52 @@ async fn test_count_wildcard_on_sort() -> Result<()> { .collect() .await?; - let expected_sql_result = "+---------------+------------------------------------------------------------------------------------------------------------+\ - \n| plan_type | plan |\ - \n+---------------+------------------------------------------------------------------------------------------------------------+\ - \n| logical_plan | Projection: t1.b, count(*) |\ - \n| | Sort: count(Int64(1)) AS count(*) AS count(*) ASC NULLS LAST |\ - \n| | Projection: t1.b, count(Int64(1)) AS count(*), count(Int64(1)) |\ - \n| | Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1))]] |\ - \n| | TableScan: t1 projection=[b] |\ - \n| physical_plan | ProjectionExec: expr=[b@0 as b, count(*)@1 as count(*)] |\ - \n| | SortPreservingMergeExec: [count(Int64(1))@2 ASC NULLS LAST] |\ - \n| | SortExec: expr=[count(Int64(1))@2 ASC NULLS LAST], preserve_partitioning=[true] |\ - \n| | ProjectionExec: expr=[b@0 as b, count(Int64(1))@1 as count(*), count(Int64(1))@1 as count(Int64(1))] |\ - \n| | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(Int64(1))] |\ - \n| | CoalesceBatchesExec: target_batch_size=8192 |\ - \n| | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4 |\ - \n| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |\ - \n| | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))] |\ - \n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ - \n| | |\ - \n+---------------+------------------------------------------------------------------------------------------------------------+"; - - assert_eq!( - expected_sql_result, - pretty_format_batches(&sql_results)?.to_string() - ); - - let expected_df_result = "+---------------+--------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+--------------------------------------------------------------------------------+\ -\n| logical_plan | Sort: count(*) ASC NULLS LAST |\ -\n| | Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1)) AS count(*)]] |\ -\n| | TableScan: t1 projection=[b] |\ -\n| physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST] |\ -\n| | SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true] |\ -\n| | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(*)] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4 |\ -\n| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |\ -\n| | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(*)] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+--------------------------------------------------------------------------------+"; + assert_snapshot!( + pretty_format_batches(&sql_results).unwrap(), + @r###" + +---------------+------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: t1.b, count(*) | + | | Sort: count(Int64(1)) AS count(*) AS count(*) ASC NULLS LAST | + | | Projection: t1.b, count(Int64(1)) AS count(*), count(Int64(1)) | + | | Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1))]] | + | | TableScan: t1 projection=[b] | + | physical_plan | ProjectionExec: expr=[b@0 as b, count(*)@1 as count(*)] | + | | SortPreservingMergeExec: [count(Int64(1))@2 ASC NULLS LAST] | + | | SortExec: expr=[count(Int64(1))@2 ASC NULLS LAST], preserve_partitioning=[true] | + | | ProjectionExec: expr=[b@0 as b, count(Int64(1))@1 as count(*), count(Int64(1))@1 as count(Int64(1))] | + | | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(Int64(1))] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4 | + | | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | + | | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(Int64(1))] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+------------------------------------------------------------------------------------------------------------+ + "### + ); - assert_eq!( - expected_df_result, - pretty_format_batches(&df_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&df_results).unwrap(), + @r###" + +---------------+--------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+--------------------------------------------------------------------------------+ + | logical_plan | Sort: count(*) ASC NULLS LAST | + | | Aggregate: groupBy=[[t1.b]], aggr=[[count(Int64(1)) AS count(*)]] | + | | TableScan: t1 projection=[b] | + | physical_plan | SortPreservingMergeExec: [count(*)@1 ASC NULLS LAST] | + | | SortExec: expr=[count(*)@1 ASC NULLS LAST], preserve_partitioning=[true] | + | | AggregateExec: mode=FinalPartitioned, gby=[b@0 as b], aggr=[count(*)] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | RepartitionExec: partitioning=Hash([b@0], 4), input_partitions=4 | + | | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | + | | AggregateExec: mode=Partial, gby=[b@0 as b], aggr=[count(*)] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+--------------------------------------------------------------------------------+ + "### ); Ok(()) } @@ -2552,27 +2554,27 @@ async fn test_count_wildcard_on_where_in() -> Result<()> { .collect() .await?; - let expected_sql_result = "+---------------+------------------------------------------------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+------------------------------------------------------------------------------------------------------------------------+\ -\n| logical_plan | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*) |\ -\n| | TableScan: t1 projection=[a, b] |\ -\n| | SubqueryAlias: __correlated_sq_1 |\ -\n| | Projection: count(Int64(1)) AS count(*) |\ -\n| | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] |\ -\n| | TableScan: t2 projection=[] |\ -\n| physical_plan | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] |\ -\n| | ProjectionExec: expr=[4 as count(*)] |\ -\n| | PlaceholderRowExec |\ -\n| | ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+------------------------------------------------------------------------------------------------------------------------+"; - - assert_eq!( - expected_sql_result, - pretty_format_batches(&sql_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&sql_results).unwrap(), + @r###" + +---------------+------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*) | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __correlated_sq_1 | + | | Projection: count(Int64(1)) AS count(*) | + | | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] | + | | TableScan: t2 projection=[] | + | physical_plan | CoalesceBatchesExec: target_batch_size=8192 | + | | HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] | + | | ProjectionExec: expr=[4 as count(*)] | + | | PlaceholderRowExec | + | | ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+------------------------------------------------------------------------------------------------------------------------+ + "### ); // In the same SessionContext, AliasGenerator will increase subquery_alias id by 1 @@ -2597,27 +2599,27 @@ async fn test_count_wildcard_on_where_in() -> Result<()> { .collect() .await?; - let actual_df_result= "+---------------+------------------------------------------------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+------------------------------------------------------------------------------------------------------------------------+\ -\n| logical_plan | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*) |\ -\n| | TableScan: t1 projection=[a, b] |\ -\n| | SubqueryAlias: __correlated_sq_1 |\ -\n| | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] |\ -\n| | TableScan: t2 projection=[] |\ -\n| physical_plan | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] |\ -\n| | ProjectionExec: expr=[4 as count(*)] |\ -\n| | PlaceholderRowExec |\ -\n| | ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+------------------------------------------------------------------------------------------------------------------------+"; - // make sure sql plan same with df plan - assert_eq!( - actual_df_result, - pretty_format_batches(&df_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&df_results).unwrap(), + @r###" + +---------------+------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | LeftSemi Join: CAST(t1.a AS Int64) = __correlated_sq_1.count(*) | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __correlated_sq_1 | + | | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] | + | | TableScan: t2 projection=[] | + | physical_plan | CoalesceBatchesExec: target_batch_size=8192 | + | | HashJoinExec: mode=Partitioned, join_type=RightSemi, on=[(count(*)@0, CAST(t1.a AS Int64)@2)], projection=[a@0, b@1] | + | | ProjectionExec: expr=[4 as count(*)] | + | | PlaceholderRowExec | + | | ProjectionExec: expr=[a@0 as a, b@1 as b, CAST(a@0 AS Int64) as CAST(t1.a AS Int64)] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+------------------------------------------------------------------------------------------------------------------------+ + "### ); Ok(()) @@ -2633,26 +2635,25 @@ async fn test_count_wildcard_on_where_exist() -> Result<()> { .collect() .await?; - let actual_sql_result = - "+---------------+---------------------------------------------------------+\ - \n| plan_type | plan |\ - \n+---------------+---------------------------------------------------------+\ - \n| logical_plan | LeftSemi Join: |\ - \n| | TableScan: t1 projection=[a, b] |\ - \n| | SubqueryAlias: __correlated_sq_1 |\ - \n| | Projection: |\ - \n| | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] |\ - \n| | TableScan: t2 projection=[] |\ - \n| physical_plan | NestedLoopJoinExec: join_type=RightSemi |\ - \n| | ProjectionExec: expr=[] |\ - \n| | PlaceholderRowExec |\ - \n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ - \n| | |\ - \n+---------------+---------------------------------------------------------+"; - - assert_eq!( - actual_sql_result, - pretty_format_batches(&sql_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&sql_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------+ + | logical_plan | LeftSemi Join: | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __correlated_sq_1 | + | | Projection: | + | | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] | + | | TableScan: t2 projection=[] | + | physical_plan | NestedLoopJoinExec: join_type=RightSemi | + | | ProjectionExec: expr=[] | + | | PlaceholderRowExec | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+---------------------------------------------------------+ + "### ); let df_results = ctx @@ -2673,25 +2674,25 @@ async fn test_count_wildcard_on_where_exist() -> Result<()> { .collect() .await?; - let actual_df_result = "+---------------+---------------------------------------------------------------------+\ - \n| plan_type | plan |\ - \n+---------------+---------------------------------------------------------------------+\ - \n| logical_plan | LeftSemi Join: |\ - \n| | TableScan: t1 projection=[a, b] |\ - \n| | SubqueryAlias: __correlated_sq_1 |\ - \n| | Projection: |\ - \n| | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] |\ - \n| | TableScan: t2 projection=[] |\ - \n| physical_plan | NestedLoopJoinExec: join_type=RightSemi |\ - \n| | ProjectionExec: expr=[] |\ - \n| | PlaceholderRowExec |\ - \n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ - \n| | |\ - \n+---------------+---------------------------------------------------------------------+"; - - assert_eq!( - actual_df_result, - pretty_format_batches(&df_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&df_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------------------+ + | logical_plan | LeftSemi Join: | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __correlated_sq_1 | + | | Projection: | + | | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] | + | | TableScan: t2 projection=[] | + | physical_plan | NestedLoopJoinExec: join_type=RightSemi | + | | ProjectionExec: expr=[] | + | | PlaceholderRowExec | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+---------------------------------------------------------------------+ + "### ); Ok(()) @@ -2708,22 +2709,22 @@ async fn test_count_wildcard_on_window() -> Result<()> { .collect() .await?; - let actual_sql_result = "+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\ -\n| logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING |\ -\n| | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] |\ -\n| | TableScan: t1 projection=[a] |\ -\n| physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] |\ -\n| | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Ok(Field { name: \"count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(6)), end_bound: Following(UInt32(2)), is_causal: false }], mode=[Sorted] |\ -\n| | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+"; - - assert_eq!( - actual_sql_result, - pretty_format_batches(&sql_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&sql_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING AS count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | + | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | + | | TableScan: t1 projection=[a] | + | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(*) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | + | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Ok(Field { name: "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(6)), end_bound: Following(UInt32(2)), is_causal: false }], mode=[Sorted] | + | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + "### ); let df_results = ctx @@ -2742,22 +2743,22 @@ async fn test_count_wildcard_on_window() -> Result<()> { .collect() .await?; - let actual_df_result = "+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\ -\n| logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING |\ -\n| | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] |\ -\n| | TableScan: t1 projection=[a] |\ -\n| physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] |\ -\n| | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Ok(Field { name: \"count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(6)), end_bound: Following(UInt32(2)), is_causal: false }], mode=[Sorted] |\ -\n| | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+"; - - assert_eq!( - actual_df_result, - pretty_format_batches(&df_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&df_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING | + | | WindowAggr: windowExpr=[[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING]] | + | | TableScan: t1 projection=[a] | + | physical_plan | ProjectionExec: expr=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@1 as count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING] | + | | BoundedWindowAggExec: wdw=[count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Ok(Field { name: "count(Int64(1)) ORDER BY [t1.a DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(UInt32(6)), end_bound: Following(UInt32(2)), is_causal: false }], mode=[Sorted] | + | | SortExec: expr=[a@0 DESC], preserve_partitioning=[false] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + "### ); Ok(()) @@ -2775,20 +2776,20 @@ async fn test_count_wildcard_on_aggregate() -> Result<()> { .collect() .await?; - let actual_sql_result = - "+---------------+-----------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+-----------------------------------------------------+\ -\n| logical_plan | Projection: count(Int64(1)) AS count(*) |\ -\n| | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] |\ -\n| | TableScan: t1 projection=[] |\ -\n| physical_plan | ProjectionExec: expr=[4 as count(*)] |\ -\n| | PlaceholderRowExec |\ -\n| | |\ -\n+---------------+-----------------------------------------------------+"; - assert_eq!( - actual_sql_result, - pretty_format_batches(&sql_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&sql_results).unwrap(), + @r###" + +---------------+-----------------------------------------------------+ + | plan_type | plan | + +---------------+-----------------------------------------------------+ + | logical_plan | Projection: count(Int64(1)) AS count(*) | + | | Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] | + | | TableScan: t1 projection=[] | + | physical_plan | ProjectionExec: expr=[4 as count(*)] | + | | PlaceholderRowExec | + | | | + +---------------+-----------------------------------------------------+ + "### ); // add `.select(vec![count_wildcard()])?` to make sure we can analyze all node instead of just top node. @@ -2801,18 +2802,19 @@ async fn test_count_wildcard_on_aggregate() -> Result<()> { .collect() .await?; - let actual_df_result = "+---------------+---------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+---------------------------------------------------------------+\ -\n| logical_plan | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] |\ -\n| | TableScan: t1 projection=[] |\ -\n| physical_plan | ProjectionExec: expr=[4 as count(*)] |\ -\n| | PlaceholderRowExec |\ -\n| | |\ -\n+---------------+---------------------------------------------------------------+"; - assert_eq!( - actual_df_result, - pretty_format_batches(&df_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&df_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------------+ + | logical_plan | Aggregate: groupBy=[[]], aggr=[[count(Int64(1)) AS count(*)]] | + | | TableScan: t1 projection=[] | + | physical_plan | ProjectionExec: expr=[4 as count(*)] | + | | PlaceholderRowExec | + | | | + +---------------+---------------------------------------------------------------+ + "### ); Ok(()) @@ -2829,37 +2831,38 @@ async fn test_count_wildcard_on_where_scalar_subquery() -> Result<()> { .collect() .await?; - let actual_sql_result = "+---------------+---------------------------------------------------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------+\ -\n| logical_plan | Projection: t1.a, t1.b |\ -\n| | Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0) |\ -\n| | Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true |\ -\n| | Left Join: t1.a = __scalar_sq_1.a |\ -\n| | TableScan: t1 projection=[a, b] |\ -\n| | SubqueryAlias: __scalar_sq_1 |\ -\n| | Projection: count(Int64(1)) AS count(*), t2.a, Boolean(true) AS __always_true |\ -\n| | Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1))]] |\ -\n| | TableScan: t2 projection=[a] |\ -\n| physical_plan | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, a@1)], projection=[a@0, b@1, count(*)@2, __always_true@4] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1 |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | ProjectionExec: expr=[count(Int64(1))@1 as count(*), a@0 as a, true as __always_true] |\ -\n| | AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 |\ -\n| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |\ -\n| | AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------+"; - assert_eq!( - actual_sql_result, - pretty_format_batches(&sql_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&sql_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: t1.a, t1.b | + | | Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0) | + | | Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true | + | | Left Join: t1.a = __scalar_sq_1.a | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __scalar_sq_1 | + | | Projection: count(Int64(1)) AS count(*), t2.a, Boolean(true) AS __always_true | + | | Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1))]] | + | | TableScan: t2 projection=[a] | + | physical_plan | CoalesceBatchesExec: target_batch_size=8192 | + | | FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, a@1)], projection=[a@0, b@1, count(*)@2, __always_true@4] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1 | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | ProjectionExec: expr=[count(Int64(1))@1 as count(*), a@0 as a, true as __always_true] | + | | AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(Int64(1))] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 | + | | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | + | | AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(Int64(1))] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+---------------------------------------------------------------------------------------------------------------------------+ + "### ); // In the same SessionContext, AliasGenerator will increase subquery_alias id by 1 @@ -2887,37 +2890,38 @@ async fn test_count_wildcard_on_where_scalar_subquery() -> Result<()> { .collect() .await?; - let actual_df_result = "+---------------+---------------------------------------------------------------------------------------------------------------------------+\ -\n| plan_type | plan |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------+\ -\n| logical_plan | Projection: t1.a, t1.b |\ -\n| | Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0) |\ -\n| | Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true |\ -\n| | Left Join: t1.a = __scalar_sq_1.a |\ -\n| | TableScan: t1 projection=[a, b] |\ -\n| | SubqueryAlias: __scalar_sq_1 |\ -\n| | Projection: count(*), t2.a, Boolean(true) AS __always_true |\ -\n| | Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1)) AS count(*)]] |\ -\n| | TableScan: t2 projection=[a] |\ -\n| physical_plan | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, a@1)], projection=[a@0, b@1, count(*)@2, __always_true@4] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1 |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | ProjectionExec: expr=[count(*)@1 as count(*), a@0 as a, true as __always_true] |\ -\n| | AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(*)] |\ -\n| | CoalesceBatchesExec: target_batch_size=8192 |\ -\n| | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 |\ -\n| | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 |\ -\n| | AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(*)] |\ -\n| | DataSourceExec: partitions=1, partition_sizes=[1] |\ -\n| | |\ -\n+---------------+---------------------------------------------------------------------------------------------------------------------------+"; - assert_eq!( - actual_df_result, - pretty_format_batches(&df_results)?.to_string() + assert_snapshot!( + pretty_format_batches(&df_results).unwrap(), + @r###" + +---------------+---------------------------------------------------------------------------------------------------------------------------+ + | plan_type | plan | + +---------------+---------------------------------------------------------------------------------------------------------------------------+ + | logical_plan | Projection: t1.a, t1.b | + | | Filter: CASE WHEN __scalar_sq_1.__always_true IS NULL THEN Int64(0) ELSE __scalar_sq_1.count(*) END > Int64(0) | + | | Projection: t1.a, t1.b, __scalar_sq_1.count(*), __scalar_sq_1.__always_true | + | | Left Join: t1.a = __scalar_sq_1.a | + | | TableScan: t1 projection=[a, b] | + | | SubqueryAlias: __scalar_sq_1 | + | | Projection: count(*), t2.a, Boolean(true) AS __always_true | + | | Aggregate: groupBy=[[t2.a]], aggr=[[count(Int64(1)) AS count(*)]] | + | | TableScan: t2 projection=[a] | + | physical_plan | CoalesceBatchesExec: target_batch_size=8192 | + | | FilterExec: CASE WHEN __always_true@3 IS NULL THEN 0 ELSE count(*)@2 END > 0, projection=[a@0, b@1] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | HashJoinExec: mode=Partitioned, join_type=Left, on=[(a@0, a@1)], projection=[a@0, b@1, count(*)@2, __always_true@4] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=1 | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | ProjectionExec: expr=[count(*)@1 as count(*), a@0 as a, true as __always_true] | + | | AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[count(*)] | + | | CoalesceBatchesExec: target_batch_size=8192 | + | | RepartitionExec: partitioning=Hash([a@0], 4), input_partitions=4 | + | | RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | + | | AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[count(*)] | + | | DataSourceExec: partitions=1, partition_sizes=[1] | + | | | + +---------------+---------------------------------------------------------------------------------------------------------------------------+ + "### ); Ok(()) @@ -3086,7 +3090,7 @@ async fn sort_on_distinct_unprojected_columns() -> Result<()> { .distinct()? .sort(vec![Sort::new(col("b"), false, true)]) .unwrap_err(); - assert_eq!(err.strip_backtrace(), "Error during planning: For SELECT DISTINCT, ORDER BY expressions b must appear in select list"); + assert_snapshot!(err.strip_backtrace(), @"Error during planning: For SELECT DISTINCT, ORDER BY expressions b must appear in select list"); Ok(()) } @@ -3104,8 +3108,7 @@ async fn sort_on_ambiguous_column() -> Result<()> { .sort(vec![col("b").sort(true, true)]) .unwrap_err(); - let expected = "Schema error: Ambiguous reference to unqualified field b"; - assert_eq!(err.strip_backtrace(), expected); + assert_snapshot!(err.strip_backtrace(), @"Schema error: Ambiguous reference to unqualified field b"); Ok(()) } @@ -3123,8 +3126,7 @@ async fn group_by_ambiguous_column() -> Result<()> { .aggregate(vec![col("b")], vec![max(col("a"))]) .unwrap_err(); - let expected = "Schema error: Ambiguous reference to unqualified field b"; - assert_eq!(err.strip_backtrace(), expected); + assert_snapshot!(err.strip_backtrace(), @"Schema error: Ambiguous reference to unqualified field b"); Ok(()) } @@ -3142,8 +3144,7 @@ async fn filter_on_ambiguous_column() -> Result<()> { .filter(col("b").eq(lit(1))) .unwrap_err(); - let expected = "Schema error: Ambiguous reference to unqualified field b"; - assert_eq!(err.strip_backtrace(), expected); + assert_snapshot!(err.strip_backtrace(), @"Schema error: Ambiguous reference to unqualified field b"); Ok(()) } @@ -3161,8 +3162,7 @@ async fn select_ambiguous_column() -> Result<()> { .select(vec![col("b")]) .unwrap_err(); - let expected = "Schema error: Ambiguous reference to unqualified field b"; - assert_eq!(err.strip_backtrace(), expected); + assert_snapshot!(err.strip_backtrace(), @"Schema error: Ambiguous reference to unqualified field b"); Ok(()) } @@ -3427,18 +3427,16 @@ async fn join_with_alias_filter() -> Result<()> { ])?; let optimized_plan = df.clone().into_optimized_plan()?; - let expected = vec![ - "Projection: t1.a, t2.a, t1.b, t1.c, t2.b, t2.c [a:UInt32, a:UInt32, b:Utf8, c:Int32, b:Utf8, c:Int32]", - " Inner Join: t1.a + UInt32(3) = t2.a + UInt32(1) [a:UInt32, b:Utf8, c:Int32, a:UInt32, b:Utf8, c:Int32]", - " TableScan: t1 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]", - " TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]", - ]; - let formatted = optimized_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r###" + Projection: t1.a, t2.a, t1.b, t1.c, t2.b, t2.c [a:UInt32, a:UInt32, b:Utf8, c:Int32, b:Utf8, c:Int32] + Inner Join: t1.a + UInt32(3) = t2.a + UInt32(1) [a:UInt32, b:Utf8, c:Int32, a:UInt32, b:Utf8, c:Int32] + TableScan: t1 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32] + TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32] + "### ); let results = df.collect().await?; @@ -3473,20 +3471,19 @@ async fn right_semi_with_alias_filter() -> Result<()> { .join(t2, JoinType::RightSemi, &[], &[], Some(filter))? .select(vec![col("t2.a"), col("t2.b"), col("t2.c")])?; let optimized_plan = df.clone().into_optimized_plan()?; - let expected = vec![ - "RightSemi Join: t1.a = t2.a [a:UInt32, b:Utf8, c:Int32]", - " Projection: t1.a [a:UInt32]", - " Filter: t1.c > Int32(1) [a:UInt32, c:Int32]", - " TableScan: t1 projection=[a, c] [a:UInt32, c:Int32]", - " Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32]", - " TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]", - ]; let formatted = optimized_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r###" + RightSemi Join: t1.a = t2.a [a:UInt32, b:Utf8, c:Int32] + Projection: t1.a [a:UInt32] + Filter: t1.c > Int32(1) [a:UInt32, c:Int32] + TableScan: t1 projection=[a, c] [a:UInt32, c:Int32] + Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32] + TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32] + "### ); let results = df.collect().await?; @@ -3521,19 +3518,18 @@ async fn right_anti_filter_push_down() -> Result<()> { .join(t2, JoinType::RightAnti, &[], &[], Some(filter))? .select(vec![col("t2.a"), col("t2.b"), col("t2.c")])?; let optimized_plan = df.clone().into_optimized_plan()?; - let expected = vec![ - "RightAnti Join: t1.a = t2.a Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32]", - " Projection: t1.a [a:UInt32]", - " Filter: t1.c > Int32(1) [a:UInt32, c:Int32]", - " TableScan: t1 projection=[a, c] [a:UInt32, c:Int32]", - " TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32]", - ]; let formatted = optimized_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r###" + RightAnti Join: t1.a = t2.a Filter: t2.c > Int32(1) [a:UInt32, b:Utf8, c:Int32] + Projection: t1.a [a:UInt32] + Filter: t1.c > Int32(1) [a:UInt32, c:Int32] + TableScan: t1 projection=[a, c] [a:UInt32, c:Int32] + TableScan: t2 projection=[a, b, c] [a:UInt32, b:Utf8, c:Int32] + "### ); let results = df.collect().await?; @@ -4207,18 +4203,17 @@ async fn unnest_with_redundant_columns() -> Result<()> { .select(vec![col("shape_id")])?; let optimized_plan = df.clone().into_optimized_plan()?; - let expected = vec![ - "Projection: shapes.shape_id [shape_id:UInt32]", - " Unnest: lists[shape_id2|depth=1] structs[] [shape_id:UInt32, shape_id2:UInt32;N]", - " Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { name: \"item\", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]", - " TableScan: shapes projection=[shape_id] [shape_id:UInt32]", - ]; let formatted = optimized_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + assert_snapshot!( + actual, + @r###" + Projection: shapes.shape_id [shape_id:UInt32] + Unnest: lists[shape_id2|depth=1] structs[] [shape_id:UInt32, shape_id2:UInt32;N] + Aggregate: groupBy=[[shapes.shape_id]], aggr=[[array_agg(shapes.shape_id) AS shape_id2]] [shape_id:UInt32, shape_id2:List(Field { name: "item", data_type: UInt32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + TableScan: shapes projection=[shape_id] [shape_id:UInt32] + "### ); let results = df.collect().await?; @@ -4836,24 +4831,24 @@ async fn test_dataframe_placeholder_missing_param_values() -> Result<()> { let logical_plan = df.logical_plan(); let formatted = logical_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - let expected = vec![ - "Filter: a = $0 [a:Int32]", - " Projection: Int32(1) AS a [a:Int32]", - " EmptyRelation []", - ]; - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + + assert_snapshot!( + actual, + @r###" + Filter: a = $0 [a:Int32] + Projection: Int32(1) AS a [a:Int32] + EmptyRelation [] + "### ); // Executing LogicalPlans with placeholders that don't have bound values // should fail. let results = df.collect().await; let err_msg = results.unwrap_err().strip_backtrace(); - assert_eq!( + assert_snapshot!( err_msg, - "Execution error: Placeholder '$0' was not provided a value for execution." + @"Execution error: Placeholder '$0' was not provided a value for execution." ); // Providing a parameter value should resolve the error @@ -4869,15 +4864,15 @@ async fn test_dataframe_placeholder_missing_param_values() -> Result<()> { let logical_plan = df.logical_plan(); let formatted = logical_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - let expected = vec![ - "Filter: a = Int32(3) [a:Int32]", - " Projection: Int32(1) AS a [a:Int32]", - " EmptyRelation []", - ]; - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + + assert_snapshot!( + actual, + @r###" + Filter: a = Int32(3) [a:Int32] + Projection: Int32(1) AS a [a:Int32] + EmptyRelation [] + "### ); // N.B., the test is basically `SELECT 1 as a WHERE a = 3;` which returns no results. @@ -4900,26 +4895,23 @@ async fn test_dataframe_placeholder_column_parameter() -> Result<()> { let df = ctx.read_empty().unwrap().select_exprs(&["$1"]).unwrap(); let logical_plan = df.logical_plan(); let formatted = logical_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - - #[rustfmt::skip] - let expected = vec![ - "Projection: $1 [$1:Null;N]", - " EmptyRelation []" - ]; + let actual = formatted.trim(); - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + assert_snapshot!( + actual, + @r###" + Projection: $1 [$1:Null;N] + EmptyRelation [] + "### ); // Executing LogicalPlans with placeholders that don't have bound values // should fail. let results = df.collect().await; let err_msg = results.unwrap_err().strip_backtrace(); - assert_eq!( + assert_snapshot!( err_msg, - "Execution error: Placeholder '$1' was not provided a value for execution." + @"Execution error: Placeholder '$1' was not provided a value for execution." ); // Providing a parameter value should resolve the error @@ -4933,14 +4925,14 @@ async fn test_dataframe_placeholder_column_parameter() -> Result<()> { let logical_plan = df.logical_plan(); let formatted = logical_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - let expected = vec![ - "Projection: Int32(3) AS $1 [$1:Null;N]", - " EmptyRelation []", - ]; - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + + assert_snapshot!( + actual, + @r###" + Projection: Int32(3) AS $1 [$1:Null;N] + EmptyRelation [] + "### ); assert_snapshot!( @@ -4972,24 +4964,24 @@ async fn test_dataframe_placeholder_like_expression() -> Result<()> { let logical_plan = df.logical_plan(); let formatted = logical_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - let expected = vec![ - "Filter: a LIKE $1 [a:Utf8]", - " Projection: Utf8(\"foo\") AS a [a:Utf8]", - " EmptyRelation []", - ]; - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + + assert_snapshot!( + actual, + @r###" + Filter: a LIKE $1 [a:Utf8] + Projection: Utf8("foo") AS a [a:Utf8] + EmptyRelation [] + "### ); // Executing LogicalPlans with placeholders that don't have bound values // should fail. let results = df.collect().await; let err_msg = results.unwrap_err().strip_backtrace(); - assert_eq!( + assert_snapshot!( err_msg, - "Execution error: Placeholder '$1' was not provided a value for execution." + @"Execution error: Placeholder '$1' was not provided a value for execution." ); // Providing a parameter value should resolve the error @@ -5005,15 +4997,15 @@ async fn test_dataframe_placeholder_like_expression() -> Result<()> { let logical_plan = df.logical_plan(); let formatted = logical_plan.display_indent_schema().to_string(); - let actual: Vec<&str> = formatted.trim().lines().collect(); - let expected = vec![ - "Filter: a LIKE Utf8(\"f%\") [a:Utf8]", - " Projection: Utf8(\"foo\") AS a [a:Utf8]", - " EmptyRelation []", - ]; - assert_eq!( - expected, actual, - "\n\nexpected:\n\n{expected:#?}\nactual:\n\n{actual:#?}\n\n" + let actual = formatted.trim(); + + assert_snapshot!( + actual, + @r###" + Filter: a LIKE Utf8("f%") [a:Utf8] + Projection: Utf8("foo") AS a [a:Utf8] + EmptyRelation [] + "### ); assert_snapshot!( @@ -5481,15 +5473,17 @@ async fn test_alias() -> Result<()> { df.schema().columns().iter().for_each(|c| { assert_eq!(c.relation, Some("table_alias".into())); }); - let expected = "SubqueryAlias: table_alias [a:Utf8, b:Int32, one:Int32]\ - \n Projection: test.a, test.b, Int32(1) AS one [a:Utf8, b:Int32, one:Int32]\ - \n TableScan: test [a:Utf8, b:Int32]"; + let plan = df .clone() .into_unoptimized_plan() .display_indent_schema() .to_string(); - assert_eq!(plan, expected); + assert_snapshot!(plan, @r###" + SubqueryAlias: table_alias [a:Utf8, b:Int32, one:Int32] + Projection: test.a, test.b, Int32(1) AS one [a:Utf8, b:Int32, one:Int32] + TableScan: test [a:Utf8, b:Int32] + "###); // Select over the aliased DataFrame let df = df.select(vec![ @@ -5538,14 +5532,15 @@ async fn test_alias_self_join() -> Result<()> { #[tokio::test] async fn test_alias_empty() -> Result<()> { let df = create_test_table("test").await?.alias("")?; - let expected = "SubqueryAlias: [a:Utf8, b:Int32]\ - \n TableScan: test [a:Utf8, b:Int32]"; let plan = df .clone() .into_unoptimized_plan() .display_indent_schema() .to_string(); - assert_eq!(plan, expected); + assert_snapshot!(plan, @r###" + SubqueryAlias: [a:Utf8, b:Int32] + TableScan: test [a:Utf8, b:Int32] + "###); assert_snapshot!( batches_to_sort_string(&df.select(vec![col("a"), col("b")])?.collect().await.unwrap()), @@ -5571,16 +5566,18 @@ async fn test_alias_nested() -> Result<()> { .select(vec![col("a"), col("test.b"), lit(1).alias("one")])? .alias("alias1")? .alias("alias2")?; - let expected = "SubqueryAlias: alias2 [a:Utf8, b:Int32, one:Int32]\ - \n SubqueryAlias: alias1 [a:Utf8, b:Int32, one:Int32]\ - \n Projection: test.a, test.b, Int32(1) AS one [a:Utf8, b:Int32, one:Int32]\ - \n TableScan: test projection=[a, b] [a:Utf8, b:Int32]"; + let plan = df .clone() .into_optimized_plan()? .display_indent_schema() .to_string(); - assert_eq!(plan, expected); + assert_snapshot!(plan, @r###" + SubqueryAlias: alias2 [a:Utf8, b:Int32, one:Int32] + SubqueryAlias: alias1 [a:Utf8, b:Int32, one:Int32] + Projection: test.a, test.b, Int32(1) AS one [a:Utf8, b:Int32, one:Int32] + TableScan: test projection=[a, b] [a:Utf8, b:Int32] + "###); // Select over the aliased DataFrame let select1 = df @@ -5603,10 +5600,9 @@ async fn test_alias_nested() -> Result<()> { // Only the outermost alias is visible let select2 = df.select(vec![col("alias1.a")]); - assert_eq!( + assert_snapshot!( select2.unwrap_err().strip_backtrace(), - "Schema error: No field named alias1.a. \ - Valid fields are alias2.a, alias2.b, alias2.one." + @"Schema error: No field named alias1.a. Valid fields are alias2.a, alias2.b, alias2.one." ); Ok(()) } From 7c837bcb743860e68d3dea2a8359b335ee1b8215 Mon Sep 17 00:00:00 2001 From: Dmitrii Blaginin Date: Tue, 11 Mar 2025 21:28:03 +0000 Subject: [PATCH 6/6] Format toml --- Cargo.toml | 2 +- datafusion-cli/Cargo.toml | 2 +- datafusion/core/Cargo.toml | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 16a6967910a8..0f55d2e1004b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -155,6 +155,7 @@ parquet = { version = "54.2.1", default-features = false, features = [ pbjson = { version = "0.7.0" } pbjson-types = "0.7" # Should match arrow-flight's version of prost. +insta = { version = "1.41.1", features = ["glob", "filters"] } prost = "0.13.1" rand = "0.8.5" recursive = "0.1.1" @@ -165,7 +166,6 @@ sqlparser = { version = "0.54.0", features = ["visitor"] } tempfile = "3" tokio = { version = "1.43", features = ["macros", "rt", "sync"] } url = "2.5.4" -insta = { version = "1.41.1", features = ["glob", "filters"] } [profile.release] codegen-units = 1 diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index fcc28075c9ee..752697c7f6c7 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -67,7 +67,7 @@ url = { workspace = true } [dev-dependencies] assert_cmd = "2.0" ctor = { workspace = true } -insta-cmd = "0.6.0" insta = { workspace = true } +insta-cmd = "0.6.0" predicates = "3.0" rstest = { workspace = true } diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index c0f487b8b7fc..0209e75ee780 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -152,6 +152,7 @@ datafusion-functions-window-common = { workspace = true } datafusion-physical-optimizer = { workspace = true } doc-comment = { workspace = true } env_logger = { workspace = true } +insta = { workspace = true } paste = "^1.0" rand = { workspace = true, features = ["small_rng"] } rand_distr = "0.4.3" @@ -161,8 +162,6 @@ serde_json = { workspace = true } sysinfo = "0.33.1" test-utils = { path = "../../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"] } -insta = { workspace = true } - [target.'cfg(not(target_os = "windows"))'.dev-dependencies] nix = { version = "0.29.0", features = ["fs"] }