Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
//! encompasses the former, resulting in fewer checks during query execution.

use datafusion_common::{Column, Result, ScalarValue};
use datafusion_expr::{BinaryExpr, Cast, Expr, Operator};
use datafusion_expr::{BinaryExpr, Expr, Operator};
use std::collections::BTreeMap;

/// Simplifies a list of predicates by removing redundancies.
Expand Down Expand Up @@ -239,8 +239,99 @@ fn find_most_restrictive_predicate(
fn extract_column_from_expr(expr: &Expr) -> Option<Column> {
match expr {
Expr::Column(col) => Some(col.clone()),
// Handle cases where the column might be wrapped in a cast or other operation
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Expr::Cast(Cast { expr, .. }) => extract_column_from_expr(expr),
_ => None,
}
}

#[cfg(test)]
mod tests {
use super::*;
use arrow::datatypes::DataType;
use datafusion_expr::{cast, col, lit};

#[test]
fn test_simplify_predicates_with_cast() {
// Test that predicates on cast expressions are not grouped with predicates on the raw column
// a < 5 AND CAST(a AS varchar) < 'abc' AND a < 6
// Should simplify to:
// a < 5 AND CAST(a AS varchar) < 'abc'

let predicates = vec![
col("a").lt(lit(5i32)),
cast(col("a"), DataType::Utf8).lt(lit("abc")),
col("a").lt(lit(6i32)),
];

let result = simplify_predicates(predicates).unwrap();

// Should have 2 predicates: a < 5 and CAST(a AS varchar) < 'abc'
assert_eq!(result.len(), 2);

// Check that the cast predicate is preserved
let has_cast_predicate = result.iter().any(|p| {
matches!(p, Expr::BinaryExpr(BinaryExpr {
left,
op: Operator::Lt,
right
}) if matches!(left.as_ref(), Expr::Cast(_)) && right == &Box::new(lit("abc")))
});
assert!(has_cast_predicate, "Cast predicate should be preserved");

// Check that we have the more restrictive column predicate (a < 5)
let has_column_predicate = result.iter().any(|p| {
matches!(p, Expr::BinaryExpr(BinaryExpr {
left,
op: Operator::Lt,
right
}) if left == &Box::new(col("a")) && right == &Box::new(lit(5i32)))
});
assert!(has_column_predicate, "Should have a < 5 predicate");
}

#[test]
fn test_extract_column_ignores_cast() {
// Test that extract_column_from_expr does not extract columns from cast expressions
let cast_expr = cast(col("a"), DataType::Utf8);
assert_eq!(extract_column_from_expr(&cast_expr), None);

// Test that it still extracts from direct column references
let col_expr = col("a");
assert_eq!(extract_column_from_expr(&col_expr), Some(Column::from("a")));
}

#[test]
fn test_simplify_predicates_direct_columns_only() {
// Test that only predicates on direct columns are simplified together
let predicates = vec![
col("a").lt(lit(5i32)),
col("a").lt(lit(3i32)),
col("b").gt(lit(10i32)),
col("b").gt(lit(20i32)),
];

let result = simplify_predicates(predicates).unwrap();

// Should have 2 predicates: a < 3 and b > 20 (most restrictive for each column)
assert_eq!(result.len(), 2);

// Check for a < 3
let has_a_predicate = result.iter().any(|p| {
matches!(p, Expr::BinaryExpr(BinaryExpr {
left,
op: Operator::Lt,
right
}) if left == &Box::new(col("a")) && right == &Box::new(lit(3i32)))
});
assert!(has_a_predicate, "Should have a < 3 predicate");

// Check for b > 20
let has_b_predicate = result.iter().any(|p| {
matches!(p, Expr::BinaryExpr(BinaryExpr {
left,
op: Operator::Gt,
right
}) if left == &Box::new(col("b")) && right == &Box::new(lit(20i32)))
});
assert!(has_b_predicate, "Should have b > 20 predicate");
}
}
25 changes: 25 additions & 0 deletions datafusion/sqllogictest/test_files/push_down_filter.slt
Original file line number Diff line number Diff line change
Expand Up @@ -395,3 +395,28 @@ order by t1.k, t2.v;
----
1 1 1
10000000 10000000 10000000

# Regression test for https://github.com/apache/datafusion/issues/17512

query I
COPY (
SELECT arrow_cast('2025-01-01T00:00:00Z'::timestamptz, 'Timestamp(Microsecond, Some("UTC"))') AS start_timestamp
)
TO 'test_files/scratch/push_down_filter/17512.parquet'
STORED AS PARQUET;
----
1

statement ok
CREATE EXTERNAL TABLE records STORED AS PARQUET LOCATION 'test_files/scratch/push_down_filter/17512.parquet';

query I
SELECT 1
FROM (
SELECT start_timestamp
FROM records
WHERE start_timestamp <= '2025-01-01T00:00:00Z'::timestamptz
) AS t
WHERE t.start_timestamp::time < '00:00:01'::time;
Comment on lines +418 to +420
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like one predicate is on start_timestamp and the other is on start_timestamp::time.
from predicate pushdown perspective, the latter is useless.
from find_most_restrictive_predicate perspective, a predicate c < x1 and f(c) < x2 are incomparable. They need to take the f into account. The optimizer that does that is called "unwrap cast in comparison" AFAICT. The find_most_restrictive_predicate should operate only on predicates comparing column c directly, ignoring those which compare f(c).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They need to take the f into account. The optimizer that does that is called "unwrap cast in comparison" AFAICT.

Yes I agree (assuming f is a cast expression)

The find_most_restrictive_predicate should operate only on predicates comparing column c directly, ignoring those which compare f(c).

That is my understanding of what this PR does. I am not sure if you are just confirming this change or if you are proposing / suggesting something more

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The find_most_restrictive_predicate should operate only on predicates comparing column c directly, ignoring those which compare f(c).

That is my understanding of what this PR does. I am not sure if you are just confirming this change or if you are proposing / suggesting something more

At the time of writing it was a proposal.
Now that the proposed change has been applied, it can be read as a confirmation.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

makes sense -- thank you

----
1