Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
408cee1
#17801 Improve nullability reporting of case expressions
pepijnve Sep 28, 2025
045fc9c
#17801 Clarify logical expression test cases
pepijnve Sep 29, 2025
de8b780
#17801 Attempt to clarify const evaluation logic
pepijnve Sep 30, 2025
bbd2949
#17801 Extend predicate const evaluation
pepijnve Sep 30, 2025
2075f4b
#17801 Correctly report nullability of implicit casts in predicates
pepijnve Oct 1, 2025
8c87937
#17801 Code formatting
pepijnve Oct 6, 2025
e155d41
Merge branch 'main' into issue_17801
alamb Oct 8, 2025
5cfe8b6
Merge branch 'main' into issue_17801
alamb Oct 8, 2025
ac4267c
Add comment explaining why the logical plan optimizer is triggered
pepijnve Oct 9, 2025
101db28
Simplify predicate eval code
pepijnve Oct 9, 2025
f4c8579
Code formatting
pepijnve Oct 9, 2025
81b6ec1
Add license header
pepijnve Oct 9, 2025
b6ebd13
Merge branch 'main' into issue_17801
alamb Oct 15, 2025
ebc2d38
Merge branch 'refs/heads/main' into issue_17801
pepijnve Nov 6, 2025
3131899
Try to align logical and physical implementations as much as possible
pepijnve Nov 6, 2025
3da92e5
Allow optimizations to change fields from nullable to not-nullable
pepijnve Nov 6, 2025
0a6b2e7
Correctly handle case-with-expression nullability analysis
pepijnve Nov 7, 2025
113e899
Add unit tests for predicate_eval
pepijnve Nov 7, 2025
9dee1e8
Another attempt to make the code easier to read
pepijnve Nov 7, 2025
4a22dfc
Rework predicate_eval to use set arithmetic
pepijnve Nov 8, 2025
a1bc263
Rename predicate_eval to predicate_bounds
pepijnve Nov 8, 2025
ac765e9
Add unit tests for NullableInterval::is_certainly_...
pepijnve Nov 8, 2025
51af749
Formatting
pepijnve Nov 8, 2025
4af84a7
Simplify logical and physical case branch filtering logic
pepijnve Nov 9, 2025
427fc30
Further simplification of `is_null`
pepijnve Nov 10, 2025
0223a54
Merge remote-tracking branch 'upstream/HEAD' into issue_17801
pepijnve Nov 10, 2025
c5914d6
Update bitflags version declaration to match arrow-schema
pepijnve Nov 10, 2025
4b879e4
Silence "needless pass by value" lint
pepijnve Nov 10, 2025
5558293
WIP
pepijnve Nov 11, 2025
d8df5d1
Merge remote-tracking branch 'upstream/main' into issue_17801
pepijnve Nov 18, 2025
867da26
Replace TernarySet with NullableInterval
pepijnve Nov 18, 2025
1acb33e
Move GuaranteeRewriter to `expr`
pepijnve Nov 18, 2025
fabe190
Make GuaranteeRewriter implementation private
pepijnve Nov 18, 2025
4f95e6d
Rewrite 'when' expressions with 'null' guarantee before evaluating bo…
pepijnve Nov 18, 2025
97ffdff
Add additional test cases
pepijnve Nov 18, 2025
0bfd2b2
Make null replacement the fallback branch of f_up
pepijnve Nov 18, 2025
981c0f7
Remove unused dependency
pepijnve Nov 18, 2025
94f7f00
Add additional explanation why nullability change is allowed.
pepijnve Nov 18, 2025
262734a
Allow ScalarValue::Null to be combined with non-null scalars in Inter…
pepijnve Nov 18, 2025
398cdb5
Revert adding initial expression functions for between and like; not …
pepijnve Nov 19, 2025
7e4bf89
Restructure GuaranteeRewriter for readability
pepijnve Nov 19, 2025
63de91a
Do not error out when rewriting 'between' expressions with empty valu…
pepijnve Nov 19, 2025
44221c4
Merge remote-tracking branch 'upstream/main' into issue_17801
pepijnve Nov 19, 2025
5115952
Revert changes in Interval::try_new
pepijnve Nov 19, 2025
4b15030
Mimic wikipedia 3VL truth tables as well as possible in ascii art
pepijnve Nov 19, 2025
5d9fc7e
Revert "Disable failing benchmark query (#17809)"
alamb Sep 29, 2025
ff9a41f
Merge remote-tracking branch 'apache/main' into issue_17801
alamb Nov 20, 2025
3e0fd7e
Update comments
pepijnve Nov 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion datafusion/core/tests/tpcds_planning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1052,9 +1052,12 @@ async fn regression_test(query_no: u8, create_physical: bool) -> Result<()> {
for sql in &sql {
let df = ctx.sql(sql).await?;
let (state, plan) = df.into_parts();
let plan = state.optimize(&plan)?;
if create_physical {
let _ = state.create_physical_plan(&plan).await?;
} else {
// Run the logical optimizer even if we are not creating the physical plan
// to ensure it will properly succeed
let _ = state.optimize(&plan)?;
}
}

Expand Down
5 changes: 5 additions & 0 deletions datafusion/expr/src/expr_fn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,11 @@ pub fn is_null(expr: Expr) -> Expr {
Expr::IsNull(Box::new(expr))
}

/// Create is not null expression
pub fn is_not_null(expr: Expr) -> Expr {
Expr::IsNotNull(Box::new(expr))
}

/// Create is true expression
pub fn is_true(expr: Expr) -> Expr {
Expr::IsTrue(Box::new(expr))
Expand Down
183 changes: 176 additions & 7 deletions datafusion/expr/src/expr_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,13 @@
// specific language governing permissions and limitations
// under the License.

use super::{Between, Expr, Like};
use super::{predicate_eval, Between, Expr, Like};
use crate::expr::{
AggregateFunction, AggregateFunctionParams, Alias, BinaryExpr, Cast, FieldMetadata,
InList, InSubquery, Placeholder, ScalarFunction, TryCast, Unnest, WindowFunction,
WindowFunctionParams,
};
use crate::predicate_eval::TriStateBool;
use crate::type_coercion::functions::{
data_types_with_scalar_udf, fields_with_aggregate_udf, fields_with_window_udf,
};
Expand Down Expand Up @@ -279,13 +280,50 @@ impl ExprSchemable for Expr {
Expr::OuterReferenceColumn(field, _) => Ok(field.is_nullable()),
Expr::Literal(value, _) => Ok(value.is_null()),
Expr::Case(case) => {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While re-reading this I can't help but think the logic is quite non trivial - and someone trying to figure out if an expression is nullable on a deeply nested function might end up calling this function many times

Not for this PR, but I think we should consider how to cache or otherwise avoid re-computing the same nullabilty (and DataType) expressions over and over again.

I'll writeup a follow on ticket

Copy link
Contributor Author

@pepijnve pepijnve Nov 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's absolutely correct. Performance overhead concerns were the main reason I had initially avoided rewriting the expression and instead tried to do the rewrite indirectly. Rather than rewriting using a NullableInterval::Null guarantee, I was checking this using a callback function.

It's probably feasible, but non-trivial to cache this result. What would you use as storage location?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See https://github.com/apache/datafusion/pull/17813/files#r2545958309. That already mitigates the additional calculations a little bit.

Copy link
Contributor

@alamb alamb Nov 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's probably feasible, but non-trivial to cache this result. What would you use as storage location?

Yes, I agree it is non trivial. I wrote up some ideas in

Copy link
Contributor Author

@pepijnve pepijnve Nov 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I started looking at the possible options here already a bit. I don't immediately see a simple solution.

// This expression is nullable if any of the input expressions are nullable
let then_nullable = case
// This expression is nullable if any of the then expressions are nullable
let any_nullable_thens = !case
.when_then_expr
.iter()
.map(|(_, t)| t.nullable(input_schema))
.collect::<Result<Vec<_>>>()?;
if then_nullable.contains(&true) {
.filter_map(|(w, t)| {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code is clear and easy to follow. Very nice

match t.nullable(input_schema) {
// Branches with a then expression that is not nullable can be skipped
Ok(false) => None,
// Pass error determining nullability on verbatim
Err(e) => Some(Err(e)),
// For branches with a nullable then expressions try to determine
// using limited const evaluation if the branch will be taken when
// the then expression evaluates to null.
Ok(true) => {
let const_result = predicate_eval::const_eval_predicate(
w,
input_schema,
|expr| {
if expr.eq(t) {
TriStateBool::True
} else {
TriStateBool::Uncertain
}
},
);

match const_result {
// Const evaluation was inconclusive or determined the branch
// would be taken
None | Some(TriStateBool::True) => Some(Ok(())),
// Const evaluation proves the branch will never be taken.
// The most common pattern for this is
// `WHEN x IS NOT NULL THEN x`.
Some(TriStateBool::False)
| Some(TriStateBool::Uncertain) => None,
}
}
}
})
.collect::<Result<Vec<_>>>()?
.is_empty();

if any_nullable_thens {
// There is at least one reachable nullable then
Ok(true)
} else if let Some(e) = &case.else_expr {
e.nullable(input_schema)
Expand Down Expand Up @@ -777,7 +815,7 @@ pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result<Subq
#[cfg(test)]
mod tests {
use super::*;
use crate::{col, lit, out_ref_col_with_metadata};
use crate::{and, col, lit, not, or, out_ref_col_with_metadata, when};

use datafusion_common::{internal_err, DFSchema, HashMap, ScalarValue};

Expand Down Expand Up @@ -830,6 +868,137 @@ mod tests {
assert!(expr.nullable(&get_schema(false)).unwrap());
}

fn assert_nullability(expr: &Expr, schema: &dyn ExprSchema, expected: bool) {
assert_eq!(
expr.nullable(schema).unwrap(),
expected,
"Nullability of '{expr}' should be {expected}"
);
}

fn assert_not_nullable(expr: &Expr, schema: &dyn ExprSchema) {
assert_nullability(expr, schema, false);
}

fn assert_nullable(expr: &Expr, schema: &dyn ExprSchema) {
assert_nullability(expr, schema, true);
}

#[test]
fn test_case_expression_nullability() -> Result<()> {
let nullable_schema = MockExprSchema::new()
.with_data_type(DataType::Int32)
.with_nullable(true);

let not_nullable_schema = MockExprSchema::new()
.with_data_type(DataType::Int32)
.with_nullable(false);

// CASE WHEN x IS NOT NULL THEN x ELSE 0
let e = when(col("x").is_not_null(), col("x")).otherwise(lit(0))?;
assert_not_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN NOT x IS NULL THEN x ELSE 0
let e = when(not(col("x").is_null()), col("x")).otherwise(lit(0))?;
assert_not_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN X = 5 THEN x ELSE 0
let e = when(col("x").eq(lit(5)), col("x")).otherwise(lit(0))?;
assert_not_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN x IS NOT NULL AND x = 5 THEN x ELSE 0
let e = when(and(col("x").is_not_null(), col("x").eq(lit(5))), col("x"))
.otherwise(lit(0))?;
assert_not_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN x = 5 AND x IS NOT NULL THEN x ELSE 0
let e = when(and(col("x").eq(lit(5)), col("x").is_not_null()), col("x"))
.otherwise(lit(0))?;
assert_not_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN x IS NOT NULL OR x = 5 THEN x ELSE 0
let e = when(or(col("x").is_not_null(), col("x").eq(lit(5))), col("x"))
.otherwise(lit(0))?;
assert_not_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN x = 5 OR x IS NOT NULL THEN x ELSE 0
let e = when(or(col("x").eq(lit(5)), col("x").is_not_null()), col("x"))
.otherwise(lit(0))?;
assert_not_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN (x = 5 AND x IS NOT NULL) OR (x = bar AND x IS NOT NULL) THEN x ELSE 0
let e = when(
or(
and(col("x").eq(lit(5)), col("x").is_not_null()),
and(col("x").eq(col("bar")), col("x").is_not_null()),
),
col("x"),
)
.otherwise(lit(0))?;
assert_not_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN x = 5 OR x IS NULL THEN x ELSE 0
let e = when(or(col("x").eq(lit(5)), col("x").is_null()), col("x"))
.otherwise(lit(0))?;
assert_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN x IS TRUE THEN x ELSE 0
let e = when(col("x").is_true(), col("x")).otherwise(lit(0))?;
assert_not_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN x IS NOT TRUE THEN x ELSE 0
let e = when(col("x").is_not_true(), col("x")).otherwise(lit(0))?;
assert_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN x IS FALSE THEN x ELSE 0
let e = when(col("x").is_false(), col("x")).otherwise(lit(0))?;
assert_not_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN x IS NOT FALSE THEN x ELSE 0
let e = when(col("x").is_not_false(), col("x")).otherwise(lit(0))?;
assert_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN x IS UNKNOWN THEN x ELSE 0
let e = when(col("x").is_unknown(), col("x")).otherwise(lit(0))?;
assert_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN x IS NOT UNKNOWN THEN x ELSE 0
let e = when(col("x").is_not_unknown(), col("x")).otherwise(lit(0))?;
assert_not_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN x LIKE 'x' THEN x ELSE 0
let e = when(col("x").like(lit("x")), col("x")).otherwise(lit(0))?;
assert_not_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN 0 THEN x ELSE 0
let e = when(lit(0), col("x")).otherwise(lit(0))?;
assert_not_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

// CASE WHEN 1 THEN x ELSE 0
let e = when(lit(1), col("x")).otherwise(lit(0))?;
assert_nullable(&e, &nullable_schema);
assert_not_nullable(&e, &not_nullable_schema);

Ok(())
}

#[test]
fn test_inlist_nullability() {
let get_schema = |nullable| {
Expand Down
1 change: 1 addition & 0 deletions datafusion/expr/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ pub mod async_udf;
pub mod statistics {
pub use datafusion_expr_common::statistics::*;
}
mod predicate_eval;
pub mod ptr_eq;
pub mod test;
pub mod tree_node;
Expand Down
Loading