-
Notifications
You must be signed in to change notification settings - Fork 1.8k
#17801 Improve nullability reporting of case expressions #17813
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 13 commits
408cee1
045fc9c
de8b780
bbd2949
2075f4b
8c87937
e155d41
5cfe8b6
ac4267c
101db28
f4c8579
81b6ec1
b6ebd13
ebc2d38
3131899
3da92e5
0a6b2e7
113e899
9dee1e8
4a22dfc
a1bc263
ac765e9
51af749
4af84a7
427fc30
0223a54
c5914d6
4b879e4
5558293
d8df5d1
867da26
1acb33e
fabe190
4f95e6d
97ffdff
0bfd2b2
981c0f7
94f7f00
262734a
398cdb5
7e4bf89
63de91a
44221c4
5115952
4b15030
5d9fc7e
ff9a41f
3e0fd7e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,12 +15,13 @@ | |
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| use super::{Between, Expr, Like}; | ||
| use super::{predicate_eval, Between, Expr, Like}; | ||
| use crate::expr::{ | ||
| AggregateFunction, AggregateFunctionParams, Alias, BinaryExpr, Cast, FieldMetadata, | ||
| InList, InSubquery, Placeholder, ScalarFunction, TryCast, Unnest, WindowFunction, | ||
| WindowFunctionParams, | ||
| }; | ||
| use crate::predicate_eval::TriStateBool; | ||
| use crate::type_coercion::functions::{ | ||
| data_types_with_scalar_udf, fields_with_aggregate_udf, fields_with_window_udf, | ||
| }; | ||
|
|
@@ -279,13 +280,50 @@ impl ExprSchemable for Expr { | |
| Expr::OuterReferenceColumn(field, _) => Ok(field.is_nullable()), | ||
| Expr::Literal(value, _) => Ok(value.is_null()), | ||
| Expr::Case(case) => { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While re-reading this I can't help but think the logic is quite non trivial - and someone trying to figure out if an expression is nullable on a deeply nested function might end up calling this function many times Not for this PR, but I think we should consider how to cache or otherwise avoid re-computing the same nullabilty (and DataType) expressions over and over again. I'll writeup a follow on ticket
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's absolutely correct. Performance overhead concerns were the main reason I had initially avoided rewriting the expression and instead tried to do the rewrite indirectly. Rather than rewriting using a It's probably feasible, but non-trivial to cache this result. What would you use as storage location?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See https://github.com/apache/datafusion/pull/17813/files#r2545958309. That already mitigates the additional calculations a little bit.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, I agree it is non trivial. I wrote up some ideas in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I started looking at the possible options here already a bit. I don't immediately see a simple solution. |
||
| // This expression is nullable if any of the input expressions are nullable | ||
| let then_nullable = case | ||
| // This expression is nullable if any of the then expressions are nullable | ||
| let any_nullable_thens = !case | ||
| .when_then_expr | ||
| .iter() | ||
| .map(|(_, t)| t.nullable(input_schema)) | ||
| .collect::<Result<Vec<_>>>()?; | ||
| if then_nullable.contains(&true) { | ||
| .filter_map(|(w, t)| { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code is clear and easy to follow. Very nice |
||
| match t.nullable(input_schema) { | ||
| // Branches with a then expression that is not nullable can be skipped | ||
| Ok(false) => None, | ||
| // Pass error determining nullability on verbatim | ||
| Err(e) => Some(Err(e)), | ||
| // For branches with a nullable then expressions try to determine | ||
| // using limited const evaluation if the branch will be taken when | ||
| // the then expression evaluates to null. | ||
| Ok(true) => { | ||
| let const_result = predicate_eval::const_eval_predicate( | ||
| w, | ||
| input_schema, | ||
| |expr| { | ||
| if expr.eq(t) { | ||
| TriStateBool::True | ||
| } else { | ||
| TriStateBool::Uncertain | ||
| } | ||
| }, | ||
| ); | ||
|
|
||
| match const_result { | ||
| // Const evaluation was inconclusive or determined the branch | ||
| // would be taken | ||
| None | Some(TriStateBool::True) => Some(Ok(())), | ||
| // Const evaluation proves the branch will never be taken. | ||
| // The most common pattern for this is | ||
| // `WHEN x IS NOT NULL THEN x`. | ||
| Some(TriStateBool::False) | ||
| | Some(TriStateBool::Uncertain) => None, | ||
| } | ||
| } | ||
| } | ||
| }) | ||
| .collect::<Result<Vec<_>>>()? | ||
| .is_empty(); | ||
|
|
||
| if any_nullable_thens { | ||
| // There is at least one reachable nullable then | ||
| Ok(true) | ||
| } else if let Some(e) = &case.else_expr { | ||
| e.nullable(input_schema) | ||
|
|
@@ -777,7 +815,7 @@ pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result<Subq | |
| #[cfg(test)] | ||
| mod tests { | ||
| use super::*; | ||
| use crate::{col, lit, out_ref_col_with_metadata}; | ||
| use crate::{and, col, lit, not, or, out_ref_col_with_metadata, when}; | ||
|
|
||
| use datafusion_common::{internal_err, DFSchema, HashMap, ScalarValue}; | ||
|
|
||
|
|
@@ -830,6 +868,137 @@ mod tests { | |
| assert!(expr.nullable(&get_schema(false)).unwrap()); | ||
| } | ||
|
|
||
| fn assert_nullability(expr: &Expr, schema: &dyn ExprSchema, expected: bool) { | ||
| assert_eq!( | ||
| expr.nullable(schema).unwrap(), | ||
| expected, | ||
| "Nullability of '{expr}' should be {expected}" | ||
| ); | ||
| } | ||
|
|
||
| fn assert_not_nullable(expr: &Expr, schema: &dyn ExprSchema) { | ||
| assert_nullability(expr, schema, false); | ||
| } | ||
|
|
||
| fn assert_nullable(expr: &Expr, schema: &dyn ExprSchema) { | ||
| assert_nullability(expr, schema, true); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_case_expression_nullability() -> Result<()> { | ||
pepijnve marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| let nullable_schema = MockExprSchema::new() | ||
| .with_data_type(DataType::Int32) | ||
| .with_nullable(true); | ||
|
|
||
| let not_nullable_schema = MockExprSchema::new() | ||
| .with_data_type(DataType::Int32) | ||
| .with_nullable(false); | ||
|
|
||
| // CASE WHEN x IS NOT NULL THEN x ELSE 0 | ||
| let e = when(col("x").is_not_null(), col("x")).otherwise(lit(0))?; | ||
| assert_not_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN NOT x IS NULL THEN x ELSE 0 | ||
| let e = when(not(col("x").is_null()), col("x")).otherwise(lit(0))?; | ||
| assert_not_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN X = 5 THEN x ELSE 0 | ||
| let e = when(col("x").eq(lit(5)), col("x")).otherwise(lit(0))?; | ||
| assert_not_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN x IS NOT NULL AND x = 5 THEN x ELSE 0 | ||
| let e = when(and(col("x").is_not_null(), col("x").eq(lit(5))), col("x")) | ||
| .otherwise(lit(0))?; | ||
| assert_not_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN x = 5 AND x IS NOT NULL THEN x ELSE 0 | ||
| let e = when(and(col("x").eq(lit(5)), col("x").is_not_null()), col("x")) | ||
| .otherwise(lit(0))?; | ||
| assert_not_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN x IS NOT NULL OR x = 5 THEN x ELSE 0 | ||
| let e = when(or(col("x").is_not_null(), col("x").eq(lit(5))), col("x")) | ||
| .otherwise(lit(0))?; | ||
| assert_not_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN x = 5 OR x IS NOT NULL THEN x ELSE 0 | ||
| let e = when(or(col("x").eq(lit(5)), col("x").is_not_null()), col("x")) | ||
| .otherwise(lit(0))?; | ||
| assert_not_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN (x = 5 AND x IS NOT NULL) OR (x = bar AND x IS NOT NULL) THEN x ELSE 0 | ||
| let e = when( | ||
| or( | ||
| and(col("x").eq(lit(5)), col("x").is_not_null()), | ||
| and(col("x").eq(col("bar")), col("x").is_not_null()), | ||
| ), | ||
| col("x"), | ||
| ) | ||
| .otherwise(lit(0))?; | ||
| assert_not_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN x = 5 OR x IS NULL THEN x ELSE 0 | ||
| let e = when(or(col("x").eq(lit(5)), col("x").is_null()), col("x")) | ||
| .otherwise(lit(0))?; | ||
| assert_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN x IS TRUE THEN x ELSE 0 | ||
| let e = when(col("x").is_true(), col("x")).otherwise(lit(0))?; | ||
| assert_not_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN x IS NOT TRUE THEN x ELSE 0 | ||
| let e = when(col("x").is_not_true(), col("x")).otherwise(lit(0))?; | ||
| assert_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN x IS FALSE THEN x ELSE 0 | ||
| let e = when(col("x").is_false(), col("x")).otherwise(lit(0))?; | ||
| assert_not_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN x IS NOT FALSE THEN x ELSE 0 | ||
| let e = when(col("x").is_not_false(), col("x")).otherwise(lit(0))?; | ||
| assert_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN x IS UNKNOWN THEN x ELSE 0 | ||
| let e = when(col("x").is_unknown(), col("x")).otherwise(lit(0))?; | ||
| assert_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN x IS NOT UNKNOWN THEN x ELSE 0 | ||
| let e = when(col("x").is_not_unknown(), col("x")).otherwise(lit(0))?; | ||
| assert_not_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN x LIKE 'x' THEN x ELSE 0 | ||
| let e = when(col("x").like(lit("x")), col("x")).otherwise(lit(0))?; | ||
| assert_not_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN 0 THEN x ELSE 0 | ||
| let e = when(lit(0), col("x")).otherwise(lit(0))?; | ||
| assert_not_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| // CASE WHEN 1 THEN x ELSE 0 | ||
| let e = when(lit(1), col("x")).otherwise(lit(0))?; | ||
| assert_nullable(&e, &nullable_schema); | ||
| assert_not_nullable(&e, ¬_nullable_schema); | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_inlist_nullability() { | ||
| let get_schema = |nullable| { | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.