Skip to content
Merged
Show file tree
Hide file tree
Changes from 47 commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
408cee1
#17801 Improve nullability reporting of case expressions
pepijnve Sep 28, 2025
045fc9c
#17801 Clarify logical expression test cases
pepijnve Sep 29, 2025
de8b780
#17801 Attempt to clarify const evaluation logic
pepijnve Sep 30, 2025
bbd2949
#17801 Extend predicate const evaluation
pepijnve Sep 30, 2025
2075f4b
#17801 Correctly report nullability of implicit casts in predicates
pepijnve Oct 1, 2025
8c87937
#17801 Code formatting
pepijnve Oct 6, 2025
e155d41
Merge branch 'main' into issue_17801
alamb Oct 8, 2025
5cfe8b6
Merge branch 'main' into issue_17801
alamb Oct 8, 2025
ac4267c
Add comment explaining why the logical plan optimizer is triggered
pepijnve Oct 9, 2025
101db28
Simplify predicate eval code
pepijnve Oct 9, 2025
f4c8579
Code formatting
pepijnve Oct 9, 2025
81b6ec1
Add license header
pepijnve Oct 9, 2025
b6ebd13
Merge branch 'main' into issue_17801
alamb Oct 15, 2025
ebc2d38
Merge branch 'refs/heads/main' into issue_17801
pepijnve Nov 6, 2025
3131899
Try to align logical and physical implementations as much as possible
pepijnve Nov 6, 2025
3da92e5
Allow optimizations to change fields from nullable to not-nullable
pepijnve Nov 6, 2025
0a6b2e7
Correctly handle case-with-expression nullability analysis
pepijnve Nov 7, 2025
113e899
Add unit tests for predicate_eval
pepijnve Nov 7, 2025
9dee1e8
Another attempt to make the code easier to read
pepijnve Nov 7, 2025
4a22dfc
Rework predicate_eval to use set arithmetic
pepijnve Nov 8, 2025
a1bc263
Rename predicate_eval to predicate_bounds
pepijnve Nov 8, 2025
ac765e9
Add unit tests for NullableInterval::is_certainly_...
pepijnve Nov 8, 2025
51af749
Formatting
pepijnve Nov 8, 2025
4af84a7
Simplify logical and physical case branch filtering logic
pepijnve Nov 9, 2025
427fc30
Further simplification of `is_null`
pepijnve Nov 10, 2025
0223a54
Merge remote-tracking branch 'upstream/HEAD' into issue_17801
pepijnve Nov 10, 2025
c5914d6
Update bitflags version declaration to match arrow-schema
pepijnve Nov 10, 2025
4b879e4
Silence "needless pass by value" lint
pepijnve Nov 10, 2025
5558293
WIP
pepijnve Nov 11, 2025
d8df5d1
Merge remote-tracking branch 'upstream/main' into issue_17801
pepijnve Nov 18, 2025
867da26
Replace TernarySet with NullableInterval
pepijnve Nov 18, 2025
1acb33e
Move GuaranteeRewriter to `expr`
pepijnve Nov 18, 2025
fabe190
Make GuaranteeRewriter implementation private
pepijnve Nov 18, 2025
4f95e6d
Rewrite 'when' expressions with 'null' guarantee before evaluating bo…
pepijnve Nov 18, 2025
97ffdff
Add additional test cases
pepijnve Nov 18, 2025
0bfd2b2
Make null replacement the fallback branch of f_up
pepijnve Nov 18, 2025
981c0f7
Remove unused dependency
pepijnve Nov 18, 2025
94f7f00
Add additional explanation why nullability change is allowed.
pepijnve Nov 18, 2025
262734a
Allow ScalarValue::Null to be combined with non-null scalars in Inter…
pepijnve Nov 18, 2025
398cdb5
Revert adding initial expression functions for between and like; not …
pepijnve Nov 19, 2025
7e4bf89
Restructure GuaranteeRewriter for readability
pepijnve Nov 19, 2025
63de91a
Do not error out when rewriting 'between' expressions with empty valu…
pepijnve Nov 19, 2025
44221c4
Merge remote-tracking branch 'upstream/main' into issue_17801
pepijnve Nov 19, 2025
5115952
Revert changes in Interval::try_new
pepijnve Nov 19, 2025
4b15030
Mimic wikipedia 3VL truth tables as well as possible in ascii art
pepijnve Nov 19, 2025
5d9fc7e
Revert "Disable failing benchmark query (#17809)"
alamb Sep 29, 2025
ff9a41f
Merge remote-tracking branch 'apache/main' into issue_17801
alamb Nov 20, 2025
3e0fd7e
Update comments
pepijnve Nov 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions datafusion/core/benches/sql_planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -519,9 +519,6 @@ fn criterion_benchmark(c: &mut Criterion) {
};

let raw_tpcds_sql_queries = (1..100)
// skip query 75 until it is fixed
// https://github.com/apache/datafusion/issues/17801
.filter(|q| *q != 75)
.map(|q| std::fs::read_to_string(format!("{tests_path}tpc-ds/{q}.sql")).unwrap())
.collect::<Vec<_>>();

Expand Down
37 changes: 36 additions & 1 deletion datafusion/core/src/physical_planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ use crate::schema_equivalence::schema_satisfied_by;
use arrow::array::{builder::StringBuilder, RecordBatch};
use arrow::compute::SortOptions;
use arrow::datatypes::Schema;
use arrow_schema::Field;
use datafusion_catalog::ScanArgs;
use datafusion_common::display::ToStringifiedPlan;
use datafusion_common::format::ExplainAnalyzeLevel;
Expand Down Expand Up @@ -2520,7 +2521,9 @@ impl<'a> OptimizationInvariantChecker<'a> {
previous_schema: &Arc<Schema>,
) -> Result<()> {
// if the rule is not permitted to change the schema, confirm that it did not change.
if self.rule.schema_check() && plan.schema() != *previous_schema {
if self.rule.schema_check()
&& !is_allowed_schema_change(previous_schema.as_ref(), plan.schema().as_ref())
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change relaxes the schema check slightly. It now allows individual fields to change from nullable to not-nullable which is ok because it only allow a strict subset of the original schema. schema_check has documentation stating that you should disable the schema check entirely if you want to do this. Seemed better to not have to disable checking entirely.

{
internal_err!("PhysicalOptimizer rule '{}' failed. Schema mismatch. Expected original schema: {:?}, got new schema: {:?}",
self.rule.name(),
previous_schema,
Expand All @@ -2536,6 +2539,38 @@ impl<'a> OptimizationInvariantChecker<'a> {
}
}

/// Checks if the change from `old` schema to `new` is allowed or not.
///
/// The current implementation only allows nullability of individual fields to change
/// from 'nullable' to 'not nullable'. This can happen due to physical expressions knowing
/// more about their null-ness than their logical counterparts.
/// This change is allowed because for any field the non-nullable domain `F` is a strict subset
/// of the nullable domain `F ∪ { NULL }`. A physical schema that guarantees a stricter subset
/// of values will not violate any assumptions made based on the less strict schema.
fn is_allowed_schema_change(old: &Schema, new: &Schema) -> bool {
if new.metadata != old.metadata {
return false;
}

if new.fields.len() != old.fields.len() {
return false;
}

let new_fields = new.fields.iter().map(|f| f.as_ref());
let old_fields = old.fields.iter().map(|f| f.as_ref());
old_fields
.zip(new_fields)
.all(|(old, new)| is_allowed_field_change(old, new))
}

fn is_allowed_field_change(old_field: &Field, new_field: &Field) -> bool {
new_field.name() == old_field.name()
&& new_field.data_type() == old_field.data_type()
&& new_field.metadata() == old_field.metadata()
&& (new_field.is_nullable() == old_field.is_nullable()
|| !new_field.is_nullable())
}

impl<'n> TreeNodeVisitor<'n> for OptimizationInvariantChecker<'_> {
type Node = Arc<dyn ExecutionPlan>;

Expand Down
5 changes: 4 additions & 1 deletion datafusion/core/tests/tpcds_planning.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1052,9 +1052,12 @@ async fn regression_test(query_no: u8, create_physical: bool) -> Result<()> {
for sql in &sql {
let df = ctx.sql(sql).await?;
let (state, plan) = df.into_parts();
let plan = state.optimize(&plan)?;
if create_physical {
let _ = state.create_physical_plan(&plan).await?;
} else {
// Run the logical optimizer even if we are not creating the physical plan
// to ensure it will properly succeed
let _ = state.optimize(&plan)?;
}
}

Expand Down
Loading
Loading