diff --git a/datafusion/pruning/src/lib.rs b/datafusion/pruning/src/lib.rs index 95dc5888d36b8..f49782153d261 100644 --- a/datafusion/pruning/src/lib.rs +++ b/datafusion/pruning/src/lib.rs @@ -15,5116 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! [`PruningPredicate`] to apply filter [`Expr`] to prune "containers" -//! based on statistics (e.g. Parquet Row Groups) -//! -//! [`Expr`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.Expr.html -use std::collections::HashSet; -use std::sync::Arc; +mod pruning_predicate; -use arrow::array::AsArray; -use arrow::{ - array::{new_null_array, ArrayRef, BooleanArray}, - datatypes::{DataType, Field, Schema, SchemaRef}, - record_batch::{RecordBatch, RecordBatchOptions}, +pub use pruning_predicate::{ + PredicateRewriter, PruningPredicate, PruningStatistics, RequiredColumns, + UnhandledPredicateHook, }; -// pub use for backwards compatibility -pub use datafusion_common::pruning::PruningStatistics; -use log::{debug, trace}; - -use datafusion_common::error::{DataFusionError, Result}; -use datafusion_common::tree_node::TransformedResult; -use datafusion_common::{ - internal_err, plan_datafusion_err, plan_err, - tree_node::{Transformed, TreeNode}, - ScalarValue, -}; -use datafusion_common::{Column, DFSchema}; -use datafusion_expr_common::operator::Operator; -use datafusion_physical_expr::utils::{collect_columns, Guarantee, LiteralGuarantee}; -use datafusion_physical_expr::{expressions as phys_expr, PhysicalExprRef}; -use datafusion_physical_expr_common::physical_expr::snapshot_physical_expr; -use datafusion_physical_plan::{ColumnarValue, PhysicalExpr}; - -/// Used to prove that arbitrary predicates (boolean expression) can not -/// possibly evaluate to `true` given information about a column provided by -/// [`PruningStatistics`]. -/// -/// # Introduction -/// -/// `PruningPredicate` analyzes filter expressions using statistics such as -/// min/max values and null counts, attempting to prove a "container" (e.g. -/// Parquet Row Group) can be skipped without reading the actual data, -/// potentially leading to significant performance improvements. -/// -/// For example, `PruningPredicate`s are used to prune Parquet Row Groups based -/// on the min/max values found in the Parquet metadata. If the -/// `PruningPredicate` can prove that the filter can never evaluate to `true` -/// for any row in the Row Group, the entire Row Group is skipped during query -/// execution. -/// -/// The `PruningPredicate` API is general, and can be used for pruning other -/// types of containers (e.g. files) based on statistics that may be known from -/// external catalogs (e.g. Delta Lake) or other sources. How this works is a -/// subtle topic. See the Background and Implementation section for details. -/// -/// `PruningPredicate` supports: -/// -/// 1. Arbitrary expressions (including user defined functions) -/// -/// 2. Vectorized evaluation (provide more than one set of statistics at a time) -/// so it is suitable for pruning 1000s of containers. -/// -/// 3. Any source of information that implements the [`PruningStatistics`] trait -/// (not just Parquet metadata). -/// -/// # Example -/// -/// See the [`pruning.rs` example in the `datafusion-examples`] for a complete -/// example of how to use `PruningPredicate` to prune files based on min/max -/// values. -/// -/// [`pruning.rs` example in the `datafusion-examples`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/pruning.rs -/// -/// Given an expression like `x = 5` and statistics for 3 containers (Row -/// Groups, files, etc) `A`, `B`, and `C`: -/// -/// ```text -/// A: {x_min = 0, x_max = 4} -/// B: {x_min = 2, x_max = 10} -/// C: {x_min = 5, x_max = 8} -/// ``` -/// -/// `PruningPredicate` will conclude that the rows in container `A` can never -/// be true (as the maximum value is only `4`), so it can be pruned: -/// -/// ```text -/// A: false (no rows could possibly match x = 5) -/// B: true (rows might match x = 5) -/// C: true (rows might match x = 5) -/// ``` -/// -/// See [`PruningPredicate::try_new`] and [`PruningPredicate::prune`] for more information. -/// -/// # Background -/// -/// ## Boolean Tri-state logic -/// -/// To understand the details of the rest of this documentation, it is important -/// to understand how the tri-state boolean logic in SQL works. As this is -/// somewhat esoteric, we review it here. -/// -/// SQL has a notion of `NULL` that represents the value is `“unknown”` and this -/// uncertainty propagates through expressions. SQL `NULL` behaves very -/// differently than the `NULL` in most other languages where it is a special, -/// sentinel value (e.g. `0` in `C/C++`). While representing uncertainty with -/// `NULL` is powerful and elegant, SQL `NULL`s are often deeply confusing when -/// first encountered as they behave differently than most programmers may -/// expect. -/// -/// In most other programming languages, -/// * `a == NULL` evaluates to `true` if `a` also had the value `NULL` -/// * `a == NULL` evaluates to `false` if `a` has any other value -/// -/// However, in SQL `a = NULL` **always** evaluates to `NULL` (never `true` or -/// `false`): -/// -/// Expression | Result -/// ------------- | --------- -/// `1 = NULL` | `NULL` -/// `NULL = NULL` | `NULL` -/// -/// Also important is how `AND` and `OR` works with tri-state boolean logic as -/// (perhaps counterintuitively) the result is **not** always NULL. While -/// consistent with the notion of `NULL` representing “unknown”, this is again, -/// often deeply confusing 🤯 when first encountered. -/// -/// Expression | Result | Intuition -/// --------------- | --------- | ----------- -/// `NULL AND true` | `NULL` | The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change -/// `NULL AND false` | `false` | If the `NULL` was either `true` or `false` the overall expression is still `false` -/// `NULL AND NULL` | `NULL` | -/// -/// Expression | Result | Intuition -/// --------------- | --------- | ---------- -/// `NULL OR true` | `true` | If the `NULL` was either `true` or `false` the overall expression is still `true` -/// `NULL OR false` | `NULL` | The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change -/// `NULL OR NULL` | `NULL` | -/// -/// ## SQL Filter Semantics -/// -/// The SQL `WHERE` clause has a boolean expression, often called a filter or -/// predicate. The semantics of this predicate are that the query evaluates the -/// predicate for each row in the input tables and: -/// -/// * Rows that evaluate to `true` are returned in the query results -/// -/// * Rows that evaluate to `false` are not returned (“filtered out” or “pruned” or “skipped”). -/// -/// * Rows that evaluate to `NULL` are **NOT** returned (also “filtered out”). -/// Note: *this treatment of `NULL` is **DIFFERENT** than how `NULL` is treated -/// in the rewritten predicate described below.* -/// -/// # `PruningPredicate` Implementation -/// -/// Armed with the information in the Background section, we can now understand -/// how the `PruningPredicate` logic works. -/// -/// ## Interface -/// -/// **Inputs** -/// 1. An input schema describing what columns exist -/// -/// 2. A predicate (expression that evaluates to a boolean) -/// -/// 3. [`PruningStatistics`] that provides information about columns in that -/// schema, for multiple “containers”. For each column in each container, it -/// provides optional information on contained values, min_values, max_values, -/// null_counts counts, and row_counts counts. -/// -/// **Outputs**: -/// A (non null) boolean value for each container: -/// * `true`: There MAY be rows that match the predicate -/// -/// * `false`: There are no rows that could possibly match the predicate (the -/// predicate can never possibly be true). The container can be pruned (skipped) -/// entirely. -/// -/// While `PruningPredicate` will never return a `NULL` value, the -/// rewritten predicate (as returned by `build_predicate_expression` and used internally -/// by `PruningPredicate`) may evaluate to `NULL` when some of the min/max values -/// or null / row counts are not known. -/// -/// In order to be correct, `PruningPredicate` must return false -/// **only** if it can determine that for all rows in the container, the -/// predicate could never evaluate to `true` (always evaluates to either `NULL` -/// or `false`). -/// -/// ## Contains Analysis and Min/Max Rewrite -/// -/// `PruningPredicate` works by first analyzing the predicate to see what -/// [`LiteralGuarantee`] must hold for the predicate to be true. -/// -/// Then, the `PruningPredicate` rewrites the original predicate into an -/// expression that references the min/max values of each column in the original -/// predicate. -/// -/// When the min/max values are actually substituted in to this expression and -/// evaluated, the result means -/// -/// * `true`: there MAY be rows that pass the predicate, **KEEPS** the container -/// -/// * `NULL`: there MAY be rows that pass the predicate, **KEEPS** the container -/// Note that rewritten predicate can evaluate to NULL when some of -/// the min/max values are not known. *Note that this is different than -/// the SQL filter semantics where `NULL` means the row is filtered -/// out.* -/// -/// * `false`: there are no rows that could possibly match the predicate, -/// **PRUNES** the container -/// -/// For example, given a column `x`, the `x_min`, `x_max`, `x_null_count`, and -/// `x_row_count` represent the minimum and maximum values, the null count of -/// column `x`, and the row count of column `x`, provided by the `PruningStatistics`. -/// `x_null_count` and `x_row_count` are used to handle the case where the column `x` -/// is known to be all `NULL`s. Note this is different from knowing nothing about -/// the column `x`, which confusingly is encoded by returning `NULL` for the min/max -/// values from [`PruningStatistics::max_values`] and [`PruningStatistics::min_values`]. -/// -/// Here are some examples of the rewritten predicates: -/// -/// Original Predicate | Rewritten Predicate -/// ------------------ | -------------------- -/// `x = 5` | `x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max)` -/// `x < 5` | `x_null_count != x_row_count THEN false (x_max < 5)` -/// `x = 5 AND y = 10` | `x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max) AND y_null_count != y_row_count (y_min <= 10 AND 10 <= y_max)` -/// `x IS NULL` | `x_null_count > 0` -/// `x IS NOT NULL` | `x_null_count != row_count` -/// `CAST(x as int) = 5` | `x_null_count != x_row_count (CAST(x_min as int) <= 5 AND 5 <= CAST(x_max as int))` -/// -/// ## Predicate Evaluation -/// The PruningPredicate works in two passes -/// -/// **First pass**: For each `LiteralGuarantee` calls -/// [`PruningStatistics::contained`] and rules out containers where the -/// LiteralGuarantees are not satisfied -/// -/// **Second Pass**: Evaluates the rewritten expression using the -/// min/max/null_counts/row_counts values for each column for each container. For any -/// container that this expression evaluates to `false`, it rules out those -/// containers. -/// -/// -/// ### Example 1 -/// -/// Given the predicate, `x = 5 AND y = 10`, the rewritten predicate would look like: -/// -/// ```sql -/// x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max) -/// AND -/// y_null_count != y_row_count AND (y_min <= 10 AND 10 <= y_max) -/// ``` -/// -/// If we know that for a given container, `x` is between `1 and 100` and we know that -/// `y` is between `4` and `7`, we know nothing about the null count and row count of -/// `x` and `y`, the input statistics might look like: -/// -/// Column | Value -/// -------- | ----- -/// `x_min` | `1` -/// `x_max` | `100` -/// `x_null_count` | `null` -/// `x_row_count` | `null` -/// `y_min` | `4` -/// `y_max` | `7` -/// `y_null_count` | `null` -/// `y_row_count` | `null` -/// -/// When these statistics values are substituted in to the rewritten predicate and -/// simplified, the result is `false`: -/// -/// * `null != null AND (1 <= 5 AND 5 <= 100) AND null != null AND (4 <= 10 AND 10 <= 7)` -/// * `null = null` is `null` which is not true, so the AND moves on to the next clause -/// * `null and (1 <= 5 AND 5 <= 100) AND null AND (4 <= 10 AND 10 <= 7)` -/// * evaluating the clauses further we get: -/// * `null and true and null and false` -/// * `null and false` -/// * `false` -/// -/// Returning `false` means the container can be pruned, which matches the -/// intuition that `x = 5 AND y = 10` can’t be true for any row if all values of `y` -/// are `7` or less. -/// -/// Note that if we had ended up with `null AND true AND null AND true` the result -/// would have been `null`. -/// `null` is treated the same as`true`, because we can't prove that the predicate is `false.` -/// -/// If, for some other container, we knew `y` was between the values `4` and -/// `15`, then the rewritten predicate evaluates to `true` (verifying this is -/// left as an exercise to the reader -- are you still here?), and the container -/// **could not** be pruned. The intuition is that there may be rows where the -/// predicate *might* evaluate to `true`, and the only way to find out is to do -/// more analysis, for example by actually reading the data and evaluating the -/// predicate row by row. -/// -/// ### Example 2 -/// -/// Given the same predicate, `x = 5 AND y = 10`, the rewritten predicate would -/// look like the same as example 1: -/// -/// ```sql -/// x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max) -/// AND -/// y_null_count != y_row_count AND (y_min <= 10 AND 10 <= y_max) -/// ``` -/// -/// If we know that for another given container, `x_min` is NULL and `x_max` is -/// NULL (the min/max values are unknown), `x_null_count` is `100` and `x_row_count` -/// is `100`; we know that `y` is between `4` and `7`, but we know nothing about -/// the null count and row count of `y`. The input statistics might look like: -/// -/// Column | Value -/// -------- | ----- -/// `x_min` | `null` -/// `x_max` | `null` -/// `x_null_count` | `100` -/// `x_row_count` | `100` -/// `y_min` | `4` -/// `y_max` | `7` -/// `y_null_count` | `null` -/// `y_row_count` | `null` -/// -/// When these statistics values are substituted in to the rewritten predicate and -/// simplified, the result is `false`: -/// -/// * `100 != 100 AND (null <= 5 AND 5 <= null) AND null = null AND (4 <= 10 AND 10 <= 7)` -/// * `false AND null AND null AND false` -/// * `false AND false` -/// * `false` -/// -/// Returning `false` means the container can be pruned, which matches the -/// intuition that `x = 5 AND y = 10` can’t be true because all values in `x` -/// are known to be NULL. -/// -/// # Related Work -/// -/// [`PruningPredicate`] implements the type of min/max pruning described in -/// Section `3.3.3` of the [`Snowflake SIGMOD Paper`]. The technique is -/// described by various research such as [small materialized aggregates], [zone -/// maps], and [data skipping]. -/// -/// [`Snowflake SIGMOD Paper`]: https://dl.acm.org/doi/10.1145/2882903.2903741 -/// [small materialized aggregates]: https://www.vldb.org/conf/1998/p476.pdf -/// [zone maps]: https://dl.acm.org/doi/10.1007/978-3-642-03730-6_10 -/// [data skipping]: https://dl.acm.org/doi/10.1145/2588555.2610515 -#[derive(Debug, Clone)] -pub struct PruningPredicate { - /// The input schema against which the predicate will be evaluated - schema: SchemaRef, - /// A min/max pruning predicate (rewritten in terms of column min/max - /// values, which are supplied by statistics) - predicate_expr: Arc, - /// Description of which statistics are required to evaluate `predicate_expr` - required_columns: RequiredColumns, - /// Original physical predicate from which this predicate expr is derived - /// (required for serialization) - orig_expr: Arc, - /// [`LiteralGuarantee`]s used to try and prove a predicate can not possibly - /// evaluate to `true`. - /// - /// See [`PruningPredicate::literal_guarantees`] for more details. - literal_guarantees: Vec, -} - -/// Rewrites predicates that [`PredicateRewriter`] can not handle, e.g. certain -/// complex expressions or predicates that reference columns that are not in the -/// schema. -pub trait UnhandledPredicateHook { - /// Called when a predicate can not be rewritten in terms of statistics or - /// references a column that is not in the schema. - fn handle(&self, expr: &Arc) -> Arc; -} - -/// The default handling for unhandled predicates is to return a constant `true` -/// (meaning don't prune the container) -#[derive(Debug, Clone)] -struct ConstantUnhandledPredicateHook { - default: Arc, -} - -impl Default for ConstantUnhandledPredicateHook { - fn default() -> Self { - Self { - default: Arc::new(phys_expr::Literal::new(ScalarValue::from(true))), - } - } -} - -impl UnhandledPredicateHook for ConstantUnhandledPredicateHook { - fn handle(&self, _expr: &Arc) -> Arc { - Arc::clone(&self.default) - } -} - -impl PruningPredicate { - /// Try to create a new instance of [`PruningPredicate`] - /// - /// This will translate the provided `expr` filter expression into - /// a *pruning predicate*. - /// - /// A pruning predicate is one that has been rewritten in terms of - /// the min and max values of column references and that evaluates - /// to FALSE if the filter predicate would evaluate FALSE *for - /// every row* whose values fell within the min / max ranges (aka - /// could be pruned). - /// - /// The pruning predicate evaluates to TRUE or NULL - /// if the filter predicate *might* evaluate to TRUE for at least - /// one row whose values fell within the min/max ranges (in other - /// words they might pass the predicate) - /// - /// For example, the filter expression `(column / 2) = 4` becomes - /// the pruning predicate - /// `(column_min / 2) <= 4 && 4 <= (column_max / 2))` - /// - /// See the struct level documentation on [`PruningPredicate`] for more - /// details. - pub fn try_new(expr: Arc, schema: SchemaRef) -> Result { - // Get a (simpler) snapshot of the physical expr here to use with `PruningPredicate` - // which does not handle dynamic exprs in general - let expr = snapshot_physical_expr(expr)?; - let unhandled_hook = Arc::new(ConstantUnhandledPredicateHook::default()) as _; - - // build predicate expression once - let mut required_columns = RequiredColumns::new(); - let predicate_expr = build_predicate_expression( - &expr, - schema.as_ref(), - &mut required_columns, - &unhandled_hook, - ); - - let literal_guarantees = LiteralGuarantee::analyze(&expr); - - Ok(Self { - schema, - predicate_expr, - required_columns, - orig_expr: expr, - literal_guarantees, - }) - } - - /// For each set of statistics, evaluates the pruning predicate - /// and returns a `bool` with the following meaning for a - /// all rows whose values match the statistics: - /// - /// `true`: There MAY be rows that match the predicate - /// - /// `false`: There are no rows that could possibly match the predicate - /// - /// Note: the predicate passed to `prune` should already be simplified as - /// much as possible (e.g. this pass doesn't handle some - /// expressions like `b = false`, but it does handle the - /// simplified version `b`. See [`ExprSimplifier`] to simplify expressions. - /// - /// [`ExprSimplifier`]: https://docs.rs/datafusion/latest/datafusion/optimizer/simplify_expressions/struct.ExprSimplifier.html - pub fn prune( - &self, - statistics: &S, - ) -> Result> { - let mut builder = BoolVecBuilder::new(statistics.num_containers()); - - // Try to prove the predicate can't be true for the containers based on - // literal guarantees - for literal_guarantee in &self.literal_guarantees { - let LiteralGuarantee { - column, - guarantee, - literals, - } = literal_guarantee; - if let Some(results) = statistics.contained(column, literals) { - match guarantee { - // `In` means the values in the column must be one of the - // values in the set for the predicate to evaluate to true. - // If `contained` returns false, that means the column is - // not any of the values so we can prune the container - Guarantee::In => builder.combine_array(&results), - // `NotIn` means the values in the column must must not be - // any of the values in the set for the predicate to - // evaluate to true. If contained returns true, it means the - // column is only in the set of values so we can prune the - // container - Guarantee::NotIn => { - builder.combine_array(&arrow::compute::not(&results)?) - } - } - // if all containers are pruned (has rows that DEFINITELY DO NOT pass the predicate) - // can return early without evaluating the rest of predicates. - if builder.check_all_pruned() { - return Ok(builder.build()); - } - } - } - - // Next, try to prove the predicate can't be true for the containers based - // on min/max values - - // build a RecordBatch that contains the min/max values in the - // appropriate statistics columns for the min/max predicate - let statistics_batch = - build_statistics_record_batch(statistics, &self.required_columns)?; - - // Evaluate the pruning predicate on that record batch and append any results to the builder - builder.combine_value(self.predicate_expr.evaluate(&statistics_batch)?); - - Ok(builder.build()) - } - - /// Return a reference to the input schema - pub fn schema(&self) -> &SchemaRef { - &self.schema - } - - /// Returns a reference to the physical expr used to construct this pruning predicate - pub fn orig_expr(&self) -> &Arc { - &self.orig_expr - } - - /// Returns a reference to the predicate expr - pub fn predicate_expr(&self) -> &Arc { - &self.predicate_expr - } - - /// Returns a reference to the literal guarantees - /// - /// Note that **All** `LiteralGuarantee`s must be satisfied for the - /// expression to possibly be `true`. If any is not satisfied, the - /// expression is guaranteed to be `null` or `false`. - pub fn literal_guarantees(&self) -> &[LiteralGuarantee] { - &self.literal_guarantees - } - - /// Returns true if this pruning predicate can not prune anything. - /// - /// This happens if the predicate is a literal `true` and - /// literal_guarantees is empty. - /// - /// This can happen when a predicate is simplified to a constant `true` - pub fn always_true(&self) -> bool { - is_always_true(&self.predicate_expr) && self.literal_guarantees.is_empty() - } - - // this is only used by `parquet` feature right now - #[allow(dead_code)] - pub fn required_columns(&self) -> &RequiredColumns { - &self.required_columns - } - - /// Names of the columns that are known to be / not be in a set - /// of literals (constants). These are the columns the that may be passed to - /// [`PruningStatistics::contained`] during pruning. - /// - /// This is useful to avoid fetching statistics for columns that will not be - /// used in the predicate. For example, it can be used to avoid reading - /// unneeded bloom filters (a non trivial operation). - pub fn literal_columns(&self) -> Vec { - let mut seen = HashSet::new(); - self.literal_guarantees - .iter() - .map(|e| &e.column.name) - // avoid duplicates - .filter(|name| seen.insert(*name)) - .map(|s| s.to_string()) - .collect() - } -} - -/// Builds the return `Vec` for [`PruningPredicate::prune`]. -#[derive(Debug)] -struct BoolVecBuilder { - /// One element per container. Each element is - /// * `true`: if the container has row that may pass the predicate - /// * `false`: if the container has rows that DEFINITELY DO NOT pass the predicate - inner: Vec, -} - -impl BoolVecBuilder { - /// Create a new `BoolVecBuilder` with `num_containers` elements - fn new(num_containers: usize) -> Self { - Self { - // assume by default all containers may pass the predicate - inner: vec![true; num_containers], - } - } - - /// Combines result `array` for a conjunct (e.g. `AND` clause) of a - /// predicate into the currently in progress array. - /// - /// Each `array` element is: - /// * `true`: container has row that may pass the predicate - /// * `false`: all container rows DEFINITELY DO NOT pass the predicate - /// * `null`: container may or may not have rows that pass the predicate - fn combine_array(&mut self, array: &BooleanArray) { - assert_eq!(array.len(), self.inner.len()); - for (cur, new) in self.inner.iter_mut().zip(array.iter()) { - // `false` for this conjunct means we know for sure no rows could - // pass the predicate and thus we set the corresponding container - // location to false. - if let Some(false) = new { - *cur = false; - } - } - } - - /// Combines the results in the [`ColumnarValue`] to the currently in - /// progress array, following the same rules as [`Self::combine_array`]. - /// - /// # Panics - /// If `value` is not boolean - fn combine_value(&mut self, value: ColumnarValue) { - match value { - ColumnarValue::Array(array) => { - self.combine_array(array.as_boolean()); - } - ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))) => { - // False means all containers can not pass the predicate - self.inner = vec![false; self.inner.len()]; - } - _ => { - // Null or true means the rows in container may pass this - // conjunct so we can't prune any containers based on that - } - } - } - - /// Convert this builder into a Vec of bools - fn build(self) -> Vec { - self.inner - } - - /// Check all containers has rows that DEFINITELY DO NOT pass the predicate - fn check_all_pruned(&self) -> bool { - self.inner.iter().all(|&x| !x) - } -} - -fn is_always_true(expr: &Arc) -> bool { - expr.as_any() - .downcast_ref::() - .map(|l| matches!(l.value(), ScalarValue::Boolean(Some(true)))) - .unwrap_or_default() -} - -fn is_always_false(expr: &Arc) -> bool { - expr.as_any() - .downcast_ref::() - .map(|l| matches!(l.value(), ScalarValue::Boolean(Some(false)))) - .unwrap_or_default() -} - -/// Describes which columns statistics are necessary to evaluate a -/// [`PruningPredicate`]. -/// -/// This structure permits reading and creating the minimum number statistics, -/// which is important since statistics may be non trivial to read (e.g. large -/// strings or when there are 1000s of columns). -/// -/// Handles creating references to the min/max statistics -/// for columns as well as recording which statistics are needed -#[derive(Debug, Default, Clone)] -pub struct RequiredColumns { - /// The statistics required to evaluate this predicate: - /// * The unqualified column in the input schema - /// * Statistics type (e.g. Min or Max or Null_Count) - /// * The field the statistics value should be placed in for - /// pruning predicate evaluation (e.g. `min_value` or `max_value`) - columns: Vec<(phys_expr::Column, StatisticsType, Field)>, -} - -impl RequiredColumns { - fn new() -> Self { - Self::default() - } - - /// Returns Some(column) if this is a single column predicate. - /// - /// Returns None if this is a multi-column predicate. - /// - /// Examples: - /// * `a > 5 OR a < 10` returns `Some(a)` - /// * `a > 5 OR b < 10` returns `None` - /// * `true` returns None - #[allow(dead_code)] - // this fn is only used by `parquet` feature right now, thus the `allow(dead_code)` - pub fn single_column(&self) -> Option<&phys_expr::Column> { - if self.columns.windows(2).all(|w| { - // check if all columns are the same (ignoring statistics and field) - let c1 = &w[0].0; - let c2 = &w[1].0; - c1 == c2 - }) { - self.columns.first().map(|r| &r.0) - } else { - None - } - } - - /// Returns an iterator over items in columns (see doc on - /// `self.columns` for details) - pub(crate) fn iter( - &self, - ) -> impl Iterator { - self.columns.iter() - } - - fn find_stat_column( - &self, - column: &phys_expr::Column, - statistics_type: StatisticsType, - ) -> Option { - match statistics_type { - StatisticsType::RowCount => { - // Use the first row count we find, if any - self.columns - .iter() - .enumerate() - .find(|(_i, (_c, t, _f))| t == &statistics_type) - .map(|(i, (_c, _t, _f))| i) - } - _ => self - .columns - .iter() - .enumerate() - .find(|(_i, (c, t, _f))| c == column && t == &statistics_type) - .map(|(i, (_c, _t, _f))| i), - } - } - - /// Rewrites column_expr so that all appearances of column - /// are replaced with a reference to either the min or max - /// statistics column, while keeping track that a reference to the statistics - /// column is required - /// - /// for example, an expression like `col("foo") > 5`, when called - /// with Max would result in an expression like `col("foo_max") > - /// 5` with the appropriate entry noted in self.columns - fn stat_column_expr( - &mut self, - column: &phys_expr::Column, - column_expr: &Arc, - field: &Field, - stat_type: StatisticsType, - ) -> Result> { - let (idx, need_to_insert) = match self.find_stat_column(column, stat_type) { - Some(idx) => (idx, false), - None => (self.columns.len(), true), - }; - - let column_name = column.name(); - let stat_column_name = match stat_type { - StatisticsType::Min => format!("{column_name}_min"), - StatisticsType::Max => format!("{column_name}_max"), - StatisticsType::NullCount => format!("{column_name}_null_count"), - StatisticsType::RowCount => "row_count".to_string(), - }; - - let stat_column = phys_expr::Column::new(&stat_column_name, idx); - - // only add statistics column if not previously added - if need_to_insert { - // may be null if statistics are not present - let nullable = true; - let stat_field = - Field::new(stat_column.name(), field.data_type().clone(), nullable); - self.columns.push((column.clone(), stat_type, stat_field)); - } - rewrite_column_expr(Arc::clone(column_expr), column, &stat_column) - } - - /// rewrite col --> col_min - fn min_column_expr( - &mut self, - column: &phys_expr::Column, - column_expr: &Arc, - field: &Field, - ) -> Result> { - self.stat_column_expr(column, column_expr, field, StatisticsType::Min) - } - - /// rewrite col --> col_max - fn max_column_expr( - &mut self, - column: &phys_expr::Column, - column_expr: &Arc, - field: &Field, - ) -> Result> { - self.stat_column_expr(column, column_expr, field, StatisticsType::Max) - } - - /// rewrite col --> col_null_count - fn null_count_column_expr( - &mut self, - column: &phys_expr::Column, - column_expr: &Arc, - field: &Field, - ) -> Result> { - self.stat_column_expr(column, column_expr, field, StatisticsType::NullCount) - } - - /// rewrite col --> col_row_count - fn row_count_column_expr( - &mut self, - column: &phys_expr::Column, - column_expr: &Arc, - field: &Field, - ) -> Result> { - self.stat_column_expr(column, column_expr, field, StatisticsType::RowCount) - } -} - -impl From> for RequiredColumns { - fn from(columns: Vec<(phys_expr::Column, StatisticsType, Field)>) -> Self { - Self { columns } - } -} - -/// Build a RecordBatch from a list of statistics, creating arrays, -/// with one row for each PruningStatistics and columns specified in -/// in the required_columns parameter. -/// -/// For example, if the requested columns are -/// ```text -/// ("s1", Min, Field:s1_min) -/// ("s2", Max, field:s2_max) -///``` -/// -/// And the input statistics had -/// ```text -/// S1(Min: 5, Max: 10) -/// S2(Min: 99, Max: 1000) -/// S3(Min: 1, Max: 2) -/// ``` -/// -/// Then this function would build a record batch with 2 columns and -/// one row s1_min and s2_max as follows (s3 is not requested): -/// -/// ```text -/// s1_min | s2_max -/// -------+-------- -/// 5 | 1000 -/// ``` -fn build_statistics_record_batch( - statistics: &S, - required_columns: &RequiredColumns, -) -> Result { - let mut fields = Vec::::new(); - let mut arrays = Vec::::new(); - // For each needed statistics column: - for (column, statistics_type, stat_field) in required_columns.iter() { - let column = Column::from_name(column.name()); - let data_type = stat_field.data_type(); - - let num_containers = statistics.num_containers(); - - let array = match statistics_type { - StatisticsType::Min => statistics.min_values(&column), - StatisticsType::Max => statistics.max_values(&column), - StatisticsType::NullCount => statistics.null_counts(&column), - StatisticsType::RowCount => statistics.row_counts(&column), - }; - let array = array.unwrap_or_else(|| new_null_array(data_type, num_containers)); - - if num_containers != array.len() { - return internal_err!( - "mismatched statistics length. Expected {}, got {}", - num_containers, - array.len() - ); - } - - // cast statistics array to required data type (e.g. parquet - // provides timestamp statistics as "Int64") - let array = arrow::compute::cast(&array, data_type)?; - - fields.push(stat_field.clone()); - arrays.push(array); - } - - let schema = Arc::new(Schema::new(fields)); - // provide the count in case there were no needed statistics - let mut options = RecordBatchOptions::default(); - options.row_count = Some(statistics.num_containers()); - - trace!("Creating statistics batch for {required_columns:#?} with {arrays:#?}"); - - RecordBatch::try_new_with_options(schema, arrays, &options).map_err(|err| { - plan_datafusion_err!("Can not create statistics record batch: {err}") - }) -} - -struct PruningExpressionBuilder<'a> { - column: phys_expr::Column, - column_expr: Arc, - op: Operator, - scalar_expr: Arc, - field: &'a Field, - required_columns: &'a mut RequiredColumns, -} - -impl<'a> PruningExpressionBuilder<'a> { - fn try_new( - left: &'a Arc, - right: &'a Arc, - op: Operator, - schema: &'a Schema, - required_columns: &'a mut RequiredColumns, - ) -> Result { - // find column name; input could be a more complicated expression - let left_columns = collect_columns(left); - let right_columns = collect_columns(right); - let (column_expr, scalar_expr, columns, correct_operator) = - match (left_columns.len(), right_columns.len()) { - (1, 0) => (left, right, left_columns, op), - (0, 1) => (right, left, right_columns, reverse_operator(op)?), - _ => { - // if more than one column used in expression - not supported - return plan_err!( - "Multi-column expressions are not currently supported" - ); - } - }; - - let df_schema = DFSchema::try_from(schema.clone())?; - let (column_expr, correct_operator, scalar_expr) = rewrite_expr_to_prunable( - column_expr, - correct_operator, - scalar_expr, - df_schema, - )?; - let column = columns.iter().next().unwrap().clone(); - let field = match schema.column_with_name(column.name()) { - Some((_, f)) => f, - _ => { - return plan_err!("Field not found in schema"); - } - }; - - Ok(Self { - column, - column_expr, - op: correct_operator, - scalar_expr, - field, - required_columns, - }) - } - - fn op(&self) -> Operator { - self.op - } - - fn scalar_expr(&self) -> &Arc { - &self.scalar_expr - } - - fn min_column_expr(&mut self) -> Result> { - self.required_columns - .min_column_expr(&self.column, &self.column_expr, self.field) - } - - fn max_column_expr(&mut self) -> Result> { - self.required_columns - .max_column_expr(&self.column, &self.column_expr, self.field) - } - - /// This function is to simply retune the `null_count` physical expression no matter what the - /// predicate expression is - /// - /// i.e., x > 5 => x_null_count, - /// cast(x as int) < 10 => x_null_count, - /// try_cast(x as float) < 10.0 => x_null_count - fn null_count_column_expr(&mut self) -> Result> { - // Retune to [`phys_expr::Column`] - let column_expr = Arc::new(self.column.clone()) as _; - - // null_count is DataType::UInt64, which is different from the column's data type (i.e. self.field) - let null_count_field = &Field::new(self.field.name(), DataType::UInt64, true); - - self.required_columns.null_count_column_expr( - &self.column, - &column_expr, - null_count_field, - ) - } - - /// This function is to simply retune the `row_count` physical expression no matter what the - /// predicate expression is - /// - /// i.e., x > 5 => x_row_count, - /// cast(x as int) < 10 => x_row_count, - /// try_cast(x as float) < 10.0 => x_row_count - fn row_count_column_expr(&mut self) -> Result> { - // Retune to [`phys_expr::Column`] - let column_expr = Arc::new(self.column.clone()) as _; - - // row_count is DataType::UInt64, which is different from the column's data type (i.e. self.field) - let row_count_field = &Field::new(self.field.name(), DataType::UInt64, true); - - self.required_columns.row_count_column_expr( - &self.column, - &column_expr, - row_count_field, - ) - } -} - -/// This function is designed to rewrite the column_expr to -/// ensure the column_expr is monotonically increasing. -/// -/// For example, -/// 1. `col > 10` -/// 2. `-col > 10` should be rewritten to `col < -10` -/// 3. `!col = true` would be rewritten to `col = !true` -/// 4. `abs(a - 10) > 0` not supported -/// 5. `cast(can_prunable_expr) > 10` -/// 6. `try_cast(can_prunable_expr) > 10` -/// -/// More rewrite rules are still in progress. -fn rewrite_expr_to_prunable( - column_expr: &PhysicalExprRef, - op: Operator, - scalar_expr: &PhysicalExprRef, - schema: DFSchema, -) -> Result<(PhysicalExprRef, Operator, PhysicalExprRef)> { - if !is_compare_op(op) { - return plan_err!("rewrite_expr_to_prunable only support compare expression"); - } - - let column_expr_any = column_expr.as_any(); - - if column_expr_any - .downcast_ref::() - .is_some() - { - // `col op lit()` - Ok((Arc::clone(column_expr), op, Arc::clone(scalar_expr))) - } else if let Some(cast) = column_expr_any.downcast_ref::() { - // `cast(col) op lit()` - let arrow_schema: SchemaRef = schema.clone().into(); - let from_type = cast.expr().data_type(&arrow_schema)?; - verify_support_type_for_prune(&from_type, cast.cast_type())?; - let (left, op, right) = - rewrite_expr_to_prunable(cast.expr(), op, scalar_expr, schema)?; - let left = Arc::new(phys_expr::CastExpr::new( - left, - cast.cast_type().clone(), - None, - )); - Ok((left, op, right)) - } else if let Some(try_cast) = - column_expr_any.downcast_ref::() - { - // `try_cast(col) op lit()` - let arrow_schema: SchemaRef = schema.clone().into(); - let from_type = try_cast.expr().data_type(&arrow_schema)?; - verify_support_type_for_prune(&from_type, try_cast.cast_type())?; - let (left, op, right) = - rewrite_expr_to_prunable(try_cast.expr(), op, scalar_expr, schema)?; - let left = Arc::new(phys_expr::TryCastExpr::new( - left, - try_cast.cast_type().clone(), - )); - Ok((left, op, right)) - } else if let Some(neg) = column_expr_any.downcast_ref::() { - // `-col > lit()` --> `col < -lit()` - let (left, op, right) = - rewrite_expr_to_prunable(neg.arg(), op, scalar_expr, schema)?; - let right = Arc::new(phys_expr::NegativeExpr::new(right)); - Ok((left, reverse_operator(op)?, right)) - } else if let Some(not) = column_expr_any.downcast_ref::() { - // `!col = true` --> `col = !true` - if op != Operator::Eq && op != Operator::NotEq { - return plan_err!("Not with operator other than Eq / NotEq is not supported"); - } - if not - .arg() - .as_any() - .downcast_ref::() - .is_some() - { - let left = Arc::clone(not.arg()); - let right = Arc::new(phys_expr::NotExpr::new(Arc::clone(scalar_expr))); - Ok((left, reverse_operator(op)?, right)) - } else { - plan_err!("Not with complex expression {column_expr:?} is not supported") - } - } else { - plan_err!("column expression {column_expr:?} is not supported") - } -} - -fn is_compare_op(op: Operator) -> bool { - matches!( - op, - Operator::Eq - | Operator::NotEq - | Operator::Lt - | Operator::LtEq - | Operator::Gt - | Operator::GtEq - | Operator::LikeMatch - | Operator::NotLikeMatch - ) -} - -fn is_string_type(data_type: &DataType) -> bool { - matches!( - data_type, - DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View - ) -} - -// The pruning logic is based on the comparing the min/max bounds. -// Must make sure the two type has order. -// For example, casts from string to numbers is not correct. -// Because the "13" is less than "3" with UTF8 comparison order. -fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Result<()> { - // Dictionary casts are always supported as long as the value types are supported - let from_type = match from_type { - DataType::Dictionary(_, t) => { - return verify_support_type_for_prune(t.as_ref(), to_type) - } - _ => from_type, - }; - let to_type = match to_type { - DataType::Dictionary(_, t) => { - return verify_support_type_for_prune(from_type, t.as_ref()) - } - _ => to_type, - }; - // If both types are strings or both are not strings (number, timestamp, etc) - // then we can compare them. - // PruningPredicate does not support casting of strings to numbers and such. - if is_string_type(from_type) == is_string_type(to_type) { - Ok(()) - } else { - plan_err!( - "Try Cast/Cast with from type {from_type} to type {to_type} is not supported" - ) - } -} - -/// replaces a column with an old name with a new name in an expression -fn rewrite_column_expr( - e: Arc, - column_old: &phys_expr::Column, - column_new: &phys_expr::Column, -) -> Result> { - e.transform(|expr| { - if let Some(column) = expr.as_any().downcast_ref::() { - if column == column_old { - return Ok(Transformed::yes(Arc::new(column_new.clone()))); - } - } - - Ok(Transformed::no(expr)) - }) - .data() -} - -fn reverse_operator(op: Operator) -> Result { - op.swap().ok_or_else(|| { - DataFusionError::Internal(format!( - "Could not reverse operator {op} while building pruning predicate" - )) - }) -} - -/// Given a column reference to `column`, returns a pruning -/// expression in terms of the min and max that will evaluate to true -/// if the column may contain values, and false if definitely does not -/// contain values -fn build_single_column_expr( - column: &phys_expr::Column, - schema: &Schema, - required_columns: &mut RequiredColumns, - is_not: bool, // if true, treat as !col -) -> Option> { - let field = schema.field_with_name(column.name()).ok()?; - - if matches!(field.data_type(), &DataType::Boolean) { - let col_ref = Arc::new(column.clone()) as _; - - let min = required_columns - .min_column_expr(column, &col_ref, field) - .ok()?; - let max = required_columns - .max_column_expr(column, &col_ref, field) - .ok()?; - - // remember -- we want an expression that is: - // TRUE: if there may be rows that match - // FALSE: if there are no rows that match - if is_not { - // The only way we know a column couldn't match is if both the min and max are true - // !(min && max) - Some(Arc::new(phys_expr::NotExpr::new(Arc::new( - phys_expr::BinaryExpr::new(min, Operator::And, max), - )))) - } else { - // the only way we know a column couldn't match is if both the min and max are false - // !(!min && !max) --> min || max - Some(Arc::new(phys_expr::BinaryExpr::new(min, Operator::Or, max))) - } - } else { - None - } -} - -/// Given an expression reference to `expr`, if `expr` is a column expression, -/// returns a pruning expression in terms of IsNull that will evaluate to true -/// if the column may contain null, and false if definitely does not -/// contain null. -/// If `with_not` is true, build a pruning expression for `col IS NOT NULL`: `col_count != col_null_count` -/// The pruning expression evaluates to true ONLY if the column definitely CONTAINS -/// at least one NULL value. In this case we can know that `IS NOT NULL` can not be true and -/// thus can prune the row group / value -fn build_is_null_column_expr( - expr: &Arc, - schema: &Schema, - required_columns: &mut RequiredColumns, - with_not: bool, -) -> Option> { - if let Some(col) = expr.as_any().downcast_ref::() { - let field = schema.field_with_name(col.name()).ok()?; - - let null_count_field = &Field::new(field.name(), DataType::UInt64, true); - if with_not { - if let Ok(row_count_expr) = - required_columns.row_count_column_expr(col, expr, null_count_field) - { - required_columns - .null_count_column_expr(col, expr, null_count_field) - .map(|null_count_column_expr| { - // IsNotNull(column) => null_count != row_count - Arc::new(phys_expr::BinaryExpr::new( - null_count_column_expr, - Operator::NotEq, - row_count_expr, - )) as _ - }) - .ok() - } else { - None - } - } else { - required_columns - .null_count_column_expr(col, expr, null_count_field) - .map(|null_count_column_expr| { - // IsNull(column) => null_count > 0 - Arc::new(phys_expr::BinaryExpr::new( - null_count_column_expr, - Operator::Gt, - Arc::new(phys_expr::Literal::new(ScalarValue::UInt64(Some(0)))), - )) as _ - }) - .ok() - } - } else { - None - } -} - -/// The maximum number of entries in an `InList` that might be rewritten into -/// an OR chain -const MAX_LIST_VALUE_SIZE_REWRITE: usize = 20; - -/// Rewrite a predicate expression in terms of statistics (min/max/null_counts) -/// for use as a [`PruningPredicate`]. -pub struct PredicateRewriter { - unhandled_hook: Arc, -} - -impl Default for PredicateRewriter { - fn default() -> Self { - Self { - unhandled_hook: Arc::new(ConstantUnhandledPredicateHook::default()), - } - } -} - -impl PredicateRewriter { - /// Create a new `PredicateRewriter` - pub fn new() -> Self { - Self::default() - } - - /// Set the unhandled hook to be used when a predicate can not be rewritten - pub fn with_unhandled_hook( - self, - unhandled_hook: Arc, - ) -> Self { - Self { unhandled_hook } - } - - /// Translate logical filter expression into pruning predicate - /// expression that will evaluate to FALSE if it can be determined no - /// rows between the min/max values could pass the predicates. - /// - /// Any predicates that can not be translated will be passed to `unhandled_hook`. - /// - /// Returns the pruning predicate as an [`PhysicalExpr`] - /// - /// Notice: Does not handle [`phys_expr::InListExpr`] greater than 20, which will fall back to calling `unhandled_hook` - pub fn rewrite_predicate_to_statistics_predicate( - &self, - expr: &Arc, - schema: &Schema, - ) -> Arc { - let mut required_columns = RequiredColumns::new(); - build_predicate_expression( - expr, - schema, - &mut required_columns, - &self.unhandled_hook, - ) - } -} - -/// Translate logical filter expression into pruning predicate -/// expression that will evaluate to FALSE if it can be determined no -/// rows between the min/max values could pass the predicates. -/// -/// Any predicates that can not be translated will be passed to `unhandled_hook`. -/// -/// Returns the pruning predicate as an [`PhysicalExpr`] -/// -/// Notice: Does not handle [`phys_expr::InListExpr`] greater than 20, which will fall back to calling `unhandled_hook` -fn build_predicate_expression( - expr: &Arc, - schema: &Schema, - required_columns: &mut RequiredColumns, - unhandled_hook: &Arc, -) -> Arc { - if is_always_false(expr) { - // Shouldn't return `unhandled_hook.handle(expr)` - // Because it will transfer false to true. - return Arc::clone(expr); - } - // predicate expression can only be a binary expression - let expr_any = expr.as_any(); - if let Some(is_null) = expr_any.downcast_ref::() { - return build_is_null_column_expr(is_null.arg(), schema, required_columns, false) - .unwrap_or_else(|| unhandled_hook.handle(expr)); - } - if let Some(is_not_null) = expr_any.downcast_ref::() { - return build_is_null_column_expr( - is_not_null.arg(), - schema, - required_columns, - true, - ) - .unwrap_or_else(|| unhandled_hook.handle(expr)); - } - if let Some(col) = expr_any.downcast_ref::() { - return build_single_column_expr(col, schema, required_columns, false) - .unwrap_or_else(|| unhandled_hook.handle(expr)); - } - if let Some(not) = expr_any.downcast_ref::() { - // match !col (don't do so recursively) - if let Some(col) = not.arg().as_any().downcast_ref::() { - return build_single_column_expr(col, schema, required_columns, true) - .unwrap_or_else(|| unhandled_hook.handle(expr)); - } else { - return unhandled_hook.handle(expr); - } - } - if let Some(in_list) = expr_any.downcast_ref::() { - if !in_list.list().is_empty() - && in_list.list().len() <= MAX_LIST_VALUE_SIZE_REWRITE - { - let eq_op = if in_list.negated() { - Operator::NotEq - } else { - Operator::Eq - }; - let re_op = if in_list.negated() { - Operator::And - } else { - Operator::Or - }; - let change_expr = in_list - .list() - .iter() - .map(|e| { - Arc::new(phys_expr::BinaryExpr::new( - Arc::clone(in_list.expr()), - eq_op, - Arc::clone(e), - )) as _ - }) - .reduce(|a, b| Arc::new(phys_expr::BinaryExpr::new(a, re_op, b)) as _) - .unwrap(); - return build_predicate_expression( - &change_expr, - schema, - required_columns, - unhandled_hook, - ); - } else { - return unhandled_hook.handle(expr); - } - } - - let (left, op, right) = { - if let Some(bin_expr) = expr_any.downcast_ref::() { - ( - Arc::clone(bin_expr.left()), - *bin_expr.op(), - Arc::clone(bin_expr.right()), - ) - } else if let Some(like_expr) = expr_any.downcast_ref::() { - if like_expr.case_insensitive() { - return unhandled_hook.handle(expr); - } - let op = match (like_expr.negated(), like_expr.case_insensitive()) { - (false, false) => Operator::LikeMatch, - (true, false) => Operator::NotLikeMatch, - (false, true) => Operator::ILikeMatch, - (true, true) => Operator::NotILikeMatch, - }; - ( - Arc::clone(like_expr.expr()), - op, - Arc::clone(like_expr.pattern()), - ) - } else { - return unhandled_hook.handle(expr); - } - }; - - if op == Operator::And || op == Operator::Or { - let left_expr = - build_predicate_expression(&left, schema, required_columns, unhandled_hook); - let right_expr = - build_predicate_expression(&right, schema, required_columns, unhandled_hook); - // simplify boolean expression if applicable - let expr = match (&left_expr, op, &right_expr) { - (left, Operator::And, right) - if is_always_false(left) || is_always_false(right) => - { - Arc::new(phys_expr::Literal::new(ScalarValue::Boolean(Some(false)))) - } - (left, Operator::And, _) if is_always_true(left) => right_expr, - (_, Operator::And, right) if is_always_true(right) => left_expr, - (left, Operator::Or, right) - if is_always_true(left) || is_always_true(right) => - { - Arc::new(phys_expr::Literal::new(ScalarValue::Boolean(Some(true)))) - } - (left, Operator::Or, _) if is_always_false(left) => right_expr, - (_, Operator::Or, right) if is_always_false(right) => left_expr, - - _ => Arc::new(phys_expr::BinaryExpr::new(left_expr, op, right_expr)), - }; - return expr; - } - - let expr_builder = - PruningExpressionBuilder::try_new(&left, &right, op, schema, required_columns); - let mut expr_builder = match expr_builder { - Ok(builder) => builder, - // allow partial failure in predicate expression generation - // this can still produce a useful predicate when multiple conditions are joined using AND - Err(e) => { - debug!("Error building pruning expression: {e}"); - return unhandled_hook.handle(expr); - } - }; - - build_statistics_expr(&mut expr_builder) - .unwrap_or_else(|_| unhandled_hook.handle(expr)) -} - -fn build_statistics_expr( - expr_builder: &mut PruningExpressionBuilder, -) -> Result> { - let statistics_expr: Arc = match expr_builder.op() { - Operator::NotEq => { - // column != literal => (min, max) = literal => - // !(min != literal && max != literal) ==> - // min != literal || literal != max - let min_column_expr = expr_builder.min_column_expr()?; - let max_column_expr = expr_builder.max_column_expr()?; - Arc::new(phys_expr::BinaryExpr::new( - Arc::new(phys_expr::BinaryExpr::new( - min_column_expr, - Operator::NotEq, - Arc::clone(expr_builder.scalar_expr()), - )), - Operator::Or, - Arc::new(phys_expr::BinaryExpr::new( - Arc::clone(expr_builder.scalar_expr()), - Operator::NotEq, - max_column_expr, - )), - )) - } - Operator::Eq => { - // column = literal => (min, max) = literal => min <= literal && literal <= max - // (column / 2) = 4 => (column_min / 2) <= 4 && 4 <= (column_max / 2) - let min_column_expr = expr_builder.min_column_expr()?; - let max_column_expr = expr_builder.max_column_expr()?; - Arc::new(phys_expr::BinaryExpr::new( - Arc::new(phys_expr::BinaryExpr::new( - min_column_expr, - Operator::LtEq, - Arc::clone(expr_builder.scalar_expr()), - )), - Operator::And, - Arc::new(phys_expr::BinaryExpr::new( - Arc::clone(expr_builder.scalar_expr()), - Operator::LtEq, - max_column_expr, - )), - )) - } - Operator::NotLikeMatch => build_not_like_match(expr_builder)?, - Operator::LikeMatch => build_like_match(expr_builder).ok_or_else(|| { - plan_datafusion_err!( - "LIKE expression with wildcard at the beginning is not supported" - ) - })?, - Operator::Gt => { - // column > literal => (min, max) > literal => max > literal - Arc::new(phys_expr::BinaryExpr::new( - expr_builder.max_column_expr()?, - Operator::Gt, - Arc::clone(expr_builder.scalar_expr()), - )) - } - Operator::GtEq => { - // column >= literal => (min, max) >= literal => max >= literal - Arc::new(phys_expr::BinaryExpr::new( - expr_builder.max_column_expr()?, - Operator::GtEq, - Arc::clone(expr_builder.scalar_expr()), - )) - } - Operator::Lt => { - // column < literal => (min, max) < literal => min < literal - Arc::new(phys_expr::BinaryExpr::new( - expr_builder.min_column_expr()?, - Operator::Lt, - Arc::clone(expr_builder.scalar_expr()), - )) - } - Operator::LtEq => { - // column <= literal => (min, max) <= literal => min <= literal - Arc::new(phys_expr::BinaryExpr::new( - expr_builder.min_column_expr()?, - Operator::LtEq, - Arc::clone(expr_builder.scalar_expr()), - )) - } - // other expressions are not supported - _ => { - return plan_err!( - "expressions other than (neq, eq, gt, gteq, lt, lteq) are not supported" - ); - } - }; - let statistics_expr = wrap_null_count_check_expr(statistics_expr, expr_builder)?; - Ok(statistics_expr) -} - -/// returns the string literal of the scalar value if it is a string -fn unpack_string(s: &ScalarValue) -> Option<&str> { - s.try_as_str().flatten() -} - -fn extract_string_literal(expr: &Arc) -> Option<&str> { - if let Some(lit) = expr.as_any().downcast_ref::() { - let s = unpack_string(lit.value())?; - return Some(s); - } - None -} - -/// Convert `column LIKE literal` where P is a constant prefix of the literal -/// to a range check on the column: `P <= column && column < P'`, where P' is the -/// lowest string after all P* strings. -fn build_like_match( - expr_builder: &mut PruningExpressionBuilder, -) -> Option> { - // column LIKE literal => (min, max) LIKE literal split at % => min <= split literal && split literal <= max - // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max - // column LIKE '%foo' => min <= '' && '' <= max => true - // column LIKE '%foo%' => min <= '' && '' <= max => true - // column LIKE 'foo' => min <= 'foo' && 'foo' <= max - - // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase - // this may involve building the physical expressions that call lower() and upper() - let min_column_expr = expr_builder.min_column_expr().ok()?; - let max_column_expr = expr_builder.max_column_expr().ok()?; - let scalar_expr = expr_builder.scalar_expr(); - // check that the scalar is a string literal - let s = extract_string_literal(scalar_expr)?; - // ANSI SQL specifies two wildcards: % and _. % matches zero or more characters, _ matches exactly one character. - let first_wildcard_index = s.find(['%', '_']); - if first_wildcard_index == Some(0) { - // there's no filtering we could possibly do, return an error and have this be handled by the unhandled hook - return None; - } - let (lower_bound, upper_bound) = if let Some(wildcard_index) = first_wildcard_index { - let prefix = &s[..wildcard_index]; - let lower_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some( - prefix.to_string(), - )))); - let upper_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some( - increment_utf8(prefix)?, - )))); - (lower_bound_lit, upper_bound_lit) - } else { - // the like expression is a literal and can be converted into a comparison - let bound = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some( - s.to_string(), - )))); - (Arc::clone(&bound), bound) - }; - let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new( - lower_bound, - Operator::LtEq, - Arc::clone(&max_column_expr), - )); - let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new( - Arc::clone(&min_column_expr), - Operator::LtEq, - upper_bound, - )); - let combined = Arc::new(phys_expr::BinaryExpr::new( - upper_bound_expr, - Operator::And, - lower_bound_expr, - )); - Some(combined) -} - -// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`. -// -// The intuition is that if both `col_min` and `col_max` begin with `const_prefix` that means -// **all** data in this row group begins with `const_prefix` as well (and therefore the predicate -// looking for rows that don't begin with `const_prefix` can never be true) -fn build_not_like_match( - expr_builder: &mut PruningExpressionBuilder<'_>, -) -> Result> { - // col NOT LIKE 'const_prefix%' -> !(col_min LIKE 'const_prefix%' && col_max LIKE 'const_prefix%') -> (col_min NOT LIKE 'const_prefix%' || col_max NOT LIKE 'const_prefix%') - - let min_column_expr = expr_builder.min_column_expr()?; - let max_column_expr = expr_builder.max_column_expr()?; - - let scalar_expr = expr_builder.scalar_expr(); - - let pattern = extract_string_literal(scalar_expr).ok_or_else(|| { - plan_datafusion_err!("cannot extract literal from NOT LIKE expression") - })?; - - let (const_prefix, remaining) = split_constant_prefix(pattern); - if const_prefix.is_empty() || remaining != "%" { - // we can not handle `%` at the beginning or in the middle of the pattern - // Example: For pattern "foo%bar", the row group might include values like - // ["foobar", "food", "foodbar"], making it unsafe to prune. - // Even if the min/max values in the group (e.g., "foobar" and "foodbar") - // match the pattern, intermediate values like "food" may not - // match the full pattern "foo%bar", making pruning unsafe. - // (truncate foo%bar to foo% have same problem) - - // we can not handle pattern containing `_` - // Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"], - // which means not every row is guaranteed to match the pattern. - return Err(plan_datafusion_err!( - "NOT LIKE expressions only support constant_prefix+wildcard`%`" - )); - } - - let min_col_not_like_epxr = Arc::new(phys_expr::LikeExpr::new( - true, - false, - Arc::clone(&min_column_expr), - Arc::clone(scalar_expr), - )); - - let max_col_not_like_expr = Arc::new(phys_expr::LikeExpr::new( - true, - false, - Arc::clone(&max_column_expr), - Arc::clone(scalar_expr), - )); - - Ok(Arc::new(phys_expr::BinaryExpr::new( - min_col_not_like_epxr, - Operator::Or, - max_col_not_like_expr, - ))) -} - -/// Returns unescaped constant prefix of a LIKE pattern (possibly empty) and the remaining pattern (possibly empty) -fn split_constant_prefix(pattern: &str) -> (&str, &str) { - let char_indices = pattern.char_indices().collect::>(); - for i in 0..char_indices.len() { - let (idx, char) = char_indices[i]; - if char == '%' || char == '_' { - if i != 0 && char_indices[i - 1].1 == '\\' { - // ecsaped by `\` - continue; - } - return (&pattern[..idx], &pattern[idx..]); - } - } - (pattern, "") -} - -/// Increment a UTF8 string by one, returning `None` if it can't be incremented. -/// This makes it so that the returned string will always compare greater than the input string -/// or any other string with the same prefix. -/// This is necessary since the statistics may have been truncated: if we have a min statistic -/// of "fo" that may have originally been "foz" or anything else with the prefix "fo". -/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"` -/// In this example `increment_utf8("foo") == "fop" -fn increment_utf8(data: &str) -> Option { - // Helper function to check if a character is valid to use - fn is_valid_unicode(c: char) -> bool { - let cp = c as u32; - - // Filter out non-characters (https://www.unicode.org/versions/corrigendum9.html) - if [0xFFFE, 0xFFFF].contains(&cp) || (0xFDD0..=0xFDEF).contains(&cp) { - return false; - } - - // Filter out private use area - if cp >= 0x110000 { - return false; - } - - true - } - - // Convert string to vector of code points - let mut code_points: Vec = data.chars().collect(); - - // Work backwards through code points - for idx in (0..code_points.len()).rev() { - let original = code_points[idx] as u32; - - // Try incrementing the code point - if let Some(next_char) = char::from_u32(original + 1) { - if is_valid_unicode(next_char) { - code_points[idx] = next_char; - // truncate the string to the current index - code_points.truncate(idx + 1); - return Some(code_points.into_iter().collect()); - } - } - } - - None -} - -/// Wrap the statistics expression in a check that skips the expression if the column is all nulls. -/// -/// This is important not only as an optimization but also because statistics may not be -/// accurate for columns that are all nulls. -/// For example, for an `int` column `x` with all nulls, the min/max/null_count statistics -/// might be set to 0 and evaluating `x = 0` would incorrectly include the column. -/// -/// For example: -/// -/// `x_min <= 10 AND 10 <= x_max` -/// -/// will become -/// -/// ```sql -/// x_null_count != x_row_count AND (x_min <= 10 AND 10 <= x_max) -/// ```` -/// -/// If the column is known to be all nulls, then the expression -/// `x_null_count = x_row_count` will be true, which will cause the -/// boolean expression to return false. Therefore, prune out the container. -fn wrap_null_count_check_expr( - statistics_expr: Arc, - expr_builder: &mut PruningExpressionBuilder, -) -> Result> { - // x_null_count != x_row_count - let not_when_null_count_eq_row_count = Arc::new(phys_expr::BinaryExpr::new( - expr_builder.null_count_column_expr()?, - Operator::NotEq, - expr_builder.row_count_column_expr()?, - )); - - // (x_null_count != x_row_count) AND () - Ok(Arc::new(phys_expr::BinaryExpr::new( - not_when_null_count_eq_row_count, - Operator::And, - statistics_expr, - ))) -} - -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub(crate) enum StatisticsType { - Min, - Max, - NullCount, - RowCount, -} - -#[cfg(test)] -mod tests { - use std::collections::HashMap; - use std::ops::{Not, Rem}; - - use super::*; - use datafusion_common::test_util::batches_to_string; - use datafusion_expr::{and, col, lit, or}; - use insta::assert_snapshot; - - use arrow::array::Decimal128Array; - use arrow::{ - array::{BinaryArray, Int32Array, Int64Array, StringArray, UInt64Array}, - datatypes::TimeUnit, - }; - use datafusion_expr::expr::InList; - use datafusion_expr::{cast, is_null, try_cast, Expr}; - use datafusion_functions_nested::expr_fn::{array_has, make_array}; - use datafusion_physical_expr::expressions as phys_expr; - use datafusion_physical_expr::planner::logical2physical; - - #[derive(Debug, Default)] - /// Mock statistic provider for tests - /// - /// Each row represents the statistics for a "container" (which - /// might represent an entire parquet file, or directory of files, - /// or some other collection of data for which we had statistics) - /// - /// Note All `ArrayRefs` must be the same size. - struct ContainerStats { - min: Option, - max: Option, - /// Optional values - null_counts: Option, - row_counts: Option, - /// Optional known values (e.g. mimic a bloom filter) - /// (value, contained) - /// If present, all BooleanArrays must be the same size as min/max - contained: Vec<(HashSet, BooleanArray)>, - } - - impl ContainerStats { - fn new() -> Self { - Default::default() - } - fn new_decimal128( - min: impl IntoIterator>, - max: impl IntoIterator>, - precision: u8, - scale: i8, - ) -> Self { - Self::new() - .with_min(Arc::new( - min.into_iter() - .collect::() - .with_precision_and_scale(precision, scale) - .unwrap(), - )) - .with_max(Arc::new( - max.into_iter() - .collect::() - .with_precision_and_scale(precision, scale) - .unwrap(), - )) - } - - fn new_i64( - min: impl IntoIterator>, - max: impl IntoIterator>, - ) -> Self { - Self::new() - .with_min(Arc::new(min.into_iter().collect::())) - .with_max(Arc::new(max.into_iter().collect::())) - } - - fn new_i32( - min: impl IntoIterator>, - max: impl IntoIterator>, - ) -> Self { - Self::new() - .with_min(Arc::new(min.into_iter().collect::())) - .with_max(Arc::new(max.into_iter().collect::())) - } - - fn new_utf8<'a>( - min: impl IntoIterator>, - max: impl IntoIterator>, - ) -> Self { - Self::new() - .with_min(Arc::new(min.into_iter().collect::())) - .with_max(Arc::new(max.into_iter().collect::())) - } - - fn new_bool( - min: impl IntoIterator>, - max: impl IntoIterator>, - ) -> Self { - Self::new() - .with_min(Arc::new(min.into_iter().collect::())) - .with_max(Arc::new(max.into_iter().collect::())) - } - - fn min(&self) -> Option { - self.min.clone() - } - - fn max(&self) -> Option { - self.max.clone() - } - - fn null_counts(&self) -> Option { - self.null_counts.clone() - } - - fn row_counts(&self) -> Option { - self.row_counts.clone() - } - - /// return an iterator over all arrays in this statistics - fn arrays(&self) -> Vec { - let contained_arrays = self - .contained - .iter() - .map(|(_values, contained)| Arc::new(contained.clone()) as ArrayRef); - - [ - self.min.as_ref().cloned(), - self.max.as_ref().cloned(), - self.null_counts.as_ref().cloned(), - self.row_counts.as_ref().cloned(), - ] - .into_iter() - .flatten() - .chain(contained_arrays) - .collect() - } - - /// Returns the number of containers represented by this statistics This - /// picks the length of the first array as all arrays must have the same - /// length (which is verified by `assert_invariants`). - fn len(&self) -> usize { - // pick the first non zero length - self.arrays().iter().map(|a| a.len()).next().unwrap_or(0) - } - - /// Ensure that the lengths of all arrays are consistent - fn assert_invariants(&self) { - let mut prev_len = None; - - for len in self.arrays().iter().map(|a| a.len()) { - // Get a length, if we don't already have one - match prev_len { - None => { - prev_len = Some(len); - } - Some(prev_len) => { - assert_eq!(prev_len, len); - } - } - } - } - - /// Add min values - fn with_min(mut self, min: ArrayRef) -> Self { - self.min = Some(min); - self - } - - /// Add max values - fn with_max(mut self, max: ArrayRef) -> Self { - self.max = Some(max); - self - } - - /// Add null counts. There must be the same number of null counts as - /// there are containers - fn with_null_counts( - mut self, - counts: impl IntoIterator>, - ) -> Self { - let null_counts: ArrayRef = - Arc::new(counts.into_iter().collect::()); - - self.assert_invariants(); - self.null_counts = Some(null_counts); - self - } - - /// Add row counts. There must be the same number of row counts as - /// there are containers - fn with_row_counts( - mut self, - counts: impl IntoIterator>, - ) -> Self { - let row_counts: ArrayRef = - Arc::new(counts.into_iter().collect::()); - - self.assert_invariants(); - self.row_counts = Some(row_counts); - self - } - - /// Add contained information. - pub fn with_contained( - mut self, - values: impl IntoIterator, - contained: impl IntoIterator>, - ) -> Self { - let contained: BooleanArray = contained.into_iter().collect(); - let values: HashSet<_> = values.into_iter().collect(); - - self.contained.push((values, contained)); - self.assert_invariants(); - self - } - - /// get any contained information for the specified values - fn contained(&self, find_values: &HashSet) -> Option { - // find the one with the matching values - self.contained - .iter() - .find(|(values, _contained)| values == find_values) - .map(|(_values, contained)| contained.clone()) - } - } - - #[derive(Debug, Default)] - struct TestStatistics { - // key: column name - stats: HashMap, - } - - impl TestStatistics { - fn new() -> Self { - Self::default() - } - - fn with( - mut self, - name: impl Into, - container_stats: ContainerStats, - ) -> Self { - let col = Column::from_name(name.into()); - self.stats.insert(col, container_stats); - self - } - - /// Add null counts for the specified column. - /// There must be the same number of null counts as - /// there are containers - fn with_null_counts( - mut self, - name: impl Into, - counts: impl IntoIterator>, - ) -> Self { - let col = Column::from_name(name.into()); - - // take stats out and update them - let container_stats = self - .stats - .remove(&col) - .unwrap_or_default() - .with_null_counts(counts); - - // put stats back in - self.stats.insert(col, container_stats); - self - } - - /// Add row counts for the specified column. - /// There must be the same number of row counts as - /// there are containers - fn with_row_counts( - mut self, - name: impl Into, - counts: impl IntoIterator>, - ) -> Self { - let col = Column::from_name(name.into()); - - // take stats out and update them - let container_stats = self - .stats - .remove(&col) - .unwrap_or_default() - .with_row_counts(counts); - - // put stats back in - self.stats.insert(col, container_stats); - self - } - - /// Add contained information for the specified column. - fn with_contained( - mut self, - name: impl Into, - values: impl IntoIterator, - contained: impl IntoIterator>, - ) -> Self { - let col = Column::from_name(name.into()); - - // take stats out and update them - let container_stats = self - .stats - .remove(&col) - .unwrap_or_default() - .with_contained(values, contained); - - // put stats back in - self.stats.insert(col, container_stats); - self - } - } - - impl PruningStatistics for TestStatistics { - fn min_values(&self, column: &Column) -> Option { - self.stats - .get(column) - .map(|container_stats| container_stats.min()) - .unwrap_or(None) - } - - fn max_values(&self, column: &Column) -> Option { - self.stats - .get(column) - .map(|container_stats| container_stats.max()) - .unwrap_or(None) - } - - fn num_containers(&self) -> usize { - self.stats - .values() - .next() - .map(|container_stats| container_stats.len()) - .unwrap_or(0) - } - - fn null_counts(&self, column: &Column) -> Option { - self.stats - .get(column) - .map(|container_stats| container_stats.null_counts()) - .unwrap_or(None) - } - - fn row_counts(&self, column: &Column) -> Option { - self.stats - .get(column) - .map(|container_stats| container_stats.row_counts()) - .unwrap_or(None) - } - - fn contained( - &self, - column: &Column, - values: &HashSet, - ) -> Option { - self.stats - .get(column) - .and_then(|container_stats| container_stats.contained(values)) - } - } - - /// Returns the specified min/max container values - struct OneContainerStats { - min_values: Option, - max_values: Option, - num_containers: usize, - } - - impl PruningStatistics for OneContainerStats { - fn min_values(&self, _column: &Column) -> Option { - self.min_values.clone() - } - - fn max_values(&self, _column: &Column) -> Option { - self.max_values.clone() - } - - fn num_containers(&self) -> usize { - self.num_containers - } - - fn null_counts(&self, _column: &Column) -> Option { - None - } - - fn row_counts(&self, _column: &Column) -> Option { - None - } - - fn contained( - &self, - _column: &Column, - _values: &HashSet, - ) -> Option { - None - } - } - - /// Row count should only be referenced once in the pruning expression, even if we need the row count - /// for multiple columns. - #[test] - fn test_unique_row_count_field_and_column() { - // c1 = 100 AND c2 = 200 - let schema: SchemaRef = Arc::new(Schema::new(vec![ - Field::new("c1", DataType::Int32, true), - Field::new("c2", DataType::Int32, true), - ])); - let expr = col("c1").eq(lit(100)).and(col("c2").eq(lit(200))); - let expr = logical2physical(&expr, &schema); - let p = PruningPredicate::try_new(expr, Arc::clone(&schema)).unwrap(); - // note pruning expression refers to row_count twice - assert_eq!( - "c1_null_count@2 != row_count@3 AND c1_min@0 <= 100 AND 100 <= c1_max@1 AND c2_null_count@6 != row_count@3 AND c2_min@4 <= 200 AND 200 <= c2_max@5", - p.predicate_expr.to_string() - ); - - // Fields in required schema should be unique, otherwise when creating batches - // it will fail because of duplicate field names - let mut fields = HashSet::new(); - for (_col, _ty, field) in p.required_columns().iter() { - let was_new = fields.insert(field); - if !was_new { - panic!( - "Duplicate field in required schema: {field:?}. Previous fields:\n{fields:#?}" - ); - } - } - } - - #[test] - fn prune_all_rows_null_counts() { - // if null_count = row_count then we should prune the container for i = 0 - // regardless of the statistics - let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); - let statistics = TestStatistics::new().with( - "i", - ContainerStats::new_i32( - vec![Some(0)], // min - vec![Some(0)], // max - ) - .with_null_counts(vec![Some(1)]) - .with_row_counts(vec![Some(1)]), - ); - let expected_ret = &[false]; - prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); - - // this should be true even if the container stats are missing - let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); - let container_stats = ContainerStats { - min: Some(Arc::new(Int32Array::from(vec![None]))), - max: Some(Arc::new(Int32Array::from(vec![None]))), - null_counts: Some(Arc::new(UInt64Array::from(vec![Some(1)]))), - row_counts: Some(Arc::new(UInt64Array::from(vec![Some(1)]))), - ..ContainerStats::default() - }; - let statistics = TestStatistics::new().with("i", container_stats); - let expected_ret = &[false]; - prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); - - // If the null counts themselves are missing we should be able to fall back to the stats - let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); - let container_stats = ContainerStats { - min: Some(Arc::new(Int32Array::from(vec![Some(0)]))), - max: Some(Arc::new(Int32Array::from(vec![Some(0)]))), - null_counts: Some(Arc::new(UInt64Array::from(vec![None]))), - row_counts: Some(Arc::new(UInt64Array::from(vec![Some(1)]))), - ..ContainerStats::default() - }; - let statistics = TestStatistics::new().with("i", container_stats); - let expected_ret = &[true]; - prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); - let expected_ret = &[false]; - prune_with_expr(col("i").gt(lit(0)), &schema, &statistics, expected_ret); - - // Same for the row counts - let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); - let container_stats = ContainerStats { - min: Some(Arc::new(Int32Array::from(vec![Some(0)]))), - max: Some(Arc::new(Int32Array::from(vec![Some(0)]))), - null_counts: Some(Arc::new(UInt64Array::from(vec![Some(1)]))), - row_counts: Some(Arc::new(UInt64Array::from(vec![None]))), - ..ContainerStats::default() - }; - let statistics = TestStatistics::new().with("i", container_stats); - let expected_ret = &[true]; - prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); - let expected_ret = &[false]; - prune_with_expr(col("i").gt(lit(0)), &schema, &statistics, expected_ret); - } - - #[test] - fn prune_missing_statistics() { - // If the min or max stats are missing we should not prune - // (unless we know all rows are null, see `prune_all_rows_null_counts`) - let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); - let container_stats = ContainerStats { - min: Some(Arc::new(Int32Array::from(vec![None, Some(0)]))), - max: Some(Arc::new(Int32Array::from(vec![Some(0), None]))), - null_counts: Some(Arc::new(UInt64Array::from(vec![Some(0), Some(0)]))), - row_counts: Some(Arc::new(UInt64Array::from(vec![Some(1), Some(1)]))), - ..ContainerStats::default() - }; - let statistics = TestStatistics::new().with("i", container_stats); - let expected_ret = &[true, true]; - prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); - let expected_ret = &[false, true]; - prune_with_expr(col("i").gt(lit(0)), &schema, &statistics, expected_ret); - let expected_ret = &[true, false]; - prune_with_expr(col("i").lt(lit(0)), &schema, &statistics, expected_ret); - } - - #[test] - fn prune_null_stats() { - // if null_count = row_count then we should prune the container for i = 0 - // regardless of the statistics - let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); - - let statistics = TestStatistics::new().with( - "i", - ContainerStats::new_i32( - vec![Some(0)], // min - vec![Some(0)], // max - ) - .with_null_counts(vec![Some(1)]) - .with_row_counts(vec![Some(1)]), - ); - - let expected_ret = &[false]; - - // i = 0 - prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); - } - - #[test] - fn test_build_statistics_record_batch() { - // Request a record batch with of s1_min, s2_max, s3_max, s3_min - let required_columns = RequiredColumns::from(vec![ - // min of original column s1, named s1_min - ( - phys_expr::Column::new("s1", 1), - StatisticsType::Min, - Field::new("s1_min", DataType::Int32, true), - ), - // max of original column s2, named s2_max - ( - phys_expr::Column::new("s2", 2), - StatisticsType::Max, - Field::new("s2_max", DataType::Int32, true), - ), - // max of original column s3, named s3_max - ( - phys_expr::Column::new("s3", 3), - StatisticsType::Max, - Field::new("s3_max", DataType::Utf8, true), - ), - // min of original column s3, named s3_min - ( - phys_expr::Column::new("s3", 3), - StatisticsType::Min, - Field::new("s3_min", DataType::Utf8, true), - ), - ]); - - let statistics = TestStatistics::new() - .with( - "s1", - ContainerStats::new_i32( - vec![None, None, Some(9), None], // min - vec![Some(10), None, None, None], // max - ), - ) - .with( - "s2", - ContainerStats::new_i32( - vec![Some(2), None, None, None], // min - vec![Some(20), None, None, None], // max - ), - ) - .with( - "s3", - ContainerStats::new_utf8( - vec![Some("a"), None, None, None], // min - vec![Some("q"), None, Some("r"), None], // max - ), - ); - - let batch = - build_statistics_record_batch(&statistics, &required_columns).unwrap(); - assert_snapshot!(batches_to_string(&[batch]), @r" - +--------+--------+--------+--------+ - | s1_min | s2_max | s3_max | s3_min | - +--------+--------+--------+--------+ - | | 20 | q | a | - | | | | | - | 9 | | r | | - | | | | | - +--------+--------+--------+--------+ - "); - } - - #[test] - fn test_build_statistics_casting() { - // Test requesting a Timestamp column, but getting statistics as Int64 - // which is what Parquet does - - // Request a record batch with of s1_min as a timestamp - let required_columns = RequiredColumns::from(vec![( - phys_expr::Column::new("s3", 3), - StatisticsType::Min, - Field::new( - "s1_min", - DataType::Timestamp(TimeUnit::Nanosecond, None), - true, - ), - )]); - - // Note the statistics pass back i64 (not timestamp) - let statistics = OneContainerStats { - min_values: Some(Arc::new(Int64Array::from(vec![Some(10)]))), - max_values: Some(Arc::new(Int64Array::from(vec![Some(20)]))), - num_containers: 1, - }; - - let batch = - build_statistics_record_batch(&statistics, &required_columns).unwrap(); - - assert_snapshot!(batches_to_string(&[batch]), @r" - +-------------------------------+ - | s1_min | - +-------------------------------+ - | 1970-01-01T00:00:00.000000010 | - +-------------------------------+ - "); - } - - #[test] - fn test_build_statistics_no_required_stats() { - let required_columns = RequiredColumns::new(); - - let statistics = OneContainerStats { - min_values: Some(Arc::new(Int64Array::from(vec![Some(10)]))), - max_values: Some(Arc::new(Int64Array::from(vec![Some(20)]))), - num_containers: 1, - }; - - let batch = - build_statistics_record_batch(&statistics, &required_columns).unwrap(); - assert_eq!(batch.num_rows(), 1); // had 1 container - } - - #[test] - fn test_build_statistics_inconsistent_types() { - // Test requesting a Utf8 column when the stats return some other type - - // Request a record batch with of s1_min as a timestamp - let required_columns = RequiredColumns::from(vec![( - phys_expr::Column::new("s3", 3), - StatisticsType::Min, - Field::new("s1_min", DataType::Utf8, true), - )]); - - // Note the statistics return an invalid UTF-8 sequence which will be converted to null - let statistics = OneContainerStats { - min_values: Some(Arc::new(BinaryArray::from(vec![&[255u8] as &[u8]]))), - max_values: None, - num_containers: 1, - }; - - let batch = - build_statistics_record_batch(&statistics, &required_columns).unwrap(); - assert_snapshot!(batches_to_string(&[batch]), @r" - +--------+ - | s1_min | - +--------+ - | | - +--------+ - "); - } - - #[test] - fn test_build_statistics_inconsistent_length() { - // return an inconsistent length to the actual statistics arrays - let required_columns = RequiredColumns::from(vec![( - phys_expr::Column::new("s1", 3), - StatisticsType::Min, - Field::new("s1_min", DataType::Int64, true), - )]); - - // Note the statistics pass back i64 (not timestamp) - let statistics = OneContainerStats { - min_values: Some(Arc::new(Int64Array::from(vec![Some(10)]))), - max_values: Some(Arc::new(Int64Array::from(vec![Some(20)]))), - num_containers: 3, - }; - - let result = - build_statistics_record_batch(&statistics, &required_columns).unwrap_err(); - assert!( - result - .to_string() - .contains("mismatched statistics length. Expected 3, got 1"), - "{}", - result - ); - } - - #[test] - fn row_group_predicate_eq() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = - "c1_null_count@2 != row_count@3 AND c1_min@0 <= 1 AND 1 <= c1_max@1"; - - // test column on the left - let expr = col("c1").eq(lit(1)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = lit(1).eq(col("c1")); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_not_eq() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = - "c1_null_count@2 != row_count@3 AND (c1_min@0 != 1 OR 1 != c1_max@1)"; - - // test column on the left - let expr = col("c1").not_eq(lit(1)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = lit(1).not_eq(col("c1")); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_gt() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "c1_null_count@1 != row_count@2 AND c1_max@0 > 1"; - - // test column on the left - let expr = col("c1").gt(lit(1)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = lit(1).lt(col("c1")); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_gt_eq() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "c1_null_count@1 != row_count@2 AND c1_max@0 >= 1"; - - // test column on the left - let expr = col("c1").gt_eq(lit(1)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - // test column on the right - let expr = lit(1).lt_eq(col("c1")); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_lt() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "c1_null_count@1 != row_count@2 AND c1_min@0 < 1"; - - // test column on the left - let expr = col("c1").lt(lit(1)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = lit(1).gt(col("c1")); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_lt_eq() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "c1_null_count@1 != row_count@2 AND c1_min@0 <= 1"; - - // test column on the left - let expr = col("c1").lt_eq(lit(1)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - // test column on the right - let expr = lit(1).gt_eq(col("c1")); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_and() -> Result<()> { - let schema = Schema::new(vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Int32, false), - Field::new("c3", DataType::Int32, false), - ]); - // test AND operator joining supported c1 < 1 expression and unsupported c2 > c3 expression - let expr = col("c1").lt(lit(1)).and(col("c2").lt(col("c3"))); - let expected_expr = "c1_null_count@1 != row_count@2 AND c1_min@0 < 1"; - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_or() -> Result<()> { - let schema = Schema::new(vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Int32, false), - ]); - // test OR operator joining supported c1 < 1 expression and unsupported c2 % 2 = 0 expression - let expr = col("c1").lt(lit(1)).or(col("c2").rem(lit(2)).eq(lit(0))); - let expected_expr = "true"; - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_not() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "true"; - - let expr = col("c1").not(); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_not_bool() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Boolean, false)]); - let expected_expr = "NOT c1_min@0 AND c1_max@1"; - - let expr = col("c1").not(); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_bool() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Boolean, false)]); - let expected_expr = "c1_min@0 OR c1_max@1"; - - let expr = col("c1"); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_lt_bool() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Boolean, false)]); - let expected_expr = "c1_null_count@1 != row_count@2 AND c1_min@0 < true"; - - // DF doesn't support arithmetic on boolean columns so - // this predicate will error when evaluated - let expr = col("c1").lt(lit(true)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_required_columns() -> Result<()> { - let schema = Schema::new(vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Int32, false), - ]); - let mut required_columns = RequiredColumns::new(); - // c1 < 1 and (c2 = 2 or c2 = 3) - let expr = col("c1") - .lt(lit(1)) - .and(col("c2").eq(lit(2)).or(col("c2").eq(lit(3)))); - let expected_expr = "c1_null_count@1 != row_count@2 AND c1_min@0 < 1 AND (c2_null_count@5 != row_count@2 AND c2_min@3 <= 2 AND 2 <= c2_max@4 OR c2_null_count@5 != row_count@2 AND c2_min@3 <= 3 AND 3 <= c2_max@4)"; - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut required_columns); - assert_eq!(predicate_expr.to_string(), expected_expr); - println!("required_columns: {required_columns:#?}"); // for debugging assertions below - // c1 < 1 should add c1_min - let c1_min_field = Field::new("c1_min", DataType::Int32, false); - assert_eq!( - required_columns.columns[0], - ( - phys_expr::Column::new("c1", 0), - StatisticsType::Min, - c1_min_field.with_nullable(true) // could be nullable if stats are not present - ) - ); - // c1 < 1 should add c1_null_count - let c1_null_count_field = Field::new("c1_null_count", DataType::UInt64, false); - assert_eq!( - required_columns.columns[1], - ( - phys_expr::Column::new("c1", 0), - StatisticsType::NullCount, - c1_null_count_field.with_nullable(true) // could be nullable if stats are not present - ) - ); - // c1 < 1 should add row_count - let row_count_field = Field::new("row_count", DataType::UInt64, false); - assert_eq!( - required_columns.columns[2], - ( - phys_expr::Column::new("c1", 0), - StatisticsType::RowCount, - row_count_field.with_nullable(true) // could be nullable if stats are not present - ) - ); - // c2 = 2 should add c2_min and c2_max - let c2_min_field = Field::new("c2_min", DataType::Int32, false); - assert_eq!( - required_columns.columns[3], - ( - phys_expr::Column::new("c2", 1), - StatisticsType::Min, - c2_min_field.with_nullable(true) // could be nullable if stats are not present - ) - ); - let c2_max_field = Field::new("c2_max", DataType::Int32, false); - assert_eq!( - required_columns.columns[4], - ( - phys_expr::Column::new("c2", 1), - StatisticsType::Max, - c2_max_field.with_nullable(true) // could be nullable if stats are not present - ) - ); - // c2 = 2 should add c2_null_count - let c2_null_count_field = Field::new("c2_null_count", DataType::UInt64, false); - assert_eq!( - required_columns.columns[5], - ( - phys_expr::Column::new("c2", 1), - StatisticsType::NullCount, - c2_null_count_field.with_nullable(true) // could be nullable if stats are not present - ) - ); - // c2 = 1 should add row_count - let row_count_field = Field::new("row_count", DataType::UInt64, false); - assert_eq!( - required_columns.columns[2], - ( - phys_expr::Column::new("c1", 0), - StatisticsType::RowCount, - row_count_field.with_nullable(true) // could be nullable if stats are not present - ) - ); - // c2 = 3 shouldn't add any new statistics fields - assert_eq!(required_columns.columns.len(), 6); - - Ok(()) - } - - #[test] - fn row_group_predicate_in_list() -> Result<()> { - let schema = Schema::new(vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Int32, false), - ]); - // test c1 in(1, 2, 3) - let expr = Expr::InList(InList::new( - Box::new(col("c1")), - vec![lit(1), lit(2), lit(3)], - false, - )); - let expected_expr = "c1_null_count@2 != row_count@3 AND c1_min@0 <= 1 AND 1 <= c1_max@1 OR c1_null_count@2 != row_count@3 AND c1_min@0 <= 2 AND 2 <= c1_max@1 OR c1_null_count@2 != row_count@3 AND c1_min@0 <= 3 AND 3 <= c1_max@1"; - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_in_list_empty() -> Result<()> { - let schema = Schema::new(vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Int32, false), - ]); - // test c1 in() - let expr = Expr::InList(InList::new(Box::new(col("c1")), vec![], false)); - let expected_expr = "true"; - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_in_list_negated() -> Result<()> { - let schema = Schema::new(vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Int32, false), - ]); - // test c1 not in(1, 2, 3) - let expr = Expr::InList(InList::new( - Box::new(col("c1")), - vec![lit(1), lit(2), lit(3)], - true, - )); - let expected_expr = "c1_null_count@2 != row_count@3 AND (c1_min@0 != 1 OR 1 != c1_max@1) AND c1_null_count@2 != row_count@3 AND (c1_min@0 != 2 OR 2 != c1_max@1) AND c1_null_count@2 != row_count@3 AND (c1_min@0 != 3 OR 3 != c1_max@1)"; - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_between() -> Result<()> { - let schema = Schema::new(vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Int32, false), - ]); - - // test c1 BETWEEN 1 AND 5 - let expr1 = col("c1").between(lit(1), lit(5)); - - // test 1 <= c1 <= 5 - let expr2 = col("c1").gt_eq(lit(1)).and(col("c1").lt_eq(lit(5))); - - let predicate_expr1 = - test_build_predicate_expression(&expr1, &schema, &mut RequiredColumns::new()); - - let predicate_expr2 = - test_build_predicate_expression(&expr2, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr1.to_string(), predicate_expr2.to_string()); - - Ok(()) - } - - #[test] - fn row_group_predicate_between_with_in_list() -> Result<()> { - let schema = Schema::new(vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Int32, false), - ]); - // test c1 in(1, 2) - let expr1 = col("c1").in_list(vec![lit(1), lit(2)], false); - - // test c2 BETWEEN 4 AND 5 - let expr2 = col("c2").between(lit(4), lit(5)); - - // test c1 in(1, 2) and c2 BETWEEN 4 AND 5 - let expr3 = expr1.and(expr2); - - let expected_expr = "(c1_null_count@2 != row_count@3 AND c1_min@0 <= 1 AND 1 <= c1_max@1 OR c1_null_count@2 != row_count@3 AND c1_min@0 <= 2 AND 2 <= c1_max@1) AND c2_null_count@5 != row_count@3 AND c2_max@4 >= 4 AND c2_null_count@5 != row_count@3 AND c2_min@6 <= 5"; - let predicate_expr = - test_build_predicate_expression(&expr3, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_in_list_to_many_values() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - // test c1 in(1..21) - // in pruning.rs has MAX_LIST_VALUE_SIZE_REWRITE = 20, more than this value will be rewrite - // always true - let expr = col("c1").in_list((1..=21).map(lit).collect(), false); - - let expected_expr = "true"; - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_cast_int_int() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Int64) <= 1 AND 1 <= CAST(c1_max@1 AS Int64)"; - - // test cast(c1 as int64) = 1 - // test column on the left - let expr = cast(col("c1"), DataType::Int64).eq(lit(ScalarValue::Int64(Some(1)))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = lit(ScalarValue::Int64(Some(1))).eq(cast(col("c1"), DataType::Int64)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - let expected_expr = - "c1_null_count@1 != row_count@2 AND TRY_CAST(c1_max@0 AS Int64) > 1"; - - // test column on the left - let expr = - try_cast(col("c1"), DataType::Int64).gt(lit(ScalarValue::Int64(Some(1)))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = - lit(ScalarValue::Int64(Some(1))).lt(try_cast(col("c1"), DataType::Int64)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_cast_string_string() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); - let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Utf8) <= 1 AND 1 <= CAST(c1_max@1 AS Utf8)"; - - // test column on the left - let expr = cast(col("c1"), DataType::Utf8) - .eq(lit(ScalarValue::Utf8(Some("1".to_string())))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = lit(ScalarValue::Utf8(Some("1".to_string()))) - .eq(cast(col("c1"), DataType::Utf8)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_cast_string_int() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); - let expected_expr = "true"; - - // test column on the left - let expr = cast(col("c1"), DataType::Int32).eq(lit(ScalarValue::Int32(Some(1)))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = lit(ScalarValue::Int32(Some(1))).eq(cast(col("c1"), DataType::Int32)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_cast_int_string() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "true"; - - // test column on the left - let expr = cast(col("c1"), DataType::Utf8) - .eq(lit(ScalarValue::Utf8(Some("1".to_string())))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = lit(ScalarValue::Utf8(Some("1".to_string()))) - .eq(cast(col("c1"), DataType::Utf8)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_date_date() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Date32, false)]); - let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Date64) <= 1970-01-01 AND 1970-01-01 <= CAST(c1_max@1 AS Date64)"; - - // test column on the left - let expr = - cast(col("c1"), DataType::Date64).eq(lit(ScalarValue::Date64(Some(123)))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = - lit(ScalarValue::Date64(Some(123))).eq(cast(col("c1"), DataType::Date64)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_dict_string_date() -> Result<()> { - // Test with Dictionary for the literal - let schema = Schema::new(vec![Field::new("c1", DataType::Date32, false)]); - let expected_expr = "true"; - - // test column on the left - let expr = cast( - col("c1"), - DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), - ) - .eq(lit(ScalarValue::Utf8(Some("2024-01-01".to_string())))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = lit(ScalarValue::Utf8(Some("2024-01-01".to_string()))).eq(cast( - col("c1"), - DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), - )); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_date_dict_string() -> Result<()> { - // Test with Dictionary for the column - let schema = Schema::new(vec![Field::new( - "c1", - DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), - false, - )]); - let expected_expr = "true"; - - // test column on the left - let expr = - cast(col("c1"), DataType::Date32).eq(lit(ScalarValue::Date32(Some(123)))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = - lit(ScalarValue::Date32(Some(123))).eq(cast(col("c1"), DataType::Date32)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_dict_dict_same_value_type() -> Result<()> { - // Test with Dictionary types that have the same value type but different key types - let schema = Schema::new(vec![Field::new( - "c1", - DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), - false, - )]); - - // Direct comparison with no cast - let expr = col("c1").eq(lit(ScalarValue::Utf8(Some("test".to_string())))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - let expected_expr = - "c1_null_count@2 != row_count@3 AND c1_min@0 <= test AND test <= c1_max@1"; - assert_eq!(predicate_expr.to_string(), expected_expr); - - // Test with column cast to a dictionary with different key type - let expr = cast( - col("c1"), - DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)), - ) - .eq(lit(ScalarValue::Utf8(Some("test".to_string())))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Dictionary(UInt16, Utf8)) <= test AND test <= CAST(c1_max@1 AS Dictionary(UInt16, Utf8))"; - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_dict_dict_different_value_type() -> Result<()> { - // Test with Dictionary types that have different value types - let schema = Schema::new(vec![Field::new( - "c1", - DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Int32)), - false, - )]); - let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Int64) <= 123 AND 123 <= CAST(c1_max@1 AS Int64)"; - - // Test with literal of a different type - let expr = - cast(col("c1"), DataType::Int64).eq(lit(ScalarValue::Int64(Some(123)))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_nested_dict() -> Result<()> { - // Test with nested Dictionary types - let schema = Schema::new(vec![Field::new( - "c1", - DataType::Dictionary( - Box::new(DataType::UInt8), - Box::new(DataType::Dictionary( - Box::new(DataType::UInt16), - Box::new(DataType::Utf8), - )), - ), - false, - )]); - let expected_expr = - "c1_null_count@2 != row_count@3 AND c1_min@0 <= test AND test <= c1_max@1"; - - // Test with a simple literal - let expr = col("c1").eq(lit(ScalarValue::Utf8(Some("test".to_string())))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_dict_date_dict_date() -> Result<()> { - // Test with dictionary-wrapped date types for both sides - let schema = Schema::new(vec![Field::new( - "c1", - DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Date32)), - false, - )]); - let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Dictionary(UInt16, Date64)) <= 1970-01-01 AND 1970-01-01 <= CAST(c1_max@1 AS Dictionary(UInt16, Date64))"; - - // Test with a cast to a different date type - let expr = cast( - col("c1"), - DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Date64)), - ) - .eq(lit(ScalarValue::Date64(Some(123)))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_date_string() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, false)]); - let expected_expr = "true"; - - // test column on the left - let expr = - cast(col("c1"), DataType::Date32).eq(lit(ScalarValue::Date32(Some(123)))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = - lit(ScalarValue::Date32(Some(123))).eq(cast(col("c1"), DataType::Date32)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_string_date() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Date32, false)]); - let expected_expr = "true"; - - // test column on the left - let expr = cast(col("c1"), DataType::Utf8) - .eq(lit(ScalarValue::Utf8(Some("2024-01-01".to_string())))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = lit(ScalarValue::Utf8(Some("2024-01-01".to_string()))) - .eq(cast(col("c1"), DataType::Utf8)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_cast_list() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - // test cast(c1 as int64) in int64(1, 2, 3) - let expr = Expr::InList(InList::new( - Box::new(cast(col("c1"), DataType::Int64)), - vec![ - lit(ScalarValue::Int64(Some(1))), - lit(ScalarValue::Int64(Some(2))), - lit(ScalarValue::Int64(Some(3))), - ], - false, - )); - let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Int64) <= 1 AND 1 <= CAST(c1_max@1 AS Int64) OR c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Int64) <= 2 AND 2 <= CAST(c1_max@1 AS Int64) OR c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Int64) <= 3 AND 3 <= CAST(c1_max@1 AS Int64)"; - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - let expr = Expr::InList(InList::new( - Box::new(cast(col("c1"), DataType::Int64)), - vec![ - lit(ScalarValue::Int64(Some(1))), - lit(ScalarValue::Int64(Some(2))), - lit(ScalarValue::Int64(Some(3))), - ], - true, - )); - let expected_expr = "c1_null_count@2 != row_count@3 AND (CAST(c1_min@0 AS Int64) != 1 OR 1 != CAST(c1_max@1 AS Int64)) AND c1_null_count@2 != row_count@3 AND (CAST(c1_min@0 AS Int64) != 2 OR 2 != CAST(c1_max@1 AS Int64)) AND c1_null_count@2 != row_count@3 AND (CAST(c1_min@0 AS Int64) != 3 OR 3 != CAST(c1_max@1 AS Int64))"; - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn prune_decimal_data() { - // decimal(9,2) - let schema = Arc::new(Schema::new(vec![Field::new( - "s1", - DataType::Decimal128(9, 2), - true, - )])); - - prune_with_expr( - // s1 > 5 - col("s1").gt(lit(ScalarValue::Decimal128(Some(500), 9, 2))), - &schema, - // If the data is written by spark, the physical data type is INT32 in the parquet - // So we use the INT32 type of statistic. - &TestStatistics::new().with( - "s1", - ContainerStats::new_i32( - vec![Some(0), Some(4), None, Some(3)], // min - vec![Some(5), Some(6), Some(4), None], // max - ), - ), - &[false, true, false, true], - ); - - prune_with_expr( - // with cast column to other type - cast(col("s1"), DataType::Decimal128(14, 3)) - .gt(lit(ScalarValue::Decimal128(Some(5000), 14, 3))), - &schema, - &TestStatistics::new().with( - "s1", - ContainerStats::new_i32( - vec![Some(0), Some(4), None, Some(3)], // min - vec![Some(5), Some(6), Some(4), None], // max - ), - ), - &[false, true, false, true], - ); - - prune_with_expr( - // with try cast column to other type - try_cast(col("s1"), DataType::Decimal128(14, 3)) - .gt(lit(ScalarValue::Decimal128(Some(5000), 14, 3))), - &schema, - &TestStatistics::new().with( - "s1", - ContainerStats::new_i32( - vec![Some(0), Some(4), None, Some(3)], // min - vec![Some(5), Some(6), Some(4), None], // max - ), - ), - &[false, true, false, true], - ); - - // decimal(18,2) - let schema = Arc::new(Schema::new(vec![Field::new( - "s1", - DataType::Decimal128(18, 2), - true, - )])); - prune_with_expr( - // s1 > 5 - col("s1").gt(lit(ScalarValue::Decimal128(Some(500), 18, 2))), - &schema, - // If the data is written by spark, the physical data type is INT64 in the parquet - // So we use the INT32 type of statistic. - &TestStatistics::new().with( - "s1", - ContainerStats::new_i64( - vec![Some(0), Some(4), None, Some(3)], // min - vec![Some(5), Some(6), Some(4), None], // max - ), - ), - &[false, true, false, true], - ); - - // decimal(23,2) - let schema = Arc::new(Schema::new(vec![Field::new( - "s1", - DataType::Decimal128(23, 2), - true, - )])); - - prune_with_expr( - // s1 > 5 - col("s1").gt(lit(ScalarValue::Decimal128(Some(500), 23, 2))), - &schema, - &TestStatistics::new().with( - "s1", - ContainerStats::new_decimal128( - vec![Some(0), Some(400), None, Some(300)], // min - vec![Some(500), Some(600), Some(400), None], // max - 23, - 2, - ), - ), - &[false, true, false, true], - ); - } - - #[test] - fn prune_api() { - let schema = Arc::new(Schema::new(vec![ - Field::new("s1", DataType::Utf8, true), - Field::new("s2", DataType::Int32, true), - ])); - - let statistics = TestStatistics::new().with( - "s2", - ContainerStats::new_i32( - vec![Some(0), Some(4), None, Some(3)], // min - vec![Some(5), Some(6), None, None], // max - ), - ); - prune_with_expr( - // Prune using s2 > 5 - col("s2").gt(lit(5)), - &schema, - &statistics, - // s2 [0, 5] ==> no rows should pass - // s2 [4, 6] ==> some rows could pass - // No stats for s2 ==> some rows could pass - // s2 [3, None] (null max) ==> some rows could pass - &[false, true, true, true], - ); - - prune_with_expr( - // filter with cast - cast(col("s2"), DataType::Int64).gt(lit(ScalarValue::Int64(Some(5)))), - &schema, - &statistics, - &[false, true, true, true], - ); - } - - #[test] - fn prune_not_eq_data() { - let schema = Arc::new(Schema::new(vec![Field::new("s1", DataType::Utf8, true)])); - - prune_with_expr( - // Prune using s2 != 'M' - col("s1").not_eq(lit("M")), - &schema, - &TestStatistics::new().with( - "s1", - ContainerStats::new_utf8( - vec![Some("A"), Some("A"), Some("N"), Some("M"), None, Some("A")], // min - vec![Some("Z"), Some("L"), Some("Z"), Some("M"), None, None], // max - ), - ), - // s1 [A, Z] ==> might have values that pass predicate - // s1 [A, L] ==> all rows pass the predicate - // s1 [N, Z] ==> all rows pass the predicate - // s1 [M, M] ==> all rows do not pass the predicate - // No stats for s2 ==> some rows could pass - // s2 [3, None] (null max) ==> some rows could pass - &[true, true, true, false, true, true], - ); - } - - /// Creates setup for boolean chunk pruning - /// - /// For predicate "b1" (boolean expr) - /// b1 [false, false] ==> no rows can pass (not keep) - /// b1 [false, true] ==> some rows could pass (must keep) - /// b1 [true, true] ==> all rows must pass (must keep) - /// b1 [NULL, NULL] ==> unknown (must keep) - /// b1 [false, NULL] ==> unknown (must keep) - /// - /// For predicate "!b1" (boolean expr) - /// b1 [false, false] ==> all rows pass (must keep) - /// b1 [false, true] ==> some rows could pass (must keep) - /// b1 [true, true] ==> no rows can pass (not keep) - /// b1 [NULL, NULL] ==> unknown (must keep) - /// b1 [false, NULL] ==> unknown (must keep) - fn bool_setup() -> (SchemaRef, TestStatistics, Vec, Vec) { - let schema = - Arc::new(Schema::new(vec![Field::new("b1", DataType::Boolean, true)])); - - let statistics = TestStatistics::new().with( - "b1", - ContainerStats::new_bool( - vec![Some(false), Some(false), Some(true), None, Some(false)], // min - vec![Some(false), Some(true), Some(true), None, None], // max - ), - ); - let expected_true = vec![false, true, true, true, true]; - let expected_false = vec![true, true, false, true, true]; - - (schema, statistics, expected_true, expected_false) - } - - #[test] - fn prune_bool_const_expr() { - let (schema, statistics, _, _) = bool_setup(); - - prune_with_expr( - // true - lit(true), - &schema, - &statistics, - &[true, true, true, true, true], - ); - - prune_with_expr( - // false - lit(false), - &schema, - &statistics, - &[false, false, false, false, false], - ); - } - - #[test] - fn prune_bool_column() { - let (schema, statistics, expected_true, _) = bool_setup(); - - prune_with_expr( - // b1 - col("b1"), - &schema, - &statistics, - &expected_true, - ); - } - - #[test] - fn prune_bool_not_column() { - let (schema, statistics, _, expected_false) = bool_setup(); - - prune_with_expr( - // !b1 - col("b1").not(), - &schema, - &statistics, - &expected_false, - ); - } - - #[test] - fn prune_bool_column_eq_true() { - let (schema, statistics, expected_true, _) = bool_setup(); - - prune_with_expr( - // b1 = true - col("b1").eq(lit(true)), - &schema, - &statistics, - &expected_true, - ); - } - - #[test] - fn prune_bool_not_column_eq_true() { - let (schema, statistics, _, expected_false) = bool_setup(); - - prune_with_expr( - // !b1 = true - col("b1").not().eq(lit(true)), - &schema, - &statistics, - &expected_false, - ); - } - - /// Creates a setup for chunk pruning, modeling a int32 column "i" - /// with 5 different containers (e.g. RowGroups). They have [min, - /// max]: - /// - /// i [-5, 5] - /// i [1, 11] - /// i [-11, -1] - /// i [NULL, NULL] - /// i [1, NULL] - fn int32_setup() -> (SchemaRef, TestStatistics) { - let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); - - let statistics = TestStatistics::new().with( - "i", - ContainerStats::new_i32( - vec![Some(-5), Some(1), Some(-11), None, Some(1)], // min - vec![Some(5), Some(11), Some(-1), None, None], // max - ), - ); - (schema, statistics) - } - - #[test] - fn prune_int32_col_gt_zero() { - let (schema, statistics) = int32_setup(); - - // Expression "i > 0" and "-i < 0" - // i [-5, 5] ==> some rows could pass (must keep) - // i [1, 11] ==> all rows must pass (must keep) - // i [-11, -1] ==> no rows can pass (not keep) - // i [NULL, NULL] ==> unknown (must keep) - // i [1, NULL] ==> unknown (must keep) - let expected_ret = &[true, true, false, true, true]; - - // i > 0 - prune_with_expr(col("i").gt(lit(0)), &schema, &statistics, expected_ret); - - // -i < 0 - prune_with_expr( - Expr::Negative(Box::new(col("i"))).lt(lit(0)), - &schema, - &statistics, - expected_ret, - ); - } - - #[test] - fn prune_int32_col_lte_zero() { - let (schema, statistics) = int32_setup(); - - // Expression "i <= 0" and "-i >= 0" - // i [-5, 5] ==> some rows could pass (must keep) - // i [1, 11] ==> no rows can pass (not keep) - // i [-11, -1] ==> all rows must pass (must keep) - // i [NULL, NULL] ==> unknown (must keep) - // i [1, NULL] ==> no rows can pass (not keep) - let expected_ret = &[true, false, true, true, false]; - - prune_with_expr( - // i <= 0 - col("i").lt_eq(lit(0)), - &schema, - &statistics, - expected_ret, - ); - - prune_with_expr( - // -i >= 0 - Expr::Negative(Box::new(col("i"))).gt_eq(lit(0)), - &schema, - &statistics, - expected_ret, - ); - } - - #[test] - fn prune_int32_col_lte_zero_cast() { - let (schema, statistics) = int32_setup(); - - // Expression "cast(i as utf8) <= '0'" - // i [-5, 5] ==> some rows could pass (must keep) - // i [1, 11] ==> no rows can pass in theory, -0.22 (conservatively keep) - // i [-11, -1] ==> no rows could pass in theory (conservatively keep) - // i [NULL, NULL] ==> unknown (must keep) - // i [1, NULL] ==> no rows can pass (conservatively keep) - let expected_ret = &[true, true, true, true, true]; - - prune_with_expr( - // cast(i as utf8) <= 0 - cast(col("i"), DataType::Utf8).lt_eq(lit("0")), - &schema, - &statistics, - expected_ret, - ); - - prune_with_expr( - // try_cast(i as utf8) <= 0 - try_cast(col("i"), DataType::Utf8).lt_eq(lit("0")), - &schema, - &statistics, - expected_ret, - ); - - prune_with_expr( - // cast(-i as utf8) >= 0 - cast(Expr::Negative(Box::new(col("i"))), DataType::Utf8).gt_eq(lit("0")), - &schema, - &statistics, - expected_ret, - ); - - prune_with_expr( - // try_cast(-i as utf8) >= 0 - try_cast(Expr::Negative(Box::new(col("i"))), DataType::Utf8).gt_eq(lit("0")), - &schema, - &statistics, - expected_ret, - ); - } - - #[test] - fn prune_int32_col_eq_zero() { - let (schema, statistics) = int32_setup(); - - // Expression "i = 0" - // i [-5, 5] ==> some rows could pass (must keep) - // i [1, 11] ==> no rows can pass (not keep) - // i [-11, -1] ==> no rows can pass (not keep) - // i [NULL, NULL] ==> unknown (must keep) - // i [1, NULL] ==> no rows can pass (not keep) - let expected_ret = &[true, false, false, true, false]; - - prune_with_expr( - // i = 0 - col("i").eq(lit(0)), - &schema, - &statistics, - expected_ret, - ); - } - - #[test] - fn prune_int32_col_eq_zero_cast() { - let (schema, statistics) = int32_setup(); - - // Expression "cast(i as int64) = 0" - // i [-5, 5] ==> some rows could pass (must keep) - // i [1, 11] ==> no rows can pass (not keep) - // i [-11, -1] ==> no rows can pass (not keep) - // i [NULL, NULL] ==> unknown (must keep) - // i [1, NULL] ==> no rows can pass (not keep) - let expected_ret = &[true, false, false, true, false]; - - prune_with_expr( - cast(col("i"), DataType::Int64).eq(lit(0i64)), - &schema, - &statistics, - expected_ret, - ); - - prune_with_expr( - try_cast(col("i"), DataType::Int64).eq(lit(0i64)), - &schema, - &statistics, - expected_ret, - ); - } - - #[test] - fn prune_int32_col_eq_zero_cast_as_str() { - let (schema, statistics) = int32_setup(); - - // Note the cast is to a string where sorting properties are - // not the same as integers - // - // Expression "cast(i as utf8) = '0'" - // i [-5, 5] ==> some rows could pass (keep) - // i [1, 11] ==> no rows can pass (could keep) - // i [-11, -1] ==> no rows can pass (could keep) - // i [NULL, NULL] ==> unknown (keep) - // i [1, NULL] ==> no rows can pass (could keep) - let expected_ret = &[true, true, true, true, true]; - - prune_with_expr( - cast(col("i"), DataType::Utf8).eq(lit("0")), - &schema, - &statistics, - expected_ret, - ); - } - - #[test] - fn prune_int32_col_lt_neg_one() { - let (schema, statistics) = int32_setup(); - - // Expression "i > -1" and "-i < 1" - // i [-5, 5] ==> some rows could pass (must keep) - // i [1, 11] ==> all rows must pass (must keep) - // i [-11, -1] ==> no rows can pass (not keep) - // i [NULL, NULL] ==> unknown (must keep) - // i [1, NULL] ==> all rows must pass (must keep) - let expected_ret = &[true, true, false, true, true]; - - prune_with_expr( - // i > -1 - col("i").gt(lit(-1)), - &schema, - &statistics, - expected_ret, - ); - - prune_with_expr( - // -i < 1 - Expr::Negative(Box::new(col("i"))).lt(lit(1)), - &schema, - &statistics, - expected_ret, - ); - } - - #[test] - fn prune_int32_is_null() { - let (schema, statistics) = int32_setup(); - - // Expression "i IS NULL" when there are no null statistics, - // should all be kept - let expected_ret = &[true, true, true, true, true]; - - prune_with_expr( - // i IS NULL, no null statistics - col("i").is_null(), - &schema, - &statistics, - expected_ret, - ); - - // provide null counts for each column - let statistics = statistics.with_null_counts( - "i", - vec![ - Some(0), // no nulls (don't keep) - Some(1), // 1 null - None, // unknown nulls - None, // unknown nulls (min/max are both null too, like no stats at all) - Some(0), // 0 nulls (max=null too which means no known max) (don't keep) - ], - ); - - let expected_ret = &[false, true, true, true, false]; - - prune_with_expr( - // i IS NULL, with actual null statistics - col("i").is_null(), - &schema, - &statistics, - expected_ret, - ); - } - - #[test] - fn prune_int32_column_is_known_all_null() { - let (schema, statistics) = int32_setup(); - - // Expression "i < 0" - // i [-5, 5] ==> some rows could pass (must keep) - // i [1, 11] ==> no rows can pass (not keep) - // i [-11, -1] ==> all rows must pass (must keep) - // i [NULL, NULL] ==> unknown (must keep) - // i [1, NULL] ==> no rows can pass (not keep) - let expected_ret = &[true, false, true, true, false]; - - prune_with_expr( - // i < 0 - col("i").lt(lit(0)), - &schema, - &statistics, - expected_ret, - ); - - // provide row counts for each column - let statistics = statistics.with_row_counts( - "i", - vec![ - Some(10), // 10 rows of data - Some(9), // 9 rows of data - None, // unknown row counts - Some(4), - Some(10), - ], - ); - - // pruning result is still the same if we only know row counts - prune_with_expr( - // i < 0, with only row counts statistics - col("i").lt(lit(0)), - &schema, - &statistics, - expected_ret, - ); - - // provide null counts for each column - let statistics = statistics.with_null_counts( - "i", - vec![ - Some(0), // no nulls - Some(1), // 1 null - None, // unknown nulls - Some(4), // 4 nulls, which is the same as the row counts, i.e. this column is all null (don't keep) - Some(0), // 0 nulls (max=null too which means no known max) - ], - ); - - // Expression "i < 0" with actual null and row counts statistics - // col | min, max | row counts | null counts | - // ----+--------------+------------+-------------+ - // i | [-5, 5] | 10 | 0 | ==> Some rows could pass (must keep) - // i | [1, 11] | 9 | 1 | ==> No rows can pass (not keep) - // i | [-11,-1] | Unknown | Unknown | ==> All rows must pass (must keep) - // i | [NULL, NULL] | 4 | 4 | ==> The column is all null (not keep) - // i | [1, NULL] | 10 | 0 | ==> No rows can pass (not keep) - let expected_ret = &[true, false, true, false, false]; - - prune_with_expr( - // i < 0, with actual null and row counts statistics - col("i").lt(lit(0)), - &schema, - &statistics, - expected_ret, - ); - } - - #[test] - fn prune_cast_column_scalar() { - // The data type of column i is INT32 - let (schema, statistics) = int32_setup(); - let expected_ret = &[true, true, false, true, true]; - - prune_with_expr( - // i > int64(0) - col("i").gt(cast(lit(ScalarValue::Int64(Some(0))), DataType::Int32)), - &schema, - &statistics, - expected_ret, - ); - - prune_with_expr( - // cast(i as int64) > int64(0) - cast(col("i"), DataType::Int64).gt(lit(ScalarValue::Int64(Some(0)))), - &schema, - &statistics, - expected_ret, - ); - - prune_with_expr( - // try_cast(i as int64) > int64(0) - try_cast(col("i"), DataType::Int64).gt(lit(ScalarValue::Int64(Some(0)))), - &schema, - &statistics, - expected_ret, - ); - - prune_with_expr( - // `-cast(i as int64) < 0` convert to `cast(i as int64) > -0` - Expr::Negative(Box::new(cast(col("i"), DataType::Int64))) - .lt(lit(ScalarValue::Int64(Some(0)))), - &schema, - &statistics, - expected_ret, - ); - } - - #[test] - fn test_increment_utf8() { - // Basic ASCII - assert_eq!(increment_utf8("abc").unwrap(), "abd"); - assert_eq!(increment_utf8("abz").unwrap(), "ab{"); - - // Test around ASCII 127 (DEL) - assert_eq!(increment_utf8("~").unwrap(), "\u{7f}"); // 126 -> 127 - assert_eq!(increment_utf8("\u{7f}").unwrap(), "\u{80}"); // 127 -> 128 - - // Test 2-byte UTF-8 sequences - assert_eq!(increment_utf8("ß").unwrap(), "à"); // U+00DF -> U+00E0 - - // Test 3-byte UTF-8 sequences - assert_eq!(increment_utf8("℣").unwrap(), "ℤ"); // U+2123 -> U+2124 - - // Test at UTF-8 boundaries - assert_eq!(increment_utf8("\u{7FF}").unwrap(), "\u{800}"); // 2-byte to 3-byte boundary - assert_eq!(increment_utf8("\u{FFFF}").unwrap(), "\u{10000}"); // 3-byte to 4-byte boundary - - // Test that if we can't increment we return None - assert!(increment_utf8("").is_none()); - assert!(increment_utf8("\u{10FFFF}").is_none()); // U+10FFFF is the max code point - - // Test that if we can't increment the last character we do the previous one and truncate - assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b"); - - // Test surrogate pair range (0xD800..=0xDFFF) - assert_eq!(increment_utf8("a\u{D7FF}").unwrap(), "b"); - assert!(increment_utf8("\u{D7FF}").is_none()); - - // Test non-characters range (0xFDD0..=0xFDEF) - assert_eq!(increment_utf8("a\u{FDCF}").unwrap(), "b"); - assert!(increment_utf8("\u{FDCF}").is_none()); - - // Test private use area limit (>= 0x110000) - assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b"); - assert!(increment_utf8("\u{10FFFF}").is_none()); // Can't increment past max valid codepoint - } - - /// Creates a setup for chunk pruning, modeling a utf8 column "s1" - /// with 5 different containers (e.g. RowGroups). They have [min, - /// max]: - /// s1 ["A", "Z"] - /// s1 ["A", "L"] - /// s1 ["N", "Z"] - /// s1 [NULL, NULL] - /// s1 ["A", NULL] - /// s1 ["", "A"] - /// s1 ["", ""] - /// s1 ["AB", "A\u{10ffff}"] - /// s1 ["A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] - fn utf8_setup() -> (SchemaRef, TestStatistics) { - let schema = Arc::new(Schema::new(vec![Field::new("s1", DataType::Utf8, true)])); - - let statistics = TestStatistics::new().with( - "s1", - ContainerStats::new_utf8( - vec![ - Some("A"), - Some("A"), - Some("N"), - Some("M"), - None, - Some("A"), - Some(""), - Some(""), - Some("AB"), - Some("A\u{10ffff}\u{10ffff}"), - ], // min - vec![ - Some("Z"), - Some("L"), - Some("Z"), - Some("M"), - None, - None, - Some("A"), - Some(""), - Some("A\u{10ffff}\u{10ffff}\u{10ffff}"), - Some("A\u{10ffff}\u{10ffff}"), - ], // max - ), - ); - (schema, statistics) - } - - #[test] - fn prune_utf8_eq() { - let (schema, statistics) = utf8_setup(); - - let expr = col("s1").eq(lit("A")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["A", "L"] ==> some rows could pass (must keep) - true, - // s1 ["N", "Z"] ==> no rows can pass (not keep) - false, - // s1 ["M", "M"] ==> no rows can pass (not keep) - false, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> unknown (must keep) - true, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> no rows can pass (not keep) - false, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) - false, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) - false, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - - let expr = col("s1").eq(lit("")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> no rows can pass (not keep) - false, - // s1 ["A", "L"] ==> no rows can pass (not keep) - false, - // s1 ["N", "Z"] ==> no rows can pass (not keep) - false, - // s1 ["M", "M"] ==> no rows can pass (not keep) - false, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> no rows can pass (not keep) - false, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> all rows must pass (must keep) - true, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) - false, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) - false, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - } - - #[test] - fn prune_utf8_not_eq() { - let (schema, statistics) = utf8_setup(); - - let expr = col("s1").not_eq(lit("A")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["A", "L"] ==> some rows could pass (must keep) - true, - // s1 ["N", "Z"] ==> all rows must pass (must keep) - true, - // s1 ["M", "M"] ==> all rows must pass (must keep) - true, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> unknown (must keep) - true, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> all rows must pass (must keep) - true, - // s1 ["AB", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) - true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) - true, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - - let expr = col("s1").not_eq(lit("")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> all rows must pass (must keep) - true, - // s1 ["A", "L"] ==> all rows must pass (must keep) - true, - // s1 ["N", "Z"] ==> all rows must pass (must keep) - true, - // s1 ["M", "M"] ==> all rows must pass (must keep) - true, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> unknown (must keep) - true, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> no rows can pass (not keep) - false, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) - true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) - true, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - } - - #[test] - fn prune_utf8_like_one() { - let (schema, statistics) = utf8_setup(); - - let expr = col("s1").like(lit("A_")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["A", "L"] ==> some rows could pass (must keep) - true, - // s1 ["N", "Z"] ==> no rows can pass (not keep) - false, - // s1 ["M", "M"] ==> no rows can pass (not keep) - false, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> unknown (must keep) - true, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> no rows can pass (not keep) - false, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - - let expr = col("s1").like(lit("_A_")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["A", "L"] ==> some rows could pass (must keep) - true, - // s1 ["N", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["M", "M"] ==> some rows could pass (must keep) - true, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> unknown (must keep) - true, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> some rows could pass (must keep) - true, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - - let expr = col("s1").like(lit("_")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> all rows must pass (must keep) - true, - // s1 ["A", "L"] ==> all rows must pass (must keep) - true, - // s1 ["N", "Z"] ==> all rows must pass (must keep) - true, - // s1 ["M", "M"] ==> all rows must pass (must keep) - true, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> unknown (must keep) - true, - // s1 ["", "A"] ==> all rows must pass (must keep) - true, - // s1 ["", ""] ==> all rows must pass (must keep) - true, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) - true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) - true, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - - let expr = col("s1").like(lit("")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> no rows can pass (not keep) - false, - // s1 ["A", "L"] ==> no rows can pass (not keep) - false, - // s1 ["N", "Z"] ==> no rows can pass (not keep) - false, - // s1 ["M", "M"] ==> no rows can pass (not keep) - false, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> no rows can pass (not keep) - false, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> all rows must pass (must keep) - true, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) - false, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) - false, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - } - - #[test] - fn prune_utf8_like_many() { - let (schema, statistics) = utf8_setup(); - - let expr = col("s1").like(lit("A%")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["A", "L"] ==> some rows could pass (must keep) - true, - // s1 ["N", "Z"] ==> no rows can pass (not keep) - false, - // s1 ["M", "M"] ==> no rows can pass (not keep) - false, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> unknown (must keep) - true, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> no rows can pass (not keep) - false, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - - let expr = col("s1").like(lit("%A%")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["A", "L"] ==> some rows could pass (must keep) - true, - // s1 ["N", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["M", "M"] ==> some rows could pass (must keep) - true, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> unknown (must keep) - true, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> some rows could pass (must keep) - true, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - - let expr = col("s1").like(lit("%")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> all rows must pass (must keep) - true, - // s1 ["A", "L"] ==> all rows must pass (must keep) - true, - // s1 ["N", "Z"] ==> all rows must pass (must keep) - true, - // s1 ["M", "M"] ==> all rows must pass (must keep) - true, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> unknown (must keep) - true, - // s1 ["", "A"] ==> all rows must pass (must keep) - true, - // s1 ["", ""] ==> all rows must pass (must keep) - true, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) - true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) - true, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - - let expr = col("s1").like(lit("")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> no rows can pass (not keep) - false, - // s1 ["A", "L"] ==> no rows can pass (not keep) - false, - // s1 ["N", "Z"] ==> no rows can pass (not keep) - false, - // s1 ["M", "M"] ==> no rows can pass (not keep) - false, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> no rows can pass (not keep) - false, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> all rows must pass (must keep) - true, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) - false, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) - false, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - } - - #[test] - fn prune_utf8_not_like_one() { - let (schema, statistics) = utf8_setup(); - - let expr = col("s1").not_like(lit("A\u{10ffff}_")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["A", "L"] ==> some rows could pass (must keep) - true, - // s1 ["N", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["M", "M"] ==> some rows could pass (must keep) - true, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> some rows could pass (must keep) - true, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> some rows could pass (must keep) - true, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate - // orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}") - true, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - } - - #[test] - fn prune_utf8_not_like_many() { - let (schema, statistics) = utf8_setup(); - - let expr = col("s1").not_like(lit("A\u{10ffff}%")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["A", "L"] ==> some rows could pass (must keep) - true, - // s1 ["N", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["M", "M"] ==> some rows could pass (must keep) - true, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> some rows could pass (must keep) - true, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> some rows could pass (must keep) - true, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match - false, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - - let expr = col("s1").not_like(lit("A\u{10ffff}%\u{10ffff}")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["A", "L"] ==> some rows could pass (must keep) - true, - // s1 ["N", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["M", "M"] ==> some rows could pass (must keep) - true, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> some rows could pass (must keep) - true, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> some rows could pass (must keep) - true, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - - let expr = col("s1").not_like(lit("A\u{10ffff}%\u{10ffff}_")); - #[rustfmt::skip] - let expected_ret = &[ - // s1 ["A", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["A", "L"] ==> some rows could pass (must keep) - true, - // s1 ["N", "Z"] ==> some rows could pass (must keep) - true, - // s1 ["M", "M"] ==> some rows could pass (must keep) - true, - // s1 [NULL, NULL] ==> unknown (must keep) - true, - // s1 ["A", NULL] ==> some rows could pass (must keep) - true, - // s1 ["", "A"] ==> some rows could pass (must keep) - true, - // s1 ["", ""] ==> some rows could pass (must keep) - true, - // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) - true, - ]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - - let expr = col("s1").not_like(lit("A\\%%")); - let statistics = TestStatistics::new().with( - "s1", - ContainerStats::new_utf8( - vec![Some("A%a"), Some("A")], - vec![Some("A%c"), Some("A")], - ), - ); - let expected_ret = &[false, true]; - prune_with_expr(expr, &schema, &statistics, expected_ret); - } - - #[test] - fn test_rewrite_expr_to_prunable() { - let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); - let df_schema = DFSchema::try_from(schema.clone()).unwrap(); - - // column op lit - let left_input = col("a"); - let left_input = logical2physical(&left_input, &schema); - let right_input = lit(ScalarValue::Int32(Some(12))); - let right_input = logical2physical(&right_input, &schema); - let (result_left, _, result_right) = rewrite_expr_to_prunable( - &left_input, - Operator::Eq, - &right_input, - df_schema.clone(), - ) - .unwrap(); - assert_eq!(result_left.to_string(), left_input.to_string()); - assert_eq!(result_right.to_string(), right_input.to_string()); - - // cast op lit - let left_input = cast(col("a"), DataType::Decimal128(20, 3)); - let left_input = logical2physical(&left_input, &schema); - let right_input = lit(ScalarValue::Decimal128(Some(12), 20, 3)); - let right_input = logical2physical(&right_input, &schema); - let (result_left, _, result_right) = rewrite_expr_to_prunable( - &left_input, - Operator::Gt, - &right_input, - df_schema.clone(), - ) - .unwrap(); - assert_eq!(result_left.to_string(), left_input.to_string()); - assert_eq!(result_right.to_string(), right_input.to_string()); - - // try_cast op lit - let left_input = try_cast(col("a"), DataType::Int64); - let left_input = logical2physical(&left_input, &schema); - let right_input = lit(ScalarValue::Int64(Some(12))); - let right_input = logical2physical(&right_input, &schema); - let (result_left, _, result_right) = - rewrite_expr_to_prunable(&left_input, Operator::Gt, &right_input, df_schema) - .unwrap(); - assert_eq!(result_left.to_string(), left_input.to_string()); - assert_eq!(result_right.to_string(), right_input.to_string()); - - // TODO: add test for other case and op - } - - #[test] - fn test_rewrite_expr_to_prunable_custom_unhandled_hook() { - struct CustomUnhandledHook; - - impl UnhandledPredicateHook for CustomUnhandledHook { - /// This handles an arbitrary case of a column that doesn't exist in the schema - /// by renaming it to yet another column that doesn't exist in the schema - /// (the transformation is arbitrary, the point is that it can do whatever it wants) - fn handle(&self, _expr: &Arc) -> Arc { - Arc::new(phys_expr::Literal::new(ScalarValue::Int32(Some(42)))) - } - } - - let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); - let schema_with_b = Schema::new(vec![ - Field::new("a", DataType::Int32, true), - Field::new("b", DataType::Int32, true), - ]); - - let rewriter = PredicateRewriter::new() - .with_unhandled_hook(Arc::new(CustomUnhandledHook {})); - - let transform_expr = |expr| { - let expr = logical2physical(&expr, &schema_with_b); - rewriter.rewrite_predicate_to_statistics_predicate(&expr, &schema) - }; - - // transform an arbitrary valid expression that we know is handled - let known_expression = col("a").eq(lit(12)); - let known_expression_transformed = PredicateRewriter::new() - .rewrite_predicate_to_statistics_predicate( - &logical2physical(&known_expression, &schema), - &schema, - ); - - // an expression referencing an unknown column (that is not in the schema) gets passed to the hook - let input = col("b").eq(lit(12)); - let expected = logical2physical(&lit(42), &schema); - let transformed = transform_expr(input.clone()); - assert_eq!(transformed.to_string(), expected.to_string()); - - // more complex case with unknown column - let input = known_expression.clone().and(input.clone()); - let expected = phys_expr::BinaryExpr::new( - Arc::::clone(&known_expression_transformed), - Operator::And, - logical2physical(&lit(42), &schema), - ); - let transformed = transform_expr(input.clone()); - assert_eq!(transformed.to_string(), expected.to_string()); - - // an unknown expression gets passed to the hook - let input = array_has(make_array(vec![lit(1)]), col("a")); - let expected = logical2physical(&lit(42), &schema); - let transformed = transform_expr(input.clone()); - assert_eq!(transformed.to_string(), expected.to_string()); - - // more complex case with unknown expression - let input = known_expression.and(input); - let expected = phys_expr::BinaryExpr::new( - Arc::::clone(&known_expression_transformed), - Operator::And, - logical2physical(&lit(42), &schema), - ); - let transformed = transform_expr(input.clone()); - assert_eq!(transformed.to_string(), expected.to_string()); - } - - #[test] - fn test_rewrite_expr_to_prunable_error() { - // cast string value to numeric value - // this cast is not supported - let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); - let df_schema = DFSchema::try_from(schema.clone()).unwrap(); - let left_input = cast(col("a"), DataType::Int64); - let left_input = logical2physical(&left_input, &schema); - let right_input = lit(ScalarValue::Int64(Some(12))); - let right_input = logical2physical(&right_input, &schema); - let result = rewrite_expr_to_prunable( - &left_input, - Operator::Gt, - &right_input, - df_schema.clone(), - ); - assert!(result.is_err()); - - // other expr - let left_input = is_null(col("a")); - let left_input = logical2physical(&left_input, &schema); - let right_input = lit(ScalarValue::Int64(Some(12))); - let right_input = logical2physical(&right_input, &schema); - let result = - rewrite_expr_to_prunable(&left_input, Operator::Gt, &right_input, df_schema); - assert!(result.is_err()); - // TODO: add other negative test for other case and op - } - - #[test] - fn prune_with_contained_one_column() { - let schema = Arc::new(Schema::new(vec![Field::new("s1", DataType::Utf8, true)])); - - // Model having information like a bloom filter for s1 - let statistics = TestStatistics::new() - .with_contained( - "s1", - [ScalarValue::from("foo")], - [ - // container 0 known to only contain "foo"", - Some(true), - // container 1 known to not contain "foo" - Some(false), - // container 2 unknown about "foo" - None, - // container 3 known to only contain "foo" - Some(true), - // container 4 known to not contain "foo" - Some(false), - // container 5 unknown about "foo" - None, - // container 6 known to only contain "foo" - Some(true), - // container 7 known to not contain "foo" - Some(false), - // container 8 unknown about "foo" - None, - ], - ) - .with_contained( - "s1", - [ScalarValue::from("bar")], - [ - // containers 0,1,2 known to only contain "bar" - Some(true), - Some(true), - Some(true), - // container 3,4,5 known to not contain "bar" - Some(false), - Some(false), - Some(false), - // container 6,7,8 unknown about "bar" - None, - None, - None, - ], - ) - .with_contained( - // the way the tests are setup, this data is - // consulted if the "foo" and "bar" are being checked at the same time - "s1", - [ScalarValue::from("foo"), ScalarValue::from("bar")], - [ - // container 0,1,2 unknown about ("foo, "bar") - None, - None, - None, - // container 3,4,5 known to contain only either "foo" and "bar" - Some(true), - Some(true), - Some(true), - // container 6,7,8 known to contain neither "foo" and "bar" - Some(false), - Some(false), - Some(false), - ], - ); - - // s1 = 'foo' - prune_with_expr( - col("s1").eq(lit("foo")), - &schema, - &statistics, - // rule out containers ('false) where we know foo is not present - &[true, false, true, true, false, true, true, false, true], - ); - - // s1 = 'bar' - prune_with_expr( - col("s1").eq(lit("bar")), - &schema, - &statistics, - // rule out containers where we know bar is not present - &[true, true, true, false, false, false, true, true, true], - ); - - // s1 = 'baz' (unknown value) - prune_with_expr( - col("s1").eq(lit("baz")), - &schema, - &statistics, - // can't rule out anything - &[true, true, true, true, true, true, true, true, true], - ); - - // s1 = 'foo' AND s1 = 'bar' - prune_with_expr( - col("s1").eq(lit("foo")).and(col("s1").eq(lit("bar"))), - &schema, - &statistics, - // logically this predicate can't possibly be true (the column can't - // take on both values) but we could rule it out if the stats tell - // us that both values are not present - &[true, true, true, true, true, true, true, true, true], - ); - - // s1 = 'foo' OR s1 = 'bar' - prune_with_expr( - col("s1").eq(lit("foo")).or(col("s1").eq(lit("bar"))), - &schema, - &statistics, - // can rule out containers that we know contain neither foo nor bar - &[true, true, true, true, true, true, false, false, false], - ); - - // s1 = 'foo' OR s1 = 'baz' - prune_with_expr( - col("s1").eq(lit("foo")).or(col("s1").eq(lit("baz"))), - &schema, - &statistics, - // can't rule out anything container - &[true, true, true, true, true, true, true, true, true], - ); - - // s1 = 'foo' OR s1 = 'bar' OR s1 = 'baz' - prune_with_expr( - col("s1") - .eq(lit("foo")) - .or(col("s1").eq(lit("bar"))) - .or(col("s1").eq(lit("baz"))), - &schema, - &statistics, - // can rule out any containers based on knowledge of s1 and `foo`, - // `bar` and (`foo`, `bar`) - &[true, true, true, true, true, true, true, true, true], - ); - - // s1 != foo - prune_with_expr( - col("s1").not_eq(lit("foo")), - &schema, - &statistics, - // rule out containers we know for sure only contain foo - &[false, true, true, false, true, true, false, true, true], - ); - - // s1 != bar - prune_with_expr( - col("s1").not_eq(lit("bar")), - &schema, - &statistics, - // rule out when we know for sure s1 has the value bar - &[false, false, false, true, true, true, true, true, true], - ); - - // s1 != foo AND s1 != bar - prune_with_expr( - col("s1") - .not_eq(lit("foo")) - .and(col("s1").not_eq(lit("bar"))), - &schema, - &statistics, - // can rule out any container where we know s1 does not have either 'foo' or 'bar' - &[true, true, true, false, false, false, true, true, true], - ); - - // s1 != foo AND s1 != bar AND s1 != baz - prune_with_expr( - col("s1") - .not_eq(lit("foo")) - .and(col("s1").not_eq(lit("bar"))) - .and(col("s1").not_eq(lit("baz"))), - &schema, - &statistics, - // can't rule out any container based on knowledge of s1,s2 - &[true, true, true, true, true, true, true, true, true], - ); - - // s1 != foo OR s1 != bar - prune_with_expr( - col("s1") - .not_eq(lit("foo")) - .or(col("s1").not_eq(lit("bar"))), - &schema, - &statistics, - // cant' rule out anything based on contains information - &[true, true, true, true, true, true, true, true, true], - ); - - // s1 != foo OR s1 != bar OR s1 != baz - prune_with_expr( - col("s1") - .not_eq(lit("foo")) - .or(col("s1").not_eq(lit("bar"))) - .or(col("s1").not_eq(lit("baz"))), - &schema, - &statistics, - // cant' rule out anything based on contains information - &[true, true, true, true, true, true, true, true, true], - ); - } - - #[test] - fn prune_with_contained_two_columns() { - let schema = Arc::new(Schema::new(vec![ - Field::new("s1", DataType::Utf8, true), - Field::new("s2", DataType::Utf8, true), - ])); - - // Model having information like bloom filters for s1 and s2 - let statistics = TestStatistics::new() - .with_contained( - "s1", - [ScalarValue::from("foo")], - [ - // container 0, s1 known to only contain "foo"", - Some(true), - // container 1, s1 known to not contain "foo" - Some(false), - // container 2, s1 unknown about "foo" - None, - // container 3, s1 known to only contain "foo" - Some(true), - // container 4, s1 known to not contain "foo" - Some(false), - // container 5, s1 unknown about "foo" - None, - // container 6, s1 known to only contain "foo" - Some(true), - // container 7, s1 known to not contain "foo" - Some(false), - // container 8, s1 unknown about "foo" - None, - ], - ) - .with_contained( - "s2", // for column s2 - [ScalarValue::from("bar")], - [ - // containers 0,1,2 s2 known to only contain "bar" - Some(true), - Some(true), - Some(true), - // container 3,4,5 s2 known to not contain "bar" - Some(false), - Some(false), - Some(false), - // container 6,7,8 s2 unknown about "bar" - None, - None, - None, - ], - ); - - // s1 = 'foo' - prune_with_expr( - col("s1").eq(lit("foo")), - &schema, - &statistics, - // rule out containers where we know s1 is not present - &[true, false, true, true, false, true, true, false, true], - ); - - // s1 = 'foo' OR s2 = 'bar' - let expr = col("s1").eq(lit("foo")).or(col("s2").eq(lit("bar"))); - prune_with_expr( - expr, - &schema, - &statistics, - // can't rule out any container (would need to prove that s1 != foo AND s2 != bar) - &[true, true, true, true, true, true, true, true, true], - ); - - // s1 = 'foo' AND s2 != 'bar' - prune_with_expr( - col("s1").eq(lit("foo")).and(col("s2").not_eq(lit("bar"))), - &schema, - &statistics, - // can only rule out container where we know either: - // 1. s1 doesn't have the value 'foo` or - // 2. s2 has only the value of 'bar' - &[false, false, false, true, false, true, true, false, true], - ); - - // s1 != 'foo' AND s2 != 'bar' - prune_with_expr( - col("s1") - .not_eq(lit("foo")) - .and(col("s2").not_eq(lit("bar"))), - &schema, - &statistics, - // Can rule out any container where we know either - // 1. s1 has only the value 'foo' - // 2. s2 has only the value 'bar' - &[false, false, false, false, true, true, false, true, true], - ); - - // s1 != 'foo' AND (s2 = 'bar' OR s2 = 'baz') - prune_with_expr( - col("s1") - .not_eq(lit("foo")) - .and(col("s2").eq(lit("bar")).or(col("s2").eq(lit("baz")))), - &schema, - &statistics, - // Can rule out any container where we know s1 has only the value - // 'foo'. Can't use knowledge of s2 and bar to rule out anything - &[false, true, true, false, true, true, false, true, true], - ); - - // s1 like '%foo%bar%' - prune_with_expr( - col("s1").like(lit("foo%bar%")), - &schema, - &statistics, - // cant rule out anything with information we know - &[true, true, true, true, true, true, true, true, true], - ); - - // s1 like '%foo%bar%' AND s2 = 'bar' - prune_with_expr( - col("s1") - .like(lit("foo%bar%")) - .and(col("s2").eq(lit("bar"))), - &schema, - &statistics, - // can rule out any container where we know s2 does not have the value 'bar' - &[true, true, true, false, false, false, true, true, true], - ); - - // s1 like '%foo%bar%' OR s2 = 'bar' - prune_with_expr( - col("s1").like(lit("foo%bar%")).or(col("s2").eq(lit("bar"))), - &schema, - &statistics, - // can't rule out anything (we would have to prove that both the - // like and the equality must be false) - &[true, true, true, true, true, true, true, true, true], - ); - } - - #[test] - fn prune_with_range_and_contained() { - // Setup mimics range information for i, a bloom filter for s - let schema = Arc::new(Schema::new(vec![ - Field::new("i", DataType::Int32, true), - Field::new("s", DataType::Utf8, true), - ])); - - let statistics = TestStatistics::new() - .with( - "i", - ContainerStats::new_i32( - // Container 0, 3, 6: [-5 to 5] - // Container 1, 4, 7: [10 to 20] - // Container 2, 5, 9: unknown - vec![ - Some(-5), - Some(10), - None, - Some(-5), - Some(10), - None, - Some(-5), - Some(10), - None, - ], // min - vec![ - Some(5), - Some(20), - None, - Some(5), - Some(20), - None, - Some(5), - Some(20), - None, - ], // max - ), - ) - // Add contained information about the s and "foo" - .with_contained( - "s", - [ScalarValue::from("foo")], - [ - // container 0,1,2 known to only contain "foo" - Some(true), - Some(true), - Some(true), - // container 3,4,5 known to not contain "foo" - Some(false), - Some(false), - Some(false), - // container 6,7,8 unknown about "foo" - None, - None, - None, - ], - ); - - // i = 0 and s = 'foo' - prune_with_expr( - col("i").eq(lit(0)).and(col("s").eq(lit("foo"))), - &schema, - &statistics, - // Can rule out container where we know that either: - // 1. 0 is outside the min/max range of i - // 1. s does not contain foo - // (range is false, and contained is false) - &[true, false, true, false, false, false, true, false, true], - ); - - // i = 0 and s != 'foo' - prune_with_expr( - col("i").eq(lit(0)).and(col("s").not_eq(lit("foo"))), - &schema, - &statistics, - // Can rule out containers where either: - // 1. 0 is outside the min/max range of i - // 2. s only contains foo - &[false, false, false, true, false, true, true, false, true], - ); - - // i = 0 OR s = 'foo' - prune_with_expr( - col("i").eq(lit(0)).or(col("s").eq(lit("foo"))), - &schema, - &statistics, - // in theory could rule out containers if we had min/max values for - // s as well. But in this case we don't so we can't rule out anything - &[true, true, true, true, true, true, true, true, true], - ); - } - - /// prunes the specified expr with the specified schema and statistics, and - /// ensures it returns expected. - /// - /// `expected` is a vector of bools, where true means the row group should - /// be kept, and false means it should be pruned. - /// - // TODO refactor other tests to use this to reduce boiler plate - fn prune_with_expr( - expr: Expr, - schema: &SchemaRef, - statistics: &TestStatistics, - expected: &[bool], - ) { - println!("Pruning with expr: {expr}"); - let expr = logical2physical(&expr, schema); - let p = PruningPredicate::try_new(expr, Arc::::clone(schema)).unwrap(); - let result = p.prune(statistics).unwrap(); - assert_eq!(result, expected); - } - - fn test_build_predicate_expression( - expr: &Expr, - schema: &Schema, - required_columns: &mut RequiredColumns, - ) -> Arc { - let expr = logical2physical(expr, schema); - let unhandled_hook = Arc::new(ConstantUnhandledPredicateHook::default()) as _; - build_predicate_expression(&expr, schema, required_columns, &unhandled_hook) - } - - #[test] - fn test_build_predicate_expression_with_false() { - let expr = lit(ScalarValue::Boolean(Some(false))); - let schema = Schema::empty(); - let res = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - let expected = logical2physical(&expr, &schema); - assert_eq!(&res, &expected); - } - - #[test] - fn test_build_predicate_expression_with_and_false() { - let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); - let expr = and( - col("c1").eq(lit("a")), - lit(ScalarValue::Boolean(Some(false))), - ); - let res = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - let expected = logical2physical(&lit(ScalarValue::Boolean(Some(false))), &schema); - assert_eq!(&res, &expected); - } - - #[test] - fn test_build_predicate_expression_with_or_false() { - let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); - let left_expr = col("c1").eq(lit("a")); - let right_expr = lit(ScalarValue::Boolean(Some(false))); - let res = test_build_predicate_expression( - &or(left_expr.clone(), right_expr.clone()), - &schema, - &mut RequiredColumns::new(), - ); - let expected = - "c1_null_count@2 != row_count@3 AND c1_min@0 <= a AND a <= c1_max@1"; - assert_eq!(res.to_string(), expected); - } -} diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs new file mode 100644 index 0000000000000..95dc5888d36b8 --- /dev/null +++ b/datafusion/pruning/src/pruning_predicate.rs @@ -0,0 +1,5130 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`PruningPredicate`] to apply filter [`Expr`] to prune "containers" +//! based on statistics (e.g. Parquet Row Groups) +//! +//! [`Expr`]: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.Expr.html +use std::collections::HashSet; +use std::sync::Arc; + +use arrow::array::AsArray; +use arrow::{ + array::{new_null_array, ArrayRef, BooleanArray}, + datatypes::{DataType, Field, Schema, SchemaRef}, + record_batch::{RecordBatch, RecordBatchOptions}, +}; +// pub use for backwards compatibility +pub use datafusion_common::pruning::PruningStatistics; +use log::{debug, trace}; + +use datafusion_common::error::{DataFusionError, Result}; +use datafusion_common::tree_node::TransformedResult; +use datafusion_common::{ + internal_err, plan_datafusion_err, plan_err, + tree_node::{Transformed, TreeNode}, + ScalarValue, +}; +use datafusion_common::{Column, DFSchema}; +use datafusion_expr_common::operator::Operator; +use datafusion_physical_expr::utils::{collect_columns, Guarantee, LiteralGuarantee}; +use datafusion_physical_expr::{expressions as phys_expr, PhysicalExprRef}; +use datafusion_physical_expr_common::physical_expr::snapshot_physical_expr; +use datafusion_physical_plan::{ColumnarValue, PhysicalExpr}; + +/// Used to prove that arbitrary predicates (boolean expression) can not +/// possibly evaluate to `true` given information about a column provided by +/// [`PruningStatistics`]. +/// +/// # Introduction +/// +/// `PruningPredicate` analyzes filter expressions using statistics such as +/// min/max values and null counts, attempting to prove a "container" (e.g. +/// Parquet Row Group) can be skipped without reading the actual data, +/// potentially leading to significant performance improvements. +/// +/// For example, `PruningPredicate`s are used to prune Parquet Row Groups based +/// on the min/max values found in the Parquet metadata. If the +/// `PruningPredicate` can prove that the filter can never evaluate to `true` +/// for any row in the Row Group, the entire Row Group is skipped during query +/// execution. +/// +/// The `PruningPredicate` API is general, and can be used for pruning other +/// types of containers (e.g. files) based on statistics that may be known from +/// external catalogs (e.g. Delta Lake) or other sources. How this works is a +/// subtle topic. See the Background and Implementation section for details. +/// +/// `PruningPredicate` supports: +/// +/// 1. Arbitrary expressions (including user defined functions) +/// +/// 2. Vectorized evaluation (provide more than one set of statistics at a time) +/// so it is suitable for pruning 1000s of containers. +/// +/// 3. Any source of information that implements the [`PruningStatistics`] trait +/// (not just Parquet metadata). +/// +/// # Example +/// +/// See the [`pruning.rs` example in the `datafusion-examples`] for a complete +/// example of how to use `PruningPredicate` to prune files based on min/max +/// values. +/// +/// [`pruning.rs` example in the `datafusion-examples`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/pruning.rs +/// +/// Given an expression like `x = 5` and statistics for 3 containers (Row +/// Groups, files, etc) `A`, `B`, and `C`: +/// +/// ```text +/// A: {x_min = 0, x_max = 4} +/// B: {x_min = 2, x_max = 10} +/// C: {x_min = 5, x_max = 8} +/// ``` +/// +/// `PruningPredicate` will conclude that the rows in container `A` can never +/// be true (as the maximum value is only `4`), so it can be pruned: +/// +/// ```text +/// A: false (no rows could possibly match x = 5) +/// B: true (rows might match x = 5) +/// C: true (rows might match x = 5) +/// ``` +/// +/// See [`PruningPredicate::try_new`] and [`PruningPredicate::prune`] for more information. +/// +/// # Background +/// +/// ## Boolean Tri-state logic +/// +/// To understand the details of the rest of this documentation, it is important +/// to understand how the tri-state boolean logic in SQL works. As this is +/// somewhat esoteric, we review it here. +/// +/// SQL has a notion of `NULL` that represents the value is `“unknown”` and this +/// uncertainty propagates through expressions. SQL `NULL` behaves very +/// differently than the `NULL` in most other languages where it is a special, +/// sentinel value (e.g. `0` in `C/C++`). While representing uncertainty with +/// `NULL` is powerful and elegant, SQL `NULL`s are often deeply confusing when +/// first encountered as they behave differently than most programmers may +/// expect. +/// +/// In most other programming languages, +/// * `a == NULL` evaluates to `true` if `a` also had the value `NULL` +/// * `a == NULL` evaluates to `false` if `a` has any other value +/// +/// However, in SQL `a = NULL` **always** evaluates to `NULL` (never `true` or +/// `false`): +/// +/// Expression | Result +/// ------------- | --------- +/// `1 = NULL` | `NULL` +/// `NULL = NULL` | `NULL` +/// +/// Also important is how `AND` and `OR` works with tri-state boolean logic as +/// (perhaps counterintuitively) the result is **not** always NULL. While +/// consistent with the notion of `NULL` representing “unknown”, this is again, +/// often deeply confusing 🤯 when first encountered. +/// +/// Expression | Result | Intuition +/// --------------- | --------- | ----------- +/// `NULL AND true` | `NULL` | The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change +/// `NULL AND false` | `false` | If the `NULL` was either `true` or `false` the overall expression is still `false` +/// `NULL AND NULL` | `NULL` | +/// +/// Expression | Result | Intuition +/// --------------- | --------- | ---------- +/// `NULL OR true` | `true` | If the `NULL` was either `true` or `false` the overall expression is still `true` +/// `NULL OR false` | `NULL` | The `NULL` stands for “unknown” and if it were `true` or `false` the overall expression value could change +/// `NULL OR NULL` | `NULL` | +/// +/// ## SQL Filter Semantics +/// +/// The SQL `WHERE` clause has a boolean expression, often called a filter or +/// predicate. The semantics of this predicate are that the query evaluates the +/// predicate for each row in the input tables and: +/// +/// * Rows that evaluate to `true` are returned in the query results +/// +/// * Rows that evaluate to `false` are not returned (“filtered out” or “pruned” or “skipped”). +/// +/// * Rows that evaluate to `NULL` are **NOT** returned (also “filtered out”). +/// Note: *this treatment of `NULL` is **DIFFERENT** than how `NULL` is treated +/// in the rewritten predicate described below.* +/// +/// # `PruningPredicate` Implementation +/// +/// Armed with the information in the Background section, we can now understand +/// how the `PruningPredicate` logic works. +/// +/// ## Interface +/// +/// **Inputs** +/// 1. An input schema describing what columns exist +/// +/// 2. A predicate (expression that evaluates to a boolean) +/// +/// 3. [`PruningStatistics`] that provides information about columns in that +/// schema, for multiple “containers”. For each column in each container, it +/// provides optional information on contained values, min_values, max_values, +/// null_counts counts, and row_counts counts. +/// +/// **Outputs**: +/// A (non null) boolean value for each container: +/// * `true`: There MAY be rows that match the predicate +/// +/// * `false`: There are no rows that could possibly match the predicate (the +/// predicate can never possibly be true). The container can be pruned (skipped) +/// entirely. +/// +/// While `PruningPredicate` will never return a `NULL` value, the +/// rewritten predicate (as returned by `build_predicate_expression` and used internally +/// by `PruningPredicate`) may evaluate to `NULL` when some of the min/max values +/// or null / row counts are not known. +/// +/// In order to be correct, `PruningPredicate` must return false +/// **only** if it can determine that for all rows in the container, the +/// predicate could never evaluate to `true` (always evaluates to either `NULL` +/// or `false`). +/// +/// ## Contains Analysis and Min/Max Rewrite +/// +/// `PruningPredicate` works by first analyzing the predicate to see what +/// [`LiteralGuarantee`] must hold for the predicate to be true. +/// +/// Then, the `PruningPredicate` rewrites the original predicate into an +/// expression that references the min/max values of each column in the original +/// predicate. +/// +/// When the min/max values are actually substituted in to this expression and +/// evaluated, the result means +/// +/// * `true`: there MAY be rows that pass the predicate, **KEEPS** the container +/// +/// * `NULL`: there MAY be rows that pass the predicate, **KEEPS** the container +/// Note that rewritten predicate can evaluate to NULL when some of +/// the min/max values are not known. *Note that this is different than +/// the SQL filter semantics where `NULL` means the row is filtered +/// out.* +/// +/// * `false`: there are no rows that could possibly match the predicate, +/// **PRUNES** the container +/// +/// For example, given a column `x`, the `x_min`, `x_max`, `x_null_count`, and +/// `x_row_count` represent the minimum and maximum values, the null count of +/// column `x`, and the row count of column `x`, provided by the `PruningStatistics`. +/// `x_null_count` and `x_row_count` are used to handle the case where the column `x` +/// is known to be all `NULL`s. Note this is different from knowing nothing about +/// the column `x`, which confusingly is encoded by returning `NULL` for the min/max +/// values from [`PruningStatistics::max_values`] and [`PruningStatistics::min_values`]. +/// +/// Here are some examples of the rewritten predicates: +/// +/// Original Predicate | Rewritten Predicate +/// ------------------ | -------------------- +/// `x = 5` | `x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max)` +/// `x < 5` | `x_null_count != x_row_count THEN false (x_max < 5)` +/// `x = 5 AND y = 10` | `x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max) AND y_null_count != y_row_count (y_min <= 10 AND 10 <= y_max)` +/// `x IS NULL` | `x_null_count > 0` +/// `x IS NOT NULL` | `x_null_count != row_count` +/// `CAST(x as int) = 5` | `x_null_count != x_row_count (CAST(x_min as int) <= 5 AND 5 <= CAST(x_max as int))` +/// +/// ## Predicate Evaluation +/// The PruningPredicate works in two passes +/// +/// **First pass**: For each `LiteralGuarantee` calls +/// [`PruningStatistics::contained`] and rules out containers where the +/// LiteralGuarantees are not satisfied +/// +/// **Second Pass**: Evaluates the rewritten expression using the +/// min/max/null_counts/row_counts values for each column for each container. For any +/// container that this expression evaluates to `false`, it rules out those +/// containers. +/// +/// +/// ### Example 1 +/// +/// Given the predicate, `x = 5 AND y = 10`, the rewritten predicate would look like: +/// +/// ```sql +/// x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max) +/// AND +/// y_null_count != y_row_count AND (y_min <= 10 AND 10 <= y_max) +/// ``` +/// +/// If we know that for a given container, `x` is between `1 and 100` and we know that +/// `y` is between `4` and `7`, we know nothing about the null count and row count of +/// `x` and `y`, the input statistics might look like: +/// +/// Column | Value +/// -------- | ----- +/// `x_min` | `1` +/// `x_max` | `100` +/// `x_null_count` | `null` +/// `x_row_count` | `null` +/// `y_min` | `4` +/// `y_max` | `7` +/// `y_null_count` | `null` +/// `y_row_count` | `null` +/// +/// When these statistics values are substituted in to the rewritten predicate and +/// simplified, the result is `false`: +/// +/// * `null != null AND (1 <= 5 AND 5 <= 100) AND null != null AND (4 <= 10 AND 10 <= 7)` +/// * `null = null` is `null` which is not true, so the AND moves on to the next clause +/// * `null and (1 <= 5 AND 5 <= 100) AND null AND (4 <= 10 AND 10 <= 7)` +/// * evaluating the clauses further we get: +/// * `null and true and null and false` +/// * `null and false` +/// * `false` +/// +/// Returning `false` means the container can be pruned, which matches the +/// intuition that `x = 5 AND y = 10` can’t be true for any row if all values of `y` +/// are `7` or less. +/// +/// Note that if we had ended up with `null AND true AND null AND true` the result +/// would have been `null`. +/// `null` is treated the same as`true`, because we can't prove that the predicate is `false.` +/// +/// If, for some other container, we knew `y` was between the values `4` and +/// `15`, then the rewritten predicate evaluates to `true` (verifying this is +/// left as an exercise to the reader -- are you still here?), and the container +/// **could not** be pruned. The intuition is that there may be rows where the +/// predicate *might* evaluate to `true`, and the only way to find out is to do +/// more analysis, for example by actually reading the data and evaluating the +/// predicate row by row. +/// +/// ### Example 2 +/// +/// Given the same predicate, `x = 5 AND y = 10`, the rewritten predicate would +/// look like the same as example 1: +/// +/// ```sql +/// x_null_count != x_row_count AND (x_min <= 5 AND 5 <= x_max) +/// AND +/// y_null_count != y_row_count AND (y_min <= 10 AND 10 <= y_max) +/// ``` +/// +/// If we know that for another given container, `x_min` is NULL and `x_max` is +/// NULL (the min/max values are unknown), `x_null_count` is `100` and `x_row_count` +/// is `100`; we know that `y` is between `4` and `7`, but we know nothing about +/// the null count and row count of `y`. The input statistics might look like: +/// +/// Column | Value +/// -------- | ----- +/// `x_min` | `null` +/// `x_max` | `null` +/// `x_null_count` | `100` +/// `x_row_count` | `100` +/// `y_min` | `4` +/// `y_max` | `7` +/// `y_null_count` | `null` +/// `y_row_count` | `null` +/// +/// When these statistics values are substituted in to the rewritten predicate and +/// simplified, the result is `false`: +/// +/// * `100 != 100 AND (null <= 5 AND 5 <= null) AND null = null AND (4 <= 10 AND 10 <= 7)` +/// * `false AND null AND null AND false` +/// * `false AND false` +/// * `false` +/// +/// Returning `false` means the container can be pruned, which matches the +/// intuition that `x = 5 AND y = 10` can’t be true because all values in `x` +/// are known to be NULL. +/// +/// # Related Work +/// +/// [`PruningPredicate`] implements the type of min/max pruning described in +/// Section `3.3.3` of the [`Snowflake SIGMOD Paper`]. The technique is +/// described by various research such as [small materialized aggregates], [zone +/// maps], and [data skipping]. +/// +/// [`Snowflake SIGMOD Paper`]: https://dl.acm.org/doi/10.1145/2882903.2903741 +/// [small materialized aggregates]: https://www.vldb.org/conf/1998/p476.pdf +/// [zone maps]: https://dl.acm.org/doi/10.1007/978-3-642-03730-6_10 +/// [data skipping]: https://dl.acm.org/doi/10.1145/2588555.2610515 +#[derive(Debug, Clone)] +pub struct PruningPredicate { + /// The input schema against which the predicate will be evaluated + schema: SchemaRef, + /// A min/max pruning predicate (rewritten in terms of column min/max + /// values, which are supplied by statistics) + predicate_expr: Arc, + /// Description of which statistics are required to evaluate `predicate_expr` + required_columns: RequiredColumns, + /// Original physical predicate from which this predicate expr is derived + /// (required for serialization) + orig_expr: Arc, + /// [`LiteralGuarantee`]s used to try and prove a predicate can not possibly + /// evaluate to `true`. + /// + /// See [`PruningPredicate::literal_guarantees`] for more details. + literal_guarantees: Vec, +} + +/// Rewrites predicates that [`PredicateRewriter`] can not handle, e.g. certain +/// complex expressions or predicates that reference columns that are not in the +/// schema. +pub trait UnhandledPredicateHook { + /// Called when a predicate can not be rewritten in terms of statistics or + /// references a column that is not in the schema. + fn handle(&self, expr: &Arc) -> Arc; +} + +/// The default handling for unhandled predicates is to return a constant `true` +/// (meaning don't prune the container) +#[derive(Debug, Clone)] +struct ConstantUnhandledPredicateHook { + default: Arc, +} + +impl Default for ConstantUnhandledPredicateHook { + fn default() -> Self { + Self { + default: Arc::new(phys_expr::Literal::new(ScalarValue::from(true))), + } + } +} + +impl UnhandledPredicateHook for ConstantUnhandledPredicateHook { + fn handle(&self, _expr: &Arc) -> Arc { + Arc::clone(&self.default) + } +} + +impl PruningPredicate { + /// Try to create a new instance of [`PruningPredicate`] + /// + /// This will translate the provided `expr` filter expression into + /// a *pruning predicate*. + /// + /// A pruning predicate is one that has been rewritten in terms of + /// the min and max values of column references and that evaluates + /// to FALSE if the filter predicate would evaluate FALSE *for + /// every row* whose values fell within the min / max ranges (aka + /// could be pruned). + /// + /// The pruning predicate evaluates to TRUE or NULL + /// if the filter predicate *might* evaluate to TRUE for at least + /// one row whose values fell within the min/max ranges (in other + /// words they might pass the predicate) + /// + /// For example, the filter expression `(column / 2) = 4` becomes + /// the pruning predicate + /// `(column_min / 2) <= 4 && 4 <= (column_max / 2))` + /// + /// See the struct level documentation on [`PruningPredicate`] for more + /// details. + pub fn try_new(expr: Arc, schema: SchemaRef) -> Result { + // Get a (simpler) snapshot of the physical expr here to use with `PruningPredicate` + // which does not handle dynamic exprs in general + let expr = snapshot_physical_expr(expr)?; + let unhandled_hook = Arc::new(ConstantUnhandledPredicateHook::default()) as _; + + // build predicate expression once + let mut required_columns = RequiredColumns::new(); + let predicate_expr = build_predicate_expression( + &expr, + schema.as_ref(), + &mut required_columns, + &unhandled_hook, + ); + + let literal_guarantees = LiteralGuarantee::analyze(&expr); + + Ok(Self { + schema, + predicate_expr, + required_columns, + orig_expr: expr, + literal_guarantees, + }) + } + + /// For each set of statistics, evaluates the pruning predicate + /// and returns a `bool` with the following meaning for a + /// all rows whose values match the statistics: + /// + /// `true`: There MAY be rows that match the predicate + /// + /// `false`: There are no rows that could possibly match the predicate + /// + /// Note: the predicate passed to `prune` should already be simplified as + /// much as possible (e.g. this pass doesn't handle some + /// expressions like `b = false`, but it does handle the + /// simplified version `b`. See [`ExprSimplifier`] to simplify expressions. + /// + /// [`ExprSimplifier`]: https://docs.rs/datafusion/latest/datafusion/optimizer/simplify_expressions/struct.ExprSimplifier.html + pub fn prune( + &self, + statistics: &S, + ) -> Result> { + let mut builder = BoolVecBuilder::new(statistics.num_containers()); + + // Try to prove the predicate can't be true for the containers based on + // literal guarantees + for literal_guarantee in &self.literal_guarantees { + let LiteralGuarantee { + column, + guarantee, + literals, + } = literal_guarantee; + if let Some(results) = statistics.contained(column, literals) { + match guarantee { + // `In` means the values in the column must be one of the + // values in the set for the predicate to evaluate to true. + // If `contained` returns false, that means the column is + // not any of the values so we can prune the container + Guarantee::In => builder.combine_array(&results), + // `NotIn` means the values in the column must must not be + // any of the values in the set for the predicate to + // evaluate to true. If contained returns true, it means the + // column is only in the set of values so we can prune the + // container + Guarantee::NotIn => { + builder.combine_array(&arrow::compute::not(&results)?) + } + } + // if all containers are pruned (has rows that DEFINITELY DO NOT pass the predicate) + // can return early without evaluating the rest of predicates. + if builder.check_all_pruned() { + return Ok(builder.build()); + } + } + } + + // Next, try to prove the predicate can't be true for the containers based + // on min/max values + + // build a RecordBatch that contains the min/max values in the + // appropriate statistics columns for the min/max predicate + let statistics_batch = + build_statistics_record_batch(statistics, &self.required_columns)?; + + // Evaluate the pruning predicate on that record batch and append any results to the builder + builder.combine_value(self.predicate_expr.evaluate(&statistics_batch)?); + + Ok(builder.build()) + } + + /// Return a reference to the input schema + pub fn schema(&self) -> &SchemaRef { + &self.schema + } + + /// Returns a reference to the physical expr used to construct this pruning predicate + pub fn orig_expr(&self) -> &Arc { + &self.orig_expr + } + + /// Returns a reference to the predicate expr + pub fn predicate_expr(&self) -> &Arc { + &self.predicate_expr + } + + /// Returns a reference to the literal guarantees + /// + /// Note that **All** `LiteralGuarantee`s must be satisfied for the + /// expression to possibly be `true`. If any is not satisfied, the + /// expression is guaranteed to be `null` or `false`. + pub fn literal_guarantees(&self) -> &[LiteralGuarantee] { + &self.literal_guarantees + } + + /// Returns true if this pruning predicate can not prune anything. + /// + /// This happens if the predicate is a literal `true` and + /// literal_guarantees is empty. + /// + /// This can happen when a predicate is simplified to a constant `true` + pub fn always_true(&self) -> bool { + is_always_true(&self.predicate_expr) && self.literal_guarantees.is_empty() + } + + // this is only used by `parquet` feature right now + #[allow(dead_code)] + pub fn required_columns(&self) -> &RequiredColumns { + &self.required_columns + } + + /// Names of the columns that are known to be / not be in a set + /// of literals (constants). These are the columns the that may be passed to + /// [`PruningStatistics::contained`] during pruning. + /// + /// This is useful to avoid fetching statistics for columns that will not be + /// used in the predicate. For example, it can be used to avoid reading + /// unneeded bloom filters (a non trivial operation). + pub fn literal_columns(&self) -> Vec { + let mut seen = HashSet::new(); + self.literal_guarantees + .iter() + .map(|e| &e.column.name) + // avoid duplicates + .filter(|name| seen.insert(*name)) + .map(|s| s.to_string()) + .collect() + } +} + +/// Builds the return `Vec` for [`PruningPredicate::prune`]. +#[derive(Debug)] +struct BoolVecBuilder { + /// One element per container. Each element is + /// * `true`: if the container has row that may pass the predicate + /// * `false`: if the container has rows that DEFINITELY DO NOT pass the predicate + inner: Vec, +} + +impl BoolVecBuilder { + /// Create a new `BoolVecBuilder` with `num_containers` elements + fn new(num_containers: usize) -> Self { + Self { + // assume by default all containers may pass the predicate + inner: vec![true; num_containers], + } + } + + /// Combines result `array` for a conjunct (e.g. `AND` clause) of a + /// predicate into the currently in progress array. + /// + /// Each `array` element is: + /// * `true`: container has row that may pass the predicate + /// * `false`: all container rows DEFINITELY DO NOT pass the predicate + /// * `null`: container may or may not have rows that pass the predicate + fn combine_array(&mut self, array: &BooleanArray) { + assert_eq!(array.len(), self.inner.len()); + for (cur, new) in self.inner.iter_mut().zip(array.iter()) { + // `false` for this conjunct means we know for sure no rows could + // pass the predicate and thus we set the corresponding container + // location to false. + if let Some(false) = new { + *cur = false; + } + } + } + + /// Combines the results in the [`ColumnarValue`] to the currently in + /// progress array, following the same rules as [`Self::combine_array`]. + /// + /// # Panics + /// If `value` is not boolean + fn combine_value(&mut self, value: ColumnarValue) { + match value { + ColumnarValue::Array(array) => { + self.combine_array(array.as_boolean()); + } + ColumnarValue::Scalar(ScalarValue::Boolean(Some(false))) => { + // False means all containers can not pass the predicate + self.inner = vec![false; self.inner.len()]; + } + _ => { + // Null or true means the rows in container may pass this + // conjunct so we can't prune any containers based on that + } + } + } + + /// Convert this builder into a Vec of bools + fn build(self) -> Vec { + self.inner + } + + /// Check all containers has rows that DEFINITELY DO NOT pass the predicate + fn check_all_pruned(&self) -> bool { + self.inner.iter().all(|&x| !x) + } +} + +fn is_always_true(expr: &Arc) -> bool { + expr.as_any() + .downcast_ref::() + .map(|l| matches!(l.value(), ScalarValue::Boolean(Some(true)))) + .unwrap_or_default() +} + +fn is_always_false(expr: &Arc) -> bool { + expr.as_any() + .downcast_ref::() + .map(|l| matches!(l.value(), ScalarValue::Boolean(Some(false)))) + .unwrap_or_default() +} + +/// Describes which columns statistics are necessary to evaluate a +/// [`PruningPredicate`]. +/// +/// This structure permits reading and creating the minimum number statistics, +/// which is important since statistics may be non trivial to read (e.g. large +/// strings or when there are 1000s of columns). +/// +/// Handles creating references to the min/max statistics +/// for columns as well as recording which statistics are needed +#[derive(Debug, Default, Clone)] +pub struct RequiredColumns { + /// The statistics required to evaluate this predicate: + /// * The unqualified column in the input schema + /// * Statistics type (e.g. Min or Max or Null_Count) + /// * The field the statistics value should be placed in for + /// pruning predicate evaluation (e.g. `min_value` or `max_value`) + columns: Vec<(phys_expr::Column, StatisticsType, Field)>, +} + +impl RequiredColumns { + fn new() -> Self { + Self::default() + } + + /// Returns Some(column) if this is a single column predicate. + /// + /// Returns None if this is a multi-column predicate. + /// + /// Examples: + /// * `a > 5 OR a < 10` returns `Some(a)` + /// * `a > 5 OR b < 10` returns `None` + /// * `true` returns None + #[allow(dead_code)] + // this fn is only used by `parquet` feature right now, thus the `allow(dead_code)` + pub fn single_column(&self) -> Option<&phys_expr::Column> { + if self.columns.windows(2).all(|w| { + // check if all columns are the same (ignoring statistics and field) + let c1 = &w[0].0; + let c2 = &w[1].0; + c1 == c2 + }) { + self.columns.first().map(|r| &r.0) + } else { + None + } + } + + /// Returns an iterator over items in columns (see doc on + /// `self.columns` for details) + pub(crate) fn iter( + &self, + ) -> impl Iterator { + self.columns.iter() + } + + fn find_stat_column( + &self, + column: &phys_expr::Column, + statistics_type: StatisticsType, + ) -> Option { + match statistics_type { + StatisticsType::RowCount => { + // Use the first row count we find, if any + self.columns + .iter() + .enumerate() + .find(|(_i, (_c, t, _f))| t == &statistics_type) + .map(|(i, (_c, _t, _f))| i) + } + _ => self + .columns + .iter() + .enumerate() + .find(|(_i, (c, t, _f))| c == column && t == &statistics_type) + .map(|(i, (_c, _t, _f))| i), + } + } + + /// Rewrites column_expr so that all appearances of column + /// are replaced with a reference to either the min or max + /// statistics column, while keeping track that a reference to the statistics + /// column is required + /// + /// for example, an expression like `col("foo") > 5`, when called + /// with Max would result in an expression like `col("foo_max") > + /// 5` with the appropriate entry noted in self.columns + fn stat_column_expr( + &mut self, + column: &phys_expr::Column, + column_expr: &Arc, + field: &Field, + stat_type: StatisticsType, + ) -> Result> { + let (idx, need_to_insert) = match self.find_stat_column(column, stat_type) { + Some(idx) => (idx, false), + None => (self.columns.len(), true), + }; + + let column_name = column.name(); + let stat_column_name = match stat_type { + StatisticsType::Min => format!("{column_name}_min"), + StatisticsType::Max => format!("{column_name}_max"), + StatisticsType::NullCount => format!("{column_name}_null_count"), + StatisticsType::RowCount => "row_count".to_string(), + }; + + let stat_column = phys_expr::Column::new(&stat_column_name, idx); + + // only add statistics column if not previously added + if need_to_insert { + // may be null if statistics are not present + let nullable = true; + let stat_field = + Field::new(stat_column.name(), field.data_type().clone(), nullable); + self.columns.push((column.clone(), stat_type, stat_field)); + } + rewrite_column_expr(Arc::clone(column_expr), column, &stat_column) + } + + /// rewrite col --> col_min + fn min_column_expr( + &mut self, + column: &phys_expr::Column, + column_expr: &Arc, + field: &Field, + ) -> Result> { + self.stat_column_expr(column, column_expr, field, StatisticsType::Min) + } + + /// rewrite col --> col_max + fn max_column_expr( + &mut self, + column: &phys_expr::Column, + column_expr: &Arc, + field: &Field, + ) -> Result> { + self.stat_column_expr(column, column_expr, field, StatisticsType::Max) + } + + /// rewrite col --> col_null_count + fn null_count_column_expr( + &mut self, + column: &phys_expr::Column, + column_expr: &Arc, + field: &Field, + ) -> Result> { + self.stat_column_expr(column, column_expr, field, StatisticsType::NullCount) + } + + /// rewrite col --> col_row_count + fn row_count_column_expr( + &mut self, + column: &phys_expr::Column, + column_expr: &Arc, + field: &Field, + ) -> Result> { + self.stat_column_expr(column, column_expr, field, StatisticsType::RowCount) + } +} + +impl From> for RequiredColumns { + fn from(columns: Vec<(phys_expr::Column, StatisticsType, Field)>) -> Self { + Self { columns } + } +} + +/// Build a RecordBatch from a list of statistics, creating arrays, +/// with one row for each PruningStatistics and columns specified in +/// in the required_columns parameter. +/// +/// For example, if the requested columns are +/// ```text +/// ("s1", Min, Field:s1_min) +/// ("s2", Max, field:s2_max) +///``` +/// +/// And the input statistics had +/// ```text +/// S1(Min: 5, Max: 10) +/// S2(Min: 99, Max: 1000) +/// S3(Min: 1, Max: 2) +/// ``` +/// +/// Then this function would build a record batch with 2 columns and +/// one row s1_min and s2_max as follows (s3 is not requested): +/// +/// ```text +/// s1_min | s2_max +/// -------+-------- +/// 5 | 1000 +/// ``` +fn build_statistics_record_batch( + statistics: &S, + required_columns: &RequiredColumns, +) -> Result { + let mut fields = Vec::::new(); + let mut arrays = Vec::::new(); + // For each needed statistics column: + for (column, statistics_type, stat_field) in required_columns.iter() { + let column = Column::from_name(column.name()); + let data_type = stat_field.data_type(); + + let num_containers = statistics.num_containers(); + + let array = match statistics_type { + StatisticsType::Min => statistics.min_values(&column), + StatisticsType::Max => statistics.max_values(&column), + StatisticsType::NullCount => statistics.null_counts(&column), + StatisticsType::RowCount => statistics.row_counts(&column), + }; + let array = array.unwrap_or_else(|| new_null_array(data_type, num_containers)); + + if num_containers != array.len() { + return internal_err!( + "mismatched statistics length. Expected {}, got {}", + num_containers, + array.len() + ); + } + + // cast statistics array to required data type (e.g. parquet + // provides timestamp statistics as "Int64") + let array = arrow::compute::cast(&array, data_type)?; + + fields.push(stat_field.clone()); + arrays.push(array); + } + + let schema = Arc::new(Schema::new(fields)); + // provide the count in case there were no needed statistics + let mut options = RecordBatchOptions::default(); + options.row_count = Some(statistics.num_containers()); + + trace!("Creating statistics batch for {required_columns:#?} with {arrays:#?}"); + + RecordBatch::try_new_with_options(schema, arrays, &options).map_err(|err| { + plan_datafusion_err!("Can not create statistics record batch: {err}") + }) +} + +struct PruningExpressionBuilder<'a> { + column: phys_expr::Column, + column_expr: Arc, + op: Operator, + scalar_expr: Arc, + field: &'a Field, + required_columns: &'a mut RequiredColumns, +} + +impl<'a> PruningExpressionBuilder<'a> { + fn try_new( + left: &'a Arc, + right: &'a Arc, + op: Operator, + schema: &'a Schema, + required_columns: &'a mut RequiredColumns, + ) -> Result { + // find column name; input could be a more complicated expression + let left_columns = collect_columns(left); + let right_columns = collect_columns(right); + let (column_expr, scalar_expr, columns, correct_operator) = + match (left_columns.len(), right_columns.len()) { + (1, 0) => (left, right, left_columns, op), + (0, 1) => (right, left, right_columns, reverse_operator(op)?), + _ => { + // if more than one column used in expression - not supported + return plan_err!( + "Multi-column expressions are not currently supported" + ); + } + }; + + let df_schema = DFSchema::try_from(schema.clone())?; + let (column_expr, correct_operator, scalar_expr) = rewrite_expr_to_prunable( + column_expr, + correct_operator, + scalar_expr, + df_schema, + )?; + let column = columns.iter().next().unwrap().clone(); + let field = match schema.column_with_name(column.name()) { + Some((_, f)) => f, + _ => { + return plan_err!("Field not found in schema"); + } + }; + + Ok(Self { + column, + column_expr, + op: correct_operator, + scalar_expr, + field, + required_columns, + }) + } + + fn op(&self) -> Operator { + self.op + } + + fn scalar_expr(&self) -> &Arc { + &self.scalar_expr + } + + fn min_column_expr(&mut self) -> Result> { + self.required_columns + .min_column_expr(&self.column, &self.column_expr, self.field) + } + + fn max_column_expr(&mut self) -> Result> { + self.required_columns + .max_column_expr(&self.column, &self.column_expr, self.field) + } + + /// This function is to simply retune the `null_count` physical expression no matter what the + /// predicate expression is + /// + /// i.e., x > 5 => x_null_count, + /// cast(x as int) < 10 => x_null_count, + /// try_cast(x as float) < 10.0 => x_null_count + fn null_count_column_expr(&mut self) -> Result> { + // Retune to [`phys_expr::Column`] + let column_expr = Arc::new(self.column.clone()) as _; + + // null_count is DataType::UInt64, which is different from the column's data type (i.e. self.field) + let null_count_field = &Field::new(self.field.name(), DataType::UInt64, true); + + self.required_columns.null_count_column_expr( + &self.column, + &column_expr, + null_count_field, + ) + } + + /// This function is to simply retune the `row_count` physical expression no matter what the + /// predicate expression is + /// + /// i.e., x > 5 => x_row_count, + /// cast(x as int) < 10 => x_row_count, + /// try_cast(x as float) < 10.0 => x_row_count + fn row_count_column_expr(&mut self) -> Result> { + // Retune to [`phys_expr::Column`] + let column_expr = Arc::new(self.column.clone()) as _; + + // row_count is DataType::UInt64, which is different from the column's data type (i.e. self.field) + let row_count_field = &Field::new(self.field.name(), DataType::UInt64, true); + + self.required_columns.row_count_column_expr( + &self.column, + &column_expr, + row_count_field, + ) + } +} + +/// This function is designed to rewrite the column_expr to +/// ensure the column_expr is monotonically increasing. +/// +/// For example, +/// 1. `col > 10` +/// 2. `-col > 10` should be rewritten to `col < -10` +/// 3. `!col = true` would be rewritten to `col = !true` +/// 4. `abs(a - 10) > 0` not supported +/// 5. `cast(can_prunable_expr) > 10` +/// 6. `try_cast(can_prunable_expr) > 10` +/// +/// More rewrite rules are still in progress. +fn rewrite_expr_to_prunable( + column_expr: &PhysicalExprRef, + op: Operator, + scalar_expr: &PhysicalExprRef, + schema: DFSchema, +) -> Result<(PhysicalExprRef, Operator, PhysicalExprRef)> { + if !is_compare_op(op) { + return plan_err!("rewrite_expr_to_prunable only support compare expression"); + } + + let column_expr_any = column_expr.as_any(); + + if column_expr_any + .downcast_ref::() + .is_some() + { + // `col op lit()` + Ok((Arc::clone(column_expr), op, Arc::clone(scalar_expr))) + } else if let Some(cast) = column_expr_any.downcast_ref::() { + // `cast(col) op lit()` + let arrow_schema: SchemaRef = schema.clone().into(); + let from_type = cast.expr().data_type(&arrow_schema)?; + verify_support_type_for_prune(&from_type, cast.cast_type())?; + let (left, op, right) = + rewrite_expr_to_prunable(cast.expr(), op, scalar_expr, schema)?; + let left = Arc::new(phys_expr::CastExpr::new( + left, + cast.cast_type().clone(), + None, + )); + Ok((left, op, right)) + } else if let Some(try_cast) = + column_expr_any.downcast_ref::() + { + // `try_cast(col) op lit()` + let arrow_schema: SchemaRef = schema.clone().into(); + let from_type = try_cast.expr().data_type(&arrow_schema)?; + verify_support_type_for_prune(&from_type, try_cast.cast_type())?; + let (left, op, right) = + rewrite_expr_to_prunable(try_cast.expr(), op, scalar_expr, schema)?; + let left = Arc::new(phys_expr::TryCastExpr::new( + left, + try_cast.cast_type().clone(), + )); + Ok((left, op, right)) + } else if let Some(neg) = column_expr_any.downcast_ref::() { + // `-col > lit()` --> `col < -lit()` + let (left, op, right) = + rewrite_expr_to_prunable(neg.arg(), op, scalar_expr, schema)?; + let right = Arc::new(phys_expr::NegativeExpr::new(right)); + Ok((left, reverse_operator(op)?, right)) + } else if let Some(not) = column_expr_any.downcast_ref::() { + // `!col = true` --> `col = !true` + if op != Operator::Eq && op != Operator::NotEq { + return plan_err!("Not with operator other than Eq / NotEq is not supported"); + } + if not + .arg() + .as_any() + .downcast_ref::() + .is_some() + { + let left = Arc::clone(not.arg()); + let right = Arc::new(phys_expr::NotExpr::new(Arc::clone(scalar_expr))); + Ok((left, reverse_operator(op)?, right)) + } else { + plan_err!("Not with complex expression {column_expr:?} is not supported") + } + } else { + plan_err!("column expression {column_expr:?} is not supported") + } +} + +fn is_compare_op(op: Operator) -> bool { + matches!( + op, + Operator::Eq + | Operator::NotEq + | Operator::Lt + | Operator::LtEq + | Operator::Gt + | Operator::GtEq + | Operator::LikeMatch + | Operator::NotLikeMatch + ) +} + +fn is_string_type(data_type: &DataType) -> bool { + matches!( + data_type, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + ) +} + +// The pruning logic is based on the comparing the min/max bounds. +// Must make sure the two type has order. +// For example, casts from string to numbers is not correct. +// Because the "13" is less than "3" with UTF8 comparison order. +fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Result<()> { + // Dictionary casts are always supported as long as the value types are supported + let from_type = match from_type { + DataType::Dictionary(_, t) => { + return verify_support_type_for_prune(t.as_ref(), to_type) + } + _ => from_type, + }; + let to_type = match to_type { + DataType::Dictionary(_, t) => { + return verify_support_type_for_prune(from_type, t.as_ref()) + } + _ => to_type, + }; + // If both types are strings or both are not strings (number, timestamp, etc) + // then we can compare them. + // PruningPredicate does not support casting of strings to numbers and such. + if is_string_type(from_type) == is_string_type(to_type) { + Ok(()) + } else { + plan_err!( + "Try Cast/Cast with from type {from_type} to type {to_type} is not supported" + ) + } +} + +/// replaces a column with an old name with a new name in an expression +fn rewrite_column_expr( + e: Arc, + column_old: &phys_expr::Column, + column_new: &phys_expr::Column, +) -> Result> { + e.transform(|expr| { + if let Some(column) = expr.as_any().downcast_ref::() { + if column == column_old { + return Ok(Transformed::yes(Arc::new(column_new.clone()))); + } + } + + Ok(Transformed::no(expr)) + }) + .data() +} + +fn reverse_operator(op: Operator) -> Result { + op.swap().ok_or_else(|| { + DataFusionError::Internal(format!( + "Could not reverse operator {op} while building pruning predicate" + )) + }) +} + +/// Given a column reference to `column`, returns a pruning +/// expression in terms of the min and max that will evaluate to true +/// if the column may contain values, and false if definitely does not +/// contain values +fn build_single_column_expr( + column: &phys_expr::Column, + schema: &Schema, + required_columns: &mut RequiredColumns, + is_not: bool, // if true, treat as !col +) -> Option> { + let field = schema.field_with_name(column.name()).ok()?; + + if matches!(field.data_type(), &DataType::Boolean) { + let col_ref = Arc::new(column.clone()) as _; + + let min = required_columns + .min_column_expr(column, &col_ref, field) + .ok()?; + let max = required_columns + .max_column_expr(column, &col_ref, field) + .ok()?; + + // remember -- we want an expression that is: + // TRUE: if there may be rows that match + // FALSE: if there are no rows that match + if is_not { + // The only way we know a column couldn't match is if both the min and max are true + // !(min && max) + Some(Arc::new(phys_expr::NotExpr::new(Arc::new( + phys_expr::BinaryExpr::new(min, Operator::And, max), + )))) + } else { + // the only way we know a column couldn't match is if both the min and max are false + // !(!min && !max) --> min || max + Some(Arc::new(phys_expr::BinaryExpr::new(min, Operator::Or, max))) + } + } else { + None + } +} + +/// Given an expression reference to `expr`, if `expr` is a column expression, +/// returns a pruning expression in terms of IsNull that will evaluate to true +/// if the column may contain null, and false if definitely does not +/// contain null. +/// If `with_not` is true, build a pruning expression for `col IS NOT NULL`: `col_count != col_null_count` +/// The pruning expression evaluates to true ONLY if the column definitely CONTAINS +/// at least one NULL value. In this case we can know that `IS NOT NULL` can not be true and +/// thus can prune the row group / value +fn build_is_null_column_expr( + expr: &Arc, + schema: &Schema, + required_columns: &mut RequiredColumns, + with_not: bool, +) -> Option> { + if let Some(col) = expr.as_any().downcast_ref::() { + let field = schema.field_with_name(col.name()).ok()?; + + let null_count_field = &Field::new(field.name(), DataType::UInt64, true); + if with_not { + if let Ok(row_count_expr) = + required_columns.row_count_column_expr(col, expr, null_count_field) + { + required_columns + .null_count_column_expr(col, expr, null_count_field) + .map(|null_count_column_expr| { + // IsNotNull(column) => null_count != row_count + Arc::new(phys_expr::BinaryExpr::new( + null_count_column_expr, + Operator::NotEq, + row_count_expr, + )) as _ + }) + .ok() + } else { + None + } + } else { + required_columns + .null_count_column_expr(col, expr, null_count_field) + .map(|null_count_column_expr| { + // IsNull(column) => null_count > 0 + Arc::new(phys_expr::BinaryExpr::new( + null_count_column_expr, + Operator::Gt, + Arc::new(phys_expr::Literal::new(ScalarValue::UInt64(Some(0)))), + )) as _ + }) + .ok() + } + } else { + None + } +} + +/// The maximum number of entries in an `InList` that might be rewritten into +/// an OR chain +const MAX_LIST_VALUE_SIZE_REWRITE: usize = 20; + +/// Rewrite a predicate expression in terms of statistics (min/max/null_counts) +/// for use as a [`PruningPredicate`]. +pub struct PredicateRewriter { + unhandled_hook: Arc, +} + +impl Default for PredicateRewriter { + fn default() -> Self { + Self { + unhandled_hook: Arc::new(ConstantUnhandledPredicateHook::default()), + } + } +} + +impl PredicateRewriter { + /// Create a new `PredicateRewriter` + pub fn new() -> Self { + Self::default() + } + + /// Set the unhandled hook to be used when a predicate can not be rewritten + pub fn with_unhandled_hook( + self, + unhandled_hook: Arc, + ) -> Self { + Self { unhandled_hook } + } + + /// Translate logical filter expression into pruning predicate + /// expression that will evaluate to FALSE if it can be determined no + /// rows between the min/max values could pass the predicates. + /// + /// Any predicates that can not be translated will be passed to `unhandled_hook`. + /// + /// Returns the pruning predicate as an [`PhysicalExpr`] + /// + /// Notice: Does not handle [`phys_expr::InListExpr`] greater than 20, which will fall back to calling `unhandled_hook` + pub fn rewrite_predicate_to_statistics_predicate( + &self, + expr: &Arc, + schema: &Schema, + ) -> Arc { + let mut required_columns = RequiredColumns::new(); + build_predicate_expression( + expr, + schema, + &mut required_columns, + &self.unhandled_hook, + ) + } +} + +/// Translate logical filter expression into pruning predicate +/// expression that will evaluate to FALSE if it can be determined no +/// rows between the min/max values could pass the predicates. +/// +/// Any predicates that can not be translated will be passed to `unhandled_hook`. +/// +/// Returns the pruning predicate as an [`PhysicalExpr`] +/// +/// Notice: Does not handle [`phys_expr::InListExpr`] greater than 20, which will fall back to calling `unhandled_hook` +fn build_predicate_expression( + expr: &Arc, + schema: &Schema, + required_columns: &mut RequiredColumns, + unhandled_hook: &Arc, +) -> Arc { + if is_always_false(expr) { + // Shouldn't return `unhandled_hook.handle(expr)` + // Because it will transfer false to true. + return Arc::clone(expr); + } + // predicate expression can only be a binary expression + let expr_any = expr.as_any(); + if let Some(is_null) = expr_any.downcast_ref::() { + return build_is_null_column_expr(is_null.arg(), schema, required_columns, false) + .unwrap_or_else(|| unhandled_hook.handle(expr)); + } + if let Some(is_not_null) = expr_any.downcast_ref::() { + return build_is_null_column_expr( + is_not_null.arg(), + schema, + required_columns, + true, + ) + .unwrap_or_else(|| unhandled_hook.handle(expr)); + } + if let Some(col) = expr_any.downcast_ref::() { + return build_single_column_expr(col, schema, required_columns, false) + .unwrap_or_else(|| unhandled_hook.handle(expr)); + } + if let Some(not) = expr_any.downcast_ref::() { + // match !col (don't do so recursively) + if let Some(col) = not.arg().as_any().downcast_ref::() { + return build_single_column_expr(col, schema, required_columns, true) + .unwrap_or_else(|| unhandled_hook.handle(expr)); + } else { + return unhandled_hook.handle(expr); + } + } + if let Some(in_list) = expr_any.downcast_ref::() { + if !in_list.list().is_empty() + && in_list.list().len() <= MAX_LIST_VALUE_SIZE_REWRITE + { + let eq_op = if in_list.negated() { + Operator::NotEq + } else { + Operator::Eq + }; + let re_op = if in_list.negated() { + Operator::And + } else { + Operator::Or + }; + let change_expr = in_list + .list() + .iter() + .map(|e| { + Arc::new(phys_expr::BinaryExpr::new( + Arc::clone(in_list.expr()), + eq_op, + Arc::clone(e), + )) as _ + }) + .reduce(|a, b| Arc::new(phys_expr::BinaryExpr::new(a, re_op, b)) as _) + .unwrap(); + return build_predicate_expression( + &change_expr, + schema, + required_columns, + unhandled_hook, + ); + } else { + return unhandled_hook.handle(expr); + } + } + + let (left, op, right) = { + if let Some(bin_expr) = expr_any.downcast_ref::() { + ( + Arc::clone(bin_expr.left()), + *bin_expr.op(), + Arc::clone(bin_expr.right()), + ) + } else if let Some(like_expr) = expr_any.downcast_ref::() { + if like_expr.case_insensitive() { + return unhandled_hook.handle(expr); + } + let op = match (like_expr.negated(), like_expr.case_insensitive()) { + (false, false) => Operator::LikeMatch, + (true, false) => Operator::NotLikeMatch, + (false, true) => Operator::ILikeMatch, + (true, true) => Operator::NotILikeMatch, + }; + ( + Arc::clone(like_expr.expr()), + op, + Arc::clone(like_expr.pattern()), + ) + } else { + return unhandled_hook.handle(expr); + } + }; + + if op == Operator::And || op == Operator::Or { + let left_expr = + build_predicate_expression(&left, schema, required_columns, unhandled_hook); + let right_expr = + build_predicate_expression(&right, schema, required_columns, unhandled_hook); + // simplify boolean expression if applicable + let expr = match (&left_expr, op, &right_expr) { + (left, Operator::And, right) + if is_always_false(left) || is_always_false(right) => + { + Arc::new(phys_expr::Literal::new(ScalarValue::Boolean(Some(false)))) + } + (left, Operator::And, _) if is_always_true(left) => right_expr, + (_, Operator::And, right) if is_always_true(right) => left_expr, + (left, Operator::Or, right) + if is_always_true(left) || is_always_true(right) => + { + Arc::new(phys_expr::Literal::new(ScalarValue::Boolean(Some(true)))) + } + (left, Operator::Or, _) if is_always_false(left) => right_expr, + (_, Operator::Or, right) if is_always_false(right) => left_expr, + + _ => Arc::new(phys_expr::BinaryExpr::new(left_expr, op, right_expr)), + }; + return expr; + } + + let expr_builder = + PruningExpressionBuilder::try_new(&left, &right, op, schema, required_columns); + let mut expr_builder = match expr_builder { + Ok(builder) => builder, + // allow partial failure in predicate expression generation + // this can still produce a useful predicate when multiple conditions are joined using AND + Err(e) => { + debug!("Error building pruning expression: {e}"); + return unhandled_hook.handle(expr); + } + }; + + build_statistics_expr(&mut expr_builder) + .unwrap_or_else(|_| unhandled_hook.handle(expr)) +} + +fn build_statistics_expr( + expr_builder: &mut PruningExpressionBuilder, +) -> Result> { + let statistics_expr: Arc = match expr_builder.op() { + Operator::NotEq => { + // column != literal => (min, max) = literal => + // !(min != literal && max != literal) ==> + // min != literal || literal != max + let min_column_expr = expr_builder.min_column_expr()?; + let max_column_expr = expr_builder.max_column_expr()?; + Arc::new(phys_expr::BinaryExpr::new( + Arc::new(phys_expr::BinaryExpr::new( + min_column_expr, + Operator::NotEq, + Arc::clone(expr_builder.scalar_expr()), + )), + Operator::Or, + Arc::new(phys_expr::BinaryExpr::new( + Arc::clone(expr_builder.scalar_expr()), + Operator::NotEq, + max_column_expr, + )), + )) + } + Operator::Eq => { + // column = literal => (min, max) = literal => min <= literal && literal <= max + // (column / 2) = 4 => (column_min / 2) <= 4 && 4 <= (column_max / 2) + let min_column_expr = expr_builder.min_column_expr()?; + let max_column_expr = expr_builder.max_column_expr()?; + Arc::new(phys_expr::BinaryExpr::new( + Arc::new(phys_expr::BinaryExpr::new( + min_column_expr, + Operator::LtEq, + Arc::clone(expr_builder.scalar_expr()), + )), + Operator::And, + Arc::new(phys_expr::BinaryExpr::new( + Arc::clone(expr_builder.scalar_expr()), + Operator::LtEq, + max_column_expr, + )), + )) + } + Operator::NotLikeMatch => build_not_like_match(expr_builder)?, + Operator::LikeMatch => build_like_match(expr_builder).ok_or_else(|| { + plan_datafusion_err!( + "LIKE expression with wildcard at the beginning is not supported" + ) + })?, + Operator::Gt => { + // column > literal => (min, max) > literal => max > literal + Arc::new(phys_expr::BinaryExpr::new( + expr_builder.max_column_expr()?, + Operator::Gt, + Arc::clone(expr_builder.scalar_expr()), + )) + } + Operator::GtEq => { + // column >= literal => (min, max) >= literal => max >= literal + Arc::new(phys_expr::BinaryExpr::new( + expr_builder.max_column_expr()?, + Operator::GtEq, + Arc::clone(expr_builder.scalar_expr()), + )) + } + Operator::Lt => { + // column < literal => (min, max) < literal => min < literal + Arc::new(phys_expr::BinaryExpr::new( + expr_builder.min_column_expr()?, + Operator::Lt, + Arc::clone(expr_builder.scalar_expr()), + )) + } + Operator::LtEq => { + // column <= literal => (min, max) <= literal => min <= literal + Arc::new(phys_expr::BinaryExpr::new( + expr_builder.min_column_expr()?, + Operator::LtEq, + Arc::clone(expr_builder.scalar_expr()), + )) + } + // other expressions are not supported + _ => { + return plan_err!( + "expressions other than (neq, eq, gt, gteq, lt, lteq) are not supported" + ); + } + }; + let statistics_expr = wrap_null_count_check_expr(statistics_expr, expr_builder)?; + Ok(statistics_expr) +} + +/// returns the string literal of the scalar value if it is a string +fn unpack_string(s: &ScalarValue) -> Option<&str> { + s.try_as_str().flatten() +} + +fn extract_string_literal(expr: &Arc) -> Option<&str> { + if let Some(lit) = expr.as_any().downcast_ref::() { + let s = unpack_string(lit.value())?; + return Some(s); + } + None +} + +/// Convert `column LIKE literal` where P is a constant prefix of the literal +/// to a range check on the column: `P <= column && column < P'`, where P' is the +/// lowest string after all P* strings. +fn build_like_match( + expr_builder: &mut PruningExpressionBuilder, +) -> Option> { + // column LIKE literal => (min, max) LIKE literal split at % => min <= split literal && split literal <= max + // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max + // column LIKE '%foo' => min <= '' && '' <= max => true + // column LIKE '%foo%' => min <= '' && '' <= max => true + // column LIKE 'foo' => min <= 'foo' && 'foo' <= max + + // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase + // this may involve building the physical expressions that call lower() and upper() + let min_column_expr = expr_builder.min_column_expr().ok()?; + let max_column_expr = expr_builder.max_column_expr().ok()?; + let scalar_expr = expr_builder.scalar_expr(); + // check that the scalar is a string literal + let s = extract_string_literal(scalar_expr)?; + // ANSI SQL specifies two wildcards: % and _. % matches zero or more characters, _ matches exactly one character. + let first_wildcard_index = s.find(['%', '_']); + if first_wildcard_index == Some(0) { + // there's no filtering we could possibly do, return an error and have this be handled by the unhandled hook + return None; + } + let (lower_bound, upper_bound) = if let Some(wildcard_index) = first_wildcard_index { + let prefix = &s[..wildcard_index]; + let lower_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some( + prefix.to_string(), + )))); + let upper_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some( + increment_utf8(prefix)?, + )))); + (lower_bound_lit, upper_bound_lit) + } else { + // the like expression is a literal and can be converted into a comparison + let bound = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some( + s.to_string(), + )))); + (Arc::clone(&bound), bound) + }; + let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new( + lower_bound, + Operator::LtEq, + Arc::clone(&max_column_expr), + )); + let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new( + Arc::clone(&min_column_expr), + Operator::LtEq, + upper_bound, + )); + let combined = Arc::new(phys_expr::BinaryExpr::new( + upper_bound_expr, + Operator::And, + lower_bound_expr, + )); + Some(combined) +} + +// For predicate `col NOT LIKE 'const_prefix%'`, we rewrite it as `(col_min NOT LIKE 'const_prefix%' OR col_max NOT LIKE 'const_prefix%')`. +// +// The intuition is that if both `col_min` and `col_max` begin with `const_prefix` that means +// **all** data in this row group begins with `const_prefix` as well (and therefore the predicate +// looking for rows that don't begin with `const_prefix` can never be true) +fn build_not_like_match( + expr_builder: &mut PruningExpressionBuilder<'_>, +) -> Result> { + // col NOT LIKE 'const_prefix%' -> !(col_min LIKE 'const_prefix%' && col_max LIKE 'const_prefix%') -> (col_min NOT LIKE 'const_prefix%' || col_max NOT LIKE 'const_prefix%') + + let min_column_expr = expr_builder.min_column_expr()?; + let max_column_expr = expr_builder.max_column_expr()?; + + let scalar_expr = expr_builder.scalar_expr(); + + let pattern = extract_string_literal(scalar_expr).ok_or_else(|| { + plan_datafusion_err!("cannot extract literal from NOT LIKE expression") + })?; + + let (const_prefix, remaining) = split_constant_prefix(pattern); + if const_prefix.is_empty() || remaining != "%" { + // we can not handle `%` at the beginning or in the middle of the pattern + // Example: For pattern "foo%bar", the row group might include values like + // ["foobar", "food", "foodbar"], making it unsafe to prune. + // Even if the min/max values in the group (e.g., "foobar" and "foodbar") + // match the pattern, intermediate values like "food" may not + // match the full pattern "foo%bar", making pruning unsafe. + // (truncate foo%bar to foo% have same problem) + + // we can not handle pattern containing `_` + // Example: For pattern "foo_", row groups might contain ["fooa", "fooaa", "foob"], + // which means not every row is guaranteed to match the pattern. + return Err(plan_datafusion_err!( + "NOT LIKE expressions only support constant_prefix+wildcard`%`" + )); + } + + let min_col_not_like_epxr = Arc::new(phys_expr::LikeExpr::new( + true, + false, + Arc::clone(&min_column_expr), + Arc::clone(scalar_expr), + )); + + let max_col_not_like_expr = Arc::new(phys_expr::LikeExpr::new( + true, + false, + Arc::clone(&max_column_expr), + Arc::clone(scalar_expr), + )); + + Ok(Arc::new(phys_expr::BinaryExpr::new( + min_col_not_like_epxr, + Operator::Or, + max_col_not_like_expr, + ))) +} + +/// Returns unescaped constant prefix of a LIKE pattern (possibly empty) and the remaining pattern (possibly empty) +fn split_constant_prefix(pattern: &str) -> (&str, &str) { + let char_indices = pattern.char_indices().collect::>(); + for i in 0..char_indices.len() { + let (idx, char) = char_indices[i]; + if char == '%' || char == '_' { + if i != 0 && char_indices[i - 1].1 == '\\' { + // ecsaped by `\` + continue; + } + return (&pattern[..idx], &pattern[idx..]); + } + } + (pattern, "") +} + +/// Increment a UTF8 string by one, returning `None` if it can't be incremented. +/// This makes it so that the returned string will always compare greater than the input string +/// or any other string with the same prefix. +/// This is necessary since the statistics may have been truncated: if we have a min statistic +/// of "fo" that may have originally been "foz" or anything else with the prefix "fo". +/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"` +/// In this example `increment_utf8("foo") == "fop" +fn increment_utf8(data: &str) -> Option { + // Helper function to check if a character is valid to use + fn is_valid_unicode(c: char) -> bool { + let cp = c as u32; + + // Filter out non-characters (https://www.unicode.org/versions/corrigendum9.html) + if [0xFFFE, 0xFFFF].contains(&cp) || (0xFDD0..=0xFDEF).contains(&cp) { + return false; + } + + // Filter out private use area + if cp >= 0x110000 { + return false; + } + + true + } + + // Convert string to vector of code points + let mut code_points: Vec = data.chars().collect(); + + // Work backwards through code points + for idx in (0..code_points.len()).rev() { + let original = code_points[idx] as u32; + + // Try incrementing the code point + if let Some(next_char) = char::from_u32(original + 1) { + if is_valid_unicode(next_char) { + code_points[idx] = next_char; + // truncate the string to the current index + code_points.truncate(idx + 1); + return Some(code_points.into_iter().collect()); + } + } + } + + None +} + +/// Wrap the statistics expression in a check that skips the expression if the column is all nulls. +/// +/// This is important not only as an optimization but also because statistics may not be +/// accurate for columns that are all nulls. +/// For example, for an `int` column `x` with all nulls, the min/max/null_count statistics +/// might be set to 0 and evaluating `x = 0` would incorrectly include the column. +/// +/// For example: +/// +/// `x_min <= 10 AND 10 <= x_max` +/// +/// will become +/// +/// ```sql +/// x_null_count != x_row_count AND (x_min <= 10 AND 10 <= x_max) +/// ```` +/// +/// If the column is known to be all nulls, then the expression +/// `x_null_count = x_row_count` will be true, which will cause the +/// boolean expression to return false. Therefore, prune out the container. +fn wrap_null_count_check_expr( + statistics_expr: Arc, + expr_builder: &mut PruningExpressionBuilder, +) -> Result> { + // x_null_count != x_row_count + let not_when_null_count_eq_row_count = Arc::new(phys_expr::BinaryExpr::new( + expr_builder.null_count_column_expr()?, + Operator::NotEq, + expr_builder.row_count_column_expr()?, + )); + + // (x_null_count != x_row_count) AND () + Ok(Arc::new(phys_expr::BinaryExpr::new( + not_when_null_count_eq_row_count, + Operator::And, + statistics_expr, + ))) +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub(crate) enum StatisticsType { + Min, + Max, + NullCount, + RowCount, +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::ops::{Not, Rem}; + + use super::*; + use datafusion_common::test_util::batches_to_string; + use datafusion_expr::{and, col, lit, or}; + use insta::assert_snapshot; + + use arrow::array::Decimal128Array; + use arrow::{ + array::{BinaryArray, Int32Array, Int64Array, StringArray, UInt64Array}, + datatypes::TimeUnit, + }; + use datafusion_expr::expr::InList; + use datafusion_expr::{cast, is_null, try_cast, Expr}; + use datafusion_functions_nested::expr_fn::{array_has, make_array}; + use datafusion_physical_expr::expressions as phys_expr; + use datafusion_physical_expr::planner::logical2physical; + + #[derive(Debug, Default)] + /// Mock statistic provider for tests + /// + /// Each row represents the statistics for a "container" (which + /// might represent an entire parquet file, or directory of files, + /// or some other collection of data for which we had statistics) + /// + /// Note All `ArrayRefs` must be the same size. + struct ContainerStats { + min: Option, + max: Option, + /// Optional values + null_counts: Option, + row_counts: Option, + /// Optional known values (e.g. mimic a bloom filter) + /// (value, contained) + /// If present, all BooleanArrays must be the same size as min/max + contained: Vec<(HashSet, BooleanArray)>, + } + + impl ContainerStats { + fn new() -> Self { + Default::default() + } + fn new_decimal128( + min: impl IntoIterator>, + max: impl IntoIterator>, + precision: u8, + scale: i8, + ) -> Self { + Self::new() + .with_min(Arc::new( + min.into_iter() + .collect::() + .with_precision_and_scale(precision, scale) + .unwrap(), + )) + .with_max(Arc::new( + max.into_iter() + .collect::() + .with_precision_and_scale(precision, scale) + .unwrap(), + )) + } + + fn new_i64( + min: impl IntoIterator>, + max: impl IntoIterator>, + ) -> Self { + Self::new() + .with_min(Arc::new(min.into_iter().collect::())) + .with_max(Arc::new(max.into_iter().collect::())) + } + + fn new_i32( + min: impl IntoIterator>, + max: impl IntoIterator>, + ) -> Self { + Self::new() + .with_min(Arc::new(min.into_iter().collect::())) + .with_max(Arc::new(max.into_iter().collect::())) + } + + fn new_utf8<'a>( + min: impl IntoIterator>, + max: impl IntoIterator>, + ) -> Self { + Self::new() + .with_min(Arc::new(min.into_iter().collect::())) + .with_max(Arc::new(max.into_iter().collect::())) + } + + fn new_bool( + min: impl IntoIterator>, + max: impl IntoIterator>, + ) -> Self { + Self::new() + .with_min(Arc::new(min.into_iter().collect::())) + .with_max(Arc::new(max.into_iter().collect::())) + } + + fn min(&self) -> Option { + self.min.clone() + } + + fn max(&self) -> Option { + self.max.clone() + } + + fn null_counts(&self) -> Option { + self.null_counts.clone() + } + + fn row_counts(&self) -> Option { + self.row_counts.clone() + } + + /// return an iterator over all arrays in this statistics + fn arrays(&self) -> Vec { + let contained_arrays = self + .contained + .iter() + .map(|(_values, contained)| Arc::new(contained.clone()) as ArrayRef); + + [ + self.min.as_ref().cloned(), + self.max.as_ref().cloned(), + self.null_counts.as_ref().cloned(), + self.row_counts.as_ref().cloned(), + ] + .into_iter() + .flatten() + .chain(contained_arrays) + .collect() + } + + /// Returns the number of containers represented by this statistics This + /// picks the length of the first array as all arrays must have the same + /// length (which is verified by `assert_invariants`). + fn len(&self) -> usize { + // pick the first non zero length + self.arrays().iter().map(|a| a.len()).next().unwrap_or(0) + } + + /// Ensure that the lengths of all arrays are consistent + fn assert_invariants(&self) { + let mut prev_len = None; + + for len in self.arrays().iter().map(|a| a.len()) { + // Get a length, if we don't already have one + match prev_len { + None => { + prev_len = Some(len); + } + Some(prev_len) => { + assert_eq!(prev_len, len); + } + } + } + } + + /// Add min values + fn with_min(mut self, min: ArrayRef) -> Self { + self.min = Some(min); + self + } + + /// Add max values + fn with_max(mut self, max: ArrayRef) -> Self { + self.max = Some(max); + self + } + + /// Add null counts. There must be the same number of null counts as + /// there are containers + fn with_null_counts( + mut self, + counts: impl IntoIterator>, + ) -> Self { + let null_counts: ArrayRef = + Arc::new(counts.into_iter().collect::()); + + self.assert_invariants(); + self.null_counts = Some(null_counts); + self + } + + /// Add row counts. There must be the same number of row counts as + /// there are containers + fn with_row_counts( + mut self, + counts: impl IntoIterator>, + ) -> Self { + let row_counts: ArrayRef = + Arc::new(counts.into_iter().collect::()); + + self.assert_invariants(); + self.row_counts = Some(row_counts); + self + } + + /// Add contained information. + pub fn with_contained( + mut self, + values: impl IntoIterator, + contained: impl IntoIterator>, + ) -> Self { + let contained: BooleanArray = contained.into_iter().collect(); + let values: HashSet<_> = values.into_iter().collect(); + + self.contained.push((values, contained)); + self.assert_invariants(); + self + } + + /// get any contained information for the specified values + fn contained(&self, find_values: &HashSet) -> Option { + // find the one with the matching values + self.contained + .iter() + .find(|(values, _contained)| values == find_values) + .map(|(_values, contained)| contained.clone()) + } + } + + #[derive(Debug, Default)] + struct TestStatistics { + // key: column name + stats: HashMap, + } + + impl TestStatistics { + fn new() -> Self { + Self::default() + } + + fn with( + mut self, + name: impl Into, + container_stats: ContainerStats, + ) -> Self { + let col = Column::from_name(name.into()); + self.stats.insert(col, container_stats); + self + } + + /// Add null counts for the specified column. + /// There must be the same number of null counts as + /// there are containers + fn with_null_counts( + mut self, + name: impl Into, + counts: impl IntoIterator>, + ) -> Self { + let col = Column::from_name(name.into()); + + // take stats out and update them + let container_stats = self + .stats + .remove(&col) + .unwrap_or_default() + .with_null_counts(counts); + + // put stats back in + self.stats.insert(col, container_stats); + self + } + + /// Add row counts for the specified column. + /// There must be the same number of row counts as + /// there are containers + fn with_row_counts( + mut self, + name: impl Into, + counts: impl IntoIterator>, + ) -> Self { + let col = Column::from_name(name.into()); + + // take stats out and update them + let container_stats = self + .stats + .remove(&col) + .unwrap_or_default() + .with_row_counts(counts); + + // put stats back in + self.stats.insert(col, container_stats); + self + } + + /// Add contained information for the specified column. + fn with_contained( + mut self, + name: impl Into, + values: impl IntoIterator, + contained: impl IntoIterator>, + ) -> Self { + let col = Column::from_name(name.into()); + + // take stats out and update them + let container_stats = self + .stats + .remove(&col) + .unwrap_or_default() + .with_contained(values, contained); + + // put stats back in + self.stats.insert(col, container_stats); + self + } + } + + impl PruningStatistics for TestStatistics { + fn min_values(&self, column: &Column) -> Option { + self.stats + .get(column) + .map(|container_stats| container_stats.min()) + .unwrap_or(None) + } + + fn max_values(&self, column: &Column) -> Option { + self.stats + .get(column) + .map(|container_stats| container_stats.max()) + .unwrap_or(None) + } + + fn num_containers(&self) -> usize { + self.stats + .values() + .next() + .map(|container_stats| container_stats.len()) + .unwrap_or(0) + } + + fn null_counts(&self, column: &Column) -> Option { + self.stats + .get(column) + .map(|container_stats| container_stats.null_counts()) + .unwrap_or(None) + } + + fn row_counts(&self, column: &Column) -> Option { + self.stats + .get(column) + .map(|container_stats| container_stats.row_counts()) + .unwrap_or(None) + } + + fn contained( + &self, + column: &Column, + values: &HashSet, + ) -> Option { + self.stats + .get(column) + .and_then(|container_stats| container_stats.contained(values)) + } + } + + /// Returns the specified min/max container values + struct OneContainerStats { + min_values: Option, + max_values: Option, + num_containers: usize, + } + + impl PruningStatistics for OneContainerStats { + fn min_values(&self, _column: &Column) -> Option { + self.min_values.clone() + } + + fn max_values(&self, _column: &Column) -> Option { + self.max_values.clone() + } + + fn num_containers(&self) -> usize { + self.num_containers + } + + fn null_counts(&self, _column: &Column) -> Option { + None + } + + fn row_counts(&self, _column: &Column) -> Option { + None + } + + fn contained( + &self, + _column: &Column, + _values: &HashSet, + ) -> Option { + None + } + } + + /// Row count should only be referenced once in the pruning expression, even if we need the row count + /// for multiple columns. + #[test] + fn test_unique_row_count_field_and_column() { + // c1 = 100 AND c2 = 200 + let schema: SchemaRef = Arc::new(Schema::new(vec![ + Field::new("c1", DataType::Int32, true), + Field::new("c2", DataType::Int32, true), + ])); + let expr = col("c1").eq(lit(100)).and(col("c2").eq(lit(200))); + let expr = logical2physical(&expr, &schema); + let p = PruningPredicate::try_new(expr, Arc::clone(&schema)).unwrap(); + // note pruning expression refers to row_count twice + assert_eq!( + "c1_null_count@2 != row_count@3 AND c1_min@0 <= 100 AND 100 <= c1_max@1 AND c2_null_count@6 != row_count@3 AND c2_min@4 <= 200 AND 200 <= c2_max@5", + p.predicate_expr.to_string() + ); + + // Fields in required schema should be unique, otherwise when creating batches + // it will fail because of duplicate field names + let mut fields = HashSet::new(); + for (_col, _ty, field) in p.required_columns().iter() { + let was_new = fields.insert(field); + if !was_new { + panic!( + "Duplicate field in required schema: {field:?}. Previous fields:\n{fields:#?}" + ); + } + } + } + + #[test] + fn prune_all_rows_null_counts() { + // if null_count = row_count then we should prune the container for i = 0 + // regardless of the statistics + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let statistics = TestStatistics::new().with( + "i", + ContainerStats::new_i32( + vec![Some(0)], // min + vec![Some(0)], // max + ) + .with_null_counts(vec![Some(1)]) + .with_row_counts(vec![Some(1)]), + ); + let expected_ret = &[false]; + prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); + + // this should be true even if the container stats are missing + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let container_stats = ContainerStats { + min: Some(Arc::new(Int32Array::from(vec![None]))), + max: Some(Arc::new(Int32Array::from(vec![None]))), + null_counts: Some(Arc::new(UInt64Array::from(vec![Some(1)]))), + row_counts: Some(Arc::new(UInt64Array::from(vec![Some(1)]))), + ..ContainerStats::default() + }; + let statistics = TestStatistics::new().with("i", container_stats); + let expected_ret = &[false]; + prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); + + // If the null counts themselves are missing we should be able to fall back to the stats + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let container_stats = ContainerStats { + min: Some(Arc::new(Int32Array::from(vec![Some(0)]))), + max: Some(Arc::new(Int32Array::from(vec![Some(0)]))), + null_counts: Some(Arc::new(UInt64Array::from(vec![None]))), + row_counts: Some(Arc::new(UInt64Array::from(vec![Some(1)]))), + ..ContainerStats::default() + }; + let statistics = TestStatistics::new().with("i", container_stats); + let expected_ret = &[true]; + prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); + let expected_ret = &[false]; + prune_with_expr(col("i").gt(lit(0)), &schema, &statistics, expected_ret); + + // Same for the row counts + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let container_stats = ContainerStats { + min: Some(Arc::new(Int32Array::from(vec![Some(0)]))), + max: Some(Arc::new(Int32Array::from(vec![Some(0)]))), + null_counts: Some(Arc::new(UInt64Array::from(vec![Some(1)]))), + row_counts: Some(Arc::new(UInt64Array::from(vec![None]))), + ..ContainerStats::default() + }; + let statistics = TestStatistics::new().with("i", container_stats); + let expected_ret = &[true]; + prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); + let expected_ret = &[false]; + prune_with_expr(col("i").gt(lit(0)), &schema, &statistics, expected_ret); + } + + #[test] + fn prune_missing_statistics() { + // If the min or max stats are missing we should not prune + // (unless we know all rows are null, see `prune_all_rows_null_counts`) + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + let container_stats = ContainerStats { + min: Some(Arc::new(Int32Array::from(vec![None, Some(0)]))), + max: Some(Arc::new(Int32Array::from(vec![Some(0), None]))), + null_counts: Some(Arc::new(UInt64Array::from(vec![Some(0), Some(0)]))), + row_counts: Some(Arc::new(UInt64Array::from(vec![Some(1), Some(1)]))), + ..ContainerStats::default() + }; + let statistics = TestStatistics::new().with("i", container_stats); + let expected_ret = &[true, true]; + prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); + let expected_ret = &[false, true]; + prune_with_expr(col("i").gt(lit(0)), &schema, &statistics, expected_ret); + let expected_ret = &[true, false]; + prune_with_expr(col("i").lt(lit(0)), &schema, &statistics, expected_ret); + } + + #[test] + fn prune_null_stats() { + // if null_count = row_count then we should prune the container for i = 0 + // regardless of the statistics + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + + let statistics = TestStatistics::new().with( + "i", + ContainerStats::new_i32( + vec![Some(0)], // min + vec![Some(0)], // max + ) + .with_null_counts(vec![Some(1)]) + .with_row_counts(vec![Some(1)]), + ); + + let expected_ret = &[false]; + + // i = 0 + prune_with_expr(col("i").eq(lit(0)), &schema, &statistics, expected_ret); + } + + #[test] + fn test_build_statistics_record_batch() { + // Request a record batch with of s1_min, s2_max, s3_max, s3_min + let required_columns = RequiredColumns::from(vec![ + // min of original column s1, named s1_min + ( + phys_expr::Column::new("s1", 1), + StatisticsType::Min, + Field::new("s1_min", DataType::Int32, true), + ), + // max of original column s2, named s2_max + ( + phys_expr::Column::new("s2", 2), + StatisticsType::Max, + Field::new("s2_max", DataType::Int32, true), + ), + // max of original column s3, named s3_max + ( + phys_expr::Column::new("s3", 3), + StatisticsType::Max, + Field::new("s3_max", DataType::Utf8, true), + ), + // min of original column s3, named s3_min + ( + phys_expr::Column::new("s3", 3), + StatisticsType::Min, + Field::new("s3_min", DataType::Utf8, true), + ), + ]); + + let statistics = TestStatistics::new() + .with( + "s1", + ContainerStats::new_i32( + vec![None, None, Some(9), None], // min + vec![Some(10), None, None, None], // max + ), + ) + .with( + "s2", + ContainerStats::new_i32( + vec![Some(2), None, None, None], // min + vec![Some(20), None, None, None], // max + ), + ) + .with( + "s3", + ContainerStats::new_utf8( + vec![Some("a"), None, None, None], // min + vec![Some("q"), None, Some("r"), None], // max + ), + ); + + let batch = + build_statistics_record_batch(&statistics, &required_columns).unwrap(); + assert_snapshot!(batches_to_string(&[batch]), @r" + +--------+--------+--------+--------+ + | s1_min | s2_max | s3_max | s3_min | + +--------+--------+--------+--------+ + | | 20 | q | a | + | | | | | + | 9 | | r | | + | | | | | + +--------+--------+--------+--------+ + "); + } + + #[test] + fn test_build_statistics_casting() { + // Test requesting a Timestamp column, but getting statistics as Int64 + // which is what Parquet does + + // Request a record batch with of s1_min as a timestamp + let required_columns = RequiredColumns::from(vec![( + phys_expr::Column::new("s3", 3), + StatisticsType::Min, + Field::new( + "s1_min", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ), + )]); + + // Note the statistics pass back i64 (not timestamp) + let statistics = OneContainerStats { + min_values: Some(Arc::new(Int64Array::from(vec![Some(10)]))), + max_values: Some(Arc::new(Int64Array::from(vec![Some(20)]))), + num_containers: 1, + }; + + let batch = + build_statistics_record_batch(&statistics, &required_columns).unwrap(); + + assert_snapshot!(batches_to_string(&[batch]), @r" + +-------------------------------+ + | s1_min | + +-------------------------------+ + | 1970-01-01T00:00:00.000000010 | + +-------------------------------+ + "); + } + + #[test] + fn test_build_statistics_no_required_stats() { + let required_columns = RequiredColumns::new(); + + let statistics = OneContainerStats { + min_values: Some(Arc::new(Int64Array::from(vec![Some(10)]))), + max_values: Some(Arc::new(Int64Array::from(vec![Some(20)]))), + num_containers: 1, + }; + + let batch = + build_statistics_record_batch(&statistics, &required_columns).unwrap(); + assert_eq!(batch.num_rows(), 1); // had 1 container + } + + #[test] + fn test_build_statistics_inconsistent_types() { + // Test requesting a Utf8 column when the stats return some other type + + // Request a record batch with of s1_min as a timestamp + let required_columns = RequiredColumns::from(vec![( + phys_expr::Column::new("s3", 3), + StatisticsType::Min, + Field::new("s1_min", DataType::Utf8, true), + )]); + + // Note the statistics return an invalid UTF-8 sequence which will be converted to null + let statistics = OneContainerStats { + min_values: Some(Arc::new(BinaryArray::from(vec![&[255u8] as &[u8]]))), + max_values: None, + num_containers: 1, + }; + + let batch = + build_statistics_record_batch(&statistics, &required_columns).unwrap(); + assert_snapshot!(batches_to_string(&[batch]), @r" + +--------+ + | s1_min | + +--------+ + | | + +--------+ + "); + } + + #[test] + fn test_build_statistics_inconsistent_length() { + // return an inconsistent length to the actual statistics arrays + let required_columns = RequiredColumns::from(vec![( + phys_expr::Column::new("s1", 3), + StatisticsType::Min, + Field::new("s1_min", DataType::Int64, true), + )]); + + // Note the statistics pass back i64 (not timestamp) + let statistics = OneContainerStats { + min_values: Some(Arc::new(Int64Array::from(vec![Some(10)]))), + max_values: Some(Arc::new(Int64Array::from(vec![Some(20)]))), + num_containers: 3, + }; + + let result = + build_statistics_record_batch(&statistics, &required_columns).unwrap_err(); + assert!( + result + .to_string() + .contains("mismatched statistics length. Expected 3, got 1"), + "{}", + result + ); + } + + #[test] + fn row_group_predicate_eq() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = + "c1_null_count@2 != row_count@3 AND c1_min@0 <= 1 AND 1 <= c1_max@1"; + + // test column on the left + let expr = col("c1").eq(lit(1)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(1).eq(col("c1")); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_not_eq() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = + "c1_null_count@2 != row_count@3 AND (c1_min@0 != 1 OR 1 != c1_max@1)"; + + // test column on the left + let expr = col("c1").not_eq(lit(1)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(1).not_eq(col("c1")); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_gt() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "c1_null_count@1 != row_count@2 AND c1_max@0 > 1"; + + // test column on the left + let expr = col("c1").gt(lit(1)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(1).lt(col("c1")); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_gt_eq() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "c1_null_count@1 != row_count@2 AND c1_max@0 >= 1"; + + // test column on the left + let expr = col("c1").gt_eq(lit(1)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + // test column on the right + let expr = lit(1).lt_eq(col("c1")); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_lt() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "c1_null_count@1 != row_count@2 AND c1_min@0 < 1"; + + // test column on the left + let expr = col("c1").lt(lit(1)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(1).gt(col("c1")); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_lt_eq() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "c1_null_count@1 != row_count@2 AND c1_min@0 <= 1"; + + // test column on the left + let expr = col("c1").lt_eq(lit(1)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + // test column on the right + let expr = lit(1).gt_eq(col("c1")); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_and() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + Field::new("c3", DataType::Int32, false), + ]); + // test AND operator joining supported c1 < 1 expression and unsupported c2 > c3 expression + let expr = col("c1").lt(lit(1)).and(col("c2").lt(col("c3"))); + let expected_expr = "c1_null_count@1 != row_count@2 AND c1_min@0 < 1"; + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_or() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + ]); + // test OR operator joining supported c1 < 1 expression and unsupported c2 % 2 = 0 expression + let expr = col("c1").lt(lit(1)).or(col("c2").rem(lit(2)).eq(lit(0))); + let expected_expr = "true"; + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_not() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "true"; + + let expr = col("c1").not(); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_not_bool() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Boolean, false)]); + let expected_expr = "NOT c1_min@0 AND c1_max@1"; + + let expr = col("c1").not(); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_bool() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Boolean, false)]); + let expected_expr = "c1_min@0 OR c1_max@1"; + + let expr = col("c1"); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_lt_bool() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Boolean, false)]); + let expected_expr = "c1_null_count@1 != row_count@2 AND c1_min@0 < true"; + + // DF doesn't support arithmetic on boolean columns so + // this predicate will error when evaluated + let expr = col("c1").lt(lit(true)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_required_columns() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + ]); + let mut required_columns = RequiredColumns::new(); + // c1 < 1 and (c2 = 2 or c2 = 3) + let expr = col("c1") + .lt(lit(1)) + .and(col("c2").eq(lit(2)).or(col("c2").eq(lit(3)))); + let expected_expr = "c1_null_count@1 != row_count@2 AND c1_min@0 < 1 AND (c2_null_count@5 != row_count@2 AND c2_min@3 <= 2 AND 2 <= c2_max@4 OR c2_null_count@5 != row_count@2 AND c2_min@3 <= 3 AND 3 <= c2_max@4)"; + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut required_columns); + assert_eq!(predicate_expr.to_string(), expected_expr); + println!("required_columns: {required_columns:#?}"); // for debugging assertions below + // c1 < 1 should add c1_min + let c1_min_field = Field::new("c1_min", DataType::Int32, false); + assert_eq!( + required_columns.columns[0], + ( + phys_expr::Column::new("c1", 0), + StatisticsType::Min, + c1_min_field.with_nullable(true) // could be nullable if stats are not present + ) + ); + // c1 < 1 should add c1_null_count + let c1_null_count_field = Field::new("c1_null_count", DataType::UInt64, false); + assert_eq!( + required_columns.columns[1], + ( + phys_expr::Column::new("c1", 0), + StatisticsType::NullCount, + c1_null_count_field.with_nullable(true) // could be nullable if stats are not present + ) + ); + // c1 < 1 should add row_count + let row_count_field = Field::new("row_count", DataType::UInt64, false); + assert_eq!( + required_columns.columns[2], + ( + phys_expr::Column::new("c1", 0), + StatisticsType::RowCount, + row_count_field.with_nullable(true) // could be nullable if stats are not present + ) + ); + // c2 = 2 should add c2_min and c2_max + let c2_min_field = Field::new("c2_min", DataType::Int32, false); + assert_eq!( + required_columns.columns[3], + ( + phys_expr::Column::new("c2", 1), + StatisticsType::Min, + c2_min_field.with_nullable(true) // could be nullable if stats are not present + ) + ); + let c2_max_field = Field::new("c2_max", DataType::Int32, false); + assert_eq!( + required_columns.columns[4], + ( + phys_expr::Column::new("c2", 1), + StatisticsType::Max, + c2_max_field.with_nullable(true) // could be nullable if stats are not present + ) + ); + // c2 = 2 should add c2_null_count + let c2_null_count_field = Field::new("c2_null_count", DataType::UInt64, false); + assert_eq!( + required_columns.columns[5], + ( + phys_expr::Column::new("c2", 1), + StatisticsType::NullCount, + c2_null_count_field.with_nullable(true) // could be nullable if stats are not present + ) + ); + // c2 = 1 should add row_count + let row_count_field = Field::new("row_count", DataType::UInt64, false); + assert_eq!( + required_columns.columns[2], + ( + phys_expr::Column::new("c1", 0), + StatisticsType::RowCount, + row_count_field.with_nullable(true) // could be nullable if stats are not present + ) + ); + // c2 = 3 shouldn't add any new statistics fields + assert_eq!(required_columns.columns.len(), 6); + + Ok(()) + } + + #[test] + fn row_group_predicate_in_list() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + ]); + // test c1 in(1, 2, 3) + let expr = Expr::InList(InList::new( + Box::new(col("c1")), + vec![lit(1), lit(2), lit(3)], + false, + )); + let expected_expr = "c1_null_count@2 != row_count@3 AND c1_min@0 <= 1 AND 1 <= c1_max@1 OR c1_null_count@2 != row_count@3 AND c1_min@0 <= 2 AND 2 <= c1_max@1 OR c1_null_count@2 != row_count@3 AND c1_min@0 <= 3 AND 3 <= c1_max@1"; + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_in_list_empty() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + ]); + // test c1 in() + let expr = Expr::InList(InList::new(Box::new(col("c1")), vec![], false)); + let expected_expr = "true"; + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_in_list_negated() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + ]); + // test c1 not in(1, 2, 3) + let expr = Expr::InList(InList::new( + Box::new(col("c1")), + vec![lit(1), lit(2), lit(3)], + true, + )); + let expected_expr = "c1_null_count@2 != row_count@3 AND (c1_min@0 != 1 OR 1 != c1_max@1) AND c1_null_count@2 != row_count@3 AND (c1_min@0 != 2 OR 2 != c1_max@1) AND c1_null_count@2 != row_count@3 AND (c1_min@0 != 3 OR 3 != c1_max@1)"; + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_between() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + ]); + + // test c1 BETWEEN 1 AND 5 + let expr1 = col("c1").between(lit(1), lit(5)); + + // test 1 <= c1 <= 5 + let expr2 = col("c1").gt_eq(lit(1)).and(col("c1").lt_eq(lit(5))); + + let predicate_expr1 = + test_build_predicate_expression(&expr1, &schema, &mut RequiredColumns::new()); + + let predicate_expr2 = + test_build_predicate_expression(&expr2, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr1.to_string(), predicate_expr2.to_string()); + + Ok(()) + } + + #[test] + fn row_group_predicate_between_with_in_list() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int32, false), + ]); + // test c1 in(1, 2) + let expr1 = col("c1").in_list(vec![lit(1), lit(2)], false); + + // test c2 BETWEEN 4 AND 5 + let expr2 = col("c2").between(lit(4), lit(5)); + + // test c1 in(1, 2) and c2 BETWEEN 4 AND 5 + let expr3 = expr1.and(expr2); + + let expected_expr = "(c1_null_count@2 != row_count@3 AND c1_min@0 <= 1 AND 1 <= c1_max@1 OR c1_null_count@2 != row_count@3 AND c1_min@0 <= 2 AND 2 <= c1_max@1) AND c2_null_count@5 != row_count@3 AND c2_max@4 >= 4 AND c2_null_count@5 != row_count@3 AND c2_min@6 <= 5"; + let predicate_expr = + test_build_predicate_expression(&expr3, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_in_list_to_many_values() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + // test c1 in(1..21) + // in pruning.rs has MAX_LIST_VALUE_SIZE_REWRITE = 20, more than this value will be rewrite + // always true + let expr = col("c1").in_list((1..=21).map(lit).collect(), false); + + let expected_expr = "true"; + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_cast_int_int() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Int64) <= 1 AND 1 <= CAST(c1_max@1 AS Int64)"; + + // test cast(c1 as int64) = 1 + // test column on the left + let expr = cast(col("c1"), DataType::Int64).eq(lit(ScalarValue::Int64(Some(1)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Int64(Some(1))).eq(cast(col("c1"), DataType::Int64)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + let expected_expr = + "c1_null_count@1 != row_count@2 AND TRY_CAST(c1_max@0 AS Int64) > 1"; + + // test column on the left + let expr = + try_cast(col("c1"), DataType::Int64).gt(lit(ScalarValue::Int64(Some(1)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = + lit(ScalarValue::Int64(Some(1))).lt(try_cast(col("c1"), DataType::Int64)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_cast_string_string() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Utf8) <= 1 AND 1 <= CAST(c1_max@1 AS Utf8)"; + + // test column on the left + let expr = cast(col("c1"), DataType::Utf8) + .eq(lit(ScalarValue::Utf8(Some("1".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Utf8(Some("1".to_string()))) + .eq(cast(col("c1"), DataType::Utf8)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_cast_string_int() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); + let expected_expr = "true"; + + // test column on the left + let expr = cast(col("c1"), DataType::Int32).eq(lit(ScalarValue::Int32(Some(1)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Int32(Some(1))).eq(cast(col("c1"), DataType::Int32)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_cast_int_string() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "true"; + + // test column on the left + let expr = cast(col("c1"), DataType::Utf8) + .eq(lit(ScalarValue::Utf8(Some("1".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Utf8(Some("1".to_string()))) + .eq(cast(col("c1"), DataType::Utf8)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_date_date() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Date32, false)]); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Date64) <= 1970-01-01 AND 1970-01-01 <= CAST(c1_max@1 AS Date64)"; + + // test column on the left + let expr = + cast(col("c1"), DataType::Date64).eq(lit(ScalarValue::Date64(Some(123)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = + lit(ScalarValue::Date64(Some(123))).eq(cast(col("c1"), DataType::Date64)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_dict_string_date() -> Result<()> { + // Test with Dictionary for the literal + let schema = Schema::new(vec![Field::new("c1", DataType::Date32, false)]); + let expected_expr = "true"; + + // test column on the left + let expr = cast( + col("c1"), + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), + ) + .eq(lit(ScalarValue::Utf8(Some("2024-01-01".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Utf8(Some("2024-01-01".to_string()))).eq(cast( + col("c1"), + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), + )); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_date_dict_string() -> Result<()> { + // Test with Dictionary for the column + let schema = Schema::new(vec![Field::new( + "c1", + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), + false, + )]); + let expected_expr = "true"; + + // test column on the left + let expr = + cast(col("c1"), DataType::Date32).eq(lit(ScalarValue::Date32(Some(123)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = + lit(ScalarValue::Date32(Some(123))).eq(cast(col("c1"), DataType::Date32)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_dict_dict_same_value_type() -> Result<()> { + // Test with Dictionary types that have the same value type but different key types + let schema = Schema::new(vec![Field::new( + "c1", + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), + false, + )]); + + // Direct comparison with no cast + let expr = col("c1").eq(lit(ScalarValue::Utf8(Some("test".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + let expected_expr = + "c1_null_count@2 != row_count@3 AND c1_min@0 <= test AND test <= c1_max@1"; + assert_eq!(predicate_expr.to_string(), expected_expr); + + // Test with column cast to a dictionary with different key type + let expr = cast( + col("c1"), + DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)), + ) + .eq(lit(ScalarValue::Utf8(Some("test".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Dictionary(UInt16, Utf8)) <= test AND test <= CAST(c1_max@1 AS Dictionary(UInt16, Utf8))"; + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_dict_dict_different_value_type() -> Result<()> { + // Test with Dictionary types that have different value types + let schema = Schema::new(vec![Field::new( + "c1", + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Int32)), + false, + )]); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Int64) <= 123 AND 123 <= CAST(c1_max@1 AS Int64)"; + + // Test with literal of a different type + let expr = + cast(col("c1"), DataType::Int64).eq(lit(ScalarValue::Int64(Some(123)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_nested_dict() -> Result<()> { + // Test with nested Dictionary types + let schema = Schema::new(vec![Field::new( + "c1", + DataType::Dictionary( + Box::new(DataType::UInt8), + Box::new(DataType::Dictionary( + Box::new(DataType::UInt16), + Box::new(DataType::Utf8), + )), + ), + false, + )]); + let expected_expr = + "c1_null_count@2 != row_count@3 AND c1_min@0 <= test AND test <= c1_max@1"; + + // Test with a simple literal + let expr = col("c1").eq(lit(ScalarValue::Utf8(Some("test".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_dict_date_dict_date() -> Result<()> { + // Test with dictionary-wrapped date types for both sides + let schema = Schema::new(vec![Field::new( + "c1", + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Date32)), + false, + )]); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Dictionary(UInt16, Date64)) <= 1970-01-01 AND 1970-01-01 <= CAST(c1_max@1 AS Dictionary(UInt16, Date64))"; + + // Test with a cast to a different date type + let expr = cast( + col("c1"), + DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Date64)), + ) + .eq(lit(ScalarValue::Date64(Some(123)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_date_string() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, false)]); + let expected_expr = "true"; + + // test column on the left + let expr = + cast(col("c1"), DataType::Date32).eq(lit(ScalarValue::Date32(Some(123)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = + lit(ScalarValue::Date32(Some(123))).eq(cast(col("c1"), DataType::Date32)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_string_date() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Date32, false)]); + let expected_expr = "true"; + + // test column on the left + let expr = cast(col("c1"), DataType::Utf8) + .eq(lit(ScalarValue::Utf8(Some("2024-01-01".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Utf8(Some("2024-01-01".to_string()))) + .eq(cast(col("c1"), DataType::Utf8)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_cast_list() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + // test cast(c1 as int64) in int64(1, 2, 3) + let expr = Expr::InList(InList::new( + Box::new(cast(col("c1"), DataType::Int64)), + vec![ + lit(ScalarValue::Int64(Some(1))), + lit(ScalarValue::Int64(Some(2))), + lit(ScalarValue::Int64(Some(3))), + ], + false, + )); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Int64) <= 1 AND 1 <= CAST(c1_max@1 AS Int64) OR c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Int64) <= 2 AND 2 <= CAST(c1_max@1 AS Int64) OR c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Int64) <= 3 AND 3 <= CAST(c1_max@1 AS Int64)"; + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + let expr = Expr::InList(InList::new( + Box::new(cast(col("c1"), DataType::Int64)), + vec![ + lit(ScalarValue::Int64(Some(1))), + lit(ScalarValue::Int64(Some(2))), + lit(ScalarValue::Int64(Some(3))), + ], + true, + )); + let expected_expr = "c1_null_count@2 != row_count@3 AND (CAST(c1_min@0 AS Int64) != 1 OR 1 != CAST(c1_max@1 AS Int64)) AND c1_null_count@2 != row_count@3 AND (CAST(c1_min@0 AS Int64) != 2 OR 2 != CAST(c1_max@1 AS Int64)) AND c1_null_count@2 != row_count@3 AND (CAST(c1_min@0 AS Int64) != 3 OR 3 != CAST(c1_max@1 AS Int64))"; + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn prune_decimal_data() { + // decimal(9,2) + let schema = Arc::new(Schema::new(vec![Field::new( + "s1", + DataType::Decimal128(9, 2), + true, + )])); + + prune_with_expr( + // s1 > 5 + col("s1").gt(lit(ScalarValue::Decimal128(Some(500), 9, 2))), + &schema, + // If the data is written by spark, the physical data type is INT32 in the parquet + // So we use the INT32 type of statistic. + &TestStatistics::new().with( + "s1", + ContainerStats::new_i32( + vec![Some(0), Some(4), None, Some(3)], // min + vec![Some(5), Some(6), Some(4), None], // max + ), + ), + &[false, true, false, true], + ); + + prune_with_expr( + // with cast column to other type + cast(col("s1"), DataType::Decimal128(14, 3)) + .gt(lit(ScalarValue::Decimal128(Some(5000), 14, 3))), + &schema, + &TestStatistics::new().with( + "s1", + ContainerStats::new_i32( + vec![Some(0), Some(4), None, Some(3)], // min + vec![Some(5), Some(6), Some(4), None], // max + ), + ), + &[false, true, false, true], + ); + + prune_with_expr( + // with try cast column to other type + try_cast(col("s1"), DataType::Decimal128(14, 3)) + .gt(lit(ScalarValue::Decimal128(Some(5000), 14, 3))), + &schema, + &TestStatistics::new().with( + "s1", + ContainerStats::new_i32( + vec![Some(0), Some(4), None, Some(3)], // min + vec![Some(5), Some(6), Some(4), None], // max + ), + ), + &[false, true, false, true], + ); + + // decimal(18,2) + let schema = Arc::new(Schema::new(vec![Field::new( + "s1", + DataType::Decimal128(18, 2), + true, + )])); + prune_with_expr( + // s1 > 5 + col("s1").gt(lit(ScalarValue::Decimal128(Some(500), 18, 2))), + &schema, + // If the data is written by spark, the physical data type is INT64 in the parquet + // So we use the INT32 type of statistic. + &TestStatistics::new().with( + "s1", + ContainerStats::new_i64( + vec![Some(0), Some(4), None, Some(3)], // min + vec![Some(5), Some(6), Some(4), None], // max + ), + ), + &[false, true, false, true], + ); + + // decimal(23,2) + let schema = Arc::new(Schema::new(vec![Field::new( + "s1", + DataType::Decimal128(23, 2), + true, + )])); + + prune_with_expr( + // s1 > 5 + col("s1").gt(lit(ScalarValue::Decimal128(Some(500), 23, 2))), + &schema, + &TestStatistics::new().with( + "s1", + ContainerStats::new_decimal128( + vec![Some(0), Some(400), None, Some(300)], // min + vec![Some(500), Some(600), Some(400), None], // max + 23, + 2, + ), + ), + &[false, true, false, true], + ); + } + + #[test] + fn prune_api() { + let schema = Arc::new(Schema::new(vec![ + Field::new("s1", DataType::Utf8, true), + Field::new("s2", DataType::Int32, true), + ])); + + let statistics = TestStatistics::new().with( + "s2", + ContainerStats::new_i32( + vec![Some(0), Some(4), None, Some(3)], // min + vec![Some(5), Some(6), None, None], // max + ), + ); + prune_with_expr( + // Prune using s2 > 5 + col("s2").gt(lit(5)), + &schema, + &statistics, + // s2 [0, 5] ==> no rows should pass + // s2 [4, 6] ==> some rows could pass + // No stats for s2 ==> some rows could pass + // s2 [3, None] (null max) ==> some rows could pass + &[false, true, true, true], + ); + + prune_with_expr( + // filter with cast + cast(col("s2"), DataType::Int64).gt(lit(ScalarValue::Int64(Some(5)))), + &schema, + &statistics, + &[false, true, true, true], + ); + } + + #[test] + fn prune_not_eq_data() { + let schema = Arc::new(Schema::new(vec![Field::new("s1", DataType::Utf8, true)])); + + prune_with_expr( + // Prune using s2 != 'M' + col("s1").not_eq(lit("M")), + &schema, + &TestStatistics::new().with( + "s1", + ContainerStats::new_utf8( + vec![Some("A"), Some("A"), Some("N"), Some("M"), None, Some("A")], // min + vec![Some("Z"), Some("L"), Some("Z"), Some("M"), None, None], // max + ), + ), + // s1 [A, Z] ==> might have values that pass predicate + // s1 [A, L] ==> all rows pass the predicate + // s1 [N, Z] ==> all rows pass the predicate + // s1 [M, M] ==> all rows do not pass the predicate + // No stats for s2 ==> some rows could pass + // s2 [3, None] (null max) ==> some rows could pass + &[true, true, true, false, true, true], + ); + } + + /// Creates setup for boolean chunk pruning + /// + /// For predicate "b1" (boolean expr) + /// b1 [false, false] ==> no rows can pass (not keep) + /// b1 [false, true] ==> some rows could pass (must keep) + /// b1 [true, true] ==> all rows must pass (must keep) + /// b1 [NULL, NULL] ==> unknown (must keep) + /// b1 [false, NULL] ==> unknown (must keep) + /// + /// For predicate "!b1" (boolean expr) + /// b1 [false, false] ==> all rows pass (must keep) + /// b1 [false, true] ==> some rows could pass (must keep) + /// b1 [true, true] ==> no rows can pass (not keep) + /// b1 [NULL, NULL] ==> unknown (must keep) + /// b1 [false, NULL] ==> unknown (must keep) + fn bool_setup() -> (SchemaRef, TestStatistics, Vec, Vec) { + let schema = + Arc::new(Schema::new(vec![Field::new("b1", DataType::Boolean, true)])); + + let statistics = TestStatistics::new().with( + "b1", + ContainerStats::new_bool( + vec![Some(false), Some(false), Some(true), None, Some(false)], // min + vec![Some(false), Some(true), Some(true), None, None], // max + ), + ); + let expected_true = vec![false, true, true, true, true]; + let expected_false = vec![true, true, false, true, true]; + + (schema, statistics, expected_true, expected_false) + } + + #[test] + fn prune_bool_const_expr() { + let (schema, statistics, _, _) = bool_setup(); + + prune_with_expr( + // true + lit(true), + &schema, + &statistics, + &[true, true, true, true, true], + ); + + prune_with_expr( + // false + lit(false), + &schema, + &statistics, + &[false, false, false, false, false], + ); + } + + #[test] + fn prune_bool_column() { + let (schema, statistics, expected_true, _) = bool_setup(); + + prune_with_expr( + // b1 + col("b1"), + &schema, + &statistics, + &expected_true, + ); + } + + #[test] + fn prune_bool_not_column() { + let (schema, statistics, _, expected_false) = bool_setup(); + + prune_with_expr( + // !b1 + col("b1").not(), + &schema, + &statistics, + &expected_false, + ); + } + + #[test] + fn prune_bool_column_eq_true() { + let (schema, statistics, expected_true, _) = bool_setup(); + + prune_with_expr( + // b1 = true + col("b1").eq(lit(true)), + &schema, + &statistics, + &expected_true, + ); + } + + #[test] + fn prune_bool_not_column_eq_true() { + let (schema, statistics, _, expected_false) = bool_setup(); + + prune_with_expr( + // !b1 = true + col("b1").not().eq(lit(true)), + &schema, + &statistics, + &expected_false, + ); + } + + /// Creates a setup for chunk pruning, modeling a int32 column "i" + /// with 5 different containers (e.g. RowGroups). They have [min, + /// max]: + /// + /// i [-5, 5] + /// i [1, 11] + /// i [-11, -1] + /// i [NULL, NULL] + /// i [1, NULL] + fn int32_setup() -> (SchemaRef, TestStatistics) { + let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, true)])); + + let statistics = TestStatistics::new().with( + "i", + ContainerStats::new_i32( + vec![Some(-5), Some(1), Some(-11), None, Some(1)], // min + vec![Some(5), Some(11), Some(-1), None, None], // max + ), + ); + (schema, statistics) + } + + #[test] + fn prune_int32_col_gt_zero() { + let (schema, statistics) = int32_setup(); + + // Expression "i > 0" and "-i < 0" + // i [-5, 5] ==> some rows could pass (must keep) + // i [1, 11] ==> all rows must pass (must keep) + // i [-11, -1] ==> no rows can pass (not keep) + // i [NULL, NULL] ==> unknown (must keep) + // i [1, NULL] ==> unknown (must keep) + let expected_ret = &[true, true, false, true, true]; + + // i > 0 + prune_with_expr(col("i").gt(lit(0)), &schema, &statistics, expected_ret); + + // -i < 0 + prune_with_expr( + Expr::Negative(Box::new(col("i"))).lt(lit(0)), + &schema, + &statistics, + expected_ret, + ); + } + + #[test] + fn prune_int32_col_lte_zero() { + let (schema, statistics) = int32_setup(); + + // Expression "i <= 0" and "-i >= 0" + // i [-5, 5] ==> some rows could pass (must keep) + // i [1, 11] ==> no rows can pass (not keep) + // i [-11, -1] ==> all rows must pass (must keep) + // i [NULL, NULL] ==> unknown (must keep) + // i [1, NULL] ==> no rows can pass (not keep) + let expected_ret = &[true, false, true, true, false]; + + prune_with_expr( + // i <= 0 + col("i").lt_eq(lit(0)), + &schema, + &statistics, + expected_ret, + ); + + prune_with_expr( + // -i >= 0 + Expr::Negative(Box::new(col("i"))).gt_eq(lit(0)), + &schema, + &statistics, + expected_ret, + ); + } + + #[test] + fn prune_int32_col_lte_zero_cast() { + let (schema, statistics) = int32_setup(); + + // Expression "cast(i as utf8) <= '0'" + // i [-5, 5] ==> some rows could pass (must keep) + // i [1, 11] ==> no rows can pass in theory, -0.22 (conservatively keep) + // i [-11, -1] ==> no rows could pass in theory (conservatively keep) + // i [NULL, NULL] ==> unknown (must keep) + // i [1, NULL] ==> no rows can pass (conservatively keep) + let expected_ret = &[true, true, true, true, true]; + + prune_with_expr( + // cast(i as utf8) <= 0 + cast(col("i"), DataType::Utf8).lt_eq(lit("0")), + &schema, + &statistics, + expected_ret, + ); + + prune_with_expr( + // try_cast(i as utf8) <= 0 + try_cast(col("i"), DataType::Utf8).lt_eq(lit("0")), + &schema, + &statistics, + expected_ret, + ); + + prune_with_expr( + // cast(-i as utf8) >= 0 + cast(Expr::Negative(Box::new(col("i"))), DataType::Utf8).gt_eq(lit("0")), + &schema, + &statistics, + expected_ret, + ); + + prune_with_expr( + // try_cast(-i as utf8) >= 0 + try_cast(Expr::Negative(Box::new(col("i"))), DataType::Utf8).gt_eq(lit("0")), + &schema, + &statistics, + expected_ret, + ); + } + + #[test] + fn prune_int32_col_eq_zero() { + let (schema, statistics) = int32_setup(); + + // Expression "i = 0" + // i [-5, 5] ==> some rows could pass (must keep) + // i [1, 11] ==> no rows can pass (not keep) + // i [-11, -1] ==> no rows can pass (not keep) + // i [NULL, NULL] ==> unknown (must keep) + // i [1, NULL] ==> no rows can pass (not keep) + let expected_ret = &[true, false, false, true, false]; + + prune_with_expr( + // i = 0 + col("i").eq(lit(0)), + &schema, + &statistics, + expected_ret, + ); + } + + #[test] + fn prune_int32_col_eq_zero_cast() { + let (schema, statistics) = int32_setup(); + + // Expression "cast(i as int64) = 0" + // i [-5, 5] ==> some rows could pass (must keep) + // i [1, 11] ==> no rows can pass (not keep) + // i [-11, -1] ==> no rows can pass (not keep) + // i [NULL, NULL] ==> unknown (must keep) + // i [1, NULL] ==> no rows can pass (not keep) + let expected_ret = &[true, false, false, true, false]; + + prune_with_expr( + cast(col("i"), DataType::Int64).eq(lit(0i64)), + &schema, + &statistics, + expected_ret, + ); + + prune_with_expr( + try_cast(col("i"), DataType::Int64).eq(lit(0i64)), + &schema, + &statistics, + expected_ret, + ); + } + + #[test] + fn prune_int32_col_eq_zero_cast_as_str() { + let (schema, statistics) = int32_setup(); + + // Note the cast is to a string where sorting properties are + // not the same as integers + // + // Expression "cast(i as utf8) = '0'" + // i [-5, 5] ==> some rows could pass (keep) + // i [1, 11] ==> no rows can pass (could keep) + // i [-11, -1] ==> no rows can pass (could keep) + // i [NULL, NULL] ==> unknown (keep) + // i [1, NULL] ==> no rows can pass (could keep) + let expected_ret = &[true, true, true, true, true]; + + prune_with_expr( + cast(col("i"), DataType::Utf8).eq(lit("0")), + &schema, + &statistics, + expected_ret, + ); + } + + #[test] + fn prune_int32_col_lt_neg_one() { + let (schema, statistics) = int32_setup(); + + // Expression "i > -1" and "-i < 1" + // i [-5, 5] ==> some rows could pass (must keep) + // i [1, 11] ==> all rows must pass (must keep) + // i [-11, -1] ==> no rows can pass (not keep) + // i [NULL, NULL] ==> unknown (must keep) + // i [1, NULL] ==> all rows must pass (must keep) + let expected_ret = &[true, true, false, true, true]; + + prune_with_expr( + // i > -1 + col("i").gt(lit(-1)), + &schema, + &statistics, + expected_ret, + ); + + prune_with_expr( + // -i < 1 + Expr::Negative(Box::new(col("i"))).lt(lit(1)), + &schema, + &statistics, + expected_ret, + ); + } + + #[test] + fn prune_int32_is_null() { + let (schema, statistics) = int32_setup(); + + // Expression "i IS NULL" when there are no null statistics, + // should all be kept + let expected_ret = &[true, true, true, true, true]; + + prune_with_expr( + // i IS NULL, no null statistics + col("i").is_null(), + &schema, + &statistics, + expected_ret, + ); + + // provide null counts for each column + let statistics = statistics.with_null_counts( + "i", + vec![ + Some(0), // no nulls (don't keep) + Some(1), // 1 null + None, // unknown nulls + None, // unknown nulls (min/max are both null too, like no stats at all) + Some(0), // 0 nulls (max=null too which means no known max) (don't keep) + ], + ); + + let expected_ret = &[false, true, true, true, false]; + + prune_with_expr( + // i IS NULL, with actual null statistics + col("i").is_null(), + &schema, + &statistics, + expected_ret, + ); + } + + #[test] + fn prune_int32_column_is_known_all_null() { + let (schema, statistics) = int32_setup(); + + // Expression "i < 0" + // i [-5, 5] ==> some rows could pass (must keep) + // i [1, 11] ==> no rows can pass (not keep) + // i [-11, -1] ==> all rows must pass (must keep) + // i [NULL, NULL] ==> unknown (must keep) + // i [1, NULL] ==> no rows can pass (not keep) + let expected_ret = &[true, false, true, true, false]; + + prune_with_expr( + // i < 0 + col("i").lt(lit(0)), + &schema, + &statistics, + expected_ret, + ); + + // provide row counts for each column + let statistics = statistics.with_row_counts( + "i", + vec![ + Some(10), // 10 rows of data + Some(9), // 9 rows of data + None, // unknown row counts + Some(4), + Some(10), + ], + ); + + // pruning result is still the same if we only know row counts + prune_with_expr( + // i < 0, with only row counts statistics + col("i").lt(lit(0)), + &schema, + &statistics, + expected_ret, + ); + + // provide null counts for each column + let statistics = statistics.with_null_counts( + "i", + vec![ + Some(0), // no nulls + Some(1), // 1 null + None, // unknown nulls + Some(4), // 4 nulls, which is the same as the row counts, i.e. this column is all null (don't keep) + Some(0), // 0 nulls (max=null too which means no known max) + ], + ); + + // Expression "i < 0" with actual null and row counts statistics + // col | min, max | row counts | null counts | + // ----+--------------+------------+-------------+ + // i | [-5, 5] | 10 | 0 | ==> Some rows could pass (must keep) + // i | [1, 11] | 9 | 1 | ==> No rows can pass (not keep) + // i | [-11,-1] | Unknown | Unknown | ==> All rows must pass (must keep) + // i | [NULL, NULL] | 4 | 4 | ==> The column is all null (not keep) + // i | [1, NULL] | 10 | 0 | ==> No rows can pass (not keep) + let expected_ret = &[true, false, true, false, false]; + + prune_with_expr( + // i < 0, with actual null and row counts statistics + col("i").lt(lit(0)), + &schema, + &statistics, + expected_ret, + ); + } + + #[test] + fn prune_cast_column_scalar() { + // The data type of column i is INT32 + let (schema, statistics) = int32_setup(); + let expected_ret = &[true, true, false, true, true]; + + prune_with_expr( + // i > int64(0) + col("i").gt(cast(lit(ScalarValue::Int64(Some(0))), DataType::Int32)), + &schema, + &statistics, + expected_ret, + ); + + prune_with_expr( + // cast(i as int64) > int64(0) + cast(col("i"), DataType::Int64).gt(lit(ScalarValue::Int64(Some(0)))), + &schema, + &statistics, + expected_ret, + ); + + prune_with_expr( + // try_cast(i as int64) > int64(0) + try_cast(col("i"), DataType::Int64).gt(lit(ScalarValue::Int64(Some(0)))), + &schema, + &statistics, + expected_ret, + ); + + prune_with_expr( + // `-cast(i as int64) < 0` convert to `cast(i as int64) > -0` + Expr::Negative(Box::new(cast(col("i"), DataType::Int64))) + .lt(lit(ScalarValue::Int64(Some(0)))), + &schema, + &statistics, + expected_ret, + ); + } + + #[test] + fn test_increment_utf8() { + // Basic ASCII + assert_eq!(increment_utf8("abc").unwrap(), "abd"); + assert_eq!(increment_utf8("abz").unwrap(), "ab{"); + + // Test around ASCII 127 (DEL) + assert_eq!(increment_utf8("~").unwrap(), "\u{7f}"); // 126 -> 127 + assert_eq!(increment_utf8("\u{7f}").unwrap(), "\u{80}"); // 127 -> 128 + + // Test 2-byte UTF-8 sequences + assert_eq!(increment_utf8("ß").unwrap(), "à"); // U+00DF -> U+00E0 + + // Test 3-byte UTF-8 sequences + assert_eq!(increment_utf8("℣").unwrap(), "ℤ"); // U+2123 -> U+2124 + + // Test at UTF-8 boundaries + assert_eq!(increment_utf8("\u{7FF}").unwrap(), "\u{800}"); // 2-byte to 3-byte boundary + assert_eq!(increment_utf8("\u{FFFF}").unwrap(), "\u{10000}"); // 3-byte to 4-byte boundary + + // Test that if we can't increment we return None + assert!(increment_utf8("").is_none()); + assert!(increment_utf8("\u{10FFFF}").is_none()); // U+10FFFF is the max code point + + // Test that if we can't increment the last character we do the previous one and truncate + assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b"); + + // Test surrogate pair range (0xD800..=0xDFFF) + assert_eq!(increment_utf8("a\u{D7FF}").unwrap(), "b"); + assert!(increment_utf8("\u{D7FF}").is_none()); + + // Test non-characters range (0xFDD0..=0xFDEF) + assert_eq!(increment_utf8("a\u{FDCF}").unwrap(), "b"); + assert!(increment_utf8("\u{FDCF}").is_none()); + + // Test private use area limit (>= 0x110000) + assert_eq!(increment_utf8("a\u{10FFFF}").unwrap(), "b"); + assert!(increment_utf8("\u{10FFFF}").is_none()); // Can't increment past max valid codepoint + } + + /// Creates a setup for chunk pruning, modeling a utf8 column "s1" + /// with 5 different containers (e.g. RowGroups). They have [min, + /// max]: + /// s1 ["A", "Z"] + /// s1 ["A", "L"] + /// s1 ["N", "Z"] + /// s1 [NULL, NULL] + /// s1 ["A", NULL] + /// s1 ["", "A"] + /// s1 ["", ""] + /// s1 ["AB", "A\u{10ffff}"] + /// s1 ["A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] + fn utf8_setup() -> (SchemaRef, TestStatistics) { + let schema = Arc::new(Schema::new(vec![Field::new("s1", DataType::Utf8, true)])); + + let statistics = TestStatistics::new().with( + "s1", + ContainerStats::new_utf8( + vec![ + Some("A"), + Some("A"), + Some("N"), + Some("M"), + None, + Some("A"), + Some(""), + Some(""), + Some("AB"), + Some("A\u{10ffff}\u{10ffff}"), + ], // min + vec![ + Some("Z"), + Some("L"), + Some("Z"), + Some("M"), + None, + None, + Some("A"), + Some(""), + Some("A\u{10ffff}\u{10ffff}\u{10ffff}"), + Some("A\u{10ffff}\u{10ffff}"), + ], // max + ), + ); + (schema, statistics) + } + + #[test] + fn prune_utf8_eq() { + let (schema, statistics) = utf8_setup(); + + let expr = col("s1").eq(lit("A")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["M", "M"] ==> no rows can pass (not keep) + false, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> no rows can pass (not keep) + false, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").eq(lit("")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["A", "L"] ==> no rows can pass (not keep) + false, + // s1 ["N", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["M", "M"] ==> no rows can pass (not keep) + false, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> no rows can pass (not keep) + false, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> all rows must pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + } + + #[test] + fn prune_utf8_not_eq() { + let (schema, statistics) = utf8_setup(); + + let expr = col("s1").not_eq(lit("A")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["M", "M"] ==> all rows must pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> all rows must pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").not_eq(lit("")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["A", "L"] ==> all rows must pass (must keep) + true, + // s1 ["N", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["M", "M"] ==> all rows must pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> no rows can pass (not keep) + false, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + } + + #[test] + fn prune_utf8_like_one() { + let (schema, statistics) = utf8_setup(); + + let expr = col("s1").like(lit("A_")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["M", "M"] ==> no rows can pass (not keep) + false, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> no rows can pass (not keep) + false, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").like(lit("_A_")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["M", "M"] ==> some rows could pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> some rows could pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").like(lit("_")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["A", "L"] ==> all rows must pass (must keep) + true, + // s1 ["N", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["M", "M"] ==> all rows must pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> all rows must pass (must keep) + true, + // s1 ["", ""] ==> all rows must pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").like(lit("")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["A", "L"] ==> no rows can pass (not keep) + false, + // s1 ["N", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["M", "M"] ==> no rows can pass (not keep) + false, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> no rows can pass (not keep) + false, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> all rows must pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + } + + #[test] + fn prune_utf8_like_many() { + let (schema, statistics) = utf8_setup(); + + let expr = col("s1").like(lit("A%")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["M", "M"] ==> no rows can pass (not keep) + false, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> no rows can pass (not keep) + false, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").like(lit("%A%")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["M", "M"] ==> some rows could pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> some rows could pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").like(lit("%")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["A", "L"] ==> all rows must pass (must keep) + true, + // s1 ["N", "Z"] ==> all rows must pass (must keep) + true, + // s1 ["M", "M"] ==> all rows must pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> unknown (must keep) + true, + // s1 ["", "A"] ==> all rows must pass (must keep) + true, + // s1 ["", ""] ==> all rows must pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> all rows must pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").like(lit("")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["A", "L"] ==> no rows can pass (not keep) + false, + // s1 ["N", "Z"] ==> no rows can pass (not keep) + false, + // s1 ["M", "M"] ==> no rows can pass (not keep) + false, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> no rows can pass (not keep) + false, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> all rows must pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no rows can pass (not keep) + false, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + } + + #[test] + fn prune_utf8_not_like_one() { + let (schema, statistics) = utf8_setup(); + + let expr = col("s1").not_like(lit("A\u{10ffff}_")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["M", "M"] ==> some rows could pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> some rows could pass (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> some rows could pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match. (min, max) maybe truncate + // orignal (min, max) maybe ("A\u{10ffff}\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}\u{10ffff}\u{10ffff}") + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + } + + #[test] + fn prune_utf8_not_like_many() { + let (schema, statistics) = utf8_setup(); + + let expr = col("s1").not_like(lit("A\u{10ffff}%")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["M", "M"] ==> some rows could pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> some rows could pass (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> some rows could pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> no row match + false, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").not_like(lit("A\u{10ffff}%\u{10ffff}")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["M", "M"] ==> some rows could pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> some rows could pass (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> some rows could pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").not_like(lit("A\u{10ffff}%\u{10ffff}_")); + #[rustfmt::skip] + let expected_ret = &[ + // s1 ["A", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["A", "L"] ==> some rows could pass (must keep) + true, + // s1 ["N", "Z"] ==> some rows could pass (must keep) + true, + // s1 ["M", "M"] ==> some rows could pass (must keep) + true, + // s1 [NULL, NULL] ==> unknown (must keep) + true, + // s1 ["A", NULL] ==> some rows could pass (must keep) + true, + // s1 ["", "A"] ==> some rows could pass (must keep) + true, + // s1 ["", ""] ==> some rows could pass (must keep) + true, + // s1 ["AB", "A\u{10ffff}\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + // s1 ["A\u{10ffff}\u{10ffff}", "A\u{10ffff}\u{10ffff}"] ==> some rows could pass (must keep) + true, + ]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + + let expr = col("s1").not_like(lit("A\\%%")); + let statistics = TestStatistics::new().with( + "s1", + ContainerStats::new_utf8( + vec![Some("A%a"), Some("A")], + vec![Some("A%c"), Some("A")], + ), + ); + let expected_ret = &[false, true]; + prune_with_expr(expr, &schema, &statistics, expected_ret); + } + + #[test] + fn test_rewrite_expr_to_prunable() { + let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); + let df_schema = DFSchema::try_from(schema.clone()).unwrap(); + + // column op lit + let left_input = col("a"); + let left_input = logical2physical(&left_input, &schema); + let right_input = lit(ScalarValue::Int32(Some(12))); + let right_input = logical2physical(&right_input, &schema); + let (result_left, _, result_right) = rewrite_expr_to_prunable( + &left_input, + Operator::Eq, + &right_input, + df_schema.clone(), + ) + .unwrap(); + assert_eq!(result_left.to_string(), left_input.to_string()); + assert_eq!(result_right.to_string(), right_input.to_string()); + + // cast op lit + let left_input = cast(col("a"), DataType::Decimal128(20, 3)); + let left_input = logical2physical(&left_input, &schema); + let right_input = lit(ScalarValue::Decimal128(Some(12), 20, 3)); + let right_input = logical2physical(&right_input, &schema); + let (result_left, _, result_right) = rewrite_expr_to_prunable( + &left_input, + Operator::Gt, + &right_input, + df_schema.clone(), + ) + .unwrap(); + assert_eq!(result_left.to_string(), left_input.to_string()); + assert_eq!(result_right.to_string(), right_input.to_string()); + + // try_cast op lit + let left_input = try_cast(col("a"), DataType::Int64); + let left_input = logical2physical(&left_input, &schema); + let right_input = lit(ScalarValue::Int64(Some(12))); + let right_input = logical2physical(&right_input, &schema); + let (result_left, _, result_right) = + rewrite_expr_to_prunable(&left_input, Operator::Gt, &right_input, df_schema) + .unwrap(); + assert_eq!(result_left.to_string(), left_input.to_string()); + assert_eq!(result_right.to_string(), right_input.to_string()); + + // TODO: add test for other case and op + } + + #[test] + fn test_rewrite_expr_to_prunable_custom_unhandled_hook() { + struct CustomUnhandledHook; + + impl UnhandledPredicateHook for CustomUnhandledHook { + /// This handles an arbitrary case of a column that doesn't exist in the schema + /// by renaming it to yet another column that doesn't exist in the schema + /// (the transformation is arbitrary, the point is that it can do whatever it wants) + fn handle(&self, _expr: &Arc) -> Arc { + Arc::new(phys_expr::Literal::new(ScalarValue::Int32(Some(42)))) + } + } + + let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); + let schema_with_b = Schema::new(vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Int32, true), + ]); + + let rewriter = PredicateRewriter::new() + .with_unhandled_hook(Arc::new(CustomUnhandledHook {})); + + let transform_expr = |expr| { + let expr = logical2physical(&expr, &schema_with_b); + rewriter.rewrite_predicate_to_statistics_predicate(&expr, &schema) + }; + + // transform an arbitrary valid expression that we know is handled + let known_expression = col("a").eq(lit(12)); + let known_expression_transformed = PredicateRewriter::new() + .rewrite_predicate_to_statistics_predicate( + &logical2physical(&known_expression, &schema), + &schema, + ); + + // an expression referencing an unknown column (that is not in the schema) gets passed to the hook + let input = col("b").eq(lit(12)); + let expected = logical2physical(&lit(42), &schema); + let transformed = transform_expr(input.clone()); + assert_eq!(transformed.to_string(), expected.to_string()); + + // more complex case with unknown column + let input = known_expression.clone().and(input.clone()); + let expected = phys_expr::BinaryExpr::new( + Arc::::clone(&known_expression_transformed), + Operator::And, + logical2physical(&lit(42), &schema), + ); + let transformed = transform_expr(input.clone()); + assert_eq!(transformed.to_string(), expected.to_string()); + + // an unknown expression gets passed to the hook + let input = array_has(make_array(vec![lit(1)]), col("a")); + let expected = logical2physical(&lit(42), &schema); + let transformed = transform_expr(input.clone()); + assert_eq!(transformed.to_string(), expected.to_string()); + + // more complex case with unknown expression + let input = known_expression.and(input); + let expected = phys_expr::BinaryExpr::new( + Arc::::clone(&known_expression_transformed), + Operator::And, + logical2physical(&lit(42), &schema), + ); + let transformed = transform_expr(input.clone()); + assert_eq!(transformed.to_string(), expected.to_string()); + } + + #[test] + fn test_rewrite_expr_to_prunable_error() { + // cast string value to numeric value + // this cast is not supported + let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]); + let df_schema = DFSchema::try_from(schema.clone()).unwrap(); + let left_input = cast(col("a"), DataType::Int64); + let left_input = logical2physical(&left_input, &schema); + let right_input = lit(ScalarValue::Int64(Some(12))); + let right_input = logical2physical(&right_input, &schema); + let result = rewrite_expr_to_prunable( + &left_input, + Operator::Gt, + &right_input, + df_schema.clone(), + ); + assert!(result.is_err()); + + // other expr + let left_input = is_null(col("a")); + let left_input = logical2physical(&left_input, &schema); + let right_input = lit(ScalarValue::Int64(Some(12))); + let right_input = logical2physical(&right_input, &schema); + let result = + rewrite_expr_to_prunable(&left_input, Operator::Gt, &right_input, df_schema); + assert!(result.is_err()); + // TODO: add other negative test for other case and op + } + + #[test] + fn prune_with_contained_one_column() { + let schema = Arc::new(Schema::new(vec![Field::new("s1", DataType::Utf8, true)])); + + // Model having information like a bloom filter for s1 + let statistics = TestStatistics::new() + .with_contained( + "s1", + [ScalarValue::from("foo")], + [ + // container 0 known to only contain "foo"", + Some(true), + // container 1 known to not contain "foo" + Some(false), + // container 2 unknown about "foo" + None, + // container 3 known to only contain "foo" + Some(true), + // container 4 known to not contain "foo" + Some(false), + // container 5 unknown about "foo" + None, + // container 6 known to only contain "foo" + Some(true), + // container 7 known to not contain "foo" + Some(false), + // container 8 unknown about "foo" + None, + ], + ) + .with_contained( + "s1", + [ScalarValue::from("bar")], + [ + // containers 0,1,2 known to only contain "bar" + Some(true), + Some(true), + Some(true), + // container 3,4,5 known to not contain "bar" + Some(false), + Some(false), + Some(false), + // container 6,7,8 unknown about "bar" + None, + None, + None, + ], + ) + .with_contained( + // the way the tests are setup, this data is + // consulted if the "foo" and "bar" are being checked at the same time + "s1", + [ScalarValue::from("foo"), ScalarValue::from("bar")], + [ + // container 0,1,2 unknown about ("foo, "bar") + None, + None, + None, + // container 3,4,5 known to contain only either "foo" and "bar" + Some(true), + Some(true), + Some(true), + // container 6,7,8 known to contain neither "foo" and "bar" + Some(false), + Some(false), + Some(false), + ], + ); + + // s1 = 'foo' + prune_with_expr( + col("s1").eq(lit("foo")), + &schema, + &statistics, + // rule out containers ('false) where we know foo is not present + &[true, false, true, true, false, true, true, false, true], + ); + + // s1 = 'bar' + prune_with_expr( + col("s1").eq(lit("bar")), + &schema, + &statistics, + // rule out containers where we know bar is not present + &[true, true, true, false, false, false, true, true, true], + ); + + // s1 = 'baz' (unknown value) + prune_with_expr( + col("s1").eq(lit("baz")), + &schema, + &statistics, + // can't rule out anything + &[true, true, true, true, true, true, true, true, true], + ); + + // s1 = 'foo' AND s1 = 'bar' + prune_with_expr( + col("s1").eq(lit("foo")).and(col("s1").eq(lit("bar"))), + &schema, + &statistics, + // logically this predicate can't possibly be true (the column can't + // take on both values) but we could rule it out if the stats tell + // us that both values are not present + &[true, true, true, true, true, true, true, true, true], + ); + + // s1 = 'foo' OR s1 = 'bar' + prune_with_expr( + col("s1").eq(lit("foo")).or(col("s1").eq(lit("bar"))), + &schema, + &statistics, + // can rule out containers that we know contain neither foo nor bar + &[true, true, true, true, true, true, false, false, false], + ); + + // s1 = 'foo' OR s1 = 'baz' + prune_with_expr( + col("s1").eq(lit("foo")).or(col("s1").eq(lit("baz"))), + &schema, + &statistics, + // can't rule out anything container + &[true, true, true, true, true, true, true, true, true], + ); + + // s1 = 'foo' OR s1 = 'bar' OR s1 = 'baz' + prune_with_expr( + col("s1") + .eq(lit("foo")) + .or(col("s1").eq(lit("bar"))) + .or(col("s1").eq(lit("baz"))), + &schema, + &statistics, + // can rule out any containers based on knowledge of s1 and `foo`, + // `bar` and (`foo`, `bar`) + &[true, true, true, true, true, true, true, true, true], + ); + + // s1 != foo + prune_with_expr( + col("s1").not_eq(lit("foo")), + &schema, + &statistics, + // rule out containers we know for sure only contain foo + &[false, true, true, false, true, true, false, true, true], + ); + + // s1 != bar + prune_with_expr( + col("s1").not_eq(lit("bar")), + &schema, + &statistics, + // rule out when we know for sure s1 has the value bar + &[false, false, false, true, true, true, true, true, true], + ); + + // s1 != foo AND s1 != bar + prune_with_expr( + col("s1") + .not_eq(lit("foo")) + .and(col("s1").not_eq(lit("bar"))), + &schema, + &statistics, + // can rule out any container where we know s1 does not have either 'foo' or 'bar' + &[true, true, true, false, false, false, true, true, true], + ); + + // s1 != foo AND s1 != bar AND s1 != baz + prune_with_expr( + col("s1") + .not_eq(lit("foo")) + .and(col("s1").not_eq(lit("bar"))) + .and(col("s1").not_eq(lit("baz"))), + &schema, + &statistics, + // can't rule out any container based on knowledge of s1,s2 + &[true, true, true, true, true, true, true, true, true], + ); + + // s1 != foo OR s1 != bar + prune_with_expr( + col("s1") + .not_eq(lit("foo")) + .or(col("s1").not_eq(lit("bar"))), + &schema, + &statistics, + // cant' rule out anything based on contains information + &[true, true, true, true, true, true, true, true, true], + ); + + // s1 != foo OR s1 != bar OR s1 != baz + prune_with_expr( + col("s1") + .not_eq(lit("foo")) + .or(col("s1").not_eq(lit("bar"))) + .or(col("s1").not_eq(lit("baz"))), + &schema, + &statistics, + // cant' rule out anything based on contains information + &[true, true, true, true, true, true, true, true, true], + ); + } + + #[test] + fn prune_with_contained_two_columns() { + let schema = Arc::new(Schema::new(vec![ + Field::new("s1", DataType::Utf8, true), + Field::new("s2", DataType::Utf8, true), + ])); + + // Model having information like bloom filters for s1 and s2 + let statistics = TestStatistics::new() + .with_contained( + "s1", + [ScalarValue::from("foo")], + [ + // container 0, s1 known to only contain "foo"", + Some(true), + // container 1, s1 known to not contain "foo" + Some(false), + // container 2, s1 unknown about "foo" + None, + // container 3, s1 known to only contain "foo" + Some(true), + // container 4, s1 known to not contain "foo" + Some(false), + // container 5, s1 unknown about "foo" + None, + // container 6, s1 known to only contain "foo" + Some(true), + // container 7, s1 known to not contain "foo" + Some(false), + // container 8, s1 unknown about "foo" + None, + ], + ) + .with_contained( + "s2", // for column s2 + [ScalarValue::from("bar")], + [ + // containers 0,1,2 s2 known to only contain "bar" + Some(true), + Some(true), + Some(true), + // container 3,4,5 s2 known to not contain "bar" + Some(false), + Some(false), + Some(false), + // container 6,7,8 s2 unknown about "bar" + None, + None, + None, + ], + ); + + // s1 = 'foo' + prune_with_expr( + col("s1").eq(lit("foo")), + &schema, + &statistics, + // rule out containers where we know s1 is not present + &[true, false, true, true, false, true, true, false, true], + ); + + // s1 = 'foo' OR s2 = 'bar' + let expr = col("s1").eq(lit("foo")).or(col("s2").eq(lit("bar"))); + prune_with_expr( + expr, + &schema, + &statistics, + // can't rule out any container (would need to prove that s1 != foo AND s2 != bar) + &[true, true, true, true, true, true, true, true, true], + ); + + // s1 = 'foo' AND s2 != 'bar' + prune_with_expr( + col("s1").eq(lit("foo")).and(col("s2").not_eq(lit("bar"))), + &schema, + &statistics, + // can only rule out container where we know either: + // 1. s1 doesn't have the value 'foo` or + // 2. s2 has only the value of 'bar' + &[false, false, false, true, false, true, true, false, true], + ); + + // s1 != 'foo' AND s2 != 'bar' + prune_with_expr( + col("s1") + .not_eq(lit("foo")) + .and(col("s2").not_eq(lit("bar"))), + &schema, + &statistics, + // Can rule out any container where we know either + // 1. s1 has only the value 'foo' + // 2. s2 has only the value 'bar' + &[false, false, false, false, true, true, false, true, true], + ); + + // s1 != 'foo' AND (s2 = 'bar' OR s2 = 'baz') + prune_with_expr( + col("s1") + .not_eq(lit("foo")) + .and(col("s2").eq(lit("bar")).or(col("s2").eq(lit("baz")))), + &schema, + &statistics, + // Can rule out any container where we know s1 has only the value + // 'foo'. Can't use knowledge of s2 and bar to rule out anything + &[false, true, true, false, true, true, false, true, true], + ); + + // s1 like '%foo%bar%' + prune_with_expr( + col("s1").like(lit("foo%bar%")), + &schema, + &statistics, + // cant rule out anything with information we know + &[true, true, true, true, true, true, true, true, true], + ); + + // s1 like '%foo%bar%' AND s2 = 'bar' + prune_with_expr( + col("s1") + .like(lit("foo%bar%")) + .and(col("s2").eq(lit("bar"))), + &schema, + &statistics, + // can rule out any container where we know s2 does not have the value 'bar' + &[true, true, true, false, false, false, true, true, true], + ); + + // s1 like '%foo%bar%' OR s2 = 'bar' + prune_with_expr( + col("s1").like(lit("foo%bar%")).or(col("s2").eq(lit("bar"))), + &schema, + &statistics, + // can't rule out anything (we would have to prove that both the + // like and the equality must be false) + &[true, true, true, true, true, true, true, true, true], + ); + } + + #[test] + fn prune_with_range_and_contained() { + // Setup mimics range information for i, a bloom filter for s + let schema = Arc::new(Schema::new(vec![ + Field::new("i", DataType::Int32, true), + Field::new("s", DataType::Utf8, true), + ])); + + let statistics = TestStatistics::new() + .with( + "i", + ContainerStats::new_i32( + // Container 0, 3, 6: [-5 to 5] + // Container 1, 4, 7: [10 to 20] + // Container 2, 5, 9: unknown + vec![ + Some(-5), + Some(10), + None, + Some(-5), + Some(10), + None, + Some(-5), + Some(10), + None, + ], // min + vec![ + Some(5), + Some(20), + None, + Some(5), + Some(20), + None, + Some(5), + Some(20), + None, + ], // max + ), + ) + // Add contained information about the s and "foo" + .with_contained( + "s", + [ScalarValue::from("foo")], + [ + // container 0,1,2 known to only contain "foo" + Some(true), + Some(true), + Some(true), + // container 3,4,5 known to not contain "foo" + Some(false), + Some(false), + Some(false), + // container 6,7,8 unknown about "foo" + None, + None, + None, + ], + ); + + // i = 0 and s = 'foo' + prune_with_expr( + col("i").eq(lit(0)).and(col("s").eq(lit("foo"))), + &schema, + &statistics, + // Can rule out container where we know that either: + // 1. 0 is outside the min/max range of i + // 1. s does not contain foo + // (range is false, and contained is false) + &[true, false, true, false, false, false, true, false, true], + ); + + // i = 0 and s != 'foo' + prune_with_expr( + col("i").eq(lit(0)).and(col("s").not_eq(lit("foo"))), + &schema, + &statistics, + // Can rule out containers where either: + // 1. 0 is outside the min/max range of i + // 2. s only contains foo + &[false, false, false, true, false, true, true, false, true], + ); + + // i = 0 OR s = 'foo' + prune_with_expr( + col("i").eq(lit(0)).or(col("s").eq(lit("foo"))), + &schema, + &statistics, + // in theory could rule out containers if we had min/max values for + // s as well. But in this case we don't so we can't rule out anything + &[true, true, true, true, true, true, true, true, true], + ); + } + + /// prunes the specified expr with the specified schema and statistics, and + /// ensures it returns expected. + /// + /// `expected` is a vector of bools, where true means the row group should + /// be kept, and false means it should be pruned. + /// + // TODO refactor other tests to use this to reduce boiler plate + fn prune_with_expr( + expr: Expr, + schema: &SchemaRef, + statistics: &TestStatistics, + expected: &[bool], + ) { + println!("Pruning with expr: {expr}"); + let expr = logical2physical(&expr, schema); + let p = PruningPredicate::try_new(expr, Arc::::clone(schema)).unwrap(); + let result = p.prune(statistics).unwrap(); + assert_eq!(result, expected); + } + + fn test_build_predicate_expression( + expr: &Expr, + schema: &Schema, + required_columns: &mut RequiredColumns, + ) -> Arc { + let expr = logical2physical(expr, schema); + let unhandled_hook = Arc::new(ConstantUnhandledPredicateHook::default()) as _; + build_predicate_expression(&expr, schema, required_columns, &unhandled_hook) + } + + #[test] + fn test_build_predicate_expression_with_false() { + let expr = lit(ScalarValue::Boolean(Some(false))); + let schema = Schema::empty(); + let res = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + let expected = logical2physical(&expr, &schema); + assert_eq!(&res, &expected); + } + + #[test] + fn test_build_predicate_expression_with_and_false() { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); + let expr = and( + col("c1").eq(lit("a")), + lit(ScalarValue::Boolean(Some(false))), + ); + let res = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + let expected = logical2physical(&lit(ScalarValue::Boolean(Some(false))), &schema); + assert_eq!(&res, &expected); + } + + #[test] + fn test_build_predicate_expression_with_or_false() { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); + let left_expr = col("c1").eq(lit("a")); + let right_expr = lit(ScalarValue::Boolean(Some(false))); + let res = test_build_predicate_expression( + &or(left_expr.clone(), right_expr.clone()), + &schema, + &mut RequiredColumns::new(), + ); + let expected = + "c1_null_count@2 != row_count@3 AND c1_min@0 <= a AND a <= c1_max@1"; + assert_eq!(res.to_string(), expected); + } +}