-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Map file-level column statistics to the table-level #15865
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,7 @@ use crate::execution::context::SessionState; | |
| use datafusion_catalog::TableProvider; | ||
| use datafusion_common::{config_err, DataFusionError, Result}; | ||
| use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; | ||
| use datafusion_datasource::schema_adapter::DefaultSchemaAdapterFactory; | ||
| use datafusion_expr::dml::InsertOp; | ||
| use datafusion_expr::{utils::conjunction, Expr, TableProviderFilterPushDown}; | ||
| use datafusion_expr::{SortExpr, TableType}; | ||
|
|
@@ -1129,7 +1130,17 @@ impl ListingTable { | |
| let (file_group, inexact_stats) = | ||
| get_files_with_limit(files, limit, self.options.collect_stat).await?; | ||
|
|
||
| let file_groups = file_group.split_files(self.options.target_partitions); | ||
| let mut file_groups = file_group.split_files(self.options.target_partitions); | ||
| let (schema_mapper, _) = DefaultSchemaAdapterFactory::from_schema(self.schema()) | ||
|
Comment on lines
+1133
to
+1134
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alamb While I was working on #15852, I found in fact, for listing table, doesn't have the issue described in #15689, that is, all files here have the same schema because when creating table, all fetched files already use the What we should fix is let the file schema match the listing table schema, usually, if users specify the partition col, table schema will have the extra partition col infos, so I moved the mapper down the |
||
| .map_schema(self.file_schema.as_ref())?; | ||
| // Use schema_mapper to map each file-level column statistics to table-level column statistics | ||
| file_groups.iter_mut().try_for_each(|file_group| { | ||
| if let Some(stat) = file_group.statistics_mut() { | ||
| stat.column_statistics = | ||
| schema_mapper.map_column_statistics(&stat.column_statistics)?; | ||
| } | ||
| Ok::<_, DataFusionError>(()) | ||
| })?; | ||
| compute_all_files_statistics( | ||
| file_groups, | ||
| self.schema(), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,7 +24,7 @@ | |
| use arrow::array::{new_null_array, RecordBatch, RecordBatchOptions}; | ||
| use arrow::compute::{can_cast_types, cast}; | ||
| use arrow::datatypes::{Schema, SchemaRef}; | ||
| use datafusion_common::plan_err; | ||
| use datafusion_common::{plan_err, ColumnStatistics}; | ||
| use std::fmt::Debug; | ||
| use std::sync::Arc; | ||
|
|
||
|
|
@@ -96,6 +96,12 @@ pub trait SchemaAdapter: Send + Sync { | |
| pub trait SchemaMapper: Debug + Send + Sync { | ||
| /// Adapts a `RecordBatch` to match the `table_schema` | ||
| fn map_batch(&self, batch: RecordBatch) -> datafusion_common::Result<RecordBatch>; | ||
|
|
||
| /// Adapts file-level column `Statistics` to match the `table_schema` | ||
| fn map_column_statistics( | ||
| &self, | ||
| file_col_statistics: &[ColumnStatistics], | ||
| ) -> datafusion_common::Result<Vec<ColumnStatistics>>; | ||
| } | ||
|
|
||
| /// Default [`SchemaAdapterFactory`] for mapping schemas. | ||
|
|
@@ -334,4 +340,126 @@ impl SchemaMapper for SchemaMapping { | |
| let record_batch = RecordBatch::try_new_with_options(schema, cols, &options)?; | ||
| Ok(record_batch) | ||
| } | ||
|
|
||
| /// Adapts file-level column `Statistics` to match the `table_schema` | ||
| fn map_column_statistics( | ||
| &self, | ||
| file_col_statistics: &[ColumnStatistics], | ||
| ) -> datafusion_common::Result<Vec<ColumnStatistics>> { | ||
| let mut table_col_statistics = vec![]; | ||
|
|
||
| // Map the statistics for each field in the file schema to the corresponding field in the | ||
| // table schema, if a field is not present in the file schema, we need to fill it with `ColumnStatistics::new_unknown` | ||
| for (_, file_col_idx) in self | ||
| .projected_table_schema | ||
| .fields() | ||
| .iter() | ||
| .zip(&self.field_mappings) | ||
| { | ||
| if let Some(file_col_idx) = file_col_idx { | ||
| table_col_statistics.push( | ||
| file_col_statistics | ||
| .get(*file_col_idx) | ||
| .cloned() | ||
| .unwrap_or_default(), | ||
| ); | ||
| } else { | ||
| table_col_statistics.push(ColumnStatistics::new_unknown()); | ||
| } | ||
| } | ||
|
Comment on lines
+351
to
+369
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What do you mean "all nulls", from the code I think it means
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What I mean is that I think we have enough information to make the stats |
||
|
|
||
| Ok(table_col_statistics) | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
| use arrow::datatypes::{DataType, Field}; | ||
| use datafusion_common::{stats::Precision, Statistics}; | ||
|
|
||
| use super::*; | ||
|
|
||
| #[test] | ||
| fn test_schema_mapping_map_statistics_basic() { | ||
| // Create table schema (a, b, c) | ||
| let table_schema = Arc::new(Schema::new(vec![ | ||
| Field::new("a", DataType::Int32, true), | ||
| Field::new("b", DataType::Utf8, true), | ||
| Field::new("c", DataType::Float64, true), | ||
| ])); | ||
|
|
||
| // Create file schema (b, a) - different order, missing c | ||
| let file_schema = Schema::new(vec![ | ||
| Field::new("b", DataType::Utf8, true), | ||
| Field::new("a", DataType::Int32, true), | ||
| ]); | ||
|
|
||
| // Create SchemaAdapter | ||
| let adapter = DefaultSchemaAdapter { | ||
| projected_table_schema: Arc::clone(&table_schema), | ||
| }; | ||
|
|
||
| // Get mapper and projection | ||
| let (mapper, projection) = adapter.map_schema(&file_schema).unwrap(); | ||
|
|
||
| // Should project columns 0,1 from file | ||
| assert_eq!(projection, vec![0, 1]); | ||
|
|
||
| // Create file statistics | ||
| let mut file_stats = Statistics::default(); | ||
|
|
||
| // Statistics for column b (index 0 in file) | ||
| let b_stats = ColumnStatistics { | ||
| null_count: Precision::Exact(5), | ||
| ..Default::default() | ||
| }; | ||
|
|
||
| // Statistics for column a (index 1 in file) | ||
| let a_stats = ColumnStatistics { | ||
| null_count: Precision::Exact(10), | ||
| ..Default::default() | ||
| }; | ||
|
|
||
| file_stats.column_statistics = vec![b_stats, a_stats]; | ||
|
|
||
| // Map statistics | ||
| let table_col_stats = mapper | ||
| .map_column_statistics(&file_stats.column_statistics) | ||
| .unwrap(); | ||
|
|
||
| // Verify stats | ||
| assert_eq!(table_col_stats.len(), 3); | ||
| assert_eq!(table_col_stats[0].null_count, Precision::Exact(10)); // a from file idx 1 | ||
| assert_eq!(table_col_stats[1].null_count, Precision::Exact(5)); // b from file idx 0 | ||
| assert_eq!(table_col_stats[2].null_count, Precision::Absent); // c (unknown) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_schema_mapping_map_statistics_empty() { | ||
| // Create schemas | ||
| let table_schema = Arc::new(Schema::new(vec![ | ||
| Field::new("a", DataType::Int32, true), | ||
| Field::new("b", DataType::Utf8, true), | ||
| ])); | ||
| let file_schema = Schema::new(vec![ | ||
| Field::new("a", DataType::Int32, true), | ||
| Field::new("b", DataType::Utf8, true), | ||
| ]); | ||
|
|
||
| let adapter = DefaultSchemaAdapter { | ||
| projected_table_schema: Arc::clone(&table_schema), | ||
| }; | ||
| let (mapper, _) = adapter.map_schema(&file_schema).unwrap(); | ||
|
|
||
| // Empty file statistics | ||
| let file_stats = Statistics::default(); | ||
| let table_col_stats = mapper | ||
| .map_column_statistics(&file_stats.column_statistics) | ||
| .unwrap(); | ||
|
|
||
| // All stats should be unknown | ||
| assert_eq!(table_col_stats.len(), 2); | ||
| assert_eq!(table_col_stats[0], ColumnStatistics::new_unknown(),); | ||
| assert_eq!(table_col_stats[1], ColumnStatistics::new_unknown(),); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think using the default schema mapper makes sense for now / in this PR, but in general I think it would make sense to allow the user to provide their own schema mapping rules here (so a default value that is not
NULLcan be used, for example) via their own mapper.However, we woudl have to add a schema mapper factory to
ListingOptionshttps://github.com/apache/datafusion/blob/f1bbb1d636650c7f28f52dc507f36e64d71e1aa8/datafusion/core/src/datasource/listing/table.rs#L256-L255
(this is not a change needed for this PR, I just noticed it while reviewing this PR)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Make sense, I filed an issue to track: #15889