diff --git a/Cargo.lock b/Cargo.lock index f371a1822b50..4e2827580ea5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2463,7 +2463,6 @@ dependencies = [ "datafusion-functions", "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", - "half", "hashbrown 0.14.5", "indexmap 2.12.1", "insta", diff --git a/datafusion-examples/examples/custom_data_source/csv_json_opener.rs b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs index 6fad8f4d5824..48b885839f13 100644 --- a/datafusion-examples/examples/custom_data_source/csv_json_opener.rs +++ b/datafusion-examples/examples/custom_data_source/csv_json_opener.rs @@ -64,22 +64,22 @@ async fn csv_opener() -> Result<()> { ..Default::default() }; - let scan_config = FileScanConfigBuilder::new( - ObjectStoreUrl::local_filesystem(), - Arc::new(CsvSource::new(Arc::clone(&schema)).with_csv_options(options.clone())), - ) - .with_projection_indices(Some(vec![12, 0])) - .with_limit(Some(5)) - .with_file(PartitionedFile::new(path.display().to_string(), 10)) - .build(); - - let config = CsvSource::new(Arc::clone(&schema)) + let source = CsvSource::new(Arc::clone(&schema)) .with_csv_options(options) .with_comment(Some(b'#')) - .with_batch_size(8192) - .with_projection(&scan_config); + .with_batch_size(8192); + + let scan_config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source) + .with_projection_indices(Some(vec![12, 0]))? + .with_limit(Some(5)) + .with_file(PartitionedFile::new(path.display().to_string(), 10)) + .build(); - let opener = config.create_file_opener(object_store, &scan_config, 0); + let opener = + scan_config + .file_source() + .create_file_opener(object_store, &scan_config, 0)?; let mut result = vec![]; let mut stream = @@ -133,7 +133,7 @@ async fn json_opener() -> Result<()> { ObjectStoreUrl::local_filesystem(), Arc::new(JsonSource::new(schema)), ) - .with_projection_indices(Some(vec![1, 0])) + .with_projection_indices(Some(vec![1, 0]))? .with_limit(Some(5)) .with_file(PartitionedFile::new(path.to_string(), 10)) .build(); diff --git a/datafusion-examples/examples/custom_data_source/default_column_values.rs b/datafusion-examples/examples/custom_data_source/default_column_values.rs index 19d00e0a0d6f..9fe816502565 100644 --- a/datafusion-examples/examples/custom_data_source/default_column_values.rs +++ b/datafusion-examples/examples/custom_data_source/default_column_values.rs @@ -258,7 +258,7 @@ impl TableProvider for DefaultValueTableProvider { ObjectStoreUrl::parse("memory://")?, Arc::new(parquet_source), ) - .with_projection_indices(projection.cloned()) + .with_projection_indices(projection.cloned())? .with_limit(limit) .with_file_group(file_group) .with_expr_adapter(Some(Arc::new(DefaultValuePhysicalExprAdapterFactory) as _)); diff --git a/datafusion-examples/examples/data_io/parquet_advanced_index.rs b/datafusion-examples/examples/data_io/parquet_advanced_index.rs index 304e490bd63b..caa3be2111e3 100644 --- a/datafusion-examples/examples/data_io/parquet_advanced_index.rs +++ b/datafusion-examples/examples/data_io/parquet_advanced_index.rs @@ -502,7 +502,7 @@ impl TableProvider for IndexTableProvider { ); let file_scan_config = FileScanConfigBuilder::new(object_store_url, file_source) .with_limit(limit) - .with_projection_indices(projection.cloned()) + .with_projection_indices(projection.cloned())? .with_file(partitioned_file) .build(); diff --git a/datafusion-examples/examples/data_io/parquet_index.rs b/datafusion-examples/examples/data_io/parquet_index.rs index 7c708046f8a8..5f6fe3e2975a 100644 --- a/datafusion-examples/examples/data_io/parquet_index.rs +++ b/datafusion-examples/examples/data_io/parquet_index.rs @@ -247,7 +247,7 @@ impl TableProvider for IndexTableProvider { Arc::new(ParquetSource::new(self.schema()).with_predicate(predicate)); let mut file_scan_config_builder = FileScanConfigBuilder::new(object_store_url, source) - .with_projection_indices(projection.cloned()) + .with_projection_indices(projection.cloned())? .with_limit(limit); // Transform to the format needed to pass to DataSourceExec diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs index 33d5c86bf88d..cbe538ae35cc 100644 --- a/datafusion/catalog-listing/src/table.rs +++ b/datafusion/catalog-listing/src/table.rs @@ -504,7 +504,7 @@ impl TableProvider for ListingTable { .with_file_groups(partitioned_file_lists) .with_constraints(self.constraints.clone()) .with_statistics(statistics) - .with_projection_indices(projection) + .with_projection_indices(projection)? .with_limit(limit) .with_output_ordering(output_ordering) .with_expr_adapter(self.expr_adapter_factory.clone()) diff --git a/datafusion/core/src/dataframe/parquet.rs b/datafusion/core/src/dataframe/parquet.rs index cb8a6cf29541..6edf628e2d6d 100644 --- a/datafusion/core/src/dataframe/parquet.rs +++ b/datafusion/core/src/dataframe/parquet.rs @@ -150,7 +150,7 @@ mod tests { let plan = df.explain(false, false)?.collect().await?; // Filters all the way to Parquet let formatted = pretty::pretty_format_batches(&plan)?.to_string(); - assert!(formatted.contains("FilterExec: id@0 = 1")); + assert!(formatted.contains("FilterExec: id@0 = 1"), "{formatted}"); Ok(()) } diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index 7c55d452c4e1..2756ea21cd00 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -92,7 +92,7 @@ pub(crate) mod test_util { ) .with_file_groups(file_groups) .with_statistics(statistics) - .with_projection_indices(projection) + .with_projection_indices(projection)? .with_limit(limit) .build(), ) diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 1cf8c573acd9..b97ab0e9cacf 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -84,7 +84,7 @@ mod tests { let source = Arc::new(AvroSource::new(Arc::clone(&file_schema))); let conf = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source) .with_file(meta.into()) - .with_projection_indices(Some(vec![0, 1, 2])) + .with_projection_indices(Some(vec![0, 1, 2]))? .build(); let source_exec = DataSourceExec::from_data_source(conf); @@ -156,7 +156,7 @@ mod tests { let source = Arc::new(AvroSource::new(Arc::clone(&file_schema))); let conf = FileScanConfigBuilder::new(object_store_url, source) .with_file(meta.into()) - .with_projection_indices(projection) + .with_projection_indices(projection)? .build(); let source_exec = DataSourceExec::from_data_source(conf); @@ -231,7 +231,7 @@ mod tests { let conf = FileScanConfigBuilder::new(object_store_url, source) // select specific columns of the files as well as the partitioning // column which is supposed to be the last column in the table schema. - .with_projection_indices(projection) + .with_projection_indices(projection)? .with_file(partitioned_file) .build(); diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 1af6b330fd11..660be4faffbc 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -126,10 +126,10 @@ mod tests { let source = Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); let config = - FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?) .with_file_compression_type(file_compression_type) .with_newlines_in_values(false) - .with_projection_indices(Some(vec![0, 2, 4])) + .with_projection_indices(Some(vec![0, 2, 4]))? .build(); assert_eq!(13, config.file_schema().fields().len()); @@ -199,10 +199,10 @@ mod tests { let source = Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); let config = - FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?) .with_newlines_in_values(false) .with_file_compression_type(file_compression_type.to_owned()) - .with_projection_indices(Some(vec![4, 0, 2])) + .with_projection_indices(Some(vec![4, 0, 2]))? .build(); assert_eq!(13, config.file_schema().fields().len()); let csv = DataSourceExec::from_data_source(config); @@ -271,7 +271,7 @@ mod tests { let source = Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); let config = - FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?) .with_newlines_in_values(false) .with_file_compression_type(file_compression_type.to_owned()) .with_limit(Some(5)) @@ -342,7 +342,7 @@ mod tests { let source = Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); let config = - FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?) .with_newlines_in_values(false) .with_file_compression_type(file_compression_type.to_owned()) .with_limit(Some(5)) @@ -411,12 +411,12 @@ mod tests { let source = Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); let config = - FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?) .with_newlines_in_values(false) .with_file_compression_type(file_compression_type.to_owned()) // We should be able to project on the partition column // Which is supposed to be after the file fields - .with_projection_indices(Some(vec![0, num_file_schema_fields])) + .with_projection_indices(Some(vec![0, num_file_schema_fields]))? .build(); // we don't have `/date=xx/` in the path but that is ok because @@ -517,7 +517,7 @@ mod tests { let source = Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); let config = - FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?) .with_newlines_in_values(false) .with_file_compression_type(file_compression_type.to_owned()) .build(); diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index f36708901a1d..3efea0330258 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -302,6 +302,7 @@ mod tests { let conf = FileScanConfigBuilder::new(object_store_url, source) .with_file_groups(file_groups) .with_projection_indices(Some(vec![0, 2])) + .unwrap() .with_file_compression_type(file_compression_type.to_owned()) .build(); let exec = DataSourceExec::from_data_source(conf); @@ -349,7 +350,7 @@ mod tests { let source = Arc::new(JsonSource::new(Arc::clone(&file_schema))); let conf = FileScanConfigBuilder::new(object_store_url, source) .with_file_groups(file_groups) - .with_projection_indices(Some(vec![3, 0, 2])) + .with_projection_indices(Some(vec![3, 0, 2]))? .with_file_compression_type(file_compression_type.to_owned()) .build(); let exec = DataSourceExec::from_data_source(conf); diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index b27dcf56e33c..90953e3f5df9 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -198,6 +198,7 @@ mod tests { FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source) .with_file_group(file_group) .with_projection_indices(self.projection.clone()) + .unwrap() .build(); DataSourceExec::from_data_source(base_config) } @@ -1664,6 +1665,7 @@ mod tests { .with_file(partitioned_file) // file has 10 cols so index 12 should be month and 13 should be day .with_projection_indices(Some(vec![0, 1, 2, 12, 13])) + .unwrap() .build(); let parquet_exec = DataSourceExec::from_data_source(config); diff --git a/datafusion/core/src/datasource/view_test.rs b/datafusion/core/src/datasource/view_test.rs index 85ad9ff664ad..7227294ba1f1 100644 --- a/datafusion/core/src/datasource/view_test.rs +++ b/datafusion/core/src/datasource/view_test.rs @@ -358,7 +358,10 @@ mod tests { .to_string(); assert!(formatted.contains("DataSourceExec: ")); assert!(formatted.contains("file_type=parquet")); - assert!(formatted.contains("projection=[bool_col, int_col], limit=10")); + assert!( + formatted.contains("projection=[bool_col, int_col], limit=10"), + "{formatted}" + ); Ok(()) } diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index 2bf9c89576e9..b0ff3eb3ae41 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -105,9 +105,10 @@ pub fn scan_partitioned_csv( }; let table_schema = TableSchema::from_file_schema(schema); let source = Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options)); - let config = FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)) - .with_file_compression_type(FileCompressionType::UNCOMPRESSED) - .build(); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?) + .with_file_compression_type(FileCompressionType::UNCOMPRESSED) + .build(); Ok(DataSourceExec::from_data_source(config)) } diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs index 51e5242cbafd..e8666f07595e 100644 --- a/datafusion/core/tests/parquet/schema_coercion.rs +++ b/datafusion/core/tests/parquet/schema_coercion.rs @@ -122,6 +122,7 @@ async fn multi_parquet_coercion_projection() { ) .with_file_group(file_group) .with_projection_indices(Some(vec![1, 0, 2])) + .unwrap() .build(); let parquet_exec = DataSourceExec::from_data_source(config); diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs index 30fd86440566..c32f7b2d0ba9 100644 --- a/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs +++ b/datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs @@ -141,14 +141,14 @@ impl FileSource for TestSource { _object_store: Arc, _base_config: &FileScanConfig, _partition: usize, - ) -> Arc { - Arc::new(TestOpener { + ) -> Result> { + Ok(Arc::new(TestOpener { batches: self.batches.clone(), batch_size: self.batch_size, schema: Arc::clone(&self.schema), projection: self.projection.clone(), predicate: self.predicate.clone(), - }) + })) } fn filter(&self) -> Option> { @@ -166,13 +166,6 @@ impl FileSource for TestSource { }) } - fn with_projection(&self, config: &FileScanConfig) -> Arc { - Arc::new(TestSource { - projection: config.projection_exprs.as_ref().map(|p| p.column_indices()), - ..self.clone() - }) - } - fn metrics(&self) -> &ExecutionPlanMetricsSet { &self.metrics } diff --git a/datafusion/core/tests/physical_optimizer/partition_statistics.rs b/datafusion/core/tests/physical_optimizer/partition_statistics.rs index 7045cb8ea133..12c31b39452e 100644 --- a/datafusion/core/tests/physical_optimizer/partition_statistics.rs +++ b/datafusion/core/tests/physical_optimizer/partition_statistics.rs @@ -620,7 +620,7 @@ mod test { let plan_string = get_plan_string(&aggregate_exec_partial).swap_remove(0); assert_snapshot!( plan_string, - @"AggregateExec: mode=Partial, gby=[id@0 as id, 1 + id@0 as expr], aggr=[COUNT(c)]" + @"AggregateExec: mode=Partial, gby=[id@0 as id, 1 + id@0 as expr], aggr=[COUNT(c)], ordering_mode=Sorted" ); let p0_statistics = aggregate_exec_partial.partition_statistics(Some(0))?; diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index 9d39a80fb9df..80f4fbc3051c 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -397,6 +397,7 @@ fn create_simple_csv_exec() -> Arc { }) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_projection_indices(Some(vec![0, 1, 2, 3, 4])) + .unwrap() .build(); DataSourceExec::from_data_source(config) @@ -421,6 +422,7 @@ fn create_projecting_csv_exec() -> Arc { }) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_projection_indices(Some(vec![3, 2, 1])) + .unwrap() .build(); DataSourceExec::from_data_source(config) @@ -703,10 +705,7 @@ fn test_projection_after_projection() -> Result<()> { assert_snapshot!( actual, - @r" - ProjectionExec: expr=[b@1 as new_b, c@2 + e@4 as binary, b@1 as newest_b] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false - " + @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[b@1 as new_b, c@2 + e@4 as binary, b@1 as newest_b], file_type=csv, has_header=false" ); Ok(()) @@ -773,8 +772,7 @@ fn test_output_req_after_projection() -> Result<()> { actual, @r" OutputRequirementExec: order_by=[(b@2, asc), (c@0 + new_a@1, asc)], dist_by=HashPartitioned[[new_a@1, b@2]]) - ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false " ); @@ -864,8 +862,7 @@ fn test_coalesce_partitions_after_projection() -> Result<()> { actual, @r" CoalescePartitionsExec - ProjectionExec: expr=[b@1 as b, a@0 as a_new, d@3 as d] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[b, a@0 as a_new, d], file_type=csv, has_header=false " ); @@ -922,8 +919,7 @@ fn test_filter_after_projection() -> Result<()> { actual, @r" FilterExec: b@1 - a_new@0 > d@2 - a_new@0 - ProjectionExec: expr=[a@0 as a_new, b@1 as b, d@3 as d] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a@0 as a_new, b, d], file_type=csv, has_header=false " ); @@ -1025,10 +1021,8 @@ fn test_join_after_projection() -> Result<()> { actual, @r" SymmetricHashJoinExec: mode=SinglePartition, join_type=Inner, on=[(b_from_left@1, c_from_right@1)], filter=b_left_inter@0 - 1 + a_right_inter@1 <= a_right_inter@1 + c_left_inter@2 - ProjectionExec: expr=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false - ProjectionExec: expr=[a@0 as a_from_right, c@2 as c_from_right] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[c@2 as c_from_left, b@1 as b_from_left, a@0 as a_from_left], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[a@0 as a_from_right, c@2 as c_from_right], file_type=csv, has_header=false " ); @@ -1410,8 +1404,7 @@ fn test_repartition_after_projection() -> Result<()> { actual, @r" RepartitionExec: partitioning=Hash([a@1, b_new@0, d_new@2], 6), input_partitions=1 - ProjectionExec: expr=[b@1 as b_new, a@0 as a, d@3 as d_new] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[b@1 as b_new, a, d@3 as d_new], file_type=csv, has_header=false " ); @@ -1481,8 +1474,7 @@ fn test_sort_after_projection() -> Result<()> { actual, @r" SortExec: expr=[b@2 ASC, c@0 + new_a@1 ASC], preserve_partitioning=[false] - ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false " ); @@ -1535,8 +1527,7 @@ fn test_sort_preserving_after_projection() -> Result<()> { actual, @r" SortPreservingMergeExec: [b@2 ASC, c@0 + new_a@1 ASC] - ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false " ); @@ -1580,12 +1571,9 @@ fn test_union_after_projection() -> Result<()> { actual, @r" UnionExec - ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false - ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false - ProjectionExec: expr=[c@2 as c, a@0 as new_a, b@1 as b] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[a, b, c, d, e], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false + DataSourceExec: file_groups={1 group: [[x]]}, projection=[c, a@0 as new_a, b], file_type=csv, has_header=false " ); @@ -1616,6 +1604,7 @@ fn partitioned_data_source() -> Arc { ) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_projection_indices(Some(vec![0, 1, 2])) + .unwrap() .build(); DataSourceExec::from_data_source(config) @@ -1653,10 +1642,7 @@ fn test_partition_col_projection_pushdown() -> Result<()> { let actual = after_optimize_string.trim(); assert_snapshot!( actual, - @r" - ProjectionExec: expr=[string_col@1 as string_col, partition_col@2 as partition_col, int_col@0 as int_col] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[int_col, string_col, partition_col], file_type=csv, has_header=false - " + @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[string_col, partition_col, int_col], file_type=csv, has_header=false" ); Ok(()) @@ -1699,10 +1685,7 @@ fn test_partition_col_projection_pushdown_expr() -> Result<()> { let actual = after_optimize_string.trim(); assert_snapshot!( actual, - @r" - ProjectionExec: expr=[string_col@1 as string_col, CAST(partition_col@2 AS Utf8View) as partition_col, int_col@0 as int_col] - DataSourceExec: file_groups={1 group: [[x]]}, projection=[int_col, string_col, partition_col], file_type=csv, has_header=false - " + @"DataSourceExec: file_groups={1 group: [[x]]}, projection=[string_col, CAST(partition_col@2 AS Utf8View) as partition_col, int_col], file_type=csv, has_header=false" ); Ok(()) diff --git a/datafusion/core/tests/sql/explain_analyze.rs b/datafusion/core/tests/sql/explain_analyze.rs index 4a67046e933d..1656bdf66f2c 100644 --- a/datafusion/core/tests/sql/explain_analyze.rs +++ b/datafusion/core/tests/sql/explain_analyze.rs @@ -819,8 +819,7 @@ async fn test_physical_plan_display_indent_multi_children() { DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true CoalesceBatchesExec: target_batch_size=4096 RepartitionExec: partitioning=Hash([c2@0], 9000), input_partitions=1 - ProjectionExec: expr=[c1@0 as c2] - DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1], file_type=csv, has_header=true + DataSourceExec: file_groups={1 group: [[ARROW_TEST_DATA/csv/aggregate_test_100.csv]]}, projection=[c1@0 as c2], file_type=csv, has_header=true " ); } diff --git a/datafusion/datasource-arrow/src/file_format.rs b/datafusion/datasource-arrow/src/file_format.rs index ef478e268890..7754748fbf86 100644 --- a/datafusion/datasource-arrow/src/file_format.rs +++ b/datafusion/datasource-arrow/src/file_format.rs @@ -208,7 +208,7 @@ impl FileFormat for ArrowFormat { conf.table_partition_cols().clone(), ); - let source: Arc = + let mut source: Arc = match is_object_in_arrow_ipc_file_format(object_store, object_location).await { Ok(true) => Arc::new(ArrowSource::new_file_source(table_schema)), @@ -216,6 +216,13 @@ impl FileFormat for ArrowFormat { Err(e) => Err(e)?, }; + // Preserve projection from the original file source + if let Some(projection) = conf.file_source.projection() { + if let Some(new_source) = source.try_pushdown_projection(projection)? { + source = new_source; + } + } + let config = FileScanConfigBuilder::from(conf) .with_source(source) .build(); diff --git a/datafusion/datasource-arrow/src/source.rs b/datafusion/datasource-arrow/src/source.rs index 3132d8a10d5c..070e4fae6df6 100644 --- a/datafusion/datasource-arrow/src/source.rs +++ b/datafusion/datasource-arrow/src/source.rs @@ -34,22 +34,27 @@ use std::sync::Arc; use std::{any::Any, io::Cursor}; -use datafusion_datasource::schema_adapter::SchemaAdapterFactory; +use datafusion_datasource::schema_adapter::{ + DefaultSchemaAdapterFactory, SchemaAdapterFactory, +}; use datafusion_datasource::{as_file_source, TableSchema}; use arrow::buffer::Buffer; +use arrow::datatypes::SchemaRef; use arrow::ipc::reader::{FileDecoder, FileReader, StreamReader}; use datafusion_common::error::Result; use datafusion_common::exec_datafusion_err; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_datasource::projection::{ProjectionOpener, SplitProjection}; use datafusion_datasource::PartitionedFile; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; +use datafusion_physical_plan::projection::ProjectionExprs; use datafusion_datasource::file_stream::FileOpenFuture; use datafusion_datasource::file_stream::FileOpener; -use futures::StreamExt; +use futures::{StreamExt, TryStreamExt}; use itertools::Itertools; use object_store::{GetOptions, GetRange, GetResultPayload, ObjectStore}; @@ -66,6 +71,8 @@ enum ArrowFormat { pub(crate) struct ArrowStreamFileOpener { object_store: Arc, projection: Option>, + projected_schema: Option, + schema_adapter_factory: Option>, } impl FileOpener for ArrowStreamFileOpener { @@ -77,27 +84,52 @@ impl FileOpener for ArrowStreamFileOpener { } let object_store = Arc::clone(&self.object_store); let projection = self.projection.clone(); + let projected_schema = self.projected_schema.clone(); + let schema_adapter_factory = self.schema_adapter_factory.clone(); + Ok(Box::pin(async move { let r = object_store .get(&partitioned_file.object_meta.location) .await?; - match r.payload { + + let stream = match r.payload { #[cfg(not(target_arch = "wasm32"))] - GetResultPayload::File(file, _) => Ok(futures::stream::iter( + GetResultPayload::File(file, _) => futures::stream::iter( StreamReader::try_new(file.try_clone()?, projection.clone())?, ) .map(|r| r.map_err(Into::into)) - .boxed()), + .boxed(), GetResultPayload::Stream(_) => { let bytes = r.bytes().await?; let cursor = Cursor::new(bytes); - Ok(futures::stream::iter(StreamReader::try_new( + futures::stream::iter(StreamReader::try_new( cursor, projection.clone(), )?) .map(|r| r.map_err(Into::into)) - .boxed()) + .boxed() } + }; + + // If we have a schema adapter factory and projected schema, use them to normalize the schema + if let (Some(factory), Some(proj_schema)) = + (schema_adapter_factory, projected_schema) + { + Ok(stream + .and_then(move |batch| { + let factory = Arc::clone(&factory); + let proj_schema = Arc::clone(&proj_schema); + async move { + let schema_adapter = + factory.create_with_projected_schema(proj_schema); + let (schema_mapper, _) = + schema_adapter.map_schema(batch.schema().as_ref())?; + schema_mapper.map_batch(batch) + } + }) + .boxed()) + } else { + Ok(stream) } })) } @@ -107,12 +139,17 @@ impl FileOpener for ArrowStreamFileOpener { pub(crate) struct ArrowFileOpener { object_store: Arc, projection: Option>, + projected_schema: Option, + schema_adapter_factory: Option>, } impl FileOpener for ArrowFileOpener { fn open(&self, partitioned_file: PartitionedFile) -> Result { let object_store = Arc::clone(&self.object_store); let projection = self.projection.clone(); + let projected_schema = self.projected_schema.clone(); + let schema_adapter_factory = self.schema_adapter_factory.clone(); + Ok(Box::pin(async move { let range = partitioned_file.range.clone(); match range { @@ -120,23 +157,44 @@ impl FileOpener for ArrowFileOpener { let r = object_store .get(&partitioned_file.object_meta.location) .await?; - match r.payload { + let stream = match r.payload { #[cfg(not(target_arch = "wasm32"))] - GetResultPayload::File(file, _) => Ok(futures::stream::iter( + GetResultPayload::File(file, _) => futures::stream::iter( FileReader::try_new(file.try_clone()?, projection.clone())?, ) .map(|r| r.map_err(Into::into)) - .boxed()), + .boxed(), GetResultPayload::Stream(_) => { let bytes = r.bytes().await?; let cursor = Cursor::new(bytes); - Ok(futures::stream::iter(FileReader::try_new( + futures::stream::iter(FileReader::try_new( cursor, projection.clone(), )?) .map(|r| r.map_err(Into::into)) - .boxed()) + .boxed() } + }; + + // Apply schema adaptation if available + if let (Some(factory), Some(proj_schema)) = + (schema_adapter_factory, projected_schema) + { + Ok(stream + .and_then(move |batch| { + let factory = Arc::clone(&factory); + let proj_schema = Arc::clone(&proj_schema); + async move { + let schema_adapter = + factory.create_with_projected_schema(proj_schema); + let (schema_mapper, _) = schema_adapter + .map_schema(batch.schema().as_ref())?; + schema_mapper.map_batch(batch) + } + }) + .boxed()) + } else { + Ok(stream) } } Some(range) => { @@ -226,7 +284,7 @@ impl FileOpener for ArrowFileOpener { ) .await?; - Ok(futures::stream::iter( + let stream = futures::stream::iter( recordbatches .into_iter() .zip(recordbatch_results) @@ -237,7 +295,29 @@ impl FileOpener for ArrowFileOpener { }), ) .map(|r| r.map_err(Into::into)) - .boxed()) + .boxed(); + + // Apply schema adaptation if available + if let (Some(factory), Some(proj_schema)) = + (schema_adapter_factory, projected_schema) + { + Ok(stream + .and_then(move |batch| { + let factory = Arc::clone(&factory); + let proj_schema = Arc::clone(&proj_schema); + async move { + let schema_adapter = + factory.create_with_projected_schema(proj_schema); + let (schema_mapper, projection) = schema_adapter + .map_schema(batch.schema().as_ref())?; + let batch = batch.project(&projection)?; + schema_mapper.map_batch(batch) + } + }) + .boxed()) + } else { + Ok(stream) + } } } })) @@ -248,29 +328,34 @@ impl FileOpener for ArrowFileOpener { #[derive(Clone)] pub struct ArrowSource { format: ArrowFormat, - table_schema: TableSchema, metrics: ExecutionPlanMetricsSet, schema_adapter_factory: Option>, + projection: SplitProjection, + table_schema: TableSchema, } impl ArrowSource { /// Creates an [`ArrowSource`] for file format pub fn new_file_source(table_schema: impl Into) -> Self { + let table_schema = table_schema.into(); Self { format: ArrowFormat::File, - table_schema: table_schema.into(), metrics: ExecutionPlanMetricsSet::new(), schema_adapter_factory: None, + projection: SplitProjection::unprojected(&table_schema), + table_schema, } } /// Creates an [`ArrowSource`] for stream format pub fn new_stream_file_source(table_schema: impl Into) -> Self { + let table_schema = table_schema.into(); Self { format: ArrowFormat::Stream, - table_schema: table_schema.into(), metrics: ExecutionPlanMetricsSet::new(), schema_adapter_factory: None, + projection: SplitProjection::unprojected(&table_schema), + table_schema, } } } @@ -279,19 +364,43 @@ impl FileSource for ArrowSource { fn create_file_opener( &self, object_store: Arc, - base_config: &FileScanConfig, + _base_config: &FileScanConfig, _partition: usize, - ) -> Arc { - match self.format { + ) -> Result> { + let split_projection = self.projection.clone(); + // For schema adaptation, we only use the file schema (not partition columns) + let projected_file_schema = SchemaRef::from( + self.table_schema + .file_schema() + .project(&split_projection.file_indices)?, + ); + + // Use provided schema adapter factory, or default to DefaultSchemaAdapterFactory + // This ensures schema normalization (removing metadata differences) happens during execution + let schema_adapter_factory = self + .schema_adapter_factory + .clone() + .unwrap_or_else(|| Arc::new(DefaultSchemaAdapterFactory)); + + let opener: Arc = match self.format { ArrowFormat::File => Arc::new(ArrowFileOpener { object_store, - projection: base_config.file_column_projection_indices(), + projection: Some(split_projection.file_indices.clone()), + projected_schema: Some(Arc::clone(&projected_file_schema)), + schema_adapter_factory: Some(schema_adapter_factory), }), ArrowFormat::Stream => Arc::new(ArrowStreamFileOpener { object_store, - projection: base_config.file_column_projection_indices(), + projection: Some(split_projection.file_indices.clone()), + projected_schema: Some(projected_file_schema), + schema_adapter_factory: Some(schema_adapter_factory), }), - } + }; + ProjectionOpener::try_new( + split_projection, + opener, + self.table_schema.file_schema(), + ) } fn as_any(&self) -> &dyn Any { @@ -302,10 +411,6 @@ impl FileSource for ArrowSource { Arc::new(Self { ..self.clone() }) } - fn with_projection(&self, _config: &FileScanConfig) -> Arc { - Arc::new(Self { ..self.clone() }) - } - fn metrics(&self) -> &ExecutionPlanMetricsSet { &self.metrics } @@ -381,6 +486,22 @@ impl FileSource for ArrowSource { fn table_schema(&self) -> &TableSchema { &self.table_schema } + + fn try_pushdown_projection( + &self, + projection: &ProjectionExprs, + ) -> Result>> { + let mut source = self.clone(); + source.projection = SplitProjection::new( + self.table_schema().file_schema(), + &source.projection.source.try_merge(projection)?, + ); + Ok(Some(Arc::new(source))) + } + + fn projection(&self) -> Option<&ProjectionExprs> { + Some(&self.projection.source) + } } /// `FileOpener` wrapper for both Arrow IPC file and stream formats @@ -408,6 +529,8 @@ impl ArrowOpener { inner: Arc::new(ArrowFileOpener { object_store, projection, + projected_schema: None, + schema_adapter_factory: None, }), } } @@ -420,6 +543,8 @@ impl ArrowOpener { inner: Arc::new(ArrowStreamFileOpener { object_store, projection, + projected_schema: None, + schema_adapter_factory: None, }), } } @@ -479,7 +604,7 @@ mod tests { ) .build(); - let file_opener = source.create_file_opener(object_store, &scan_config, 0); + let file_opener = source.create_file_opener(object_store, &scan_config, 0)?; let mut stream = file_opener.open(partitioned_file)?.await?; assert!(stream.next().await.is_some()); @@ -521,7 +646,7 @@ mod tests { ) .build(); - let file_opener = source.create_file_opener(object_store, &scan_config, 0); + let file_opener = source.create_file_opener(object_store, &scan_config, 0)?; let mut stream = file_opener.open(partitioned_file)?.await?; assert!(stream.next().await.is_some()); @@ -562,7 +687,7 @@ mod tests { ) .build(); - let file_opener = source.create_file_opener(object_store, &scan_config, 0); + let file_opener = source.create_file_opener(object_store, &scan_config, 0)?; let result = file_opener.open(partitioned_file); assert!(result.is_err()); @@ -615,6 +740,8 @@ mod tests { let opener = ArrowStreamFileOpener { object_store, projection: Some(vec![0]), // just the first column + projected_schema: None, + schema_adapter_factory: None, }; let mut stream = opener.open(partitioned_file)?.await?; diff --git a/datafusion/datasource-avro/src/file_format.rs b/datafusion/datasource-avro/src/file_format.rs index 50aecf97b299..6df26a79f0e6 100644 --- a/datafusion/datasource-avro/src/file_format.rs +++ b/datafusion/datasource-avro/src/file_format.rs @@ -35,7 +35,7 @@ use datafusion_common::{Result, Statistics}; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_compression_type::FileCompressionType; use datafusion_datasource::file_format::{FileFormat, FileFormatFactory}; -use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; +use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::source::DataSourceExec; use datafusion_physical_plan::ExecutionPlan; use datafusion_session::Session; @@ -154,11 +154,7 @@ impl FileFormat for AvroFormat { _state: &dyn Session, conf: FileScanConfig, ) -> Result> { - let file_schema = Arc::clone(conf.file_schema()); - let config = FileScanConfigBuilder::from(conf) - .with_source(Arc::new(AvroSource::new(file_schema))) - .build(); - Ok(DataSourceExec::from_data_source(config)) + Ok(DataSourceExec::from_data_source(conf)) } fn file_source( diff --git a/datafusion/datasource-avro/src/source.rs b/datafusion/datasource-avro/src/source.rs index e83113f40ea0..1ba3ad435041 100644 --- a/datafusion/datasource-avro/src/source.rs +++ b/datafusion/datasource-avro/src/source.rs @@ -26,10 +26,12 @@ use datafusion_common::error::Result; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::FileScanConfig; use datafusion_datasource::file_stream::FileOpener; +use datafusion_datasource::projection::{ProjectionOpener, SplitProjection}; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; use datafusion_datasource::TableSchema; use datafusion_physical_expr_common::sort_expr::LexOrdering; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; +use datafusion_physical_plan::projection::ProjectionExprs; use object_store::ObjectStore; @@ -38,7 +40,7 @@ use object_store::ObjectStore; pub struct AvroSource { table_schema: TableSchema, batch_size: Option, - projection: Option>, + projection: SplitProjection, metrics: ExecutionPlanMetricsSet, schema_adapter_factory: Option>, } @@ -46,21 +48,30 @@ pub struct AvroSource { impl AvroSource { /// Initialize an AvroSource with the provided schema pub fn new(table_schema: impl Into) -> Self { + let table_schema = table_schema.into(); Self { - table_schema: table_schema.into(), + projection: SplitProjection::unprojected(&table_schema), + table_schema, batch_size: None, - projection: None, metrics: ExecutionPlanMetricsSet::new(), schema_adapter_factory: None, } } fn open(&self, reader: R) -> Result> { + let file_schema = self.table_schema.file_schema(); + let projection = Some( + self.projection + .file_indices + .iter() + .map(|&idx| file_schema.field(idx).name().clone()) + .collect::>(), + ); AvroReader::try_new( reader, &Arc::clone(self.table_schema.file_schema()), self.batch_size.expect("Batch size must set before open"), - self.projection.clone().as_ref(), + projection.as_ref(), ) } } @@ -71,11 +82,17 @@ impl FileSource for AvroSource { object_store: Arc, _base_config: &FileScanConfig, _partition: usize, - ) -> Arc { - Arc::new(private::AvroOpener { + ) -> Result> { + let mut opener = Arc::new(private::AvroOpener { config: Arc::new(self.clone()), object_store, - }) + }) as Arc; + opener = ProjectionOpener::try_new( + self.projection.clone(), + Arc::clone(&opener), + self.table_schema.file_schema(), + )?; + Ok(opener) } fn as_any(&self) -> &dyn Any { @@ -92,10 +109,20 @@ impl FileSource for AvroSource { Arc::new(conf) } - fn with_projection(&self, config: &FileScanConfig) -> Arc { - let mut conf = self.clone(); - conf.projection = config.projected_file_column_names(); - Arc::new(conf) + fn try_pushdown_projection( + &self, + projection: &ProjectionExprs, + ) -> Result>> { + let mut source = self.clone(); + let new_projection = self.projection.source.try_merge(projection)?; + let split_projection = + SplitProjection::new(self.table_schema.file_schema(), &new_projection); + source.projection = split_projection; + Ok(Some(Arc::new(source))) + } + + fn projection(&self) -> Option<&ProjectionExprs> { + Some(&self.projection.source) } fn metrics(&self) -> &ExecutionPlanMetricsSet { diff --git a/datafusion/datasource-csv/src/mod.rs b/datafusion/datasource-csv/src/mod.rs index 9af57c43103c..3038bfb7ef42 100644 --- a/datafusion/datasource-csv/src/mod.rs +++ b/datafusion/datasource-csv/src/mod.rs @@ -27,6 +27,7 @@ pub mod source; use std::sync::Arc; +use datafusion_common::Result; use datafusion_datasource::file_groups::FileGroup; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::{file::FileSource, file_scan_config::FileScanConfig}; @@ -37,8 +38,10 @@ pub use file_format::*; pub fn partitioned_csv_config( file_groups: Vec, file_source: Arc, -) -> FileScanConfig { - FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source) - .with_file_groups(file_groups) - .build() +) -> Result { + Ok( + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source) + .with_file_groups(file_groups) + .build(), + ) } diff --git a/datafusion/datasource-csv/src/source.rs b/datafusion/datasource-csv/src/source.rs index b68c55c79783..95f369962733 100644 --- a/datafusion/datasource-csv/src/source.rs +++ b/datafusion/datasource-csv/src/source.rs @@ -17,7 +17,9 @@ //! Execution plan for reading CSV files +use datafusion_datasource::projection::{ProjectionOpener, SplitProjection}; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; +use datafusion_physical_plan::projection::ProjectionExprs; use std::any::Any; use std::fmt; use std::io::{Read, Seek, SeekFrom}; @@ -88,7 +90,7 @@ pub struct CsvSource { options: CsvOptions, batch_size: Option, table_schema: TableSchema, - file_projection: Option>, + projection: SplitProjection, metrics: ExecutionPlanMetricsSet, schema_adapter_factory: Option>, } @@ -96,11 +98,12 @@ pub struct CsvSource { impl CsvSource { /// Returns a [`CsvSource`] pub fn new(table_schema: impl Into) -> Self { + let table_schema = table_schema.into(); Self { options: CsvOptions::default(), - table_schema: table_schema.into(), + projection: SplitProjection::unprojected(&table_schema), + table_schema, batch_size: None, - file_projection: None, metrics: ExecutionPlanMetricsSet::new(), schema_adapter_factory: None, } @@ -194,9 +197,7 @@ impl CsvSource { if let Some(terminator) = self.terminator() { builder = builder.with_terminator(terminator); } - if let Some(proj) = &self.file_projection { - builder = builder.with_projection(proj.clone()); - } + builder = builder.with_projection(self.projection.file_indices.clone()); if let Some(escape) = self.escape() { builder = builder.with_escape(escape) } @@ -243,14 +244,20 @@ impl FileSource for CsvSource { &self, object_store: Arc, base_config: &FileScanConfig, - partition: usize, - ) -> Arc { - Arc::new(CsvOpener { + partition_index: usize, + ) -> Result> { + let mut opener = Arc::new(CsvOpener { config: Arc::new(self.clone()), file_compression_type: base_config.file_compression_type, object_store, - partition_index: partition, - }) + partition_index, + }) as Arc; + opener = ProjectionOpener::try_new( + self.projection.clone(), + Arc::clone(&opener), + self.table_schema.file_schema(), + )?; + Ok(opener) } fn as_any(&self) -> &dyn Any { @@ -267,10 +274,20 @@ impl FileSource for CsvSource { Arc::new(conf) } - fn with_projection(&self, config: &FileScanConfig) -> Arc { - let mut conf = self.clone(); - conf.file_projection = config.file_column_projection_indices(); - Arc::new(conf) + fn try_pushdown_projection( + &self, + projection: &ProjectionExprs, + ) -> Result>> { + let mut source = self.clone(); + let new_projection = self.projection.source.try_merge(projection)?; + let split_projection = + SplitProjection::new(self.table_schema.file_schema(), &new_projection); + source.projection = split_projection; + Ok(Some(Arc::new(source))) + } + + fn projection(&self) -> Option<&ProjectionExprs> { + Some(&self.projection.source) } fn metrics(&self) -> &ExecutionPlanMetricsSet { diff --git a/datafusion/datasource-json/src/file_format.rs b/datafusion/datasource-json/src/file_format.rs index afb12e526271..27d1c6d960b9 100644 --- a/datafusion/datasource-json/src/file_format.rs +++ b/datafusion/datasource-json/src/file_format.rs @@ -254,16 +254,10 @@ impl FileFormat for JsonFormat { _state: &dyn Session, conf: FileScanConfig, ) -> Result> { - let table_schema = TableSchema::new( - Arc::clone(conf.file_schema()), - conf.table_partition_cols().clone(), - ); - let source = Arc::new(JsonSource::new(table_schema)); let conf = FileScanConfigBuilder::from(conf) .with_file_compression_type(FileCompressionType::from( self.options.compression, )) - .with_source(source) .build(); Ok(DataSourceExec::from_data_source(conf)) } diff --git a/datafusion/datasource-json/src/source.rs b/datafusion/datasource-json/src/source.rs index fd382efc75d3..db070d2033f4 100644 --- a/datafusion/datasource-json/src/source.rs +++ b/datafusion/datasource-json/src/source.rs @@ -29,10 +29,12 @@ use datafusion_common_runtime::JoinSet; use datafusion_datasource::decoder::{deserialize_stream, DecoderDeserializer}; use datafusion_datasource::file_compression_type::FileCompressionType; use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener}; +use datafusion_datasource::projection::{ProjectionOpener, SplitProjection}; use datafusion_datasource::schema_adapter::SchemaAdapterFactory; use datafusion_datasource::{ as_file_source, calculate_range, ListingTableUrl, PartitionedFile, RangeCalculation, }; +use datafusion_physical_plan::projection::ProjectionExprs; use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use arrow::json::ReaderBuilder; @@ -79,13 +81,16 @@ pub struct JsonSource { batch_size: Option, metrics: ExecutionPlanMetricsSet, schema_adapter_factory: Option>, + projection: SplitProjection, } impl JsonSource { /// Initialize a JsonSource with the provided schema pub fn new(table_schema: impl Into) -> Self { + let table_schema = table_schema.into(); Self { - table_schema: table_schema.into(), + projection: SplitProjection::unprojected(&table_schema), + table_schema, batch_size: None, metrics: ExecutionPlanMetricsSet::new(), schema_adapter_factory: None, @@ -105,15 +110,29 @@ impl FileSource for JsonSource { object_store: Arc, base_config: &FileScanConfig, _partition: usize, - ) -> Arc { - Arc::new(JsonOpener { + ) -> Result> { + // Get the projected file schema for JsonOpener + let file_schema = self.table_schema.file_schema(); + let projected_schema = + Arc::new(file_schema.project(&self.projection.file_indices)?); + + let mut opener = Arc::new(JsonOpener { batch_size: self .batch_size .expect("Batch size must set before creating opener"), - projected_schema: base_config.projected_file_schema(), + projected_schema, file_compression_type: base_config.file_compression_type, object_store, - }) + }) as Arc; + + // Wrap with ProjectionOpener + opener = ProjectionOpener::try_new( + self.projection.clone(), + Arc::clone(&opener), + self.table_schema.file_schema(), + )?; + + Ok(opener) } fn as_any(&self) -> &dyn Any { @@ -130,8 +149,20 @@ impl FileSource for JsonSource { Arc::new(conf) } - fn with_projection(&self, _config: &FileScanConfig) -> Arc { - Arc::new(Self { ..self.clone() }) + fn try_pushdown_projection( + &self, + projection: &ProjectionExprs, + ) -> Result>> { + let mut source = self.clone(); + let new_projection = self.projection.source.try_merge(projection)?; + let split_projection = + SplitProjection::new(self.table_schema.file_schema(), &new_projection); + source.projection = split_projection; + Ok(Some(Arc::new(source))) + } + + fn projection(&self) -> Option<&ProjectionExprs> { + Some(&self.projection.source) } fn metrics(&self) -> &ExecutionPlanMetricsSet { diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 385bfb5472a5..a2ce16cd530d 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -460,12 +460,13 @@ impl FileFormat for ParquetFormat { metadata_size_hint = Some(metadata); } - let table_schema = TableSchema::new( - Arc::clone(conf.file_schema()), - conf.table_partition_cols().clone(), - ); - let mut source = ParquetSource::new(table_schema) - .with_table_parquet_options(self.options.clone()); + let mut source = conf + .file_source() + .as_any() + .downcast_ref::() + .cloned() + .ok_or_else(|| internal_datafusion_err!("Expected ParquetSource"))?; + source = source.with_table_parquet_options(self.options.clone()); // Use the CachedParquetFileReaderFactory let metadata_cache = state.runtime_env().cache_manager.get_file_metadata_cache(); diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index ad7474af80c2..5ed74ecfd98f 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -31,6 +31,7 @@ use datafusion_common::config::ConfigOptions; use datafusion_common::config::EncryptionFactoryOptions; use datafusion_datasource::as_file_source; use datafusion_datasource::file_stream::FileOpener; +use datafusion_datasource::projection::{ProjectionOpener, SplitProjection}; use datafusion_datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapterFactory, }; @@ -51,6 +52,7 @@ use datafusion_physical_plan::filter_pushdown::{ }; use datafusion_physical_plan::metrics::Count; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; +use datafusion_physical_plan::projection::ProjectionExprs; use datafusion_physical_plan::DisplayFormatType; #[cfg(feature = "parquet_encryption")] @@ -286,6 +288,8 @@ pub struct ParquetSource { pub(crate) batch_size: Option, /// Optional hint for the size of the parquet metadata pub(crate) metadata_size_hint: Option, + /// Projection information for column pushdown + pub(crate) projection: SplitProjection, #[cfg(feature = "parquet_encryption")] pub(crate) encryption_factory: Option>, } @@ -297,8 +301,10 @@ impl ParquetSource { /// Uses default `TableParquetOptions`. /// To set custom options, use [ParquetSource::with_table_parquet_options`]. pub fn new(table_schema: impl Into) -> Self { + let table_schema = table_schema.into(); Self { - table_schema: table_schema.into(), + projection: SplitProjection::unprojected(&table_schema), + table_schema, table_parquet_options: TableParquetOptions::default(), metrics: ExecutionPlanMetricsSet::new(), predicate: None, @@ -512,10 +518,8 @@ impl FileSource for ParquetSource { object_store: Arc, base_config: &FileScanConfig, partition: usize, - ) -> Arc { - let projection = base_config - .file_column_projection_indices() - .unwrap_or_else(|| (0..base_config.file_schema().fields().len()).collect()); + ) -> datafusion_common::Result> { + let split_projection = self.projection.clone(); let (expr_adapter_factory, schema_adapter_factory) = match ( base_config.expr_adapter_factory.as_ref(), @@ -576,9 +580,9 @@ impl FileSource for ParquetSource { .as_ref() .map(|time_unit| parse_coerce_int96_string(time_unit.as_str()).unwrap()); - Arc::new(ParquetOpener { + let mut opener = Arc::new(ParquetOpener { partition_index: partition, - projection: Arc::from(projection), + projection: Arc::from(split_projection.file_indices.clone()), batch_size: self .batch_size .expect("Batch size must set before creating ParquetOpener"), @@ -602,7 +606,13 @@ impl FileSource for ParquetSource { #[cfg(feature = "parquet_encryption")] encryption_factory: self.get_encryption_factory_with_config(), max_predicate_cache_size: self.max_predicate_cache_size(), - }) + }) as Arc; + opener = ProjectionOpener::try_new( + split_projection.clone(), + Arc::clone(&opener), + self.table_schema.file_schema(), + )?; + Ok(opener) } fn as_any(&self) -> &dyn Any { @@ -623,8 +633,20 @@ impl FileSource for ParquetSource { Arc::new(conf) } - fn with_projection(&self, _config: &FileScanConfig) -> Arc { - Arc::new(Self { ..self.clone() }) + fn try_pushdown_projection( + &self, + projection: &ProjectionExprs, + ) -> datafusion_common::Result>> { + let mut source = self.clone(); + let new_projection = self.projection.source.try_merge(projection)?; + let split_projection = + SplitProjection::new(self.table_schema.file_schema(), &new_projection); + source.projection = split_projection; + Ok(Some(Arc::new(source))) + } + + fn projection(&self) -> Option<&ProjectionExprs> { + Some(&self.projection.source) } fn metrics(&self) -> &ExecutionPlanMetricsSet { diff --git a/datafusion/datasource/src/file.rs b/datafusion/datasource/src/file.rs index 9ec34b5dda0c..3668e0e4a77e 100644 --- a/datafusion/datasource/src/file.rs +++ b/datafusion/datasource/src/file.rs @@ -28,6 +28,7 @@ use crate::file_stream::FileOpener; use crate::schema_adapter::SchemaAdapterFactory; use datafusion_common::config::ConfigOptions; use datafusion_common::{not_impl_err, Result}; +use datafusion_physical_expr::projection::ProjectionExprs; use datafusion_physical_expr::{LexOrdering, PhysicalExpr}; use datafusion_physical_plan::filter_pushdown::{FilterPushdownPropagation, PushedDown}; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; @@ -57,7 +58,7 @@ pub trait FileSource: Send + Sync { object_store: Arc, base_config: &FileScanConfig, partition: usize, - ) -> Arc; + ) -> Result>; /// Any fn as_any(&self) -> &dyn Any; /// Returns the table schema for this file source. @@ -66,12 +67,14 @@ pub trait FileSource: Send + Sync { fn table_schema(&self) -> &crate::table_schema::TableSchema; /// Initialize new type with batch size configuration fn with_batch_size(&self, batch_size: usize) -> Arc; - /// Initialize new instance with projection information - fn with_projection(&self, config: &FileScanConfig) -> Arc; /// Returns the filter expression that will be applied during the file scan. fn filter(&self) -> Option> { None } + /// Return the projection that will be applied to the output stream on top of the table schema. + fn projection(&self) -> Option<&ProjectionExprs> { + None + } /// Return execution plan metrics fn metrics(&self) -> &ExecutionPlanMetricsSet; /// String representation of file source such as "csv", "json", "parquet" @@ -126,6 +129,35 @@ pub trait FileSource: Send + Sync { )) } + /// Try to push down a projection into a this FileSource. + /// + /// `FileSource` implementations that support projection pushdown should + /// override this method and return a new `FileSource` instance with the + /// projection incorporated. + /// + /// If a `FileSource` does accept a projection it is expected to handle + /// the projection in it's entirety, including partition columns. + /// For example, the `FileSource` may translate that projection into a + /// file format specific projection (e.g. Parquet can push down struct field access, + /// some other file formats like Vortex can push down computed expressions into un-decoded data) + /// and also need to handle partition column projection (generally done by replacing partition column + /// references with literal values derived from each files partition values). + /// + /// Not all FileSource's can handle complex expression pushdowns. For example, + /// a CSV file source may only support simple column selections. In such cases, + /// the `FileSource` can use [`SplitProjection`] and [`ProjectionOpener`] + /// to split the projection into a pushdownable part and a non-pushdownable part. + /// These helpers also handle partition column projection. + /// + /// [`SplitProjection`]: crate::projection::SplitProjection + /// [`ProjectionOpener`]: crate::projection::ProjectionOpener + fn try_pushdown_projection( + &self, + _projection: &ProjectionExprs, + ) -> Result>> { + Ok(None) + } + /// Set optional schema adapter factory. /// /// [`SchemaAdapterFactory`] allows user to specify how fields from the diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 4387996a2981..12654ee5b6af 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -27,43 +27,30 @@ use crate::{ source::DataSource, statistics::MinMaxStatistics, PartitionedFile, }; use arrow::datatypes::FieldRef; -use arrow::{ - array::{ - ArrayData, ArrayRef, BufferBuilder, DictionaryArray, RecordBatch, - RecordBatchOptions, - }, - buffer::Buffer, - datatypes::{ArrowNativeType, DataType, Schema, SchemaRef, UInt16Type}, -}; +use arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion_common::config::ConfigOptions; use datafusion_common::{ - exec_datafusion_err, exec_err, internal_datafusion_err, ColumnStatistics, - Constraints, Result, ScalarValue, Statistics, + internal_datafusion_err, internal_err, ColumnStatistics, Constraints, Result, + ScalarValue, Statistics, }; use datafusion_execution::{ object_store::ObjectStoreUrl, SendableRecordBatchStream, TaskContext, }; use datafusion_expr::Operator; -use datafusion_physical_expr::expressions::{BinaryExpr, Column}; +use datafusion_physical_expr::expressions::BinaryExpr; use datafusion_physical_expr::projection::ProjectionExprs; use datafusion_physical_expr::utils::reassign_expr_columns; use datafusion_physical_expr::{split_conjunction, EquivalenceProperties, Partitioning}; use datafusion_physical_expr_adapter::PhysicalExprAdapterFactory; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::LexOrdering; -use datafusion_physical_plan::projection::{ - all_alias_free_columns, new_projections_for_columns, ProjectionExpr, -}; use datafusion_physical_plan::{ display::{display_orderings, ProjectSchemaDisplay}, filter_pushdown::FilterPushdownPropagation, metrics::ExecutionPlanMetricsSet, DisplayAs, DisplayFormatType, }; -use std::{ - any::Any, borrow::Cow, collections::HashMap, fmt::Debug, fmt::Formatter, - fmt::Result as FmtResult, marker::PhantomData, sync::Arc, -}; +use std::{any::Any, fmt::Debug, fmt::Formatter, fmt::Result as FmtResult, sync::Arc}; use datafusion_physical_expr::equivalence::project_orderings; use datafusion_physical_plan::coop::cooperative; @@ -81,7 +68,6 @@ use log::{debug, warn}; /// # use std::sync::Arc; /// # use arrow::datatypes::{Field, Fields, DataType, Schema, SchemaRef}; /// # use object_store::ObjectStore; -/// # use datafusion_common::Statistics; /// # use datafusion_common::Result; /// # use datafusion_datasource::file::FileSource; /// # use datafusion_datasource::file_groups::FileGroup; @@ -91,6 +77,7 @@ use log::{debug, warn}; /// # use datafusion_datasource::source::DataSourceExec; /// # use datafusion_datasource::table_schema::TableSchema; /// # use datafusion_execution::object_store::ObjectStoreUrl; +/// # use datafusion_physical_expr::projection::ProjectionExprs; /// # use datafusion_physical_plan::ExecutionPlan; /// # use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; /// # use datafusion_datasource::schema_adapter::SchemaAdapterFactory; @@ -107,15 +94,16 @@ use log::{debug, warn}; /// # schema_adapter_factory: Option> /// # }; /// # impl FileSource for ParquetSource { -/// # fn create_file_opener(&self, _: Arc, _: &FileScanConfig, _: usize) -> Arc { unimplemented!() } +/// # fn create_file_opener(&self, _: Arc, _: &FileScanConfig, _: usize) -> Result> { unimplemented!() } /// # fn as_any(&self) -> &dyn Any { self } /// # fn table_schema(&self) -> &TableSchema { &self.table_schema } /// # fn with_batch_size(&self, _: usize) -> Arc { unimplemented!() } -/// # fn with_projection(&self, _: &FileScanConfig) -> Arc { unimplemented!() } /// # fn metrics(&self) -> &ExecutionPlanMetricsSet { unimplemented!() } /// # fn file_type(&self) -> &str { "parquet" } /// # fn with_schema_adapter_factory(&self, factory: Arc) -> Result> { Ok(Arc::new(Self {table_schema: self.table_schema.clone(), schema_adapter_factory: Some(factory)} )) } /// # fn schema_adapter_factory(&self) -> Option> { self.schema_adapter_factory.clone() } +/// # // Note that this implementation drops the projection on the floor, it is not complete! +/// # fn try_pushdown_projection(&self, projection: &ProjectionExprs) -> Result>> { Ok(Some(Arc::new(self.clone()) as Arc)) } /// # } /// # impl ParquetSource { /// # fn new(table_schema: impl Into) -> Self { Self {table_schema: table_schema.into(), schema_adapter_factory: None} } @@ -126,6 +114,7 @@ use log::{debug, warn}; /// let config = FileScanConfigBuilder::new(object_store_url, file_source) /// .with_limit(Some(1000)) // read only the first 1000 records /// .with_projection_indices(Some(vec![2, 3])) // project columns 2 and 3 +/// .expect("Failed to push down projection") /// // Read /tmp/file1.parquet with known size of 1234 bytes in a single group /// .with_file(PartitionedFile::new("file1.parquet", 1234)) /// // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes @@ -166,12 +155,6 @@ pub struct FileScanConfig { pub file_groups: Vec, /// Table constraints pub constraints: Constraints, - /// Physical expressions defining the projection to apply when reading data. - /// - /// Each expression in the projection can reference columns from both the file - /// schema and table partition columns. If `None`, all columns from the table - /// schema are projected. - pub projection_exprs: Option, /// The maximum number of records to read from this plan. If `None`, /// all records after filtering are returned. pub limit: Option, @@ -239,6 +222,7 @@ pub struct FileScanConfig { /// .with_limit(Some(1000)) /// // Project only the first column /// .with_projection_indices(Some(vec![0])) +/// .expect("Failed to push down projection") /// // Add a file group with two files /// .with_file_group(FileGroup::new(vec![ /// PartitionedFile::new("data/date=2024-01-01/file1.parquet", 1024), @@ -256,7 +240,6 @@ pub struct FileScanConfigBuilder { object_store_url: ObjectStoreUrl, file_source: Arc, limit: Option, - projection_indices: Option>, constraints: Option, file_groups: Vec, statistics: Option, @@ -287,7 +270,6 @@ impl FileScanConfigBuilder { file_compression_type: None, new_lines_in_values: None, limit: None, - projection_indices: None, constraints: None, batch_size: None, expr_adapter_factory: None, @@ -321,15 +303,48 @@ impl FileScanConfigBuilder { /// Use [`Self::with_projection_indices`] instead. This method will be removed in a future release. #[deprecated(since = "51.0.0", note = "Use with_projection_indices instead")] pub fn with_projection(self, indices: Option>) -> Self { - self.with_projection_indices(indices) + match self.clone().with_projection_indices(indices) { + Ok(builder) => builder, + Err(e) => { + warn!("Failed to push down projection in FileScanConfigBuilder::with_projection: {e}"); + self + } + } } /// Set the columns on which to project the data using column indices. /// /// Indexes that are higher than the number of columns of `file_schema` refer to `table_partition_cols`. - pub fn with_projection_indices(mut self, indices: Option>) -> Self { - self.projection_indices = indices; - self + pub fn with_projection_indices( + mut self, + indices: Option>, + ) -> Result { + let projection_exprs = indices.map(|indices| { + ProjectionExprs::from_indices( + &indices, + self.file_source.table_schema().table_schema(), + ) + }); + let Some(projection_exprs) = projection_exprs else { + return Ok(self); + }; + let new_source = self + .file_source + .try_pushdown_projection(&projection_exprs) + .map_err(|e| { + internal_datafusion_err!( + "Failed to push down projection in FileScanConfigBuilder::build: {e}" + ) + })?; + if let Some(new_source) = new_source { + self.file_source = new_source; + } else { + internal_err!( + "FileSource {} does not support projection pushdown", + self.file_source.file_type() + )?; + } + Ok(self) } /// Set the table constraints @@ -423,12 +438,14 @@ impl FileScanConfigBuilder { /// /// This method takes ownership of the builder and returns the constructed `FileScanConfig`. /// Any unset optional fields will use their default values. + /// + /// # Errors + /// Returns an error if projection pushdown fails or if schema operations fail. pub fn build(self) -> FileScanConfig { let Self { object_store_url, file_source, limit, - projection_indices, constraints, file_groups, statistics, @@ -443,25 +460,14 @@ impl FileScanConfigBuilder { let statistics = statistics.unwrap_or_else(|| { Statistics::new_unknown(file_source.table_schema().table_schema()) }); - let file_compression_type = file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED); let new_lines_in_values = new_lines_in_values.unwrap_or(false); - // Convert projection indices to ProjectionExprs using the final table schema - // (which now includes partition columns if they were added) - let projection_exprs = projection_indices.map(|indices| { - ProjectionExprs::from_indices( - &indices, - file_source.table_schema().table_schema(), - ) - }); - FileScanConfig { object_store_url, file_source, limit, - projection_exprs, constraints, file_groups, output_ordering, @@ -485,9 +491,6 @@ impl From for FileScanConfigBuilder { file_compression_type: Some(config.file_compression_type), new_lines_in_values: Some(config.new_lines_in_values), limit: config.limit, - projection_indices: config - .projection_exprs - .map(|p| p.ordered_column_indices()), constraints: Some(config.constraints), batch_size: config.batch_size, expr_adapter_factory: config.expr_adapter_factory, @@ -506,12 +509,9 @@ impl DataSource for FileScanConfig { .batch_size .unwrap_or_else(|| context.session_config().batch_size()); - let source = self - .file_source - .with_batch_size(batch_size) - .with_projection(self); + let source = self.file_source.with_batch_size(batch_size); - let opener = source.create_file_opener(object_store, self, partition); + let opener = source.create_file_opener(object_store, self, partition)?; let stream = FileStream::new(self, partition, opener, source.metrics())?; Ok(Box::pin(cooperative(stream))) @@ -524,14 +524,35 @@ impl DataSource for FileScanConfig { fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { match t { DisplayFormatType::Default | DisplayFormatType::Verbose => { - let schema = self.projected_schema(); + let schema = self.projected_schema().map_err(|_| std::fmt::Error {})?; let orderings = get_projected_output_ordering(self, &schema); write!(f, "file_groups=")?; FileGroupsDisplay(&self.file_groups).fmt_as(t, f)?; if !schema.fields().is_empty() { - write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?; + if let Some(projection) = self.file_source.projection() { + // This matches what ProjectionExec does. + // TODO: can we put this into ProjectionExprs so that it's shared code? + let expr: Vec = projection + .as_ref() + .iter() + .map(|proj_expr| { + if let Some(column) = proj_expr.expr.as_any().downcast_ref::() { + if column.name() == proj_expr.alias { + column.name().to_string() + } else { + format!("{} as {}", proj_expr.expr, proj_expr.alias) + } + } else { + format!("{} as {}", proj_expr.expr, proj_expr.alias) + } + }) + .collect(); + write!(f, ", projection=[{}]", expr.join(", "))?; + } else { + write!(f, ", projection={}", ProjectSchemaDisplay(&schema))?; + } } if let Some(limit) = self.limit { @@ -578,15 +599,17 @@ impl DataSource for FileScanConfig { } fn eq_properties(&self) -> EquivalenceProperties { - let (schema, constraints, _, orderings) = self.project(); - let mut eq_properties = - EquivalenceProperties::new_with_orderings(Arc::clone(&schema), orderings) - .with_constraints(constraints); + let schema = self.file_source.table_schema().table_schema(); + let mut eq_properties = EquivalenceProperties::new_with_orderings( + Arc::clone(schema), + self.output_ordering.clone(), + ) + .with_constraints(self.constraints.clone()); + if let Some(filter) = self.file_source.filter() { // We need to remap column indexes to match the projected schema since that's what the equivalence properties deal with. // Note that this will *ignore* any non-projected columns: these don't factor into ordering / equivalence. - match Self::add_filter_equivalence_info(&filter, &mut eq_properties, &schema) - { + match Self::add_filter_equivalence_info(&filter, &mut eq_properties, schema) { Ok(()) => {} Err(e) => { warn!("Failed to add filter equivalence info: {e}"); @@ -595,6 +618,24 @@ impl DataSource for FileScanConfig { } } } + + if let Some(projection) = self.file_source.projection() { + match ( + projection.project_schema(schema), + projection.projection_mapping(schema), + ) { + (Ok(output_schema), Ok(mapping)) => { + eq_properties = + eq_properties.project(&mapping, Arc::new(output_schema)); + } + (Err(e), _) | (_, Err(e)) => { + warn!("Failed to project equivalence properties: {e}"); + #[cfg(debug_assertions)] + panic!("Failed to project equivalence properties: {e}"); + } + } + } + eq_properties } @@ -630,7 +671,7 @@ impl DataSource for FileScanConfig { } } // If no statistics available for this partition, return unknown - Ok(Statistics::new_unknown(&self.projected_schema())) + Ok(Statistics::new_unknown(self.projected_schema()?.as_ref())) } else { // Return aggregate statistics across all partitions Ok(self.projected_stats()) @@ -654,43 +695,16 @@ impl DataSource for FileScanConfig { fn try_swapping_with_projection( &self, - projection: &[ProjectionExpr], + projection: &ProjectionExprs, ) -> Result>> { - // This process can be moved into CsvExec, but it would be an overlap of their responsibility. - - // Must be all column references, with no table partition columns (which can not be projected) - let partitioned_columns_in_proj = projection.iter().any(|proj_expr| { - proj_expr - .expr - .as_any() - .downcast_ref::() - .map(|expr| expr.index() >= self.file_schema().fields().len()) - .unwrap_or(false) - }); - - // If there is any non-column or alias-carrier expression, Projection should not be removed. - let no_aliases = all_alias_free_columns(projection); - - Ok((no_aliases && !partitioned_columns_in_proj).then(|| { - let file_scan = self.clone(); - let source = Arc::clone(&file_scan.file_source); - let new_projections = new_projections_for_columns( - projection, - &file_scan - .projection_exprs - .as_ref() - .map(|p| p.ordered_column_indices()) - .unwrap_or_else(|| (0..self.file_schema().fields().len()).collect()), - ); - - Arc::new( - FileScanConfigBuilder::from(file_scan) - // Assign projected statistics to source - .with_projection_indices(Some(new_projections)) - .with_source(source) - .build(), - ) as _ - })) + match self.file_source.try_pushdown_projection(projection)? { + Some(new_source) => { + let mut new_file_scan_config = self.clone(); + new_file_scan_config.file_source = new_source; + Ok(Some(Arc::new(new_file_scan_config) as Arc)) + } + None => Ok(None), + } } fn try_pushdown_filters( @@ -698,15 +712,49 @@ impl DataSource for FileScanConfig { filters: Vec>, config: &ConfigOptions, ) -> Result>> { - let result = self.file_source.try_pushdown_filters(filters, config)?; + // Remap filter Column indices to match the table schema (file + partition columns). + // This is necessary because filters may have been created against a different schema + // (e.g., after projection pushdown) and need to be remapped to the table schema + // before being passed to the file source and ultimately serialized. + // For example, the filter being pushed down is `c1_c2 > 5` and it was created + // against the output schema of the this `DataSource` which has projection `c1 + c2 as c1_c2`. + // Thus we need to rewrite the filter back to `c1 + c2 > 5` before passing it to the file source. + let table_schema = self.file_source.table_schema().table_schema(); + // If there's a projection with aliases, first map the filters back through + // the projection expressions before remapping to the table schema. + let filters_to_remap = if let Some(projection) = self.file_source.projection() { + use datafusion_physical_plan::projection::update_expr; + filters + .into_iter() + .map(|filter| { + update_expr(&filter, projection.as_ref(), true)?.ok_or_else(|| { + internal_datafusion_err!( + "Failed to map filter expression through projection: {}", + filter + ) + }) + }) + .collect::>>()? + } else { + filters + }; + // Now remap column indices to match the table schema. + let remapped_filters: Result> = filters_to_remap + .into_iter() + .map(|filter| reassign_expr_columns(filter, table_schema.as_ref())) + .collect(); + let remapped_filters = remapped_filters?; + + let result = self + .file_source + .try_pushdown_filters(remapped_filters, config)?; match result.updated_node { Some(new_file_source) => { - let file_scan_config = FileScanConfigBuilder::from(self.clone()) - .with_source(new_file_source) - .build(); + let mut new_file_scan_config = self.clone(); + new_file_scan_config.file_source = new_file_source; Ok(FilterPushdownPropagation { filters: result.filters, - updated_node: Some(Arc::new(file_scan_config) as _), + updated_node: Some(Arc::new(new_file_scan_config) as _), }) } None => { @@ -732,7 +780,7 @@ impl FileScanConfig { } fn projection_indices(&self) -> Vec { - match &self.projection_exprs { + match self.file_source.projection() { Some(proj) => proj.ordered_column_indices(), None => (0..self.file_schema().fields().len() + self.table_partition_cols().len()) @@ -777,26 +825,12 @@ impl FileScanConfig { } } - pub fn projected_schema(&self) -> Arc { - let table_fields: Vec<_> = self - .projection_indices() - .into_iter() - .map(|idx| { - if idx < self.file_schema().fields().len() { - self.file_schema().field(idx).clone() - } else { - let partition_idx = idx - self.file_schema().fields().len(); - Arc::unwrap_or_clone(Arc::clone( - &self.table_partition_cols()[partition_idx], - )) - } - }) - .collect(); - - Arc::new(Schema::new_with_metadata( - table_fields, - self.file_schema().metadata().clone(), - )) + pub fn projected_schema(&self) -> Result> { + let schema = self.file_source.table_schema().table_schema(); + match self.file_source.projection() { + Some(proj) => Ok(Arc::new(proj.project_schema(schema)?)), + None => Ok(Arc::clone(schema)), + } } fn add_filter_equivalence_info( @@ -841,64 +875,8 @@ impl FileScanConfig { self.new_lines_in_values } - /// Project the schema, constraints, and the statistics on the given column indices - pub fn project(&self) -> (SchemaRef, Constraints, Statistics, Vec) { - if self.projection_exprs.is_none() && self.table_partition_cols().is_empty() { - return ( - Arc::clone(self.file_schema()), - self.constraints.clone(), - self.statistics().clone(), - self.output_ordering.clone(), - ); - } - - let schema = self.projected_schema(); - let constraints = self.projected_constraints(); - let stats = self.projected_stats(); - - let output_ordering = get_projected_output_ordering(self, &schema); - - (schema, constraints, stats, output_ordering) - } - - pub fn projected_file_column_names(&self) -> Option> { - let fields = self.file_schema().fields(); - - self.projection_exprs.as_ref().map(|p| { - let column_indices = p.ordered_column_indices(); - - column_indices - .iter() - .filter(|&&col_i| col_i < fields.len()) - .map(|&col_i| self.file_schema().field(col_i).name()) - .cloned() - .collect::>() - }) - } - - /// Projects only file schema, ignoring partition columns - pub fn projected_file_schema(&self) -> SchemaRef { - let fields = self.file_column_projection_indices().map(|indices| { - indices - .iter() - .map(|col_idx| self.file_schema().field(*col_idx)) - .cloned() - .collect::>() - }); - - fields.map_or_else( - || Arc::clone(self.file_schema()), - |f| { - Arc::new(Schema::new_with_metadata( - f, - self.file_schema().metadata.clone(), - )) - }, - ) - } - pub fn file_column_projection_indices(&self) -> Option> { - self.projection_exprs.as_ref().map(|p| { + self.file_source.projection().as_ref().map(|p| { p.ordered_column_indices() .into_iter() .filter(|&i| i < self.file_schema().fields().len()) @@ -1090,7 +1068,7 @@ impl Debug for FileScanConfig { impl DisplayAs for FileScanConfig { fn fmt_as(&self, t: DisplayFormatType, f: &mut Formatter) -> FmtResult { - let schema = self.projected_schema(); + let schema = self.projected_schema().map_err(|_| std::fmt::Error {})?; let orderings = get_projected_output_ordering(self, &schema); write!(f, "file_groups=")?; @@ -1114,252 +1092,6 @@ impl DisplayAs for FileScanConfig { } } -/// A helper that projects partition columns into the file record batches. -/// -/// One interesting trick is the usage of a cache for the key buffers of the partition column -/// dictionaries. Indeed, the partition columns are constant, so the dictionaries that represent them -/// have all their keys equal to 0. This enables us to re-use the same "all-zero" buffer across batches, -/// which makes the space consumption of the partition columns O(batch_size) instead of O(record_count). -pub struct PartitionColumnProjector { - /// An Arrow buffer initialized to zeros that represents the key array of all partition - /// columns (partition columns are materialized by dictionary arrays with only one - /// value in the dictionary, thus all the keys are equal to zero). - key_buffer_cache: ZeroBufferGenerators, - /// Mapping between the indexes in the list of partition columns and the target - /// schema. Sorted by index in the target schema so that we can iterate on it to - /// insert the partition columns in the target record batch. - projected_partition_indexes: Vec<(usize, usize)>, - /// The schema of the table once the projection was applied. - projected_schema: SchemaRef, -} - -impl PartitionColumnProjector { - // Create a projector to insert the partitioning columns into batches read from files - // - `projected_schema`: the target schema with both file and partitioning columns - // - `table_partition_cols`: all the partitioning column names - pub fn new(projected_schema: SchemaRef, table_partition_cols: &[String]) -> Self { - let mut idx_map = HashMap::new(); - for (partition_idx, partition_name) in table_partition_cols.iter().enumerate() { - if let Ok(schema_idx) = projected_schema.index_of(partition_name) { - idx_map.insert(partition_idx, schema_idx); - } - } - - let mut projected_partition_indexes: Vec<_> = idx_map.into_iter().collect(); - projected_partition_indexes.sort_by(|(_, a), (_, b)| a.cmp(b)); - - Self { - projected_partition_indexes, - key_buffer_cache: Default::default(), - projected_schema, - } - } - - // Transform the batch read from the file by inserting the partitioning columns - // to the right positions as deduced from `projected_schema` - // - `file_batch`: batch read from the file, with internal projection applied - // - `partition_values`: the list of partition values, one for each partition column - #[expect(clippy::needless_pass_by_value)] - pub fn project( - &mut self, - file_batch: RecordBatch, - partition_values: &[ScalarValue], - ) -> Result { - let expected_cols = - self.projected_schema.fields().len() - self.projected_partition_indexes.len(); - - if file_batch.columns().len() != expected_cols { - return exec_err!( - "Unexpected batch schema from file, expected {} cols but got {}", - expected_cols, - file_batch.columns().len() - ); - } - - let mut cols = file_batch.columns().to_vec(); - for &(pidx, sidx) in &self.projected_partition_indexes { - let p_value = partition_values.get(pidx).ok_or_else(|| { - exec_datafusion_err!("Invalid partitioning found on disk") - })?; - - let mut partition_value = Cow::Borrowed(p_value); - - // check if user forgot to dict-encode the partition value - let field = self.projected_schema.field(sidx); - let expected_data_type = field.data_type(); - let actual_data_type = partition_value.data_type(); - if let DataType::Dictionary(key_type, _) = expected_data_type { - if !matches!(actual_data_type, DataType::Dictionary(_, _)) { - warn!("Partition value for column {} was not dictionary-encoded, applied auto-fix.", field.name()); - partition_value = Cow::Owned(ScalarValue::Dictionary( - key_type.clone(), - Box::new(partition_value.as_ref().clone()), - )); - } - } - - cols.insert( - sidx, - create_output_array( - &mut self.key_buffer_cache, - partition_value.as_ref(), - file_batch.num_rows(), - )?, - ) - } - - RecordBatch::try_new_with_options( - Arc::clone(&self.projected_schema), - cols, - &RecordBatchOptions::new().with_row_count(Some(file_batch.num_rows())), - ) - .map_err(Into::into) - } -} - -#[derive(Debug, Default)] -struct ZeroBufferGenerators { - gen_i8: ZeroBufferGenerator, - gen_i16: ZeroBufferGenerator, - gen_i32: ZeroBufferGenerator, - gen_i64: ZeroBufferGenerator, - gen_u8: ZeroBufferGenerator, - gen_u16: ZeroBufferGenerator, - gen_u32: ZeroBufferGenerator, - gen_u64: ZeroBufferGenerator, -} - -/// Generate a arrow [`Buffer`] that contains zero values. -#[derive(Debug, Default)] -struct ZeroBufferGenerator -where - T: ArrowNativeType, -{ - cache: Option, - _t: PhantomData, -} - -impl ZeroBufferGenerator -where - T: ArrowNativeType, -{ - const SIZE: usize = size_of::(); - - fn get_buffer(&mut self, n_vals: usize) -> Buffer { - match &mut self.cache { - Some(buf) if buf.len() >= n_vals * Self::SIZE => { - buf.slice_with_length(0, n_vals * Self::SIZE) - } - _ => { - let mut key_buffer_builder = BufferBuilder::::new(n_vals); - key_buffer_builder.advance(n_vals); // keys are all 0 - self.cache.insert(key_buffer_builder.finish()).clone() - } - } - } -} - -fn create_dict_array( - buffer_gen: &mut ZeroBufferGenerator, - dict_val: &ScalarValue, - len: usize, - data_type: DataType, -) -> Result -where - T: ArrowNativeType, -{ - let dict_vals = dict_val.to_array()?; - - let sliced_key_buffer = buffer_gen.get_buffer(len); - - // assemble pieces together - let mut builder = ArrayData::builder(data_type) - .len(len) - .add_buffer(sliced_key_buffer); - builder = builder.add_child_data(dict_vals.to_data()); - Ok(Arc::new(DictionaryArray::::from( - builder.build().unwrap(), - ))) -} - -fn create_output_array( - key_buffer_cache: &mut ZeroBufferGenerators, - val: &ScalarValue, - len: usize, -) -> Result { - if let ScalarValue::Dictionary(key_type, dict_val) = &val { - match key_type.as_ref() { - DataType::Int8 => { - return create_dict_array( - &mut key_buffer_cache.gen_i8, - dict_val, - len, - val.data_type(), - ); - } - DataType::Int16 => { - return create_dict_array( - &mut key_buffer_cache.gen_i16, - dict_val, - len, - val.data_type(), - ); - } - DataType::Int32 => { - return create_dict_array( - &mut key_buffer_cache.gen_i32, - dict_val, - len, - val.data_type(), - ); - } - DataType::Int64 => { - return create_dict_array( - &mut key_buffer_cache.gen_i64, - dict_val, - len, - val.data_type(), - ); - } - DataType::UInt8 => { - return create_dict_array( - &mut key_buffer_cache.gen_u8, - dict_val, - len, - val.data_type(), - ); - } - DataType::UInt16 => { - return create_dict_array( - &mut key_buffer_cache.gen_u16, - dict_val, - len, - val.data_type(), - ); - } - DataType::UInt32 => { - return create_dict_array( - &mut key_buffer_cache.gen_u32, - dict_val, - len, - val.data_type(), - ); - } - DataType::UInt64 => { - return create_dict_array( - &mut key_buffer_cache.gen_u64, - dict_val, - len, - val.data_type(), - ); - } - _ => {} - } - } - - val.to_array_of_size(len) -} - /// The various listing tables does not attempt to read all files /// concurrently, instead they will read files in sequence within a /// partition. This is an important property as it allows plans to @@ -1436,7 +1168,8 @@ fn get_projected_output_ordering( } let indices = base_config - .projection_exprs + .file_source + .projection() .as_ref() .map(|p| p.ordered_column_indices()); @@ -1492,6 +1225,8 @@ pub fn wrap_partition_value_in_dict(val: ScalarValue) -> ScalarValue { #[cfg(test)] mod tests { + use std::collections::HashMap; + use super::*; use crate::test_util::col; use crate::TableSchema; @@ -1500,53 +1235,15 @@ mod tests { verify_sort_integrity, }; - use arrow::array::{Int32Array, RecordBatch}; use arrow::datatypes::Field; + use datafusion_common::internal_err; use datafusion_common::stats::Precision; - use datafusion_common::{assert_batches_eq, internal_err}; use datafusion_expr::{Operator, SortExpr}; use datafusion_physical_expr::create_physical_sort_expr; use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal}; + use datafusion_physical_expr::projection::ProjectionExpr; use datafusion_physical_expr_common::sort_expr::PhysicalSortExpr; - /// Returns the column names on the schema - pub fn columns(schema: &Schema) -> Vec { - schema.fields().iter().map(|f| f.name().clone()).collect() - } - - #[test] - fn physical_plan_config_no_projection() { - let file_schema = aggr_test_schema(); - let conf = config_for_projection( - Arc::clone(&file_schema), - None, - Statistics::new_unknown(&file_schema), - to_partition_cols(vec![( - "date".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - )]), - ); - - let (proj_schema, _, proj_statistics, _) = conf.project(); - assert_eq!(proj_schema.fields().len(), file_schema.fields().len() + 1); - assert_eq!( - proj_schema.field(file_schema.fields().len()).name(), - "date", - "partition columns are the last columns" - ); - assert_eq!( - proj_statistics.column_statistics.len(), - file_schema.fields().len() + 1 - ); - // TODO implement tests for partition column statistics once implemented - - let col_names = conf.projected_file_column_names(); - assert_eq!(col_names, None); - - let col_indices = conf.file_column_projection_indices(); - assert_eq!(col_indices, None); - } - #[test] fn physical_plan_config_no_projection_tab_cols_as_field() { let file_schema = aggr_test_schema(); @@ -1567,7 +1264,7 @@ mod tests { ); // verify the proj_schema includes the last column and exactly the same the field it is defined - let proj_schema = conf.projected_schema(); + let proj_schema = conf.projected_schema().unwrap(); assert_eq!(proj_schema.fields().len(), file_schema.fields().len() + 1); assert_eq!( *proj_schema.field(file_schema.fields().len()), @@ -1576,223 +1273,6 @@ mod tests { ); } - #[test] - fn physical_plan_config_with_projection() { - let file_schema = aggr_test_schema(); - let conf = config_for_projection( - Arc::clone(&file_schema), - Some(vec![file_schema.fields().len(), 0]), - Statistics { - num_rows: Precision::Inexact(10), - // assign the column index to distinct_count to help assert - // the source statistic after the projection - column_statistics: (0..file_schema.fields().len()) - .map(|i| ColumnStatistics { - distinct_count: Precision::Inexact(i), - ..Default::default() - }) - .collect(), - total_byte_size: Precision::Absent, - }, - to_partition_cols(vec![( - "date".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - )]), - ); - - let (proj_schema, _, proj_statistics, _) = conf.project(); - assert_eq!( - columns(&proj_schema), - vec!["date".to_owned(), "c1".to_owned()] - ); - let proj_stat_cols = proj_statistics.column_statistics; - assert_eq!(proj_stat_cols.len(), 2); - // TODO implement tests for proj_stat_cols[0] once partition column - // statistics are implemented - assert_eq!(proj_stat_cols[1].distinct_count, Precision::Inexact(0)); - - let col_names = conf.projected_file_column_names(); - assert_eq!(col_names, Some(vec!["c1".to_owned()])); - - let col_indices = conf.file_column_projection_indices(); - assert_eq!(col_indices, Some(vec![0])); - } - - #[test] - fn partition_column_projector() { - let file_batch = build_table_i32( - ("a", &vec![0, 1, 2]), - ("b", &vec![-2, -1, 0]), - ("c", &vec![10, 11, 12]), - ); - let partition_cols = vec![ - ( - "year".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - ), - ( - "month".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - ), - ( - "day".to_owned(), - wrap_partition_type_in_dict(DataType::Utf8), - ), - ]; - // create a projected schema - let statistics = Statistics { - num_rows: Precision::Inexact(3), - total_byte_size: Precision::Absent, - column_statistics: Statistics::unknown_column(&file_batch.schema()), - }; - - let conf = config_for_projection( - file_batch.schema(), - // keep all cols from file and 2 from partitioning - Some(vec![ - 0, - 1, - 2, - file_batch.schema().fields().len(), - file_batch.schema().fields().len() + 2, - ]), - statistics.clone(), - to_partition_cols(partition_cols.clone()), - ); - - let source_statistics = conf.statistics(); - let conf_stats = conf.partition_statistics(None).unwrap(); - - // projection should be reflected in the file source statistics - assert_eq!(conf_stats.num_rows, Precision::Inexact(3)); - - // 3 original statistics + 2 partition statistics - assert_eq!(conf_stats.column_statistics.len(), 5); - - // file statics should not be modified - assert_eq!(source_statistics, statistics); - assert_eq!(source_statistics.column_statistics.len(), 3); - - let proj_schema = conf.projected_schema(); - // created a projector for that projected schema - let mut proj = PartitionColumnProjector::new( - proj_schema, - &partition_cols - .iter() - .map(|x| x.0.clone()) - .collect::>(), - ); - - // project first batch - let projected_batch = proj - .project( - // file_batch is ok here because we kept all the file cols in the projection - file_batch, - &[ - wrap_partition_value_in_dict(ScalarValue::from("2021")), - wrap_partition_value_in_dict(ScalarValue::from("10")), - wrap_partition_value_in_dict(ScalarValue::from("26")), - ], - ) - .expect("Projection of partition columns into record batch failed"); - let expected = [ - "+---+----+----+------+-----+", - "| a | b | c | year | day |", - "+---+----+----+------+-----+", - "| 0 | -2 | 10 | 2021 | 26 |", - "| 1 | -1 | 11 | 2021 | 26 |", - "| 2 | 0 | 12 | 2021 | 26 |", - "+---+----+----+------+-----+", - ]; - assert_batches_eq!(expected, &[projected_batch]); - - // project another batch that is larger than the previous one - let file_batch = build_table_i32( - ("a", &vec![5, 6, 7, 8, 9]), - ("b", &vec![-10, -9, -8, -7, -6]), - ("c", &vec![12, 13, 14, 15, 16]), - ); - let projected_batch = proj - .project( - // file_batch is ok here because we kept all the file cols in the projection - file_batch, - &[ - wrap_partition_value_in_dict(ScalarValue::from("2021")), - wrap_partition_value_in_dict(ScalarValue::from("10")), - wrap_partition_value_in_dict(ScalarValue::from("27")), - ], - ) - .expect("Projection of partition columns into record batch failed"); - let expected = [ - "+---+-----+----+------+-----+", - "| a | b | c | year | day |", - "+---+-----+----+------+-----+", - "| 5 | -10 | 12 | 2021 | 27 |", - "| 6 | -9 | 13 | 2021 | 27 |", - "| 7 | -8 | 14 | 2021 | 27 |", - "| 8 | -7 | 15 | 2021 | 27 |", - "| 9 | -6 | 16 | 2021 | 27 |", - "+---+-----+----+------+-----+", - ]; - assert_batches_eq!(expected, &[projected_batch]); - - // project another batch that is smaller than the previous one - let file_batch = build_table_i32( - ("a", &vec![0, 1, 3]), - ("b", &vec![2, 3, 4]), - ("c", &vec![4, 5, 6]), - ); - let projected_batch = proj - .project( - // file_batch is ok here because we kept all the file cols in the projection - file_batch, - &[ - wrap_partition_value_in_dict(ScalarValue::from("2021")), - wrap_partition_value_in_dict(ScalarValue::from("10")), - wrap_partition_value_in_dict(ScalarValue::from("28")), - ], - ) - .expect("Projection of partition columns into record batch failed"); - let expected = [ - "+---+---+---+------+-----+", - "| a | b | c | year | day |", - "+---+---+---+------+-----+", - "| 0 | 2 | 4 | 2021 | 28 |", - "| 1 | 3 | 5 | 2021 | 28 |", - "| 3 | 4 | 6 | 2021 | 28 |", - "+---+---+---+------+-----+", - ]; - assert_batches_eq!(expected, &[projected_batch]); - - // forgot to dictionary-wrap the scalar value - let file_batch = build_table_i32( - ("a", &vec![0, 1, 2]), - ("b", &vec![-2, -1, 0]), - ("c", &vec![10, 11, 12]), - ); - let projected_batch = proj - .project( - // file_batch is ok here because we kept all the file cols in the projection - file_batch, - &[ - ScalarValue::from("2021"), - ScalarValue::from("10"), - ScalarValue::from("26"), - ], - ) - .expect("Projection of partition columns into record batch failed"); - let expected = [ - "+---+----+----+------+-----+", - "| a | b | c | year | day |", - "+---+----+----+------+-----+", - "| 0 | -2 | 10 | 2021 | 26 |", - "| 1 | -1 | 11 | 2021 | 26 |", - "| 2 | 0 | 12 | 2021 | 26 |", - "+---+----+----+------+-----+", - ]; - assert_batches_eq!(expected, &[projected_batch]); - } - #[test] fn test_projected_file_schema_with_partition_col() { let schema = aggr_test_schema(); @@ -1808,13 +1288,13 @@ mod tests { ]; // Projected file schema for config with projection including partition column - let projection = config_for_projection( + let config = config_for_projection( schema.clone(), Some(vec![0, 3, 5, schema.fields().len()]), Statistics::new_unknown(&schema), to_partition_cols(partition_cols), - ) - .projected_file_schema(); + ); + let projection = projected_file_schema(&config); // Assert partition column filtered out in projected file schema let expected_columns = vec!["c1", "c4", "c6"]; @@ -1826,6 +1306,16 @@ mod tests { assert_eq!(expected_columns, actual_columns); } + /// Projects only file schema, ignoring partition columns + fn projected_file_schema(config: &FileScanConfig) -> SchemaRef { + let file_schema = config.file_source.table_schema().file_schema(); + if let Some(file_indices) = config.file_column_projection_indices() { + Arc::new(file_schema.project(&file_indices).unwrap()) + } else { + Arc::clone(file_schema) + } + } + #[test] fn test_projected_file_schema_without_projection() { let schema = aggr_test_schema(); @@ -1841,13 +1331,13 @@ mod tests { ]; // Projected file schema for config without projection - let projection = config_for_projection( + let config = config_for_projection( schema.clone(), None, Statistics::new_unknown(&schema), to_partition_cols(partition_cols), - ) - .projected_file_schema(); + ); + let projection = projected_file_schema(&config); // Assert projected file schema is equal to file schema assert_eq!(projection.fields(), schema.fields()); @@ -2174,6 +1664,7 @@ mod tests { Arc::new(MockSource::new(table_schema.clone())), ) .with_projection_indices(projection) + .unwrap() .with_statistics(statistics) .build() } @@ -2186,29 +1677,6 @@ mod tests { .collect::>() } - /// returns record batch with 3 columns of i32 in memory - pub fn build_table_i32( - a: (&str, &Vec), - b: (&str, &Vec), - c: (&str, &Vec), - ) -> RecordBatch { - let schema = Schema::new(vec![ - Field::new(a.0, DataType::Int32, false), - Field::new(b.0, DataType::Int32, false), - Field::new(c.0, DataType::Int32, false), - ]); - - RecordBatch::try_new( - Arc::new(schema), - vec![ - Arc::new(Int32Array::from(a.1.clone())), - Arc::new(Int32Array::from(b.1.clone())), - Arc::new(Int32Array::from(c.1.clone())), - ], - ) - .unwrap() - } - #[test] fn test_file_scan_config_builder() { let file_schema = aggr_test_schema(); @@ -2236,6 +1704,7 @@ mod tests { let config = builder .with_limit(Some(1000)) .with_projection_indices(Some(vec![0, 1])) + .unwrap() .with_statistics(Statistics::new_unknown(&file_schema)) .with_file_groups(vec![FileGroup::new(vec![PartitionedFile::new( "test.parquet".to_string(), @@ -2254,7 +1723,11 @@ mod tests { assert_eq!(*config.file_schema(), file_schema); assert_eq!(config.limit, Some(1000)); assert_eq!( - config.projection_exprs.as_ref().map(|p| p.column_indices()), + config + .file_source + .projection() + .as_ref() + .map(|p| p.column_indices()), Some(vec![0, 1]) ); assert_eq!(config.table_partition_cols().len(), 1); @@ -2294,15 +1767,17 @@ mod tests { Arc::clone(&file_source), ) .with_projection_indices(Some(vec![0, 1, 2])) + .unwrap() .build(); // Simulate projection being updated. Since the filter has already been pushed down, // the new projection won't include the filtered column. + let exprs = ProjectionExprs::new(vec![ProjectionExpr::new( + col("c1", &file_schema).unwrap(), + "c1".to_string(), + )]); let data_source = config - .try_swapping_with_projection(&[ProjectionExpr::new( - col("c3", &file_schema).unwrap(), - "c3".to_string(), - )]) + .try_swapping_with_projection(&exprs) .unwrap() .unwrap(); @@ -2345,9 +1820,16 @@ mod tests { assert_eq!(config.object_store_url, object_store_url); assert_eq!(*config.file_schema(), file_schema); assert_eq!(config.limit, None); + // When no projection is specified, the file source should have an unprojected projection + // (i.e., all columns) + let expected_projection: Vec = (0..file_schema.fields().len()).collect(); assert_eq!( - config.projection_exprs.as_ref().map(|p| p.column_indices()), - None + config + .file_source + .projection() + .as_ref() + .map(|p| p.column_indices()), + Some(expected_projection) ); assert!(config.table_partition_cols().is_empty()); assert!(config.file_groups.is_empty()); @@ -2399,6 +1881,7 @@ mod tests { Arc::clone(&file_source), ) .with_projection_indices(Some(vec![0, 2])) + .unwrap() .with_limit(Some(10)) .with_file(file.clone()) .with_constraints(Constraints::default()) @@ -2417,7 +1900,8 @@ mod tests { assert_eq!(*new_config.file_schema(), schema); assert_eq!( new_config - .projection_exprs + .file_source + .projection() .as_ref() .map(|p| p.column_indices()), Some(vec![0, 2]) @@ -2641,7 +2125,8 @@ mod tests { ObjectStoreUrl::parse("test:///").unwrap(), Arc::new(MockSource::new(table_schema.clone())), ) - .with_projection_indices(Some(vec![0, 2])) // Only project columns 0 and 2 + .with_projection_indices(Some(vec![0, 2])) + .unwrap() // Only project columns 0 and 2 .with_file_groups(vec![file_group]) .build(); diff --git a/datafusion/datasource/src/file_stream.rs b/datafusion/datasource/src/file_stream.rs index 0568b4cc4e5f..7b24b71348bf 100644 --- a/datafusion/datasource/src/file_stream.rs +++ b/datafusion/datasource/src/file_stream.rs @@ -27,7 +27,7 @@ use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; -use crate::file_scan_config::{FileScanConfig, PartitionColumnProjector}; +use crate::file_scan_config::FileScanConfig; use crate::PartitionedFile; use arrow::datatypes::SchemaRef; use datafusion_common::error::Result; @@ -38,7 +38,6 @@ use datafusion_physical_plan::metrics::{ use arrow::record_batch::RecordBatch; use datafusion_common::instant::Instant; -use datafusion_common::ScalarValue; use futures::future::BoxFuture; use futures::stream::BoxStream; @@ -56,8 +55,6 @@ pub struct FileStream { /// A dynamic [`FileOpener`]. Calling `open()` returns a [`FileOpenFuture`], /// which can be resolved to a stream of `RecordBatch`. file_opener: Arc, - /// The partition column projector - pc_projector: PartitionColumnProjector, /// The stream state state: FileStreamState, /// File stream specific metrics @@ -76,15 +73,7 @@ impl FileStream { file_opener: Arc, metrics: &ExecutionPlanMetricsSet, ) -> Result { - let projected_schema = config.projected_schema(); - let pc_projector = PartitionColumnProjector::new( - Arc::clone(&projected_schema), - &config - .table_partition_cols() - .iter() - .map(|x| x.name().clone()) - .collect::>(), - ); + let projected_schema = config.projected_schema()?; let file_group = config.file_groups[partition].clone(); @@ -93,7 +82,6 @@ impl FileStream { projected_schema, remain: config.limit, file_opener, - pc_projector, state: FileStreamState::Idle, file_stream_metrics: FileStreamMetrics::new(metrics, partition), baseline_metrics: BaselineMetrics::new(metrics, partition), @@ -114,15 +102,9 @@ impl FileStream { /// /// Since file opening is mostly IO (and may involve a /// bunch of sequential IO), it can be parallelized with decoding. - fn start_next_file(&mut self) -> Option)>> { + fn start_next_file(&mut self) -> Option> { let part_file = self.file_iter.pop_front()?; - - let partition_values = part_file.partition_values.clone(); - Some( - self.file_opener - .open(part_file) - .map(|future| (future, partition_values)), - ) + Some(self.file_opener.open(part_file)) } fn poll_inner(&mut self, cx: &mut Context<'_>) -> Poll>> { @@ -132,12 +114,7 @@ impl FileStream { self.file_stream_metrics.time_opening.start(); match self.start_next_file().transpose() { - Ok(Some((future, partition_values))) => { - self.state = FileStreamState::Open { - future, - partition_values, - } - } + Ok(Some(future)) => self.state = FileStreamState::Open { future }, Ok(None) => return Poll::Ready(None), Err(e) => { self.state = FileStreamState::Error; @@ -145,13 +122,8 @@ impl FileStream { } } } - FileStreamState::Open { - future, - partition_values, - } => match ready!(future.poll_unpin(cx)) { + FileStreamState::Open { future } => match ready!(future.poll_unpin(cx)) { Ok(reader) => { - let partition_values = mem::take(partition_values); - // include time needed to start opening in `start_next_file` self.file_stream_metrics.time_opening.stop(); let next = self.start_next_file().transpose(); @@ -159,22 +131,14 @@ impl FileStream { self.file_stream_metrics.time_scanning_total.start(); match next { - Ok(Some((next_future, next_partition_values))) => { + Ok(Some(next_future)) => { self.state = FileStreamState::Scan { - partition_values, reader, - next: Some(( - NextOpen::Pending(next_future), - next_partition_values, - )), + next: Some(NextOpen::Pending(next_future)), }; } Ok(None) => { - self.state = FileStreamState::Scan { - reader, - partition_values, - next: None, - }; + self.state = FileStreamState::Scan { reader, next: None }; } Err(e) => { self.state = FileStreamState::Error; @@ -196,13 +160,9 @@ impl FileStream { } } }, - FileStreamState::Scan { - reader, - partition_values, - next, - } => { + FileStreamState::Scan { reader, next } => { // We need to poll the next `FileOpenFuture` here to drive it forward - if let Some((next_open_future, _)) = next { + if let Some(next_open_future) = next { if let NextOpen::Pending(f) = next_open_future { if let Poll::Ready(reader) = f.as_mut().poll(cx) { *next_open_future = NextOpen::Ready(reader); @@ -213,31 +173,22 @@ impl FileStream { Some(Ok(batch)) => { self.file_stream_metrics.time_scanning_until_data.stop(); self.file_stream_metrics.time_scanning_total.stop(); - let result = self - .pc_projector - .project(batch, partition_values) - .map(|batch| match &mut self.remain { - Some(remain) => { - if *remain > batch.num_rows() { - *remain -= batch.num_rows(); - batch - } else { - let batch = batch.slice(0, *remain); - self.state = FileStreamState::Limit; - *remain = 0; - batch - } + let batch = match &mut self.remain { + Some(remain) => { + if *remain > batch.num_rows() { + *remain -= batch.num_rows(); + batch + } else { + let batch = batch.slice(0, *remain); + self.state = FileStreamState::Limit; + *remain = 0; + batch } - None => batch, - }); - - if result.is_err() { - // If the partition value projection fails, this is not governed by - // the `OnError` behavior - self.state = FileStreamState::Error - } + } + None => batch, + }; self.file_stream_metrics.time_scanning_total.start(); - return Poll::Ready(Some(result)); + return Poll::Ready(Some(Ok(batch))); } Some(Err(err)) => { self.file_stream_metrics.file_scan_errors.add(1); @@ -247,22 +198,19 @@ impl FileStream { match self.on_error { // If `OnError::Skip` we skip the file as soon as we hit the first error OnError::Skip => match mem::take(next) { - Some((future, partition_values)) => { + Some(future) => { self.file_stream_metrics.time_opening.start(); match future { NextOpen::Pending(future) => { - self.state = FileStreamState::Open { - future, - partition_values, - } + self.state = + FileStreamState::Open { future } } NextOpen::Ready(reader) => { self.state = FileStreamState::Open { future: Box::pin(std::future::ready( reader, )), - partition_values, } } } @@ -280,22 +228,18 @@ impl FileStream { self.file_stream_metrics.time_scanning_total.stop(); match mem::take(next) { - Some((future, partition_values)) => { + Some(future) => { self.file_stream_metrics.time_opening.start(); match future { NextOpen::Pending(future) => { - self.state = FileStreamState::Open { - future, - partition_values, - } + self.state = FileStreamState::Open { future } } NextOpen::Ready(reader) => { self.state = FileStreamState::Open { future: Box::pin(std::future::ready( reader, )), - partition_values, } } } @@ -373,21 +317,16 @@ pub enum FileStreamState { Open { /// A [`FileOpenFuture`] returned by [`FileOpener::open`] future: FileOpenFuture, - /// The partition values for this file - partition_values: Vec, }, /// Scanning the [`BoxStream`] returned by the completion of a [`FileOpenFuture`] /// returned by [`FileOpener::open`] Scan { - /// Partitioning column values for the current batch_iter - partition_values: Vec, /// The reader instance reader: BoxStream<'static, Result>, - /// A [`FileOpenFuture`] for the next file to be processed, - /// and its corresponding partition column values, if any. + /// A [`FileOpenFuture`] for the next file to be processed. /// This allows the next file to be opened in parallel while the /// current file is read. - next: Option<(NextOpen, Vec)>, + next: Option, }, /// Encountered an error Error, diff --git a/datafusion/datasource/src/memory.rs b/datafusion/datasource/src/memory.rs index 036574ddd3c3..731a7f5d8669 100644 --- a/datafusion/datasource/src/memory.rs +++ b/datafusion/datasource/src/memory.rs @@ -35,11 +35,12 @@ use datafusion_common::{ }; use datafusion_execution::TaskContext; use datafusion_physical_expr::equivalence::project_orderings; +use datafusion_physical_expr::projection::ProjectionExprs; use datafusion_physical_expr::utils::collect_columns; use datafusion_physical_expr::{EquivalenceProperties, LexOrdering}; use datafusion_physical_plan::memory::MemoryStream; use datafusion_physical_plan::projection::{ - all_alias_free_columns, new_projections_for_columns, ProjectionExpr, + all_alias_free_columns, new_projections_for_columns, }; use datafusion_physical_plan::{ common, ColumnarValue, DisplayAs, DisplayFormatType, Partitioning, PhysicalExpr, @@ -230,15 +231,16 @@ impl DataSource for MemorySourceConfig { fn try_swapping_with_projection( &self, - projection: &[ProjectionExpr], + projection: &ProjectionExprs, ) -> Result>> { // If there is any non-column or alias-carrier expression, Projection should not be removed. // This process can be moved into MemoryExec, but it would be an overlap of their responsibility. - all_alias_free_columns(projection) + let exprs = projection.iter().cloned().collect_vec(); + all_alias_free_columns(exprs.as_slice()) .then(|| { let all_projections = (0..self.schema.fields().len()).collect(); let new_projections = new_projections_for_columns( - projection, + &exprs, self.projection().as_ref().unwrap_or(&all_projections), ); diff --git a/datafusion/datasource/src/mod.rs b/datafusion/datasource/src/mod.rs index 2c7d40d2fb3b..a749204ee1cc 100644 --- a/datafusion/datasource/src/mod.rs +++ b/datafusion/datasource/src/mod.rs @@ -41,6 +41,7 @@ pub mod file_scan_config; pub mod file_sink_config; pub mod file_stream; pub mod memory; +pub mod projection; pub mod schema_adapter; pub mod sink; pub mod source; diff --git a/datafusion/datasource/src/projection.rs b/datafusion/datasource/src/projection.rs new file mode 100644 index 000000000000..9e3139f4fbd3 --- /dev/null +++ b/datafusion/datasource/src/projection.rs @@ -0,0 +1,631 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use arrow::datatypes::{Schema, SchemaRef}; +use datafusion_common::{ + tree_node::{Transformed, TransformedResult, TreeNode}, + Result, ScalarValue, +}; +use datafusion_physical_expr::{ + expressions::{Column, Literal}, + projection::{ProjectionExpr, ProjectionExprs}, +}; +use futures::{FutureExt, StreamExt}; +use itertools::Itertools; + +use crate::{ + file_stream::{FileOpenFuture, FileOpener}, + PartitionedFile, TableSchema, +}; + +/// A file opener that handles applying a projection on top of an inner opener. +/// +/// This includes handling partition columns. +/// +/// Any projection pushed down will be split up into: +/// - Simple column indices / column selection +/// - A remainder projection that this opener applies on top of it +/// +/// This is meant to simplify projection pushdown for sources like CSV +/// that can only handle "simple" column selection. +pub struct ProjectionOpener { + inner: Arc, + projection: ProjectionExprs, + input_schema: SchemaRef, + partition_columns: Vec, +} + +impl ProjectionOpener { + pub fn try_new( + projection: SplitProjection, + inner: Arc, + file_schema: &Schema, + ) -> Result> { + Ok(Arc::new(ProjectionOpener { + inner, + projection: projection.remapped_projection, + input_schema: Arc::new(file_schema.project(&projection.file_indices)?), + partition_columns: projection.partition_columns, + })) + } +} + +impl FileOpener for ProjectionOpener { + fn open(&self, partitioned_file: PartitionedFile) -> Result { + let partition_values = partitioned_file.partition_values.clone(); + // Modify any references to partition columns in the projection expressions + // and substitute them with literal values from PartitionedFile.partition_values + let projection = if self.partition_columns.is_empty() { + self.projection.clone() + } else { + inject_partition_columns_into_projection( + &self.projection, + &self.partition_columns, + partition_values, + ) + }; + let projector = projection.make_projector(&self.input_schema)?; + + let inner = self.inner.open(partitioned_file)?; + + Ok(async move { + let stream = inner.await?; + let stream = stream.map(move |batch| { + let batch = batch?; + let batch = projector.project_batch(&batch)?; + Ok(batch) + }); + Ok(stream.boxed()) + } + .boxed()) + } +} + +#[derive(Debug, Clone, Copy)] +pub struct PartitionColumnIndex { + /// The index of this partition column in the remainder projection (>= num_file_columns) + pub in_remainder_projection: usize, + /// The index of this partition column in the partition_values array + pub in_partition_values: usize, +} + +fn inject_partition_columns_into_projection( + projection: &ProjectionExprs, + partition_columns: &[PartitionColumnIndex], + partition_values: Vec, +) -> ProjectionExprs { + // Pre-create all literals for partition columns to avoid cloning ScalarValues multiple times. + let partition_literals: Vec> = partition_values + .into_iter() + .map(|value| Arc::new(Literal::new(value))) + .collect(); + + let projections = projection + .iter() + .map(|projection| { + let expr = Arc::clone(&projection.expr) + .transform(|expr| { + let original_expr = Arc::clone(&expr); + if let Some(column) = expr.as_any().downcast_ref::() { + // Check if this column index corresponds to a partition column + if let Some(pci) = partition_columns + .iter() + .find(|pci| pci.in_remainder_projection == column.index()) + { + let literal = + Arc::clone(&partition_literals[pci.in_partition_values]); + return Ok(Transformed::yes(literal)); + } + } + Ok(Transformed::no(original_expr)) + }) + .data() + .expect("infallible transform"); + ProjectionExpr::new(expr, projection.alias.clone()) + }) + .collect_vec(); + ProjectionExprs::new(projections) +} + +/// At a high level the goal of SplitProjection is to take a ProjectionExprs meant to be applied to the table schema +/// and split that into: +/// - The projection indices into the file schema (file_indices) +/// - The projection indices into the partition values (partition_value_indices), which pre-compute both the index into the table schema +/// and the index into the partition values array +/// - A remapped projection that can be applied after the file projection is applied +/// This remapped projection has the following properties: +/// - Column indices referring to file columns are remapped to [0..file_indices.len()) +/// - Column indices referring to partition columns are remapped to [file_indices.len()..) +/// +/// This allows the ProjectionOpener to easily identify which columns in the remapped projection +/// refer to partition columns and substitute them with literals from the partition values. +#[derive(Debug, Clone)] +pub struct SplitProjection { + /// The original projection this [`SplitProjection`] was derived from + pub source: ProjectionExprs, + /// Column indices to read from file (public for file sources) + pub file_indices: Vec, + /// Pre-computed partition column mappings (internal, used by ProjectionOpener) + pub(crate) partition_columns: Vec, + /// The remapped projection (internal, used by ProjectionOpener) + pub(crate) remapped_projection: ProjectionExprs, +} + +impl SplitProjection { + pub fn unprojected(table_schema: &TableSchema) -> Self { + let projection = ProjectionExprs::from_indices( + &(0..table_schema.table_schema().fields().len()).collect_vec(), + table_schema.table_schema(), + ); + Self::new(table_schema.file_schema(), &projection) + } + + /// Creates a new [`SplitProjection`] by splitting a projection into + /// simple file column indices and a remainder projection that is applied after reading the file. + /// + /// In other words: we get a `Vec` projection that is meant to be applied on top of `file_schema` + /// and a remainder projection that is applied to the result of that first projection. + /// + /// Here `file_schema` is expected to be the *logical* schema of the file, that is the + /// table schema minus any partition columns. + /// Partition columns are always expected to be at the end of the table schema. + /// Note that `file_schema` is *not* the physical schema of the file. + pub fn new(logical_file_schema: &Schema, projection: &ProjectionExprs) -> Self { + let num_file_schema_columns = logical_file_schema.fields().len(); + + // Collect all unique columns and classify as file or partition + let mut file_columns = Vec::new(); + let mut partition_columns = Vec::new(); + let mut all_columns = std::collections::HashMap::new(); + + // Extract all unique column references (index -> name) + for proj_expr in projection { + proj_expr + .expr + .apply(|expr| { + if let Some(column) = expr.as_any().downcast_ref::() { + all_columns + .entry(column.index()) + .or_insert_with(|| column.name().to_string()); + } + Ok(datafusion_common::tree_node::TreeNodeRecursion::Continue) + }) + .expect("infallible apply"); + } + + // Sort by index and classify into file vs partition columns + let mut sorted_columns: Vec<_> = all_columns + .into_iter() + .map(|(idx, name)| (name, idx)) + .collect(); + sorted_columns.sort_by_key(|(_, idx)| *idx); + + // Separate file and partition columns, assigning final indices + // Pre-create all remapped columns to avoid duplicate Arc'd expressions + let mut column_mapping = std::collections::HashMap::new(); + let mut file_idx = 0; + let mut partition_idx = 0; + + for (name, original_index) in sorted_columns { + let new_index = if original_index < num_file_schema_columns { + // File column: gets index [0..num_file_columns) + file_columns.push(original_index); + let idx = file_idx; + file_idx += 1; + idx + } else { + // Partition column: gets index [num_file_columns..) + partition_columns.push(original_index); + let idx = file_idx + partition_idx; + partition_idx += 1; + idx + }; + + // Pre-create the remapped column so all references can share the same Arc + let new_column: Arc = + Arc::new(Column::new(&name, new_index)); + column_mapping.insert(original_index, new_column); + } + + // Single tree transformation: remap all column references using pre-created columns + let remapped_projection = projection + .iter() + .map(|proj_expr| { + let expr = Arc::clone(&proj_expr.expr) + .transform(|expr| { + let original_expr = Arc::clone(&expr); + if let Some(column) = expr.as_any().downcast_ref::() { + if let Some(new_column) = column_mapping.get(&column.index()) + { + return Ok(Transformed::yes(Arc::clone(new_column))); + } + } + Ok(Transformed::no(original_expr)) + }) + .data() + .expect("infallible transform"); + ProjectionExpr::new(expr, proj_expr.alias.clone()) + }) + .collect_vec(); + + // Pre-compute partition column mappings for ProjectionOpener + let num_file_columns = file_columns.len(); + let partition_column_mappings = partition_columns + .iter() + .enumerate() + .map(|(partition_idx, &table_index)| PartitionColumnIndex { + in_remainder_projection: num_file_columns + partition_idx, + in_partition_values: table_index - num_file_schema_columns, + }) + .collect_vec(); + + Self { + source: projection.clone(), + file_indices: file_columns, + partition_columns: partition_column_mappings, + remapped_projection: ProjectionExprs::from(remapped_projection), + } + } +} + +#[cfg(test)] +mod test { + use std::sync::Arc; + + use arrow::array::AsArray; + use arrow::datatypes::{DataType, SchemaRef}; + use datafusion_common::{record_batch, DFSchema, ScalarValue}; + use datafusion_expr::{col, execution_props::ExecutionProps, Expr}; + use datafusion_physical_expr::{create_physical_exprs, projection::ProjectionExpr}; + use itertools::Itertools; + + use super::*; + + fn create_projection_exprs<'a>( + exprs: impl IntoIterator, + schema: &SchemaRef, + ) -> ProjectionExprs { + let df_schema = DFSchema::try_from(Arc::clone(schema)).unwrap(); + let physical_exprs = + create_physical_exprs(exprs, &df_schema, &ExecutionProps::default()).unwrap(); + let projection_exprs = physical_exprs + .into_iter() + .enumerate() + .map(|(i, e)| ProjectionExpr::new(Arc::clone(&e), format!("col{i}"))) + .collect_vec(); + ProjectionExprs::from(projection_exprs) + } + + #[test] + fn test_split_projection_with_partition_columns() { + use arrow::array::AsArray; + use arrow::datatypes::Field; + // Simulate the avro_exec_with_partition test scenario: + // file_schema has 3 fields + let file_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("bool_col", DataType::Boolean, false), + Field::new("tinyint_col", DataType::Int8, false), + ])); + + // table_schema has 4 fields (3 file + 1 partition) + let table_schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("bool_col", DataType::Boolean, false), + Field::new("tinyint_col", DataType::Int8, false), + Field::new("date", DataType::Utf8, false), // partition column at index 3 + ])); + + // projection indices: [0, 1, 3, 2] + // This should select: id (0), bool_col (1), date (3-partition), tinyint_col (2) + let projection_indices = vec![0, 1, 3, 2]; + + // Create projection expressions from indices using the table schema + let projection = + ProjectionExprs::from_indices(&projection_indices, &table_schema); + + // Call SplitProjection to separate file and partition columns + let split = SplitProjection::new(&file_schema, &projection); + + // The file_indices should be [0, 1, 2] (all file columns needed) + assert_eq!(split.file_indices, vec![0, 1, 2]); + + // Should have 1 partition column at in_partition_values index 0 + assert_eq!(split.partition_columns.len(), 1); + assert_eq!(split.partition_columns[0].in_partition_values, 0); + + // Now create a batch with only the file columns + let file_batch = record_batch!( + ("id", Int32, vec![4]), + ("bool_col", Boolean, vec![true]), + ("tinyint_col", Int8, vec![0]) + ) + .unwrap(); + + // After the fix, the remainder projection should have remapped indices: + // - File columns: [0, 1, 2] (unchanged since they're already in order) + // - Partition column: [3] (stays at index 3, which is >= num_file_columns) + // So the remainder expects input columns [0, 1, 2] and references column [3] for partition + + // Verify that we can inject partition columns and apply the projection + let partition_values = vec![ScalarValue::from("2021-10-26")]; + + // Create partition column mapping + let partition_columns = vec![PartitionColumnIndex { + in_remainder_projection: 3, // partition column is at index 3 in remainder + in_partition_values: 0, // first partition value + }]; + + // Inject partition columns (replaces Column(3) with Literal) + let injected_projection = inject_partition_columns_into_projection( + &split.remapped_projection, + &partition_columns, + partition_values, + ); + + // Now the projection should work on the file batch + let projector = injected_projection + .make_projector(&file_batch.schema()) + .unwrap(); + let result = projector.project_batch(&file_batch).unwrap(); + + // Verify the output has the correct column order: id, bool_col, date, tinyint_col + assert_eq!(result.num_columns(), 4); + assert_eq!( + result + .column(0) + .as_primitive::() + .value(0), + 4 + ); + assert!(result.column(1).as_boolean().value(0)); + assert_eq!(result.column(2).as_string::().value(0), "2021-10-26"); + assert_eq!( + result + .column(3) + .as_primitive::() + .value(0), + 0 + ); + } + + // ======================================================================== + // Comprehensive Test Suite for SplitProjection + // ======================================================================== + + // Helper to create test schemas with file and partition columns + fn create_test_schemas( + file_cols: usize, + partition_cols: usize, + ) -> (SchemaRef, SchemaRef) { + use arrow::datatypes::Field; + + let file_fields: Vec<_> = (0..file_cols) + .map(|i| Field::new(format!("col_{i}"), DataType::Int32, false)) + .collect(); + + let mut table_fields = file_fields.clone(); + table_fields.extend( + (0..partition_cols) + .map(|i| Field::new(format!("part_{i}"), DataType::Utf8, false)), + ); + + ( + Arc::new(Schema::new(file_fields)), + Arc::new(Schema::new(table_fields)), + ) + } + + // ======================================================================== + // Partition Column Handling Tests + // ======================================================================== + + #[test] + fn test_split_projection_only_file_columns() { + let (file_schema, table_schema) = create_test_schemas(3, 2); + // Select only file columns [0, 1, 2] + let projection = ProjectionExprs::from_indices(&[0, 1, 2], &table_schema); + + let split = SplitProjection::new(&file_schema, &projection); + + assert_eq!(split.file_indices, vec![0, 1, 2]); + assert_eq!(split.partition_columns.len(), 0); + } + + #[test] + fn test_split_projection_only_partition_columns() { + let (file_schema, table_schema) = create_test_schemas(3, 2); + // Select only partition columns [3, 4] + let projection = ProjectionExprs::from_indices(&[3, 4], &table_schema); + + let split = SplitProjection::new(&file_schema, &projection); + + assert_eq!(split.file_indices, Vec::::new()); + assert_eq!(split.partition_columns.len(), 2); + assert_eq!(split.partition_columns[0].in_partition_values, 0); + assert_eq!(split.partition_columns[1].in_partition_values, 1); + } + + #[test] + fn test_split_projection_multiple_partition_columns() { + let (file_schema, table_schema) = create_test_schemas(2, 3); + // File cols: 0, 1; Partition cols: 2, 3, 4 + // Select: [0, 2, 4, 1, 3] (mixed file and partition) + let projection = ProjectionExprs::from_indices(&[0, 2, 4, 1, 3], &table_schema); + + let split = SplitProjection::new(&file_schema, &projection); + + assert_eq!(split.file_indices, vec![0, 1]); + assert_eq!(split.partition_columns.len(), 3); + assert_eq!(split.partition_columns[0].in_partition_values, 0); + assert_eq!(split.partition_columns[1].in_partition_values, 1); + assert_eq!(split.partition_columns[2].in_partition_values, 2); + + // Verify remapped projection has correct indices + // File columns should be at [0, 1], partition columns at [2, 3, 4] + assert_eq!(split.remapped_projection.iter().count(), 5); + } + + #[test] + fn test_split_projection_partition_columns_reverse_order() { + let (file_schema, table_schema) = create_test_schemas(2, 2); + // File cols: 0, 1; Partition cols: 2, 3 + // Select: [3, 2] (partitions in reverse) + let projection = ProjectionExprs::from_indices(&[3, 2], &table_schema); + + let split = SplitProjection::new(&file_schema, &projection); + + assert_eq!(split.file_indices, Vec::::new()); + assert_eq!(split.partition_columns.len(), 2); + assert_eq!(split.partition_columns[0].in_partition_values, 0); + assert_eq!(split.partition_columns[1].in_partition_values, 1); + } + + #[test] + fn test_split_projection_interleaved_file_and_partition() { + let (file_schema, table_schema) = create_test_schemas(3, 3); + // File cols: 0, 1, 2; Partition cols: 3, 4, 5 + // Select: [0, 3, 1, 4, 2, 5] (alternating) + let projection = + ProjectionExprs::from_indices(&[0, 3, 1, 4, 2, 5], &table_schema); + + let split = SplitProjection::new(&file_schema, &projection); + + assert_eq!(split.file_indices, vec![0, 1, 2]); + assert_eq!(split.partition_columns.len(), 3); + assert_eq!(split.partition_columns[0].in_partition_values, 0); + assert_eq!(split.partition_columns[1].in_partition_values, 1); + assert_eq!(split.partition_columns[2].in_partition_values, 2); + } + + #[test] + fn test_split_projection_expression_with_file_and_partition_columns() { + use arrow::datatypes::Field; + + // Create schemas: 2 file columns, 1 partition column + let file_schema = Arc::new(Schema::new(vec![ + Field::new("file_a", DataType::Int32, false), + Field::new("file_b", DataType::Int32, false), + ])); + let table_schema = Arc::new(Schema::new(vec![ + Field::new("file_a", DataType::Int32, false), + Field::new("file_b", DataType::Int32, false), + Field::new("part_c", DataType::Int32, false), + ])); + + // Create expression: file_a + part_c + let exprs = [col("file_a") + col("part_c")]; + let projection = create_projection_exprs(exprs.iter(), &table_schema); + + let split = SplitProjection::new(&file_schema, &projection); + + // Should extract both columns + assert_eq!(split.file_indices, vec![0]); + assert_eq!(split.partition_columns.len(), 1); + assert_eq!(split.partition_columns[0].in_partition_values, 0); + } + + // ======================================================================== + // Category 4: Boundary Conditions + // ======================================================================== + + #[test] + fn test_split_projection_boundary_last_file_column() { + let (file_schema, table_schema) = create_test_schemas(3, 2); + // Last file column is index 2 + let projection = ProjectionExprs::from_indices(&[2], &table_schema); + + let split = SplitProjection::new(&file_schema, &projection); + + assert_eq!(split.file_indices, vec![2]); + assert_eq!(split.partition_columns.len(), 0); + } + + #[test] + fn test_split_projection_boundary_first_partition_column() { + let (file_schema, table_schema) = create_test_schemas(3, 2); + // First partition column is index 3 + let projection = ProjectionExprs::from_indices(&[3], &table_schema); + + let split = SplitProjection::new(&file_schema, &projection); + + assert_eq!(split.file_indices, Vec::::new()); + assert_eq!(split.partition_columns.len(), 1); + assert_eq!(split.partition_columns[0].in_partition_values, 0); + } + + // ======================================================================== + // Category 6: Integration Tests + // ======================================================================== + + #[test] + fn test_inject_partition_columns_multiple_partitions() { + let data = + record_batch!(("col_0", Int32, vec![1]), ("col_1", Int32, vec![2])).unwrap(); + + // Create projection that references file columns and partition columns + let (file_schema, table_schema) = create_test_schemas(2, 2); + // Projection: [0, 2, 1, 3] = [file_0, part_0, file_1, part_1] + let projection = ProjectionExprs::from_indices(&[0, 2, 1, 3], &table_schema); + let split = SplitProjection::new(&file_schema, &projection); + + // Create partition column mappings + let partition_columns = vec![ + PartitionColumnIndex { + in_remainder_projection: 2, // First partition column at index 2 + in_partition_values: 0, + }, + PartitionColumnIndex { + in_remainder_projection: 3, // Second partition column at index 3 + in_partition_values: 1, + }, + ]; + + let partition_values = + vec![ScalarValue::from("part_a"), ScalarValue::from("part_b")]; + + let injected = inject_partition_columns_into_projection( + &split.remapped_projection, + &partition_columns, + partition_values, + ); + + // Apply projection + let projector = injected.make_projector(&data.schema()).unwrap(); + let result = projector.project_batch(&data).unwrap(); + + assert_eq!(result.num_columns(), 4); + assert_eq!( + result + .column(0) + .as_primitive::() + .value(0), + 1 + ); + assert_eq!(result.column(1).as_string::().value(0), "part_a"); + assert_eq!( + result + .column(2) + .as_primitive::() + .value(0), + 2 + ); + assert_eq!(result.column(3).as_string::().value(0), "part_b"); + } +} diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index de79512a4101..781083c0f14d 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -22,12 +22,13 @@ use std::fmt; use std::fmt::{Debug, Formatter}; use std::sync::Arc; +use datafusion_physical_expr::projection::ProjectionExprs; use datafusion_physical_plan::execution_plan::{ Boundedness, EmissionType, SchedulingType, }; use datafusion_physical_plan::metrics::SplitMetrics; use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; -use datafusion_physical_plan::projection::{ProjectionExec, ProjectionExpr}; +use datafusion_physical_plan::projection::ProjectionExec; use datafusion_physical_plan::stream::BatchSplitStream; use datafusion_physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, @@ -174,7 +175,7 @@ pub trait DataSource: Send + Sync + Debug { } fn try_swapping_with_projection( &self, - _projection: &[ProjectionExpr], + _projection: &ProjectionExprs, ) -> Result>>; /// Try to push down filters into this DataSource. /// See [`ExecutionPlan::handle_child_pushdown_result`] for more details. @@ -319,7 +320,7 @@ impl ExecutionPlan for DataSourceExec { ) -> Result>> { match self .data_source - .try_swapping_with_projection(projection.expr())? + .try_swapping_with_projection(projection.projection_expr())? { Some(new_data_source) => { Ok(Some(Arc::new(DataSourceExec::new(new_data_source)))) @@ -342,7 +343,7 @@ impl ExecutionPlan for DataSourceExec { .collect_vec(); let res = self .data_source - .try_pushdown_filters(parent_filters.clone(), config)?; + .try_pushdown_filters(parent_filters, config)?; match res.updated_node { Some(data_source) => { let mut new_node = self.clone(); diff --git a/datafusion/datasource/src/test_util.rs b/datafusion/datasource/src/test_util.rs index 5d5b277dcf04..6806cd73996f 100644 --- a/datafusion/datasource/src/test_util.rs +++ b/datafusion/datasource/src/test_util.rs @@ -35,29 +35,32 @@ pub(crate) struct MockSource { schema_adapter_factory: Option>, filter: Option>, table_schema: crate::table_schema::TableSchema, + projection: crate::projection::SplitProjection, } impl Default for MockSource { fn default() -> Self { + let table_schema = + crate::table_schema::TableSchema::new(Arc::new(Schema::empty()), vec![]); Self { metrics: ExecutionPlanMetricsSet::new(), schema_adapter_factory: None, filter: None, - table_schema: crate::table_schema::TableSchema::new( - Arc::new(Schema::empty()), - vec![], - ), + projection: crate::projection::SplitProjection::unprojected(&table_schema), + table_schema, } } } impl MockSource { pub fn new(table_schema: impl Into) -> Self { + let table_schema = table_schema.into(); Self { metrics: ExecutionPlanMetricsSet::new(), schema_adapter_factory: None, filter: None, - table_schema: table_schema.into(), + projection: crate::projection::SplitProjection::unprojected(&table_schema), + table_schema, } } @@ -73,7 +76,7 @@ impl FileSource for MockSource { _object_store: Arc, _base_config: &FileScanConfig, _partition: usize, - ) -> Arc { + ) -> Result> { unimplemented!() } @@ -89,10 +92,6 @@ impl FileSource for MockSource { Arc::new(Self { ..self.clone() }) } - fn with_projection(&self, _config: &FileScanConfig) -> Arc { - Arc::new(Self { ..self.clone() }) - } - fn metrics(&self) -> &ExecutionPlanMetricsSet { &self.metrics } @@ -118,6 +117,26 @@ impl FileSource for MockSource { fn table_schema(&self) -> &crate::table_schema::TableSchema { &self.table_schema } + + fn try_pushdown_projection( + &self, + projection: &datafusion_physical_plan::projection::ProjectionExprs, + ) -> Result>> { + let mut source = self.clone(); + let new_projection = self.projection.source.try_merge(projection)?; + let split_projection = crate::projection::SplitProjection::new( + self.table_schema.file_schema(), + &new_projection, + ); + source.projection = split_projection; + Ok(Some(Arc::new(source))) + } + + fn projection( + &self, + ) -> Option<&datafusion_physical_plan::projection::ProjectionExprs> { + Some(&self.projection.source) + } } /// Create a column expression diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 9374297ecba3..11e0cc5c604f 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -48,7 +48,6 @@ datafusion-expr = { workspace = true } datafusion-expr-common = { workspace = true } datafusion-functions-aggregate-common = { workspace = true } datafusion-physical-expr-common = { workspace = true } -half = { workspace = true } hashbrown = { workspace = true } indexmap = { workspace = true } itertools = { workspace = true, features = ["use_std"] } diff --git a/datafusion/physical-plan/src/projection.rs b/datafusion/physical-plan/src/projection.rs index 0b8c4ee5fbec..8e8baef3e71b 100644 --- a/datafusion/physical-plan/src/projection.rs +++ b/datafusion/physical-plan/src/projection.rs @@ -159,6 +159,11 @@ impl ProjectionExec { self.projector.projection().as_ref() } + /// The projection expressions as a [`ProjectionExprs`]. + pub fn projection_expr(&self) -> &ProjectionExprs { + self.projector.projection() + } + /// The input plan pub fn input(&self) -> &Arc { &self.input diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index f9400d14a59c..789176862bf0 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -1009,6 +1009,15 @@ message PhysicalSortExprNodeCollection { repeated PhysicalSortExprNode physical_sort_expr_nodes = 1; } +message ProjectionExpr { + string alias = 1; + PhysicalExprNode expr = 2; +} + +message ProjectionExprs { + repeated ProjectionExpr projections = 1; +} + message FileScanExecConf { repeated FileGroup file_groups = 1; datafusion_common.Schema schema = 2; @@ -1024,6 +1033,8 @@ message FileScanExecConf { datafusion_common.Constraints constraints = 11; optional uint64 batch_size = 12; + + optional ProjectionExprs projection_exprs = 13; } message ParquetScanExecNode { diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 4cf834d0601e..230bfa495a4b 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -5949,6 +5949,9 @@ impl serde::Serialize for FileScanExecConf { if self.batch_size.is_some() { len += 1; } + if self.projection_exprs.is_some() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion.FileScanExecConf", len)?; if !self.file_groups.is_empty() { struct_ser.serialize_field("fileGroups", &self.file_groups)?; @@ -5982,6 +5985,9 @@ impl serde::Serialize for FileScanExecConf { #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("batchSize", ToString::to_string(&v).as_str())?; } + if let Some(v) = self.projection_exprs.as_ref() { + struct_ser.serialize_field("projectionExprs", v)?; + } struct_ser.end() } } @@ -6007,6 +6013,8 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf { "constraints", "batch_size", "batchSize", + "projection_exprs", + "projectionExprs", ]; #[allow(clippy::enum_variant_names)] @@ -6021,6 +6029,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf { OutputOrdering, Constraints, BatchSize, + ProjectionExprs, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -6052,6 +6061,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf { "outputOrdering" | "output_ordering" => Ok(GeneratedField::OutputOrdering), "constraints" => Ok(GeneratedField::Constraints), "batchSize" | "batch_size" => Ok(GeneratedField::BatchSize), + "projectionExprs" | "projection_exprs" => Ok(GeneratedField::ProjectionExprs), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -6081,6 +6091,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf { let mut output_ordering__ = None; let mut constraints__ = None; let mut batch_size__ = None; + let mut projection_exprs__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::FileGroups => { @@ -6148,6 +6159,12 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf { map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| x.0) ; } + GeneratedField::ProjectionExprs => { + if projection_exprs__.is_some() { + return Err(serde::de::Error::duplicate_field("projectionExprs")); + } + projection_exprs__ = map_.next_value()?; + } } } Ok(FileScanExecConf { @@ -6161,6 +6178,7 @@ impl<'de> serde::Deserialize<'de> for FileScanExecConf { output_ordering: output_ordering__.unwrap_or_default(), constraints: constraints__, batch_size: batch_size__, + projection_exprs: projection_exprs__, }) } } @@ -19268,6 +19286,205 @@ impl<'de> serde::Deserialize<'de> for ProjectionExecNode { deserializer.deserialize_struct("datafusion.ProjectionExecNode", FIELDS, GeneratedVisitor) } } +impl serde::Serialize for ProjectionExpr { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if !self.alias.is_empty() { + len += 1; + } + if self.expr.is_some() { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.ProjectionExpr", len)?; + if !self.alias.is_empty() { + struct_ser.serialize_field("alias", &self.alias)?; + } + if let Some(v) = self.expr.as_ref() { + struct_ser.serialize_field("expr", v)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for ProjectionExpr { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "alias", + "expr", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Alias, + Expr, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "alias" => Ok(GeneratedField::Alias), + "expr" => Ok(GeneratedField::Expr), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = ProjectionExpr; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion.ProjectionExpr") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut alias__ = None; + let mut expr__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Alias => { + if alias__.is_some() { + return Err(serde::de::Error::duplicate_field("alias")); + } + alias__ = Some(map_.next_value()?); + } + GeneratedField::Expr => { + if expr__.is_some() { + return Err(serde::de::Error::duplicate_field("expr")); + } + expr__ = map_.next_value()?; + } + } + } + Ok(ProjectionExpr { + alias: alias__.unwrap_or_default(), + expr: expr__, + }) + } + } + deserializer.deserialize_struct("datafusion.ProjectionExpr", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for ProjectionExprs { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if !self.projections.is_empty() { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.ProjectionExprs", len)?; + if !self.projections.is_empty() { + struct_ser.serialize_field("projections", &self.projections)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for ProjectionExprs { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "projections", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Projections, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "projections" => Ok(GeneratedField::Projections), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = ProjectionExprs; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion.ProjectionExprs") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut projections__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Projections => { + if projections__.is_some() { + return Err(serde::de::Error::duplicate_field("projections")); + } + projections__ = Some(map_.next_value()?); + } + } + } + Ok(ProjectionExprs { + projections: projections__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion.ProjectionExprs", FIELDS, GeneratedVisitor) + } +} impl serde::Serialize for ProjectionNode { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 12b417627411..b2d0bc7751f9 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -1542,6 +1542,18 @@ pub struct PhysicalSortExprNodeCollection { pub physical_sort_expr_nodes: ::prost::alloc::vec::Vec, } #[derive(Clone, PartialEq, ::prost::Message)] +pub struct ProjectionExpr { + #[prost(string, tag = "1")] + pub alias: ::prost::alloc::string::String, + #[prost(message, optional, tag = "2")] + pub expr: ::core::option::Option, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct ProjectionExprs { + #[prost(message, repeated, tag = "1")] + pub projections: ::prost::alloc::vec::Vec, +} +#[derive(Clone, PartialEq, ::prost::Message)] pub struct FileScanExecConf { #[prost(message, repeated, tag = "1")] pub file_groups: ::prost::alloc::vec::Vec, @@ -1563,6 +1575,8 @@ pub struct FileScanExecConf { pub constraints: ::core::option::Option, #[prost(uint64, optional, tag = "12")] pub batch_size: ::core::option::Option, + #[prost(message, optional, tag = "13")] + pub projection_exprs: ::core::option::Option, } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetScanExecNode { diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index f1a9abe6ea7b..d40d835f793c 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -42,6 +42,7 @@ use datafusion_datasource_parquet::file_format::ParquetSink; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_execution::{FunctionRegistry, TaskContext}; use datafusion_expr::WindowFunctionDefinition; +use datafusion_physical_expr::projection::{ProjectionExpr, ProjectionExprs}; use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr, ScalarFunctionExpr}; use datafusion_physical_plan::expressions::{ in_list, BinaryExpr, CaseExpr, CastExpr, Column, IsNotNullExpr, IsNullExpr, LikeExpr, @@ -519,11 +520,6 @@ pub fn parse_protobuf_file_scan_config( file_source: Arc, ) -> Result { let schema: Arc = parse_protobuf_file_scan_schema(proto)?; - let projection = proto - .projection - .iter() - .map(|i| *i as usize) - .collect::>(); let constraints = convert_required!(proto.constraints)?; let statistics = convert_required!(proto.statistics)?; @@ -550,11 +546,38 @@ pub fn parse_protobuf_file_scan_config( output_ordering.extend(LexOrdering::new(sort_exprs)); } + // Parse projection expressions if present and apply to file source + let file_source = if let Some(proto_projection_exprs) = &proto.projection_exprs { + let projection_exprs: Vec = proto_projection_exprs + .projections + .iter() + .map(|proto_expr| { + let expr = parse_physical_expr( + proto_expr.expr.as_ref().ok_or_else(|| { + internal_datafusion_err!("ProjectionExpr missing expr field") + })?, + ctx, + &schema, + codec, + )?; + Ok(ProjectionExpr::new(expr, proto_expr.alias.clone())) + }) + .collect::>>()?; + + let projection_exprs = ProjectionExprs::new(projection_exprs); + + // Apply projection to file source + file_source + .try_pushdown_projection(&projection_exprs)? + .unwrap_or(file_source) + } else { + file_source + }; + let config = FileScanConfigBuilder::new(object_store_url, file_source) .with_file_groups(file_groups) .with_constraints(constraints) .with_statistics(statistics) - .with_projection_indices(Some(projection)) .with_limit(proto.limit.as_ref().map(|sl| sl.limit as usize)) .with_output_ordering(output_ordering) .with_batch_size(proto.batch_size.map(|s| s as usize)) diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index fc7818fe461a..89dd0b50650b 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -666,7 +666,6 @@ impl protobuf::PhysicalPlanNode { &self, scan: &protobuf::ParquetScanExecNode, ctx: &TaskContext, - extension_codec: &dyn PhysicalExtensionCodec, ) -> Result> { #[cfg(feature = "parquet")] diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 1ae85618b92a..146e9258111a 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -528,18 +528,31 @@ pub fn serialize_file_scan_config( .with_metadata(conf.file_schema().metadata.clone()), ); + let projection_exprs = conf + .file_source + .projection() + .as_ref() + .map(|projection_exprs| { + let projections = projection_exprs.iter().cloned().collect::>(); + Ok::<_, DataFusionError>(protobuf::ProjectionExprs { + projections: projections + .into_iter() + .map(|expr| { + Ok(protobuf::ProjectionExpr { + alias: expr.alias.to_string(), + expr: Some(serialize_physical_expr(&expr.expr, codec)?), + }) + }) + .collect::>>()?, + }) + }) + .transpose()?; + Ok(protobuf::FileScanExecConf { file_groups, statistics: Some((&conf.statistics()).into()), limit: conf.limit.map(|l| protobuf::ScanLimit { limit: l as u32 }), - projection: conf - .projection_exprs - .as_ref() - .map(|p| p.column_indices()) - .unwrap_or((0..schema.fields().len()).collect::>()) - .iter() - .map(|n| *n as u32) - .collect(), + projection: vec![], schema: Some(schema.as_ref().try_into()?), table_partition_cols: conf .table_partition_cols() @@ -555,6 +568,7 @@ pub fn serialize_file_scan_config( .collect::>(), constraints: Some(conf.constraints.clone().into()), batch_size: conf.batch_size.map(|s| s as u64), + projection_exprs, }) } diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index c50f41625c70..0bcdd610c26f 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -928,7 +928,7 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> { let file_source = Arc::new(ParquetSource::new(table_schema.clone())); let scan_config = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_source) - .with_projection_indices(Some(vec![0, 1])) + .with_projection_indices(Some(vec![0, 1]))? .with_file_group(FileGroup::new(vec![file_group])) .with_newlines_in_values(false) .build(); @@ -1813,7 +1813,7 @@ async fn roundtrip_projection_source() -> Result<()> { 1024, )])]) .with_statistics(statistics) - .with_projection_indices(Some(vec![0, 1, 2])) + .with_projection_indices(Some(vec![0, 1, 2]))? .build(); let filter = Arc::new( diff --git a/datafusion/sqllogictest/test_files/explain_tree.slt b/datafusion/sqllogictest/test_files/explain_tree.slt index 6ac28997a990..b4462533f60d 100644 --- a/datafusion/sqllogictest/test_files/explain_tree.slt +++ b/datafusion/sqllogictest/test_files/explain_tree.slt @@ -952,23 +952,11 @@ explain SELECT int_col, bigint_col, int_col+bigint_col AS sum_col FROM table2; ---- physical_plan 01)┌───────────────────────────┐ -02)│ ProjectionExec │ +02)│ DataSourceExec │ 03)│ -------------------- │ -04)│ bigint_col: │ -05)│ bigint_col │ -06)│ │ -07)│ int_col: int_col │ -08)│ │ -09)│ sum_col: │ -10)│ CAST(int_col AS Int64) + │ -11)│ bigint_col │ -12)└─────────────┬─────────────┘ -13)┌─────────────┴─────────────┐ -14)│ DataSourceExec │ -15)│ -------------------- │ -16)│ files: 1 │ -17)│ format: parquet │ -18)└───────────────────────────┘ +04)│ files: 1 │ +05)│ format: parquet │ +06)└───────────────────────────┘ # Query with projection on memory query TT diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index f217ba1bd5a0..6b1b2ca17747 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -3501,11 +3501,10 @@ logical_plan 05)----TableScan: annotated_data projection=[a0, a, b, c, d] physical_plan 01)NestedLoopJoinExec: join_type=Inner, filter=example(join_proj_push_down_1@0, join_proj_push_down_2@1) > 3, projection=[a0@0, a@1, b@2, c@3, d@4, a0@6, a@7, b@8, c@9, d@10] -02)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, CAST(a@1 AS Float64) as join_proj_push_down_1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true -04)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, CAST(a@1 AS Float64) as join_proj_push_down_2] -05)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true -06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d, CAST(a@1 AS Float64) as join_proj_push_down_1], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true +03)--ProjectionExec: expr=[a0@0 as a0, a@1 as a, b@2 as b, c@3 as c, d@4 as d, CAST(a@1 AS Float64) as join_proj_push_down_2] +04)----RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1, maintains_sort_order=true +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, a, b, c, d], output_ordering=[a@1 ASC, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true #### # Config teardown diff --git a/datafusion/sqllogictest/test_files/math.slt b/datafusion/sqllogictest/test_files/math.slt index edba5354e001..f34e1156a785 100644 --- a/datafusion/sqllogictest/test_files/math.slt +++ b/datafusion/sqllogictest/test_files/math.slt @@ -713,9 +713,7 @@ EXPLAIN SELECT log(NULL, c2) from aggregate_simple; logical_plan 01)Projection: Float64(NULL) AS log(NULL,aggregate_simple.c2) 02)--TableScan: aggregate_simple projection=[] -physical_plan -01)ProjectionExec: expr=[NULL as log(NULL,aggregate_simple.c2)] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/aggregate_simple.csv]]}, file_type=csv, has_header=true +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/aggregate_simple.csv]]}, projection=[NULL as log(NULL,aggregate_simple.c2)], file_type=csv, has_header=true # Float 16/32/64 for log query RT diff --git a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt index d94d48d45af9..7feefc169fca 100644 --- a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt +++ b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt @@ -97,9 +97,7 @@ logical_plan 01)Sort: a_big ASC NULLS LAST, multiple_ordered_table.b ASC NULLS LAST 02)--Projection: multiple_ordered_table.a, multiple_ordered_table.a AS a_big, multiple_ordered_table.b 03)----TableScan: multiple_ordered_table projection=[a, b] -physical_plan -01)ProjectionExec: expr=[a@0 as a, a@0 as a_big, b@1 as b] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, a@1 as a_big, b], output_ordering=[a@0 ASC NULLS LAST, b@2 ASC NULLS LAST], file_type=csv, has_header=true query TT EXPLAIN @@ -111,9 +109,7 @@ logical_plan 01)Sort: multiple_ordered_table.a ASC NULLS LAST, multiple_ordered_table.b ASC NULLS LAST 02)--Projection: multiple_ordered_table.a, multiple_ordered_table.a AS a_big, multiple_ordered_table.b 03)----TableScan: multiple_ordered_table projection=[a, b] -physical_plan -01)ProjectionExec: expr=[a@0 as a, a@0 as a_big, b@1 as b] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, a@1 as a_big, b], output_ordering=[a@0 ASC NULLS LAST, b@2 ASC NULLS LAST], file_type=csv, has_header=true # test for cast Utf8 diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt index 8f0cb5e53d76..03ba12ee9875 100644 --- a/datafusion/sqllogictest/test_files/order.slt +++ b/datafusion/sqllogictest/test_files/order.slt @@ -1357,10 +1357,8 @@ physical_plan 01)SortPreservingMergeExec: [d@4 ASC NULLS LAST, c@1 ASC NULLS LAST, a@2 ASC NULLS LAST, a0@3 ASC NULLS LAST, b@0 ASC NULLS LAST], fetch=2 02)--SortExec: TopK(fetch=2), expr=[d@4 ASC NULLS LAST, c@1 ASC NULLS LAST, a@2 ASC NULLS LAST, a0@3 ASC NULLS LAST, b@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----UnionExec -04)------ProjectionExec: expr=[b@1 as b, c@2 as c, a@0 as a, NULL as a0, d@3 as d] -05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[c@2 ASC NULLS LAST], file_type=csv, has_header=true -06)------ProjectionExec: expr=[b@1 as b, c@2 as c, NULL as a, a0@0 as a0, d@3 as d] -07)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a0, b, c, d], output_ordering=[c@2 ASC NULLS LAST], file_type=csv, has_header=true +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, a, NULL as a0, d], output_ordering=[c@1 ASC NULLS LAST], file_type=csv, has_header=true +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[b, c, NULL as a, a0, d], output_ordering=[c@1 ASC NULLS LAST], file_type=csv, has_header=true # Test: run the query from above query IIIII diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt index fe909e70ffb0..5a559bdb9483 100644 --- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt @@ -274,5 +274,4 @@ logical_plan 02)--TableScan: test_table projection=[constant_col] physical_plan 01)SortPreservingMergeExec: [constant_col@0 ASC NULLS LAST] -02)--SortExec: expr=[constant_col@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col], file_type=parquet +02)--DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col], file_type=parquet diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 30dbcc978c9b..ab0859fa4ade 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1420,9 +1420,7 @@ logical_plan 01)Sort: annotated_data_finite2.a ASC NULLS LAST 02)--Projection: annotated_data_finite2.a, annotated_data_finite2.b, Int64(2) 03)----TableScan: annotated_data_finite2 projection=[a, b] -physical_plan -01)ProjectionExec: expr=[a@0 as a, b@1 as b, 2 as Int64(2)] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, 2 as Int64(2)], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true # source is ordered by a,b,c # when filter result is constant for column a diff --git a/datafusion/sqllogictest/test_files/topk.slt b/datafusion/sqllogictest/test_files/topk.slt index 6c4d7be5ab8a..7364fccd8e57 100644 --- a/datafusion/sqllogictest/test_files/topk.slt +++ b/datafusion/sqllogictest/test_files/topk.slt @@ -371,8 +371,7 @@ explain select number, letter, age, number as column4, letter as column5 from pa ---- physical_plan 01)SortExec: TopK(fetch=3), expr=[number@0 DESC, letter@1 ASC NULLS LAST, age@2 DESC], preserve_partitioning=[false], sort_prefix=[number@0 DESC, letter@1 ASC NULLS LAST] -02)--ProjectionExec: expr=[number@0 as number, letter@1 as letter, age@2 as age, number@0 as column4, letter@1 as column5] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/topk/partial_sorted/1.parquet]]}, projection=[number, letter, age, number@0 as column4, letter@1 as column5], output_ordering=[number@0 DESC, letter@1 ASC NULLS LAST], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify that the sort prefix is correctly computed over normalized, order-maintaining projections (number + 1, number, number + 1, age) query TT diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index c20598239c94..5c4cf3fba71d 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -601,8 +601,7 @@ physical_plan 01)SortPreservingMergeExec: [c1@0 ASC NULLS LAST] 02)--UnionExec 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1], output_ordering=[c1@0 ASC NULLS LAST], file_type=csv, has_header=true -04)----ProjectionExec: expr=[c1a@0 as c1] -05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1a], output_ordering=[c1a@0 ASC NULLS LAST], file_type=csv, has_header=true +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1a@0 as c1], file_type=csv, has_header=true statement ok drop table t1 diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index c4319c665bd0..537b334ecbe4 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -1021,8 +1021,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[__unnest_placeholder(t.column1,depth=1)@0 as UNNEST(t.column1), column2@1 as column2] 02)--UnnestExec -03)----ProjectionExec: expr=[column1@0 as __unnest_placeholder(t.column1), column2@1 as column2] -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_array.parquet]]}, projection=[column1, column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_array.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet # Explain should have a SortExec at the top because we order by the output of the unnest (i.e. discarding the ordering) query TT @@ -1038,8 +1037,7 @@ physical_plan 01)SortExec: expr=[unnested@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--ProjectionExec: expr=[__unnest_placeholder(t.column1,depth=1)@0 as unnested, column2@1 as column2] 03)----UnnestExec -04)------ProjectionExec: expr=[column1@0 as __unnest_placeholder(t.column1), column2@1 as column2] -05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_array.parquet]]}, projection=[column1, column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_array.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet # cleanup statement ok @@ -1084,8 +1082,7 @@ logical_plan physical_plan 01)SortExec: expr=[__unnest_placeholder(struct(t.column1,t.column2,t.column3)).c0@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--UnnestExec -03)----ProjectionExec: expr=[struct(column1@0, column2@1, column3@2) as __unnest_placeholder(struct(t.column1,t.column2,t.column3))] -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_tuples.parquet]]}, projection=[column1, column2, column3], output_ordering=[column1@0 ASC NULLS LAST], file_type=parquet +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_tuples.parquet]]}, projection=[struct(column1@0, column2@1, column3@2) as __unnest_placeholder(struct(t.column1,t.column2,t.column3))], file_type=parquet # cleanup statement ok @@ -1136,8 +1133,7 @@ logical_plan 04)------TableScan: t projection=[column1, column2] physical_plan 01)UnnestExec -02)--ProjectionExec: expr=[column1@0 as __unnest_placeholder(t.column1), column2@1 as column2] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_struct.parquet]]}, projection=[column1, column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_struct.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2], output_ordering=[column2@1 ASC NULLS LAST], file_type=parquet # cleanup statement ok @@ -1207,8 +1203,7 @@ logical_plan physical_plan 01)ProjectionExec: expr=[__unnest_placeholder(t.column1,depth=2)@0 as UNNEST(UNNEST(t.column1)), __unnest_placeholder(t.column2,depth=1)@1 as UNNEST(t.column2), __unnest_placeholder(t.column3).s1@2 as __unnest_placeholder(t.column3).s1, __unnest_placeholder(t.column3).s2@3 as __unnest_placeholder(t.column3).s2, __unnest_placeholder(t.column3).s3@4 as __unnest_placeholder(t.column3).s3, column4@5 as column4] 02)--UnnestExec -03)----ProjectionExec: expr=[column1@0 as __unnest_placeholder(t.column1), column2@1 as __unnest_placeholder(t.column2), column3@2 as __unnest_placeholder(t.column3), column4@3 as column4] -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_struct_arrays.parquet]]}, projection=[column1, column2, column3, column4], output_ordering=[column4@3 ASC NULLS LAST], file_type=parquet +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/unnest/ordered_struct_arrays.parquet]]}, projection=[column1@0 as __unnest_placeholder(t.column1), column2@1 as __unnest_placeholder(t.column2), column3@2 as __unnest_placeholder(t.column3), column4], output_ordering=[column4@3 ASC NULLS LAST], file_type=parquet # cleanup statement ok diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index b7ef74e6c167..acb2f07e01b2 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -2267,8 +2267,7 @@ physical_plan 07)------------BoundedWindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable UInt64 }, frame: ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING], mode=[Sorted] 08)--------------WindowAggExec: wdw=[sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "sum(t1.c9) PARTITION BY [t1.c1, t1.c2] ORDER BY [t1.c9 ASC NULLS LAST, t1.c8 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING", data_type: UInt64, nullable: true }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(1)), end_bound: Following(UInt64(NULL)), is_causal: false }] 09)----------------SortExec: expr=[c1@0 ASC NULLS LAST, c2@1 ASC NULLS LAST, c9@3 ASC NULLS LAST, c8@2 ASC NULLS LAST], preserve_partitioning=[false] -10)------------------ProjectionExec: expr=[c1@0 as c1, c2@1 as c2, c8@2 as c8, c9@3 as c9, c1@0 as c1_alias] -11)--------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c8, c9], file_type=csv, has_header=true +10)------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c8, c9, c1@0 as c1_alias], file_type=csv, has_header=true query IIIII SELECT c9, @@ -2683,8 +2682,7 @@ physical_plan 05)--------ProjectionExec: expr=[__common_expr_1@0 as __common_expr_1, inc_col@3 as inc_col, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING@5 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING@6 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING@7 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@8 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@9 as min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@10 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@11 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@12 as max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@13 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING@14 as count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@15 as count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@16 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@17 as sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@18 as sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@19 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@20 as min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@21 as min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING@22 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING@23 as max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING@24 as max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING@25 as count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING@26 as count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING] 06)----------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING": Int64 }, frame: RANGE BETWEEN 4 PRECEDING AND 8 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING], mode=[Sorted] 07)------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 4 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 4 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 1 PRECEDING AND 8 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN 8 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 1 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "min(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 10 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 10 FOLLOWING, max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING: Field { "max(annotated_data_finite.desc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 5 PRECEDING AND 1 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN 1 PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 1 PRECEDING AND 10 FOLLOWING": nullable Int32 }, frame: ROWS BETWEEN 10 PRECEDING AND 1 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 6 PRECEDING AND 2 FOLLOWING": Int64 }, frame: RANGE BETWEEN 2 PRECEDING AND 6 FOLLOWING, count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING: Field { "count(Int64(1)) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] ROWS BETWEEN 8 PRECEDING AND 1 FOLLOWING": Int64 }, frame: ROWS BETWEEN 1 PRECEDING AND 8 FOLLOWING], mode=[Sorted] -08)--------------ProjectionExec: expr=[CAST(desc_col@2 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Int64) as __common_expr_2, ts@0 as ts, inc_col@1 as inc_col, desc_col@2 as desc_col] -09)----------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col, desc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true +08)--------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[CAST(desc_col@2 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Int64) as __common_expr_2, ts, inc_col, desc_col], output_ordering=[ts@2 ASC NULLS LAST], file_type=csv, has_header=true query IIIIIIIIIIIIIIIIIIIIIIII SELECT @@ -2839,8 +2837,7 @@ physical_plan 03)----ProjectionExec: expr=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@9 as sum1, sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@4 as sum2, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@10 as min1, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@5 as min2, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@11 as max1, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@6 as max2, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@12 as count1, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@7 as count2, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING@13 as avg1, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING@8 as avg2, inc_col@3 as inc_col] 04)------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING: Field { "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts ASC NULLS LAST] RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 5 FOLLOWING], mode=[Sorted] 05)--------BoundedWindowAggExec: wdw=[sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "sum(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "min(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "max(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Int32 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "count(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": Int64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING, avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING: Field { "avg(annotated_data_finite.inc_col) ORDER BY [annotated_data_finite.ts DESC NULLS FIRST] RANGE BETWEEN 3 PRECEDING AND UNBOUNDED FOLLOWING": nullable Float64 }, frame: RANGE BETWEEN UNBOUNDED PRECEDING AND 3 FOLLOWING], mode=[Sorted] -06)----------ProjectionExec: expr=[CAST(inc_col@1 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Float64) as __common_expr_2, ts@0 as ts, inc_col@1 as inc_col] -07)------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[ts, inc_col], output_ordering=[ts@0 ASC NULLS LAST], file_type=csv, has_header=true +06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_1.csv]]}, projection=[CAST(inc_col@1 AS Int64) as __common_expr_1, CAST(inc_col@1 AS Float64) as __common_expr_2, ts, inc_col], output_ordering=[ts@2 ASC NULLS LAST], file_type=csv, has_header=true query IIIIIIIIRR SELECT @@ -3157,8 +3154,7 @@ physical_plan 11)--------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b, annotated_data_finite2.d] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[Sorted] 12)----------------------SortExec: expr=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, d@4 ASC NULLS LAST, c@3 ASC NULLS LAST], preserve_partitioning=[false] 13)------------------------BoundedWindowAggExec: wdw=[sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 2 PRECEDING AND 1 FOLLOWING, sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING: Field { "sum(annotated_data_finite2.c) PARTITION BY [annotated_data_finite2.a, annotated_data_finite2.b] ORDER BY [annotated_data_finite2.c ASC NULLS LAST] ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING": nullable Int64 }, frame: ROWS BETWEEN 5 PRECEDING AND 5 FOLLOWING], mode=[Sorted] -14)--------------------------ProjectionExec: expr=[CAST(c@2 AS Int64) as __common_expr_1, a@0 as a, b@1 as b, c@2 as c, d@3 as d] -15)----------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b, c, d], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST], file_type=csv, has_header=true +14)--------------------------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[CAST(c@3 AS Int64) as __common_expr_1, a, b, c, d], output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST, c@3 ASC NULLS LAST], file_type=csv, has_header=true query IIIIIIIIIIIIIII SELECT a, b, c, @@ -5941,9 +5937,7 @@ physical_plan 03)----BoundedWindowAggExec: wdw=[sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "sum(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "count(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": Int64 }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable List(nullable Int64) }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW, array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Field { "array_agg(test.c2) FILTER (WHERE test.c2 >= Int64(2) AND test.c2 < Int64(4) AND test.c1 > Int64(0)) ORDER BY [test.c1 ASC NULLS LAST, test.c2 ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW": nullable List(nullable Int64) }, frame: ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW], mode=[Sorted] 04)------SortPreservingMergeExec: [c1@2 ASC NULLS LAST, c2@3 ASC NULLS LAST], fetch=5 05)--------SortExec: TopK(fetch=5), expr=[c1@2 ASC NULLS LAST, c2@3 ASC NULLS LAST], preserve_partitioning=[true] -06)----------ProjectionExec: expr=[__common_expr_3@0 as __common_expr_1, __common_expr_3@0 AND c2@2 < 4 AND c1@1 > 0 as __common_expr_2, c1@1 as c1, c2@2 as c2] -07)------------ProjectionExec: expr=[c2@1 >= 2 as __common_expr_3, c1@0 as c1, c2@1 as c2] -08)--------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-0.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-1.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-2.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-3.csv]]}, projection=[c1, c2], file_type=csv, has_header=false +06)----------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-0.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-1.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-2.csv], [WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_csv/partition-3.csv]]}, projection=[c2@1 >= 2 as __common_expr_1, c2@1 >= 2 AND c2@1 < 4 AND c1@0 > 0 as __common_expr_2, c1, c2], file_type=csv, has_header=false # FILTER filters out some rows diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs index 8ce71acecca3..1feee6386318 100644 --- a/datafusion/substrait/src/physical_plan/consumer.rs +++ b/datafusion/substrait/src/physical_plan/consumer.rs @@ -152,7 +152,7 @@ pub async fn from_substrait_rel( .map(|item| item.field as usize) .collect(); base_config_builder = base_config_builder - .with_projection_indices(Some(column_indices)); + .with_projection_indices(Some(column_indices))?; } } diff --git a/datafusion/substrait/src/physical_plan/producer.rs b/datafusion/substrait/src/physical_plan/producer.rs index 20d41c2e6112..557e80146b9a 100644 --- a/datafusion/substrait/src/physical_plan/producer.rs +++ b/datafusion/substrait/src/physical_plan/producer.rs @@ -92,7 +92,7 @@ pub fn to_substrait_rel( }; let mut select_struct = None; - if let Some(projection) = file_config.projection_exprs.as_ref() { + if let Some(projection) = file_config.file_source().projection().as_ref() { let struct_items = projection .column_indices() .into_iter() diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 7a827847dde5..25c209c5ebe8 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -202,6 +202,121 @@ Additionally, the FFI structure for Scalar UDF's no longer contains a `return_type` call. This code was not used since the `ForeignScalarUDF` struct implements the `return_field_from_args` instead. +### Projection handling moved from FileScanConfig to FileSource + +Projection handling has been moved from `FileScanConfig` into `FileSource` implementations. This enables format-specific projection pushdown (e.g., Parquet can push down struct field access, Vortex can push down computed expressions into un-decoded data). + +**Who is affected:** + +- Users who have implemented custom `FileSource` implementations +- Users who use `FileScanConfigBuilder::with_projection_indices` directly + +**Breaking changes:** + +1. **`FileSource::with_projection` replaced with `try_pushdown_projection`:** + + The `with_projection(&self, config: &FileScanConfig) -> Arc` method has been removed and replaced with `try_pushdown_projection(&self, projection: &ProjectionExprs) -> Result>>`. + +2. **`FileScanConfig.projection_exprs` field removed:** + + Projections are now stored in the `FileSource` directly, not in `FileScanConfig`. + Various public helper methods that access projection information have been removed from `FileScanConfig`. + +3. **`FileScanConfigBuilder::with_projection_indices` now returns `Result`:** + + This method can now fail if the projection pushdown fails. + +4. **`FileSource::create_file_opener` now returns `Result>`:** + + Previously returned `Arc` directly. + Any `FileSource` implementation that may fail to create a `FileOpener` should now return an appropriate error. + +5. **`DataSource::try_swapping_with_projection` signature changed:** + + Parameter changed from `&[ProjectionExpr]` to `&ProjectionExprs`. + +**Migration guide:** + +If you have a custom `FileSource` implementation: + +**Before:** + +```rust,ignore +impl FileSource for MyCustomSource { + fn with_projection(&self, config: &FileScanConfig) -> Arc { + // Apply projection from config + Arc::new(Self { /* ... */ }) + } + + fn create_file_opener( + &self, + object_store: Arc, + base_config: &FileScanConfig, + partition: usize, + ) -> Arc { + Arc::new(MyOpener { /* ... */ }) + } +} +``` + +**After:** + +```rust,ignore +impl FileSource for MyCustomSource { + fn try_pushdown_projection( + &self, + projection: &ProjectionExprs, + ) -> Result>> { + // Return None if projection cannot be pushed down + // Return Some(new_source) with projection applied if it can + Ok(Some(Arc::new(Self { + projection: Some(projection.clone()), + /* ... */ + }))) + } + + fn projection(&self) -> Option<&ProjectionExprs> { + self.projection.as_ref() + } + + fn create_file_opener( + &self, + object_store: Arc, + base_config: &FileScanConfig, + partition: usize, + ) -> Result> { + Ok(Arc::new(MyOpener { /* ... */ })) + } +} +``` + +We recommend you look at [#18627](https://github.com/apache/datafusion/pull/18627) +that introduced these changes for more examples for how this was handled for the various built in file sources. + +We have added [`SplitProjection`](https://docs.rs/datafusion-datasource/latest/datafusion_datasource/projection/struct.SplitProjection.html) and [`ProjectionOpener`](https://docs.rs/datafusion-datasource/latest/datafusion_datasource/projection/struct.ProjectionOpener.html) helpers to make it easier to handle projections in your `FileSource` implementations. + +For file sources that can only handle simple column selections (not computed expressions), use the `SplitProjection` and `ProjectionOpener` helpers to split the projection into pushdownable and non-pushdownable parts: + +```rust,ignore +use datafusion_datasource::projection::{SplitProjection, ProjectionOpener}; + +// In try_pushdown_projection: +let split = SplitProjection::new(projection, self.table_schema())?; +// Use split.file_projection() for what to push down to the file format +// The ProjectionOpener wrapper will handle the rest +``` + +**For `FileScanConfigBuilder` users:** + +```diff +let config = FileScanConfigBuilder::new(url, source) +- .with_projection_indices(Some(vec![0, 2, 3])) ++ .with_projection_indices(Some(vec![0, 2, 3]))? + .build(); +``` + +**Handling projections in `FileSource`:** + ## DataFusion `51.0.0` ### `arrow` / `parquet` updated to 57.0.0