Skip to content

Commit 8833c30

Browse files
committed
add docs
1 parent 4ae5269 commit 8833c30

File tree

2 files changed

+11
-14
lines changed

2 files changed

+11
-14
lines changed

datafusion/datasource-parquet/src/file_format.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,7 @@ impl FileFormat for ParquetFormat {
465465
.as_any()
466466
.downcast_ref::<ParquetSource>()
467467
.cloned()
468-
.expect("should be a parquet source");
468+
.ok_or_else(|| internal_datafusion_err!("Expected ParquetSource"))?;
469469
source = source.with_table_parquet_options(self.options.clone());
470470

471471
// Use the CachedParquetFileReaderFactory

datafusion/datasource/src/projection.rs

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -176,21 +176,18 @@ impl SplitProjection {
176176
Self::new(table_schema.file_schema(), &projection)
177177
}
178178

179-
/// Creates a new `SplitProjection` by splitting columns into file and partition columns.
179+
/// Creates a new [`SplitProjection`] by splitting a projection into
180+
/// simple file column indices and a remainder projection that is applied after reading the file.
180181
///
181-
/// # Algorithm
182-
/// Single-pass approach that combines extraction, classification, and remapping:
183-
/// 1. Extract all unique column references from projection expressions
184-
/// 2. Sort columns by original table index
185-
/// 3. Classify each column as either file or partition based on file_schema length
186-
/// 4. Assign final indices: file columns → [0..n), partition columns → [n..)
187-
/// 5. Transform expressions once to remap all column references
182+
/// In other words: we get a `Vec<usize>` projection that is meant to be applied on top of `file_schema`
183+
/// and a remainder projection that is applied to the result of that first projection.
188184
///
189-
/// This replaces the previous three-pass approach:
190-
/// - Old: extract → sort → remap → split → remap again (3 transformations)
191-
/// - New: extract → classify → remap (1 transformation)
192-
pub fn new(file_schema: &Schema, projection: &ProjectionExprs) -> Self {
193-
let num_file_schema_columns = file_schema.fields().len();
185+
/// Here `file_schema` is expected to be the *logical* schema of the file, that is the
186+
/// table schema minus any partition columns.
187+
/// Partition columns are always expected to be at the end of the table schema.
188+
/// Note that `file_schema` is *not* the physical schema of the file.
189+
pub fn new(logical_file_schema: &Schema, projection: &ProjectionExprs) -> Self {
190+
let num_file_schema_columns = logical_file_schema.fields().len();
194191

195192
// Collect all unique columns and classify as file or partition
196193
let mut file_columns = Vec::new();

0 commit comments

Comments
 (0)