Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 14 additions & 14 deletions datafusion-examples/examples/custom_data_source/csv_json_opener.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,22 +64,22 @@ async fn csv_opener() -> Result<()> {
..Default::default()
};

let scan_config = FileScanConfigBuilder::new(
ObjectStoreUrl::local_filesystem(),
Arc::new(CsvSource::new(Arc::clone(&schema)).with_csv_options(options.clone())),
)
.with_projection_indices(Some(vec![12, 0]))
.with_limit(Some(5))
.with_file(PartitionedFile::new(path.display().to_string(), 10))
.build();

let config = CsvSource::new(Arc::clone(&schema))
let source = CsvSource::new(Arc::clone(&schema))
.with_csv_options(options)
.with_comment(Some(b'#'))
.with_batch_size(8192)
.with_projection(&scan_config);
.with_batch_size(8192);

let scan_config =
FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
Comment on lines +72 to +73
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❤️

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I agree it is really nice to not have this strange circular dependency between the source and the config

.with_projection_indices(Some(vec![12, 0]))?
.with_limit(Some(5))
.with_file(PartitionedFile::new(path.display().to_string(), 10))
.build();

let opener = config.create_file_opener(object_store, &scan_config, 0);
let opener =
scan_config
.file_source()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one is still kind of weird -- look like we might be able to replace it entirely with

fn open(
&self,
partition: usize,
context: Arc<TaskContext>,
) -> Result<SendableRecordBatchStream> {
let object_store = context.runtime_env().object_store(&self.object_store_url)?;
let batch_size = self
.batch_size
.unwrap_or_else(|| context.session_config().batch_size());
let source = self.file_source.with_batch_size(batch_size);
let opener = source.create_file_opener(object_store, self, partition)?;
let stream = FileStream::new(self, partition, opener, source.metrics())?;
Ok(Box::pin(cooperative(stream)))
}

But then we would have to change the example to setup the runtime env

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I'll leave it untouched for now.

.create_file_opener(object_store, &scan_config, 0)?;

let mut result = vec![];
let mut stream =
Expand Down Expand Up @@ -133,7 +133,7 @@ async fn json_opener() -> Result<()> {
ObjectStoreUrl::local_filesystem(),
Arc::new(JsonSource::new(schema)),
)
.with_projection_indices(Some(vec![1, 0]))
.with_projection_indices(Some(vec![1, 0]))?
.with_limit(Some(5))
.with_file(PartitionedFile::new(path.to_string(), 10))
.build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ impl TableProvider for DefaultValueTableProvider {
ObjectStoreUrl::parse("memory://")?,
Arc::new(parquet_source),
)
.with_projection_indices(projection.cloned())
.with_projection_indices(projection.cloned())?
.with_limit(limit)
.with_file_group(file_group)
.with_expr_adapter(Some(Arc::new(DefaultValuePhysicalExprAdapterFactory) as _));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ impl TableProvider for IndexTableProvider {
);
let file_scan_config = FileScanConfigBuilder::new(object_store_url, file_source)
.with_limit(limit)
.with_projection_indices(projection.cloned())
.with_projection_indices(projection.cloned())?
.with_file(partitioned_file)
.build();

Expand Down
2 changes: 1 addition & 1 deletion datafusion-examples/examples/data_io/parquet_index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ impl TableProvider for IndexTableProvider {
Arc::new(ParquetSource::new(self.schema()).with_predicate(predicate));
let mut file_scan_config_builder =
FileScanConfigBuilder::new(object_store_url, source)
.with_projection_indices(projection.cloned())
.with_projection_indices(projection.cloned())?
.with_limit(limit);

// Transform to the format needed to pass to DataSourceExec
Expand Down
2 changes: 1 addition & 1 deletion datafusion/catalog-listing/src/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,7 @@ impl TableProvider for ListingTable {
.with_file_groups(partitioned_file_lists)
.with_constraints(self.constraints.clone())
.with_statistics(statistics)
.with_projection_indices(projection)
.with_projection_indices(projection)?
.with_limit(limit)
.with_output_ordering(output_ordering)
.with_expr_adapter(self.expr_adapter_factory.clone())
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/src/dataframe/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ mod tests {
let plan = df.explain(false, false)?.collect().await?;
// Filters all the way to Parquet
let formatted = pretty::pretty_format_batches(&plan)?.to_string();
assert!(formatted.contains("FilterExec: id@0 = 1"));
assert!(formatted.contains("FilterExec: id@0 = 1"), "{formatted}");

Ok(())
}
Expand Down
2 changes: 1 addition & 1 deletion datafusion/core/src/datasource/file_format/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ pub(crate) mod test_util {
)
.with_file_groups(file_groups)
.with_statistics(statistics)
.with_projection_indices(projection)
.with_projection_indices(projection)?
.with_limit(limit)
.build(),
)
Expand Down
6 changes: 3 additions & 3 deletions datafusion/core/src/datasource/physical_plan/avro.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ mod tests {
let source = Arc::new(AvroSource::new(Arc::clone(&file_schema)));
let conf = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
.with_file(meta.into())
.with_projection_indices(Some(vec![0, 1, 2]))
.with_projection_indices(Some(vec![0, 1, 2]))?
.build();

let source_exec = DataSourceExec::from_data_source(conf);
Expand Down Expand Up @@ -156,7 +156,7 @@ mod tests {
let source = Arc::new(AvroSource::new(Arc::clone(&file_schema)));
let conf = FileScanConfigBuilder::new(object_store_url, source)
.with_file(meta.into())
.with_projection_indices(projection)
.with_projection_indices(projection)?
.build();

let source_exec = DataSourceExec::from_data_source(conf);
Expand Down Expand Up @@ -231,7 +231,7 @@ mod tests {
let conf = FileScanConfigBuilder::new(object_store_url, source)
// select specific columns of the files as well as the partitioning
// column which is supposed to be the last column in the table schema.
.with_projection_indices(projection)
.with_projection_indices(projection)?
.with_file(partitioned_file)
.build();

Expand Down
18 changes: 9 additions & 9 deletions datafusion/core/src/datasource/physical_plan/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,10 @@ mod tests {
let source =
Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
let config =
FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source))
FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
.with_file_compression_type(file_compression_type)
.with_newlines_in_values(false)
.with_projection_indices(Some(vec![0, 2, 4]))
.with_projection_indices(Some(vec![0, 2, 4]))?
.build();

assert_eq!(13, config.file_schema().fields().len());
Expand Down Expand Up @@ -199,10 +199,10 @@ mod tests {
let source =
Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
let config =
FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source))
FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
.with_newlines_in_values(false)
.with_file_compression_type(file_compression_type.to_owned())
.with_projection_indices(Some(vec![4, 0, 2]))
.with_projection_indices(Some(vec![4, 0, 2]))?
.build();
assert_eq!(13, config.file_schema().fields().len());
let csv = DataSourceExec::from_data_source(config);
Expand Down Expand Up @@ -271,7 +271,7 @@ mod tests {
let source =
Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
let config =
FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source))
FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
.with_newlines_in_values(false)
.with_file_compression_type(file_compression_type.to_owned())
.with_limit(Some(5))
Expand Down Expand Up @@ -342,7 +342,7 @@ mod tests {
let source =
Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
let config =
FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source))
FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
.with_newlines_in_values(false)
.with_file_compression_type(file_compression_type.to_owned())
.with_limit(Some(5))
Expand Down Expand Up @@ -411,12 +411,12 @@ mod tests {
let source =
Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
let config =
FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source))
FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
.with_newlines_in_values(false)
.with_file_compression_type(file_compression_type.to_owned())
// We should be able to project on the partition column
// Which is supposed to be after the file fields
.with_projection_indices(Some(vec![0, num_file_schema_fields]))
.with_projection_indices(Some(vec![0, num_file_schema_fields]))?
.build();

// we don't have `/date=xx/` in the path but that is ok because
Expand Down Expand Up @@ -517,7 +517,7 @@ mod tests {
let source =
Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
let config =
FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source))
FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
.with_newlines_in_values(false)
.with_file_compression_type(file_compression_type.to_owned())
.build();
Expand Down
3 changes: 2 additions & 1 deletion datafusion/core/src/datasource/physical_plan/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ mod tests {
let conf = FileScanConfigBuilder::new(object_store_url, source)
.with_file_groups(file_groups)
.with_projection_indices(Some(vec![0, 2]))
.unwrap()
.with_file_compression_type(file_compression_type.to_owned())
.build();
let exec = DataSourceExec::from_data_source(conf);
Expand Down Expand Up @@ -349,7 +350,7 @@ mod tests {
let source = Arc::new(JsonSource::new(Arc::clone(&file_schema)));
let conf = FileScanConfigBuilder::new(object_store_url, source)
.with_file_groups(file_groups)
.with_projection_indices(Some(vec![3, 0, 2]))
.with_projection_indices(Some(vec![3, 0, 2]))?
.with_file_compression_type(file_compression_type.to_owned())
.build();
let exec = DataSourceExec::from_data_source(conf);
Expand Down
2 changes: 2 additions & 0 deletions datafusion/core/src/datasource/physical_plan/parquet.rs
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ mod tests {
FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), source)
.with_file_group(file_group)
.with_projection_indices(self.projection.clone())
.unwrap()
.build();
DataSourceExec::from_data_source(base_config)
}
Expand Down Expand Up @@ -1664,6 +1665,7 @@ mod tests {
.with_file(partitioned_file)
// file has 10 cols so index 12 should be month and 13 should be day
.with_projection_indices(Some(vec![0, 1, 2, 12, 13]))
.unwrap()
.build();

let parquet_exec = DataSourceExec::from_data_source(config);
Expand Down
5 changes: 4 additions & 1 deletion datafusion/core/src/datasource/view_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,10 @@ mod tests {
.to_string();
assert!(formatted.contains("DataSourceExec: "));
assert!(formatted.contains("file_type=parquet"));
assert!(formatted.contains("projection=[bool_col, int_col], limit=10"));
assert!(
formatted.contains("projection=[bool_col, int_col], limit=10"),
"{formatted}"
);
Ok(())
}

Expand Down
7 changes: 4 additions & 3 deletions datafusion/core/src/test/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,10 @@ pub fn scan_partitioned_csv(
};
let table_schema = TableSchema::from_file_schema(schema);
let source = Arc::new(CsvSource::new(table_schema.clone()).with_csv_options(options));
let config = FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source))
.with_file_compression_type(FileCompressionType::UNCOMPRESSED)
.build();
let config =
FileScanConfigBuilder::from(partitioned_csv_config(file_groups, source)?)
.with_file_compression_type(FileCompressionType::UNCOMPRESSED)
.build();
Ok(DataSourceExec::from_data_source(config))
}

Expand Down
1 change: 1 addition & 0 deletions datafusion/core/tests/parquet/schema_coercion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ async fn multi_parquet_coercion_projection() {
)
.with_file_group(file_group)
.with_projection_indices(Some(vec![1, 0, 2]))
.unwrap()
.build();

let parquet_exec = DataSourceExec::from_data_source(config);
Expand Down
13 changes: 3 additions & 10 deletions datafusion/core/tests/physical_optimizer/filter_pushdown/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,14 @@ impl FileSource for TestSource {
_object_store: Arc<dyn ObjectStore>,
_base_config: &FileScanConfig,
_partition: usize,
) -> Arc<dyn FileOpener> {
Arc::new(TestOpener {
) -> Result<Arc<dyn FileOpener>> {
Ok(Arc::new(TestOpener {
batches: self.batches.clone(),
batch_size: self.batch_size,
schema: Arc::clone(&self.schema),
projection: self.projection.clone(),
predicate: self.predicate.clone(),
})
}))
}

fn filter(&self) -> Option<Arc<dyn PhysicalExpr>> {
Expand All @@ -166,13 +166,6 @@ impl FileSource for TestSource {
})
}

fn with_projection(&self, config: &FileScanConfig) -> Arc<dyn FileSource> {
Arc::new(TestSource {
projection: config.projection_exprs.as_ref().map(|p| p.column_indices()),
..self.clone()
})
}

fn metrics(&self) -> &ExecutionPlanMetricsSet {
&self.metrics
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -620,7 +620,7 @@ mod test {
let plan_string = get_plan_string(&aggregate_exec_partial).swap_remove(0);
assert_snapshot!(
plan_string,
@"AggregateExec: mode=Partial, gby=[id@0 as id, 1 + id@0 as expr], aggr=[COUNT(c)]"
@"AggregateExec: mode=Partial, gby=[id@0 as id, 1 + id@0 as expr], aggr=[COUNT(c)], ordering_mode=Sorted"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm actually not sure where this is coming from... we need to double check it's correct / an improvement

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The table is created with ORDER BY id so I think this plan is correct:
https://github.com/apache/datafusion/blob/3c21b546a9acf9922229220d3ceca91a945cbf46/datafusion/core/tests/physical_optimizer/partition_statistics.rs#L89-L88

(I don't really know why it started appearing either)

);

let p0_statistics = aggregate_exec_partial.partition_statistics(Some(0))?;
Expand Down
Loading