Skip to content

Commit 2eb5c10

Browse files
authored
Revert "Fix omitted predicate in parquet reading (#49)" (#53)
This reverts commit 12a02c3.
1 parent 60ca421 commit 2eb5c10

2 files changed

Lines changed: 8 additions & 89 deletions

File tree

datafusion/core/src/datasource/physical_plan/parquet/mod.rs

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2091,33 +2091,6 @@ mod tests {
20912091
Ok(())
20922092
}
20932093

2094-
#[tokio::test]
2095-
async fn test_predicate_no_pushdown_parquet() -> Result<()> {
2096-
let tmp_dir = TempDir::new()?;
2097-
let path = tmp_dir.path().to_str().unwrap().to_string() + "/test.parquet";
2098-
write_file(&path);
2099-
let ctx = SessionContext::new();
2100-
let opt = ListingOptions::new(Arc::new(ParquetFormat::default()));
2101-
ctx.register_listing_table("base_table", path, opt, None, None)
2102-
.await
2103-
.unwrap();
2104-
let sql = "
2105-
with tmp as (
2106-
select *, 's' flag from base_table)
2107-
select * from tmp where flag = 'w';
2108-
";
2109-
let batch = ctx.sql(sql).await.unwrap().collect().await.unwrap();
2110-
assert_eq!(batch.len(), 1);
2111-
let expected = [
2112-
"+--------+----+------+------+",
2113-
"| struct | id | name | flag |",
2114-
"+--------+----+------+------+",
2115-
"+--------+----+------+------+",
2116-
];
2117-
crate::assert_batches_eq!(expected, &batch);
2118-
Ok(())
2119-
}
2120-
21212094
#[tokio::test]
21222095
async fn test_struct_filter_parquet_with_view_types() -> Result<()> {
21232096
let tmp_dir = TempDir::new().unwrap();

datafusion/core/src/datasource/physical_plan/parquet/opener.rs

Lines changed: 8 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,9 @@ use crate::datasource::physical_plan::{
2828
};
2929
use crate::datasource::schema_adapter::SchemaAdapterFactory;
3030
use crate::physical_optimizer::pruning::PruningPredicate;
31-
use arrow_schema::{ArrowError, Schema, SchemaRef};
31+
use arrow_schema::{ArrowError, SchemaRef};
3232
use datafusion_common::{exec_err, Result};
33-
use datafusion_physical_expr::expressions::Column;
3433
use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
35-
use datafusion_physical_plan::filter::batch_filter;
3634
use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
3735
use futures::{StreamExt, TryStreamExt};
3836
use log::debug;
@@ -127,9 +125,7 @@ impl FileOpener for ParquetOpener {
127125
);
128126

129127
// Filter pushdown: evaluate predicates during scan
130-
if let Some(predicate) =
131-
pushdown_filters.then_some(predicate.clone()).flatten()
132-
{
128+
if let Some(predicate) = pushdown_filters.then_some(predicate).flatten() {
133129
let row_filter = row_filter::build_row_filter(
134130
&predicate,
135131
&file_schema,
@@ -157,7 +153,7 @@ impl FileOpener for ParquetOpener {
157153
// Determine which row groups to actually read. The idea is to skip
158154
// as many row groups as possible based on the metadata and query
159155
let file_metadata = builder.metadata().clone();
160-
let pruning_predicate = pruning_predicate.as_ref().map(|p| p.as_ref());
156+
let predicate = pruning_predicate.as_ref().map(|p| p.as_ref());
161157
let rg_metadata = file_metadata.row_groups();
162158
// track which row groups to actually read
163159
let access_plan =
@@ -168,12 +164,12 @@ impl FileOpener for ParquetOpener {
168164
row_groups.prune_by_range(rg_metadata, range);
169165
}
170166
// If there is a predicate that can be evaluated against the metadata
171-
if let Some(pruning_predicate) = pruning_predicate.as_ref() {
167+
if let Some(predicate) = predicate.as_ref() {
172168
row_groups.prune_by_statistics(
173169
&file_schema,
174170
builder.parquet_schema(),
175171
rg_metadata,
176-
pruning_predicate,
172+
predicate,
177173
&file_metrics,
178174
);
179175

@@ -182,7 +178,7 @@ impl FileOpener for ParquetOpener {
182178
.prune_by_bloom_filters(
183179
&file_schema,
184180
&mut builder,
185-
pruning_predicate,
181+
predicate,
186182
&file_metrics,
187183
)
188184
.await;
@@ -226,21 +222,8 @@ impl FileOpener for ParquetOpener {
226222
let adapted = stream
227223
.map_err(|e| ArrowError::ExternalError(Box::new(e)))
228224
.map(move |maybe_batch| {
229-
maybe_batch.and_then(|b| {
230-
if !pushdown_filters {
231-
if let Some(predicate) = predicate.clone() {
232-
let updated_predicate = update_predicate_index(
233-
Arc::clone(&predicate),
234-
b.schema_ref(),
235-
)?;
236-
237-
let b = batch_filter(&b, &updated_predicate)?;
238-
239-
return schema_mapping.map_batch(b).map_err(Into::into);
240-
}
241-
}
242-
schema_mapping.map_batch(b).map_err(Into::into)
243-
})
225+
maybe_batch
226+
.and_then(|b| schema_mapping.map_batch(b).map_err(Into::into))
244227
});
245228

246229
Ok(adapted.boxed())
@@ -280,40 +263,3 @@ fn create_initial_plan(
280263
// default to scanning all row groups
281264
Ok(ParquetAccessPlan::new_all(row_group_count))
282265
}
283-
284-
/// Return the predicate with updated column indexes
285-
///
286-
/// The indexes of Column PhysicalExpr in predicate are based on
287-
/// file_schema / table_schema, which might be different from the
288-
/// RecordBatch schema returned by Parquet Reader
289-
///
290-
/// Recursively find all Column PhysicalExpr
291-
/// and update with indexes of RecordBatch Schema
292-
///
293-
/// Returns an error if failed to update Column index
294-
fn update_predicate_index(
295-
predicate: Arc<dyn PhysicalExpr>,
296-
schema: &Arc<Schema>,
297-
) -> std::result::Result<Arc<dyn PhysicalExpr>, ArrowError> {
298-
let children = predicate.children();
299-
300-
if children.len() == 0 {
301-
if let Some(column) = predicate.as_any().downcast_ref::<Column>() {
302-
let name = column.name();
303-
let new_index = schema.index_of(name)?;
304-
let new_column = Column::new(name, new_index);
305-
return Ok(Arc::new(new_column));
306-
}
307-
return Ok(Arc::clone(&predicate));
308-
}
309-
310-
let mut new_children: Vec<Arc<dyn PhysicalExpr>> = Vec::new();
311-
for child in children {
312-
let updated_child = update_predicate_index(Arc::clone(child), schema)?;
313-
new_children.push(updated_child);
314-
}
315-
316-
predicate
317-
.with_new_children(new_children)
318-
.map_err(Into::into)
319-
}

0 commit comments

Comments
 (0)