Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions datafusion/catalog-listing/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ pub fn split_files(
chunks
}

#[derive(Debug)]
pub struct Partition {
/// The path to the partition, including the table prefix
path: Path,
Expand Down Expand Up @@ -247,7 +248,12 @@ async fn prune_partitions(
partition_cols: &[(String, DataType)],
) -> Result<Vec<Partition>> {
if filters.is_empty() {
return Ok(partitions);
// prune partitions which don't contain the partition columns
return Ok(partitions.into_iter().filter(|p| {
let cols = partition_cols.iter().map(|x| x.0.as_str());
!parse_partitions_for_path(table_path, &p.path, cols)
.unwrap_or_default().is_empty()
}).collect());
}

let mut builders: Vec<_> = (0..partition_cols.len())
Expand Down Expand Up @@ -458,6 +464,7 @@ pub async fn pruned_partition_list<'a>(
}

let partition_prefix = evaluate_partition_prefix(partition_cols, filters);

let partitions =
list_partitions(store, table_path, partition_cols.len(), partition_prefix)
.await?;
Expand Down Expand Up @@ -528,12 +535,12 @@ where
let subpath = table_path.strip_prefix(file_path)?;

let mut part_values = vec![];
for (part, pn) in subpath.zip(table_partition_cols) {
for (part, expected_partition) in subpath.zip(table_partition_cols) {
match part.split_once('=') {
Some((name, val)) if name == pn => part_values.push(val),
Some((name, val)) if name == expected_partition => part_values.push(val),
_ => {
debug!(
"Ignoring file: file_path='{file_path}', table_path='{table_path}', part='{part}', partition_col='{pn}'",
"Ignoring file: file_path='{file_path}', table_path='{table_path}', part='{part}', partition_col='{expected_partition}'",
);
return None;
}
Expand Down Expand Up @@ -649,6 +656,8 @@ mod tests {
("tablepath/mypartition=val1/notparquetfile", 100),
("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0),
("tablepath/file.parquet", 100),
("tablepath/notapartition/file.parquet", 100),
("tablepath/notmypartition=val1/file.parquet", 100),
]);
let filter = Expr::eq(col("mypartition"), lit("val1"));
let pruned = pruned_partition_list(
Expand All @@ -674,6 +683,8 @@ mod tests {
("tablepath/mypartition=val2/file.parquet", 100),
("tablepath/mypartition=val1/ignoresemptyfile.parquet", 0),
("tablepath/mypartition=val1/other=val3/file.parquet", 100),
("tablepath/notapartition/file.parquet", 100),
("tablepath/notmypartition=val1/file.parquet", 100),
]);
let filter = Expr::eq(col("mypartition"), lit("val1"));
let pruned = pruned_partition_list(
Expand Down
43 changes: 43 additions & 0 deletions datafusion/core/src/datasource/listing/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2801,6 +2801,48 @@ mod tests {
Ok(())
}

#[tokio::test]
async fn test_listing_table_prunes_extra_files_in_hive() -> Result<()> {
let files = vec![
"bucket/test/pid=1/file1",
"bucket/test/pid=1/file2",
"bucket/test/pid=2/file3",
"bucket/test/pid=2/file4",
"bucket/test/other/file5",
];

let ctx = SessionContext::new();
register_test_store(&ctx, &files.iter().map(|f| (*f, 10)).collect::<Vec<_>>());

let opt = ListingOptions::new(Arc::new(JsonFormat::default()))
.with_file_extension_opt(Some(""))
.with_table_partition_cols(vec![
("pid".to_string(), DataType::Int32)
]);

let table_path = ListingTableUrl::parse("test:///bucket/test/").unwrap();
let schema = Schema::new(vec![Field::new("a", DataType::Boolean, false)]);
let config = ListingTableConfig::new(table_path)
.with_listing_options(opt)
.with_schema(Arc::new(schema));

let table = ListingTable::try_new(config)?;

let (file_list, _) = table.list_files_for_scan(&ctx.state(), &[], &[], None).await?;
assert_eq!(file_list.len(), 1);

let files = file_list[0].clone();

assert_eq!(files.iter().map(|f| f.path().to_string()).collect::<Vec<_>>(), vec![
"bucket/test/pid=1/file1",
"bucket/test/pid=1/file2",
"bucket/test/pid=2/file3",
"bucket/test/pid=2/file4",
]);

Ok(())
}

#[cfg(feature = "parquet")]
#[tokio::test]
async fn test_table_stats_behaviors() -> Result<()> {
Expand All @@ -2819,6 +2861,7 @@ mod tests {
let config_default = ListingTableConfig::new(table_path.clone())
.with_listing_options(opt_default)
.with_schema(schema_default);

let table_default = ListingTable::try_new(config_default)?;

let exec_default = table_default.scan(&state, None, &[], None).await?;
Expand Down