apache · BlakeOrth · Aug 14, 2025 · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025
diff --git a/datafusion/catalog-listing/src/table.rs b/datafusion/catalog-listing/src/table.rs
@@ -578,6 +578,11 @@ impl TableProvider for ListingTable {
         let keep_partition_by_columns =
             state.config_options().execution.keep_partition_by_columns;
 
+        // Invalidate cache entries for this table if they exist
+        if let Some(lfc) = state.runtime_env().cache_manager.get_list_files_cache() {
+            let _ = lfc.remove(table_path.prefix());
+        }
+
         // Sink related option, apart from format
         let config = FileSinkConfig {
             original_url: String::default(),

diff --git a/datafusion/core/tests/datasource/object_store_access.rs b/datafusion/core/tests/datasource/object_store_access.rs
@@ -98,6 +98,59 @@ async fn create_multi_file_csv_file() {
     );
 }
 
+#[tokio::test]
+async fn multi_query_multi_file_csv_file() {
+    let test = Test::new().with_multi_file_csv().await;
+    assert_snapshot!(
+        test.query("select * from csv_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 4
+    - LIST prefix=data
+    - GET  (opts) path=data/file_0.csv
+    - GET  (opts) path=data/file_1.csv
+    - GET  (opts) path=data/file_2.csv
+    "
+    );
+
+    // the second query should re-use the cached LIST results and should not reissue LIST
+    assert_snapshot!(
+        test.query("select * from csv_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 4
+    - LIST prefix=data
+    - GET  (opts) path=data/file_0.csv
+    - GET  (opts) path=data/file_1.csv
+    - GET  (opts) path=data/file_2.csv
+    "
+    );
+}
+
 #[tokio::test]
 async fn query_multi_csv_file() {
     let test = Test::new().with_multi_file_csv().await;

diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs
@@ -30,9 +30,8 @@ use datafusion::prelude::SessionContext;
 use datafusion_common::stats::Precision;
 use datafusion_common::DFSchema;
 use datafusion_execution::cache::cache_manager::CacheManagerConfig;
-use datafusion_execution::cache::cache_unit::{
-    DefaultFileStatisticsCache, DefaultListFilesCache,
-};
+use datafusion_execution::cache::cache_unit::DefaultFileStatisticsCache;
+use datafusion_execution::cache::DefaultListFilesCache;
 use datafusion_execution::config::SessionConfig;
 use datafusion_execution::runtime_env::RuntimeEnvBuilder;
 use datafusion_expr::{col, lit, Expr};

diff --git a/datafusion/execution/src/cache/cache_manager.rs b/datafusion/execution/src/cache/cache_manager.rs
@@ -24,6 +24,9 @@ use std::any::Any;
 use std::collections::HashMap;
 use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
+use std::time::Duration;
+
+use super::list_files_cache::DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT;
 
 /// A cache for [`Statistics`].
 ///
@@ -41,9 +44,19 @@ pub type FileStatisticsCache =
 /// command on the local filesystem. This operation can be expensive,
 /// especially when done over remote object stores.
 ///
-/// See [`crate::runtime_env::RuntimeEnv`] for more details
-pub type ListFilesCache =
-    Arc<dyn CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta>>;
+/// See [`crate::runtime_env::RuntimeEnv`] for more details.
+pub trait ListFilesCache:
+    CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta>
+{
+    /// Returns the cache's memory limit in bytes.
+    fn cache_limit(&self) -> usize;
+
+    /// Returns the TTL (time-to-live) for cache entries, if configured.
+    fn cache_ttl(&self) -> Option<Duration>;
+
+    /// Updates the cache with a new memory limit in bytes.
+    fn update_cache_limit(&self, limit: usize);
+}
 
 /// Generic file-embedded metadata used with [`FileMetadataCache`].
 ///
@@ -109,7 +122,7 @@ impl Debug for dyn CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta> {
     }
 }
 
-impl Debug for dyn CacheAccessor<Path, Arc<Vec<ObjectMeta>>, Extra = ObjectMeta> {
+impl Debug for dyn ListFilesCache {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(f, "Cache name: {} with length: {}", self.name(), self.len())
     }
@@ -131,7 +144,7 @@ impl Debug for dyn FileMetadataCache {
 #[derive(Debug)]
 pub struct CacheManager {
     file_statistic_cache: Option<FileStatisticsCache>,
-    list_files_cache: Option<ListFilesCache>,
+    list_files_cache: Option<Arc<dyn ListFilesCache>>,
     file_metadata_cache: Arc<dyn FileMetadataCache>,
 }
 
@@ -166,10 +179,22 @@ impl CacheManager {
     }
 
     /// Get the cache for storing the result of listing [`ObjectMeta`]s under the same path.
-    pub fn get_list_files_cache(&self) -> Option<ListFilesCache> {
+    pub fn get_list_files_cache(&self) -> Option<Arc<dyn ListFilesCache>> {
         self.list_files_cache.clone()
     }
 
+    /// Get the memory limit of the list files cache.
+    pub fn get_list_files_cache_limit(&self) -> usize {
+        self.list_files_cache
+            .as_ref()
+            .map_or(DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT, |c| c.cache_limit())
+    }
+
+    /// Get the TTL (time-to-live) of the list files cache.
+    pub fn get_list_files_cache_ttl(&self) -> Option<Duration> {
+        self.list_files_cache.as_ref().and_then(|c| c.cache_ttl())
+    }
+
     /// Get the file embedded metadata cache.
     pub fn get_file_metadata_cache(&self) -> Arc<dyn FileMetadataCache> {
         Arc::clone(&self.file_metadata_cache)
@@ -185,17 +210,24 @@ pub const DEFAULT_METADATA_CACHE_LIMIT: usize = 50 * 1024 * 1024; // 50M
 
 #[derive(Clone)]
 pub struct CacheManagerConfig {
-    /// Enable cache of files statistics when listing files.
-    /// Avoid get same file statistics repeatedly in same datafusion session.
-    /// Default is disable. Fow now only supports Parquet files.
+    /// Enable caching of file statistics when listing files.
+    /// Enabling the cache avoids repeatedly reading file statistics in a DataFusion session.
+    /// Default is disabled. Currently only Parquet files are supported.
     pub table_files_statistics_cache: Option<FileStatisticsCache>,
-    /// Enable cache of file metadata when listing files.
-    /// This setting avoids listing file meta of the same path repeatedly
-    /// in same session, which may be expensive in certain situations (e.g. remote object storage).
+    /// Enable caching of file metadata when listing files.
+    /// Enabling the cache avoids repeat list and object metadata fetch operations, which may be
+    /// expensive in certain situations (e.g. remote object storage), for objects under paths that
+    /// are cached.
     /// Note that if this option is enabled, DataFusion will not see any updates to the underlying
-    /// location.  
-    /// Default is disable.
-    pub list_files_cache: Option<ListFilesCache>,
+    /// storage for at least `list_files_cache_ttl` duration.
+    /// Default is disabled.
+    pub list_files_cache: Option<Arc<dyn ListFilesCache>>,
+    /// Limit of the `list_files_cache`, in bytes. Default: 1MiB.
+    pub list_files_cache_limit: usize,
 "metadata_cache_limit" => { 
     let limit = Self::parse_memory_limit(value)?; 
     builder.with_metadata_cache_limit(limit) 
 } 
 "metadata_cache_limit" => { 
     let limit = Self::parse_memory_limit(value)?; 
     builder.with_metadata_cache_limit(limit) 
 } 
+    /// The duration the list files cache will consider an entry valid after insertion. Note that
+    /// changes to the underlying storage system, such as adding or removing data, will not be
+    /// visible until an entry expires. Default: None (infinite).
+    pub list_files_cache_ttl: Option<Duration>,
     /// Cache of file-embedded metadata, used to avoid reading it multiple times when processing a
     /// data file (e.g., Parquet footer and page metadata).
     /// If not provided, the [`CacheManager`] will create a [`DefaultFilesMetadataCache`].
@@ -209,6 +241,8 @@ impl Default for CacheManagerConfig {
         Self {
             table_files_statistics_cache: Default::default(),
             list_files_cache: Default::default(),
+            list_files_cache_limit: DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT,
+            list_files_cache_ttl: None,
             file_metadata_cache: Default::default(),
             metadata_cache_limit: DEFAULT_METADATA_CACHE_LIMIT,
         }
@@ -228,13 +262,32 @@ impl CacheManagerConfig {
     }
 
     /// Set the cache for listing files.
-    ///     
+    ///
     /// Default is `None` (disabled).
-    pub fn with_list_files_cache(mut self, cache: Option<ListFilesCache>) -> Self {
+    pub fn with_list_files_cache(
+        mut self,
+        cache: Option<Arc<dyn ListFilesCache>>,
+    ) -> Self {
         self.list_files_cache = cache;
         self
     }
 
+    /// Sets the limit of the list files cache, in bytes.
+    ///
+    /// Default: 1MiB (1,048,576 bytes).
+    pub fn with_list_files_cache_limit(mut self, limit: usize) -> Self {
+        self.list_files_cache_limit = limit;
+        self
+    }
+
+    /// Sets the TTL (time-to-live) for entries in the list files cache.
+    ///
+    /// Default: None (infinite).
+    pub fn with_list_files_cache_ttl(mut self, ttl: Duration) -> Self {
+        self.list_files_cache_ttl = Some(ttl);
+        self
+    }
+
     /// Sets the cache for file-embedded metadata.
     ///
     /// Default is a [`DefaultFilesMetadataCache`].

diff --git a/datafusion/execution/src/cache/cache_unit.rs b/datafusion/execution/src/cache/cache_unit.rs
@@ -107,71 +107,6 @@ impl CacheAccessor<Path, Arc<Statistics>> for DefaultFileStatisticsCache {
     }
 }
 
-/// Default implementation of [`ListFilesCache`]
-///
-/// Collected files metadata for listing files.
-///
-/// Cache is not invalided until user calls [`Self::remove`] or [`Self::clear`].
-///
-/// [`ListFilesCache`]: crate::cache::cache_manager::ListFilesCache
-#[derive(Default)]
-pub struct DefaultListFilesCache {
-    statistics: DashMap<Path, Arc<Vec<ObjectMeta>>>,
-}
-
-impl CacheAccessor<Path, Arc<Vec<ObjectMeta>>> for DefaultListFilesCache {
-    type Extra = ObjectMeta;
-
-    fn get(&self, k: &Path) -> Option<Arc<Vec<ObjectMeta>>> {
-        self.statistics.get(k).map(|x| Arc::clone(x.value()))
-    }
-
-    fn get_with_extra(
-        &self,
-        _k: &Path,
-        _e: &Self::Extra,
-    ) -> Option<Arc<Vec<ObjectMeta>>> {
-        panic!("Not supported DefaultListFilesCache get_with_extra")
-    }
-
-    fn put(
-        &self,
-        key: &Path,
-        value: Arc<Vec<ObjectMeta>>,
-    ) -> Option<Arc<Vec<ObjectMeta>>> {
-        self.statistics.insert(key.clone(), value)
-    }
-
-    fn put_with_extra(
-        &self,
-        _key: &Path,
-        _value: Arc<Vec<ObjectMeta>>,
-        _e: &Self::Extra,
-    ) -> Option<Arc<Vec<ObjectMeta>>> {
-        panic!("Not supported DefaultListFilesCache put_with_extra")
-    }
-
-    fn remove(&self, k: &Path) -> Option<Arc<Vec<ObjectMeta>>> {
-        self.statistics.remove(k).map(|x| x.1)
-    }
-
-    fn contains_key(&self, k: &Path) -> bool {
-        self.statistics.contains_key(k)
-    }
-
-    fn len(&self) -> usize {
-        self.statistics.len()
-    }
-
-    fn clear(&self) {
-        self.statistics.clear()
-    }
-
-    fn name(&self) -> String {
-        "DefaultListFilesCache".to_string()
-    }
-}
-
 /// Handles the inner state of the [`DefaultFilesMetadataCache`] struct.
 struct DefaultFilesMetadataCacheState {
     lru_queue: LruQueue<Path, (ObjectMeta, Arc<dyn FileMetadata>)>,
@@ -433,7 +368,7 @@ mod tests {
         FileMetadata, FileMetadataCache, FileMetadataCacheEntry,
     };
     use crate::cache::cache_unit::{
-        DefaultFileStatisticsCache, DefaultFilesMetadataCache, DefaultListFilesCache,
+        DefaultFileStatisticsCache, DefaultFilesMetadataCache,
     };
     use crate::cache::CacheAccessor;
     use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
@@ -486,28 +421,6 @@ mod tests {
         assert!(cache.get_with_extra(&meta2.location, &meta2).is_none());
     }
 
-    #[test]
-    fn test_list_file_cache() {
-        let meta = ObjectMeta {
-            location: Path::from("test"),
-            last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00")
-                .unwrap()
-                .into(),
-            size: 1024,
-            e_tag: None,
-            version: None,
-        };
-
-        let cache = DefaultListFilesCache::default();
-        assert!(cache.get(&meta.location).is_none());
-
-        cache.put(&meta.location, vec![meta.clone()].into());
-        assert_eq!(
-            cache.get(&meta.location).unwrap().first().unwrap().clone(),
-            meta.clone()
-        );
-    }
-
     pub struct TestFileMetadata {
         metadata: String,
     }