@@ -24,6 +24,11 @@ use std::any::Any;
2424use std:: collections:: HashMap ;
2525use std:: fmt:: { Debug , Formatter } ;
2626use std:: sync:: Arc ;
27+ use std:: time:: Duration ;
28+
29+ use super :: cache_unit:: {
30+ DefaultListFilesCache , DEFAULT_LIST_FILES_CACHE_LIMIT , DEFAULT_LIST_FILES_CACHE_TTL ,
31+ } ;
2732
2833/// A cache for [`Statistics`].
2934///
@@ -42,8 +47,18 @@ pub type FileStatisticsCache =
4247/// especially when done over remote object stores.
4348///
4449/// See [`crate::runtime_env::RuntimeEnv`] for more details
45- pub type ListFilesCache =
46- Arc < dyn CacheAccessor < Path , Arc < Vec < ObjectMeta > > , Extra = ObjectMeta > > ;
50+ pub trait ListFilesCache :
51+ CacheAccessor < Path , Arc < Vec < ObjectMeta > > , Extra = ObjectMeta >
52+ {
53+ // Returns the cache's object limit.
54+ fn cache_limit ( & self ) -> usize ;
55+
56+ // Returns the cache's object ttl.
57+ fn cache_ttl ( & self ) -> Duration ;
58+
59+ // Updates the cache with a new boject limit.
60+ fn update_cache_limit ( & self , limit : usize ) ;
61+ }
4762
4863/// Generic file-embedded metadata used with [`FileMetadataCache`].
4964///
@@ -109,7 +124,7 @@ impl Debug for dyn CacheAccessor<Path, Arc<Statistics>, Extra = ObjectMeta> {
109124 }
110125}
111126
112- impl Debug for dyn CacheAccessor < Path , Arc < Vec < ObjectMeta > > , Extra = ObjectMeta > {
127+ impl Debug for dyn ListFilesCache {
113128 fn fmt ( & self , f : & mut Formatter < ' _ > ) -> std:: fmt:: Result {
114129 write ! ( f, "Cache name: {} with length: {}" , self . name( ) , self . len( ) )
115130 }
@@ -131,7 +146,7 @@ impl Debug for dyn FileMetadataCache {
131146#[ derive( Debug ) ]
132147pub struct CacheManager {
133148 file_statistic_cache : Option < FileStatisticsCache > ,
134- list_files_cache : Option < ListFilesCache > ,
149+ list_files_cache : Option < Arc < dyn ListFilesCache > > ,
135150 file_metadata_cache : Arc < dyn FileMetadataCache > ,
136151}
137152
@@ -140,7 +155,17 @@ impl CacheManager {
140155 let file_statistic_cache =
141156 config. table_files_statistics_cache . as_ref ( ) . map ( Arc :: clone) ;
142157
143- let list_files_cache = config. list_files_cache . as_ref ( ) . map ( Arc :: clone) ;
158+ let list_files_cache = config
159+ . list_files_cache
160+ . as_ref ( )
161+ . map ( Arc :: clone)
162+ . unwrap_or_else ( || {
163+ Arc :: new ( DefaultListFilesCache :: new (
164+ // TODO: config
165+ 512 * 1024 ,
166+ Duration :: new ( 600 , 0 ) ,
167+ ) )
168+ } ) ;
144169
145170 let file_metadata_cache = config
146171 . file_metadata_cache
@@ -155,7 +180,7 @@ impl CacheManager {
155180
156181 Ok ( Arc :: new ( CacheManager {
157182 file_statistic_cache,
158- list_files_cache,
183+ list_files_cache : Some ( list_files_cache ) , // TODO: reinstate optionality
159184 file_metadata_cache,
160185 } ) )
161186 }
@@ -166,10 +191,24 @@ impl CacheManager {
166191 }
167192
168193 /// Get the cache for storing the result of listing [`ObjectMeta`]s under the same path.
169- pub fn get_list_files_cache ( & self ) -> Option < ListFilesCache > {
194+ pub fn get_list_files_cache ( & self ) -> Option < Arc < dyn ListFilesCache > > {
170195 self . list_files_cache . clone ( )
171196 }
172197
198+ /// Get the limit of the file embedded metadata cache.
199+ pub fn get_list_files_cache_limit ( & self ) -> usize {
200+ self . list_files_cache
201+ . as_ref ( )
202+ . map_or ( DEFAULT_LIST_FILES_CACHE_LIMIT , |c| c. cache_limit ( ) )
203+ }
204+
205+ /// Get the limit of the file embedded metadata cache.
206+ pub fn get_list_files_cache_ttl ( & self ) -> Duration {
207+ self . list_files_cache
208+ . as_ref ( )
209+ . map_or ( DEFAULT_LIST_FILES_CACHE_TTL , |c| c. cache_ttl ( ) )
210+ }
211+
173212 /// Get the file embedded metadata cache.
174213 pub fn get_file_metadata_cache ( & self ) -> Arc < dyn FileMetadataCache > {
175214 Arc :: clone ( & self . file_metadata_cache )
@@ -189,13 +228,20 @@ pub struct CacheManagerConfig {
189228 /// Avoid get same file statistics repeatedly in same datafusion session.
190229 /// Default is disable. Fow now only supports Parquet files.
191230 pub table_files_statistics_cache : Option < FileStatisticsCache > ,
192- /// Enable cache of file metadata when listing files.
193- /// This setting avoids listing file meta of the same path repeatedly
194- /// in same session, which may be expensive in certain situations (e.g. remote object storage).
231+ /// Enable caching of file metadata when listing files.
232+ /// Enabling the cache avoids repeat list and metadata fetch operations, which may be expensive
233+ /// in certain situations (e.g. remote object storage), for objects under paths that are
234+ /// cached.
195235 /// Note that if this option is enabled, DataFusion will not see any updates to the underlying
196- /// location.
197- /// Default is disable.
198- pub list_files_cache : Option < ListFilesCache > ,
236+ /// storage for at least `list_files_cache_ttl` duration.
237+ /// Default is disabled.
238+ pub list_files_cache : Option < Arc < dyn ListFilesCache > > ,
239+ /// Limit the number of objects to keep in the `list_files_cache`. Default: ~125k objects
240+ pub list_files_cache_limit : usize ,
241+ /// The duration the list files cache will consider an entry valid after insertion. Note that
242+ /// changes to the underlying storage system, such as adding or removing data, will not be
243+ /// visible until an entry expires. Default: 10 minutes.
244+ pub list_files_cache_ttl : Duration ,
199245 /// Cache of file-embedded metadata, used to avoid reading it multiple times when processing a
200246 /// data file (e.g., Parquet footer and page metadata).
201247 /// If not provided, the [`CacheManager`] will create a [`DefaultFilesMetadataCache`].
@@ -209,6 +255,8 @@ impl Default for CacheManagerConfig {
209255 Self {
210256 table_files_statistics_cache : Default :: default ( ) ,
211257 list_files_cache : Default :: default ( ) ,
258+ list_files_cache_limit : DEFAULT_LIST_FILES_CACHE_LIMIT ,
259+ list_files_cache_ttl : DEFAULT_LIST_FILES_CACHE_TTL ,
212260 file_metadata_cache : Default :: default ( ) ,
213261 metadata_cache_limit : DEFAULT_METADATA_CACHE_LIMIT ,
214262 }
@@ -228,13 +276,30 @@ impl CacheManagerConfig {
228276 }
229277
230278 /// Set the cache for listing files.
231- ///
279+ ///
232280 /// Default is `None` (disabled).
233- pub fn with_list_files_cache ( mut self , cache : Option < ListFilesCache > ) -> Self {
281+ pub fn with_list_files_cache (
282+ mut self ,
283+ cache : Option < Arc < dyn ListFilesCache > > ,
284+ ) -> Self {
234285 self . list_files_cache = cache;
235286 self
236287 }
237288
289+ pub fn with_list_files_cache_limit ( mut self , limit : usize ) -> Self {
290+ self . list_files_cache_limit = limit;
291+ self
292+ }
293+
294+ pub fn with_list_files_cache_ttl ( mut self , ttl : Duration ) -> Self {
295+ self . list_files_cache_ttl = ttl;
296+ if ttl. is_zero ( ) {
297+ self . list_files_cache = None
298+ }
299+
300+ self
301+ }
302+
238303 /// Sets the cache for file-embedded metadata.
239304 ///
240305 /// Default is a [`DefaultFilesMetadataCache`].
0 commit comments