From 51002cb0325772adaf46d6f3ce01d41c01b51079 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 29 Nov 2023 14:16:57 +0100
Subject: [PATCH 01/37] lazy data files resolution

---
 src/datasets/builder.py    | 19 ++++++--
 src/datasets/data_files.py | 98 ++++++++++++++++++++++++++++++++++++++
 src/datasets/load.py       | 23 ++++-----
 3 files changed, 122 insertions(+), 18 deletions(-)

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index a078cb4c2c8..e8de242829f 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -46,7 +46,7 @@
     ReadInstruction,
 )
 from .arrow_writer import ArrowWriter, BeamWriter, ParquetWriter, SchemaInferenceError
-from .data_files import DataFilesDict, sanitize_patterns
+from .data_files import DataFilesDict, DataFilesPatternsDict, sanitize_patterns
 from .dataset_dict import DatasetDict, IterableDatasetDict
 from .download.download_config import DownloadConfig
 from .download.download_manager import DownloadManager, DownloadMode
@@ -128,7 +128,7 @@ class BuilderConfig:
     name: str = "default"
     version: Optional[Union[utils.Version, str]] = utils.Version("0.0.0")
     data_dir: Optional[str] = None
-    data_files: Optional[DataFilesDict] = None
+    data_files: Optional[Union[DataFilesDict, DataFilesPatternsDict]] = None
     description: Optional[str] = None
 
     def __post_init__(self):
@@ -139,7 +139,7 @@ def __post_init__(self):
                     f"Bad characters from black list '{INVALID_WINDOWS_CHARACTERS_IN_PATH}' found in '{self.name}'. "
                     f"They could create issues when creating a directory for this config on Windows filesystem."
                 )
-        if self.data_files is not None and not isinstance(self.data_files, DataFilesDict):
+        if self.data_files is not None and not isinstance(self.data_files, (DataFilesDict, DataFilesPatternsDict)):
             raise ValueError(f"Expected a DataFilesDict in data_files but got {self.data_files}")
 
     def __eq__(self, o):
@@ -213,6 +213,11 @@ def create_config_id(
         else:
             return self.name
 
+    def _resolve_data_files(self, base_path: str, download_config: DownloadConfig) -> None:
+        if self.data_files is not None and isinstance(self.data_files, DataFilesPatternsDict):
+            base_path = base_path + "/" + self.data_dir if self.data_dir else base_path
+            self.data_files = self.data_files.resolve(base_path, download_config)
+
 
 class DatasetBuilder:
     """Abstract base class for all datasets.
@@ -555,7 +560,7 @@ def _create_builder_config(
 
         # otherwise use the config_kwargs to overwrite the attributes
         else:
-            builder_config = copy.deepcopy(builder_config)
+            builder_config = copy.deepcopy(builder_config) if config_kwargs else builder_config
             for key, value in config_kwargs.items():
                 if value is not None:
                     if not hasattr(builder_config, key):
@@ -565,6 +570,12 @@ def _create_builder_config(
         if not builder_config.name:
             raise ValueError(f"BuilderConfig must have a name, got {builder_config.name}")
 
+        # resolve data files if needed
+        builder_config._resolve_data_files(
+            base_path=self.base_path,
+            download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
+        )
+
         # compute the config id that is going to be used for caching
         config_id = builder_config.create_config_id(
             config_kwargs,
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
index f318e893738..02aa594ce42 100644
--- a/src/datasets/data_files.py
+++ b/src/datasets/data_files.py
@@ -340,6 +340,15 @@ def resolve_pattern(
         base_path = os.path.splitdrive(pattern)[0] + os.sep
     else:
         base_path = ""
+    if pattern.startswith(config.HF_ENDPOINT) and "/resolve/" in pattern:
+        pattern = "hf://" + pattern[len(config.HF_ENDPOINT) + 1 :].replace("/resolve/", "@", 1)
+        pattern = (
+            pattern.split("@", 1)[0]
+            + "@"
+            + pattern.split("@", 1)[1].split("/", 1)[0].replace("%2F", "/")
+            + "/"
+            + pattern.split("@", 1)[1].split("/", 1)[1]
+        )
     pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config)
     fs, _, _ = get_fs_token_paths(pattern, storage_options=storage_options)
     fs_base_path = base_path.split("::")[0].split("://")[-1] or fs.root_marker
@@ -698,3 +707,92 @@ def filter_extensions(self, extensions: List[str]) -> "DataFilesDict":
         for key, data_files_list in self.items():
             out[key] = data_files_list.filter_extensions(extensions)
         return out
+
+
+class DataFilesPatternsList(List[str]):
+    """
+    List of data files patterns (absolute local paths or URLs).
+    """
+
+    def __init__(
+        self,
+        patterns: List[str],
+        allowed_extensions: List[Optional[List[str]]],
+    ):
+        super().__init__(patterns)
+        self.allowed_extensions = allowed_extensions
+
+    def __add__(self, other):
+        return DataFilesList([*self, *other], self.allowed_extensions + other.allowed_extensions)
+
+    @classmethod
+    def from_patterns(
+        cls, patterns: List[str], allowed_extensions: Optional[List[str]] = None
+    ) -> "DataFilesPatternsDict":
+        return cls(patterns, [allowed_extensions] * len(patterns))
+
+    def resolve(
+        self,
+        base_path: str,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesList":
+        base_path = base_path if base_path is not None else Path().resolve().as_posix()
+        data_files = []
+        for pattern, allowed_extensions in zip(self, self.allowed_extensions):
+            try:
+                data_files.extend(
+                    resolve_pattern(
+                        pattern,
+                        base_path=base_path,
+                        allowed_extensions=allowed_extensions,
+                        download_config=download_config,
+                    )
+                )
+            except FileNotFoundError:
+                if not has_magic(pattern):
+                    raise
+        origin_metadata = _get_origin_metadata(data_files, download_config=download_config)
+        return DataFilesList(data_files, origin_metadata)
+
+    def filter_extensions(self, extensions: List[str]) -> "DataFilesList":
+        return DataFilesPatternsList(
+            self, [allowed_extensions + extensions for allowed_extensions in self.allowed_extensions]
+        )
+
+
+class DataFilesPatternsDict(Dict[str, DataFilesPatternsList]):
+    """
+    Dict of split_name -> list of data files patterns (absolute local paths or URLs).
+    """
+
+    @classmethod
+    def from_patterns(
+        cls, patterns: Dict[str, List[str]], allowed_extensions: Optional[List[str]] = None
+    ) -> "DataFilesPatternsDict":
+        out = cls()
+        for key, patterns_for_key in patterns.items():
+            out[key] = (
+                DataFilesPatternsList.from_patterns(
+                    patterns_for_key,
+                    allowed_extensions=allowed_extensions,
+                )
+                if not isinstance(patterns_for_key, DataFilesPatternsList)
+                else patterns_for_key
+            )
+        return out
+
+    def resolve(
+        self,
+        base_path: str,
+        download_config: Optional[DownloadConfig] = None,
+    ) -> "DataFilesDict":
+        out = DataFilesDict()
+        for key, data_files_patterns_list in self.items():
+            out[key] = data_files_patterns_list.resolve(base_path, download_config)
+        return out
+
+    def filter_extensions(self, extensions: List[str]) -> "DataFilesPatternsDict":
+        out = type(self)()
+        for key, data_files_patterns_list in self.items():
+            out[key] = data_files_patterns_list.filter_extensions(extensions)
+        return out
diff --git a/src/datasets/load.py b/src/datasets/load.py
index 6893e94242a..f0f3572cf5b 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -40,6 +40,8 @@
     DEFAULT_PATTERNS_ALL,
     DataFilesDict,
     DataFilesList,
+    DataFilesPatternsDict,
+    DataFilesPatternsList,
     EmptyDatasetError,
     get_data_patterns,
     get_metadata_patterns,
@@ -606,7 +608,6 @@ def create_builder_configs_from_metadata_configs(
     supports_metadata: bool,
     base_path: Optional[str] = None,
     default_builder_kwargs: Dict[str, Any] = None,
-    download_config: Optional[DownloadConfig] = None,
 ) -> Tuple[List[BuilderConfig], str]:
     builder_cls = import_main_class(module_path)
     builder_config_cls = builder_cls.BUILDER_CONFIG_CLASS
@@ -624,11 +625,9 @@ def create_builder_configs_from_metadata_configs(
                 if config_data_files is not None
                 else get_data_patterns(config_base_path)
             )
-            config_data_files_dict = DataFilesDict.from_patterns(
+            config_data_files_dict = DataFilesPatternsDict.from_patterns(
                 config_patterns,
-                base_path=config_base_path,
                 allowed_extensions=ALL_ALLOWED_EXTENSIONS,
-                download_config=download_config,
             )
         except EmptyDatasetError as e:
             raise EmptyDatasetError(
@@ -641,16 +640,13 @@ def create_builder_configs_from_metadata_configs(
             except FileNotFoundError:
                 config_metadata_patterns = None
             if config_metadata_patterns is not None:
-                config_metadata_data_files_list = DataFilesList.from_patterns(
-                    config_metadata_patterns, base_path=base_path
+                config_metadata_data_files_list = DataFilesPatternsList.from_patterns(config_metadata_patterns)
+                config_data_files_dict = DataFilesPatternsDict(
+                    {
+                        split: data_files_list + config_metadata_data_files_list
+                        for split, data_files_list in config_data_files_dict.items()
+                    }
                 )
-                if config_metadata_data_files_list:
-                    config_data_files_dict = DataFilesDict(
-                        {
-                            split: data_files_list + config_metadata_data_files_list
-                            for split, data_files_list in config_data_files_dict.items()
-                        }
-                    )
         ignored_params = [
             param for param in config_params if not hasattr(builder_config_cls, param) and param != "default"
         ]
@@ -1255,7 +1251,6 @@ def get_module(self) -> DatasetModule:
                 base_path=base_path,
                 supports_metadata=supports_metadata,
                 default_builder_kwargs=default_builder_kwargs,
-                download_config=self.download_config,
             )
         else:
             builder_configs, default_config_name = None, None

From 5a5bb38bcc71ea21f2d7304aab374fdb81ded463 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 29 Nov 2023 16:22:42 +0100
Subject: [PATCH 02/37] fix tests

---
 src/datasets/builder.py    |  4 ++--
 src/datasets/data_files.py |  2 +-
 src/datasets/load.py       |  2 +-
 tests/test_load.py         | 14 +++++++++++++-
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index e8de242829f..e24baf611a3 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -51,7 +51,7 @@
 from .download.download_config import DownloadConfig
 from .download.download_manager import DownloadManager, DownloadMode
 from .download.mock_download_manager import MockDownloadManager
-from .download.streaming_download_manager import StreamingDownloadManager, xopen
+from .download.streaming_download_manager import StreamingDownloadManager, xjoin, xopen
 from .features import Features
 from .filesystems import (
     is_remote_filesystem,
@@ -215,7 +215,7 @@ def create_config_id(
 
     def _resolve_data_files(self, base_path: str, download_config: DownloadConfig) -> None:
         if self.data_files is not None and isinstance(self.data_files, DataFilesPatternsDict):
-            base_path = base_path + "/" + self.data_dir if self.data_dir else base_path
+            base_path = xjoin(base_path, self.data_dir) if self.data_dir else base_path
             self.data_files = self.data_files.resolve(base_path, download_config)
 
 
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
index 02aa594ce42..b7143a2ade5 100644
--- a/src/datasets/data_files.py
+++ b/src/datasets/data_files.py
@@ -358,7 +358,7 @@ def resolve_pattern(
     protocol_prefix = protocol + "://" if protocol != "file" else ""
     matched_paths = [
         filepath if filepath.startswith(protocol_prefix) else protocol_prefix + filepath
-        for filepath, info in fs.glob(pattern, detail=True).items()
+        for filepath, info in fs.glob(pattern, detail=True, expand_info=False).items()
         if info["type"] == "file"
         and (xbasename(filepath) not in files_to_ignore)
         and not _is_inside_unrequested_special_dir(
diff --git a/src/datasets/load.py b/src/datasets/load.py
index f0f3572cf5b..2e307d113c2 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -618,7 +618,7 @@ def create_builder_configs_from_metadata_configs(
     for config_name, config_params in metadata_configs.items():
         config_data_files = config_params.get("data_files")
         config_data_dir = config_params.get("data_dir")
-        config_base_path = base_path + "/" + config_data_dir if config_data_dir else base_path
+        config_base_path = xjoin(base_path, config_data_dir) if config_data_dir else base_path
         try:
             config_patterns = (
                 sanitize_patterns(config_data_files)
diff --git a/tests/test_load.py b/tests/test_load.py
index 48ed31e0476..11fd99b09b1 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -21,7 +21,7 @@
 from datasets.arrow_writer import ArrowWriter
 from datasets.builder import DatasetBuilder
 from datasets.config import METADATA_CONFIGS_FIELD
-from datasets.data_files import DataFilesDict
+from datasets.data_files import DataFilesDict, DataFilesPatternsDict
 from datasets.dataset_dict import DatasetDict, IterableDatasetDict
 from datasets.download.download_config import DownloadConfig
 from datasets.exceptions import DatasetNotFoundError
@@ -515,6 +515,8 @@ def test_LocalDatasetModuleFactoryWithoutScript_with_single_config_in_metadata(s
         assert isinstance(module_builder_configs[0], ImageFolderConfig)
         assert module_builder_configs[0].name == "custom"
         assert module_builder_configs[0].data_files is not None
+        assert isinstance(module_builder_configs[0].data_files, DataFilesPatternsDict)
+        module_builder_configs[0]._resolve_data_files(self._data_dir_with_single_config_in_metadata, DownloadConfig())
         assert isinstance(module_builder_configs[0].data_files, DataFilesDict)
         assert len(module_builder_configs[0].data_files) == 1  # one train split
         assert len(module_builder_configs[0].data_files["train"]) == 2  # two files
@@ -550,6 +552,10 @@ def test_LocalDatasetModuleFactoryWithoutScript_with_two_configs_in_metadata(sel
         assert module_builder_config_v2.name == "v2"
         assert isinstance(module_builder_config_v1, ImageFolderConfig)
         assert isinstance(module_builder_config_v2, ImageFolderConfig)
+        assert isinstance(module_builder_config_v1.data_files, DataFilesPatternsDict)
+        assert isinstance(module_builder_config_v2.data_files, DataFilesPatternsDict)
+        module_builder_config_v1._resolve_data_files(self._data_dir_with_two_config_in_metadata, DownloadConfig())
+        module_builder_config_v2._resolve_data_files(self._data_dir_with_two_config_in_metadata, DownloadConfig())
         assert isinstance(module_builder_config_v1.data_files, DataFilesDict)
         assert isinstance(module_builder_config_v2.data_files, DataFilesDict)
         assert sorted(module_builder_config_v1.data_files) == ["train"]
@@ -701,6 +707,8 @@ def test_HubDatasetModuleFactoryWithoutScript_with_one_default_config_in_metadat
         assert isinstance(module_builder_configs[0], AudioFolderConfig)
         assert module_builder_configs[0].name == "custom"
         assert module_builder_configs[0].data_files is not None
+        assert isinstance(module_builder_configs[0].data_files, DataFilesPatternsDict)
+        module_builder_configs[0]._resolve_data_files()
         assert isinstance(module_builder_configs[0].data_files, DataFilesDict)
         assert sorted(module_builder_configs[0].data_files) == ["test", "train"]
         assert len(module_builder_configs[0].data_files["train"]) == 3
@@ -738,6 +746,10 @@ def test_HubDatasetModuleFactoryWithoutScript_with_two_configs_in_metadata(self)
             assert module_builder_config_v2.name == "v2"
             assert isinstance(module_builder_config_v1, AudioFolderConfig)
             assert isinstance(module_builder_config_v2, AudioFolderConfig)
+            assert isinstance(module_builder_config_v1.data_files, DataFilesPatternsDict)
+            assert isinstance(module_builder_config_v2.data_files, DataFilesPatternsDict)
+            module_builder_config_v1._resolve_data_files()
+            module_builder_config_v2._resolve_data_files()
             assert isinstance(module_builder_config_v1.data_files, DataFilesDict)
             assert isinstance(module_builder_config_v2.data_files, DataFilesDict)
             assert sorted(module_builder_config_v1.data_files) == ["test", "train"]

From 214a3e6dcb66e9c1a8ff586553e8eee0f1c70710 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 29 Nov 2023 16:24:51 +0100
Subject: [PATCH 03/37] minor

---
 src/datasets/load.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index 2e307d113c2..ed7867f93ed 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1256,7 +1256,7 @@ def get_module(self) -> DatasetModule:
             builder_configs, default_config_name = None, None
         builder_kwargs = {
             "hash": hash,
-            "base_path": hf_hub_url(self.name, "", revision=self.revision),
+            "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"),
             "repo_id": self.name,
             "dataset_name": camelcase_to_snakecase(Path(self.name).name),
         }
@@ -1424,7 +1424,7 @@ def get_module(self) -> DatasetModule:
         importlib.invalidate_caches()
         builder_kwargs = {
             "hash": hash,
-            "base_path": hf_hub_url(self.name, "", revision=self.revision),
+            "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"),
             "repo_id": self.name,
         }
         return DatasetModule(module_path, hash, builder_kwargs)

From b7a9674e17156ff10124632ba705125288de7442 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 29 Nov 2023 16:29:39 +0100
Subject: [PATCH 04/37] don't use expand_info=False yet

---
 src/datasets/data_files.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
index b7143a2ade5..02aa594ce42 100644
--- a/src/datasets/data_files.py
+++ b/src/datasets/data_files.py
@@ -358,7 +358,7 @@ def resolve_pattern(
     protocol_prefix = protocol + "://" if protocol != "file" else ""
     matched_paths = [
         filepath if filepath.startswith(protocol_prefix) else protocol_prefix + filepath
-        for filepath, info in fs.glob(pattern, detail=True, expand_info=False).items()
+        for filepath, info in fs.glob(pattern, detail=True).items()
         if info["type"] == "file"
         and (xbasename(filepath) not in files_to_ignore)
         and not _is_inside_unrequested_special_dir(

From 32e0960ea165a9481b1ff6eed31771475120cb38 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 29 Nov 2023 16:48:38 +0100
Subject: [PATCH 05/37] fix

---
 tests/test_load.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_load.py b/tests/test_load.py
index 11fd99b09b1..c165762b261 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -708,7 +708,7 @@ def test_HubDatasetModuleFactoryWithoutScript_with_one_default_config_in_metadat
         assert module_builder_configs[0].name == "custom"
         assert module_builder_configs[0].data_files is not None
         assert isinstance(module_builder_configs[0].data_files, DataFilesPatternsDict)
-        module_builder_configs[0]._resolve_data_files()
+        module_builder_configs[0]._resolve_data_files(module_factory_result.builder_kwargs["base_path"], DownloadConfig())
         assert isinstance(module_builder_configs[0].data_files, DataFilesDict)
         assert sorted(module_builder_configs[0].data_files) == ["test", "train"]
         assert len(module_builder_configs[0].data_files["train"]) == 3
@@ -748,8 +748,8 @@ def test_HubDatasetModuleFactoryWithoutScript_with_two_configs_in_metadata(self)
             assert isinstance(module_builder_config_v2, AudioFolderConfig)
             assert isinstance(module_builder_config_v1.data_files, DataFilesPatternsDict)
             assert isinstance(module_builder_config_v2.data_files, DataFilesPatternsDict)
-            module_builder_config_v1._resolve_data_files()
-            module_builder_config_v2._resolve_data_files()
+            module_builder_config_v1._resolve_data_files(module_factory_result.builder_kwargs["base_path"], DownloadConfig())
+            module_builder_config_v2._resolve_data_files(module_factory_result.builder_kwargs["base_path"], DownloadConfig())
             assert isinstance(module_builder_config_v1.data_files, DataFilesDict)
             assert isinstance(module_builder_config_v2.data_files, DataFilesDict)
             assert sorted(module_builder_config_v1.data_files) == ["test", "train"]

From d3c0694d0c47a64a3cab5d468b4d9575ad7b1d96 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 29 Nov 2023 17:54:41 +0100
Subject: [PATCH 06/37] retrieve cached datasets that were pushed to hub

---
 src/datasets/load.py                         | 105 +++++++++++++------
 src/datasets/packaged_modules/cache/cache.py |   9 ++
 2 files changed, 84 insertions(+), 30 deletions(-)
 create mode 100644 src/datasets/packaged_modules/cache/cache.py

diff --git a/src/datasets/load.py b/src/datasets/load.py
index 6893e94242a..80dcf633d02 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -15,6 +15,7 @@
 # Lint as: python3
 """Access datasets."""
 import filecmp
+import glob
 import importlib
 import inspect
 import json
@@ -1444,9 +1445,11 @@ class CachedDatasetModuleFactory(_DatasetModuleFactory):
     def __init__(
         self,
         name: str,
+        cache_dir: Optional[str] = None,
         dynamic_modules_path: Optional[str] = None,
     ):
         self.name = name
+        self.cache_dir = cache_dir
         self.dynamic_modules_path = dynamic_modules_path
         assert self.name.count("/") <= 1
 
@@ -1458,38 +1461,79 @@ def get_module(self) -> DatasetModule:
             if os.path.isdir(importable_directory_path)
             else None
         )
-        if not hashes:
-            raise FileNotFoundError(f"Dataset {self.name} is not cached in {dynamic_modules_path}")
-        # get most recent
-
-        def _get_modification_time(module_hash):
-            return (Path(importable_directory_path) / module_hash / (self.name.split("/")[-1] + ".py")).stat().st_mtime
+        if hashes:
+            # get most recent
+            def _get_modification_time(module_hash):
+                return (
+                    (Path(importable_directory_path) / module_hash / (self.name.split("/")[-1] + ".py"))
+                    .stat()
+                    .st_mtime
+                )
 
-        hash = sorted(hashes, key=_get_modification_time)[-1]
-        warning_msg = (
-            f"Using the latest cached version of the module from {os.path.join(importable_directory_path, hash)} "
-            f"(last modified on {time.ctime(_get_modification_time(hash))}) since it "
-            f"couldn't be found locally at {self.name}."
-        )
-        if not config.HF_DATASETS_OFFLINE:
-            warning_msg += ", or remotely on the Hugging Face Hub."
-        logger.warning(warning_msg)
-        # make the new module to be noticed by the import system
-        module_path = ".".join(
-            [
-                os.path.basename(dynamic_modules_path),
-                "datasets",
-                self.name.replace("/", "--"),
-                hash,
-                self.name.split("/")[-1],
+            hash = sorted(hashes, key=_get_modification_time)[-1]
+            warning_msg = (
+                f"Using the latest cached version of the module from {os.path.join(importable_directory_path, hash)} "
+                f"(last modified on {time.ctime(_get_modification_time(hash))}) since it "
+                f"couldn't be found locally at {self.name}."
+            )
+            if not config.HF_DATASETS_OFFLINE:
+                warning_msg += ", or remotely on the Hugging Face Hub."
+            logger.warning(warning_msg)
+            # make the new module to be noticed by the import system
+            module_path = ".".join(
+                [
+                    os.path.basename(dynamic_modules_path),
+                    "datasets",
+                    self.name.replace("/", "--"),
+                    hash,
+                    self.name.split("/")[-1],
+                ]
+            )
+            importlib.invalidate_caches()
+            builder_kwargs = {
+                "hash": hash,
+                "repo_id": self.name,
+            }
+            return DatasetModule(module_path, hash, builder_kwargs)
+        cache_dir = os.path.expanduser(str(self.cache_dir or config.HF_DATASETS_CACHE))
+        cached_datasets_directory_path_root = os.path.join(cache_dir, self.name.replace("/", "___"))
+        cached_directory_paths = [
+            cached_directory_path
+            for cached_directory_path in glob.glob(os.path.join(cached_datasets_directory_path_root, "*", "*", "*"))
+            if os.path.isdir(cached_directory_path)
+        ]
+        if cached_directory_paths:
+            # get most recent
+            def _get_modification_time(cached_directory_path):
+                return (Path(cached_directory_path)).stat().st_mtime
+
+            hash = Path(sorted(cached_directory_paths, key=_get_modification_time)[-1]).name
+            cached_directory_paths = [
+                cached_directory_path
+                for cached_directory_path in cached_directory_paths
+                if Path(cached_directory_path).name == hash
             ]
-        )
-        importlib.invalidate_caches()
-        builder_kwargs = {
-            "hash": hash,
-            "repo_id": self.name,
-        }
-        return DatasetModule(module_path, hash, builder_kwargs)
+            warning_msg = (
+                f"Using the latest cached version of the dataset from {cached_datasets_directory_path_root}/*/*/{hash}"
+                f"(last modified on {time.ctime(_get_modification_time(cached_directory_paths[0]))}) since it "
+                f"couldn't be found locally at {self.name}."
+            )
+            if not config.HF_DATASETS_OFFLINE:
+                warning_msg += ", or remotely on the Hugging Face Hub."
+            logger.warning(warning_msg)
+            module_path = "datasets.packaged_modules.cache.cache"
+            builder_kwargs = {
+                "hash": hash,
+                "repo_id": self.name,
+                "dataset_name": self.name.split("/")[-1],
+            }
+            dataset_infos = DatasetInfosDict()
+            for cached_directory_path in cached_directory_paths:
+                config_name = Path(cached_directory_path).parts[-3]
+                dataset_infos[config_name] = DatasetInfo.from_directory(cached_directory_path)
+            # builder_configs_parameters = BuilderConfigsParameters(builder_configs=[BuilderConfig(Path(cached_directory_path).parts[-3]) for cached_directory_path in cached_directory_paths])
+            return DatasetModule(module_path, hash, builder_kwargs, dataset_infos=dataset_infos)
+        raise FileNotFoundError(f"Dataset {self.name} is not cached in {self.cache_dir}")
 
 
 class CachedMetricModuleFactory(_MetricModuleFactory):
@@ -1723,6 +1767,7 @@ def dataset_module_factory(
             try:
                 return CachedDatasetModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module()
             except Exception:
+                raise
                 # If it's not in the cache, then it doesn't exist.
                 if isinstance(e1, OfflineModeIsEnabled):
                     raise ConnectionError(f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}") from None
diff --git a/src/datasets/packaged_modules/cache/cache.py b/src/datasets/packaged_modules/cache/cache.py
new file mode 100644
index 00000000000..b46e40d8c3e
--- /dev/null
+++ b/src/datasets/packaged_modules/cache/cache.py
@@ -0,0 +1,9 @@
+from datasets import DatasetInfo, GeneratorBasedBuilder, SplitGenerator
+
+
+class Cache(GeneratorBasedBuilder):
+    def _info(self) -> DatasetInfo:
+        return DatasetInfo()
+
+    def _split_generators(self, dl_manager):
+        return [SplitGenerator(name=name, gen_kwargs={}) for name in self.info.splits]

From 8214ff2a9f706427669a6c2a01ccabffa5bf0d2b Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 29 Nov 2023 17:55:24 +0100
Subject: [PATCH 07/37] minor

---
 src/datasets/load.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index 80dcf633d02..96b0b306805 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1767,7 +1767,6 @@ def dataset_module_factory(
             try:
                 return CachedDatasetModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module()
             except Exception:
-                raise
                 # If it's not in the cache, then it doesn't exist.
                 if isinstance(e1, OfflineModeIsEnabled):
                     raise ConnectionError(f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}") from None

From 68099ca55294bfc12a34781835dd73c533a764bd Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 29 Nov 2023 18:03:44 +0100
Subject: [PATCH 08/37] style

---
 tests/test_load.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/test_load.py b/tests/test_load.py
index c165762b261..1c33998aeac 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -708,7 +708,9 @@ def test_HubDatasetModuleFactoryWithoutScript_with_one_default_config_in_metadat
         assert module_builder_configs[0].name == "custom"
         assert module_builder_configs[0].data_files is not None
         assert isinstance(module_builder_configs[0].data_files, DataFilesPatternsDict)
-        module_builder_configs[0]._resolve_data_files(module_factory_result.builder_kwargs["base_path"], DownloadConfig())
+        module_builder_configs[0]._resolve_data_files(
+            module_factory_result.builder_kwargs["base_path"], DownloadConfig()
+        )
         assert isinstance(module_builder_configs[0].data_files, DataFilesDict)
         assert sorted(module_builder_configs[0].data_files) == ["test", "train"]
         assert len(module_builder_configs[0].data_files["train"]) == 3
@@ -748,8 +750,12 @@ def test_HubDatasetModuleFactoryWithoutScript_with_two_configs_in_metadata(self)
             assert isinstance(module_builder_config_v2, AudioFolderConfig)
             assert isinstance(module_builder_config_v1.data_files, DataFilesPatternsDict)
             assert isinstance(module_builder_config_v2.data_files, DataFilesPatternsDict)
-            module_builder_config_v1._resolve_data_files(module_factory_result.builder_kwargs["base_path"], DownloadConfig())
-            module_builder_config_v2._resolve_data_files(module_factory_result.builder_kwargs["base_path"], DownloadConfig())
+            module_builder_config_v1._resolve_data_files(
+                module_factory_result.builder_kwargs["base_path"], DownloadConfig()
+            )
+            module_builder_config_v2._resolve_data_files(
+                module_factory_result.builder_kwargs["base_path"], DownloadConfig()
+            )
             assert isinstance(module_builder_config_v1.data_files, DataFilesDict)
             assert isinstance(module_builder_config_v2.data_files, DataFilesDict)
             assert sorted(module_builder_config_v1.data_files) == ["test", "train"]

From b3fc42882a2d84d7482c27063f1e19539e99b9d3 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Thu, 30 Nov 2023 19:31:18 +0100
Subject: [PATCH 09/37] tests

---
 src/datasets/builder.py    |  2 +-
 src/datasets/data_files.py |  2 ++
 tests/test_data_files.py   | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index e24baf611a3..15f4a0b7d63 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -214,7 +214,7 @@ def create_config_id(
             return self.name
 
     def _resolve_data_files(self, base_path: str, download_config: DownloadConfig) -> None:
-        if self.data_files is not None and isinstance(self.data_files, DataFilesPatternsDict):
+        if isinstance(self.data_files, DataFilesPatternsDict):
             base_path = xjoin(base_path, self.data_dir) if self.data_dir else base_path
             self.data_files = self.data_files.resolve(base_path, download_config)
 
diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
index 02aa594ce42..622890bcccf 100644
--- a/src/datasets/data_files.py
+++ b/src/datasets/data_files.py
@@ -712,6 +712,8 @@ def filter_extensions(self, extensions: List[str]) -> "DataFilesDict":
 class DataFilesPatternsList(List[str]):
     """
     List of data files patterns (absolute local paths or URLs).
+    For each pattern there should also be a list of allowed extensions
+    to keep, or a None ot keep all the files for the pattern.
     """
 
     def __init__(
diff --git a/tests/test_data_files.py b/tests/test_data_files.py
index 01b5e4dd15e..468f5909f2f 100644
--- a/tests/test_data_files.py
+++ b/tests/test_data_files.py
@@ -12,6 +12,8 @@
 from datasets.data_files import (
     DataFilesDict,
     DataFilesList,
+    DataFilesPatternsDict,
+    DataFilesPatternsList,
     _get_data_files_patterns,
     _get_metadata_files_patterns,
     _is_inside_unrequested_special_dir,
@@ -487,6 +489,37 @@ def test_DataFilesDict_from_patterns_locally_or_remote_hashing(text_file):
         assert Hasher.hash(data_files1) != Hasher.hash(data_files2)
 
 
+def test_DataFilesPatternsList(text_file):
+    data_files_patterns = DataFilesPatternsList([str(text_file)], allowed_extensions=[None])
+    data_files = data_files_patterns.resolve(base_path="")
+    assert data_files == [str(text_file)]
+    assert isinstance(data_files, DataFilesList)
+    data_files_patterns = DataFilesPatternsList([str(text_file)], allowed_extensions=[[".txt"]])
+    data_files = data_files_patterns.resolve(base_path="")
+    assert data_files == [str(text_file)]
+    assert isinstance(data_files, DataFilesList)
+    data_files_patterns = DataFilesPatternsList([str(text_file).replace(".txt", ".tx*")], allowed_extensions=[None])
+    data_files = data_files_patterns.resolve(base_path="")
+    assert data_files == [str(text_file)]
+    assert isinstance(data_files, DataFilesList)
+    data_files_patterns = DataFilesPatternsList([Path(text_file).name], allowed_extensions=[None])
+    data_files = data_files_patterns.resolve(base_path=str(Path(text_file).parent))
+    assert data_files == [str(text_file)]
+    data_files_patterns = DataFilesPatternsList([str(text_file)], allowed_extensions=[[".zip"]])
+    with pytest.raises(FileNotFoundError):
+        data_files_patterns.resolve(base_path="")
+
+
+def test_DataFilesPatternsDict(text_file):
+    data_files_patterns_dict = DataFilesPatternsDict(
+        {"train": DataFilesPatternsList([str(text_file)], allowed_extensions=[None])}
+    )
+    data_files_dict = data_files_patterns_dict.resolve(base_path="")
+    assert data_files_dict == {"train": [str(text_file)]}
+    assert isinstance(data_files_dict, DataFilesDict)
+    assert isinstance(data_files_dict["train"], DataFilesList)
+
+
 def mock_fs(file_paths: List[str]):
     """
     Set up a mock filesystem for fsspec containing the provided files

From 796a47e388a5c5711a95bd649648608c18219ac5 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Thu, 30 Nov 2023 19:47:58 +0100
Subject: [PATCH 10/37] fix win test

---
 tests/test_data_files.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_data_files.py b/tests/test_data_files.py
index 468f5909f2f..9ac8945c931 100644
--- a/tests/test_data_files.py
+++ b/tests/test_data_files.py
@@ -492,19 +492,19 @@ def test_DataFilesDict_from_patterns_locally_or_remote_hashing(text_file):
 def test_DataFilesPatternsList(text_file):
     data_files_patterns = DataFilesPatternsList([str(text_file)], allowed_extensions=[None])
     data_files = data_files_patterns.resolve(base_path="")
-    assert data_files == [str(text_file)]
+    assert data_files == [text_file.as_posix()]
     assert isinstance(data_files, DataFilesList)
     data_files_patterns = DataFilesPatternsList([str(text_file)], allowed_extensions=[[".txt"]])
     data_files = data_files_patterns.resolve(base_path="")
-    assert data_files == [str(text_file)]
+    assert data_files == [text_file.as_posix()]
     assert isinstance(data_files, DataFilesList)
     data_files_patterns = DataFilesPatternsList([str(text_file).replace(".txt", ".tx*")], allowed_extensions=[None])
     data_files = data_files_patterns.resolve(base_path="")
-    assert data_files == [str(text_file)]
+    assert data_files == [text_file.as_posix()]
     assert isinstance(data_files, DataFilesList)
     data_files_patterns = DataFilesPatternsList([Path(text_file).name], allowed_extensions=[None])
     data_files = data_files_patterns.resolve(base_path=str(Path(text_file).parent))
-    assert data_files == [str(text_file)]
+    assert data_files == [text_file.as_posix()]
     data_files_patterns = DataFilesPatternsList([str(text_file)], allowed_extensions=[[".zip"]])
     with pytest.raises(FileNotFoundError):
         data_files_patterns.resolve(base_path="")
@@ -515,7 +515,7 @@ def test_DataFilesPatternsDict(text_file):
         {"train": DataFilesPatternsList([str(text_file)], allowed_extensions=[None])}
     )
     data_files_dict = data_files_patterns_dict.resolve(base_path="")
-    assert data_files_dict == {"train": [str(text_file)]}
+    assert data_files_dict == {"train": [text_file.as_posix()]}
     assert isinstance(data_files_dict, DataFilesDict)
     assert isinstance(data_files_dict["train"], DataFilesList)
 

From a924e94d4fc5071afdec324f6dd9b81f707d262e Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Fri, 1 Dec 2023 18:39:59 +0100
Subject: [PATCH 11/37] fix tests

---
 src/datasets/load.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index 08a04a24661..06e4e296d56 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1256,7 +1256,7 @@ def get_module(self) -> DatasetModule:
             builder_configs, default_config_name = None, None
         builder_kwargs = {
             "hash": hash,
-            "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"),
+            "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"),
             "repo_id": self.name,
             "dataset_name": camelcase_to_snakecase(Path(self.name).name),
         }
@@ -1269,7 +1269,7 @@ def get_module(self) -> DatasetModule:
         try:
             # this file is deprecated and was created automatically in old versions of push_to_hub
             dataset_infos_path = cached_path(
-                hf_hub_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.revision),
+                hf_hub_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision),
                 download_config=download_config,
             )
             with open(dataset_infos_path, encoding="utf-8") as f:

From c9ecfcaa3bf6ace3de327b983503660e5569e2fd Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Mon, 4 Dec 2023 12:05:14 +0100
Subject: [PATCH 12/37] fix tests again

---
 src/datasets/load.py | 1 -
 tests/test_load.py   | 3 +++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index 87d63e24ce8..19d54834199 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1344,7 +1344,6 @@ def get_module(self) -> DatasetModule:
             module_path,
             metadata_configs,
             supports_metadata=False,
-            download_config=self.download_config,
         )
         builder_kwargs = {
             "hash": hash,
diff --git a/tests/test_load.py b/tests/test_load.py
index 8f0c193a527..72f0b3e5ec5 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -798,6 +798,9 @@ def test_HubDatasetModuleFactoryWithParquetExport(self):
         assert module_factory_result.module_path == "datasets.packaged_modules.parquet.parquet"
         assert module_factory_result.builder_configs_parameters.builder_configs
         assert isinstance(module_factory_result.builder_configs_parameters.builder_configs[0], ParquetConfig)
+        module_factory_result.builder_configs_parameters.builder_configs[0]._resolve_data_files(
+            base_path="", download_config=self.download_config
+        )
         assert module_factory_result.builder_configs_parameters.builder_configs[0].data_files == {
             "train": [
                 "hf://datasets/hf-internal-testing/dataset_with_script@da4ed81df5a1bcd916043c827b75994de8ef7eda/default/train/0000.parquet"

From ddb488bd7888a788c625bb6bbe41fdb16bf67a17 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Mon, 4 Dec 2023 12:40:22 +0100
Subject: [PATCH 13/37] remove unused code

---
 src/datasets/data_files.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py
index 622890bcccf..6d1a0e6aa95 100644
--- a/src/datasets/data_files.py
+++ b/src/datasets/data_files.py
@@ -340,15 +340,6 @@ def resolve_pattern(
         base_path = os.path.splitdrive(pattern)[0] + os.sep
     else:
         base_path = ""
-    if pattern.startswith(config.HF_ENDPOINT) and "/resolve/" in pattern:
-        pattern = "hf://" + pattern[len(config.HF_ENDPOINT) + 1 :].replace("/resolve/", "@", 1)
-        pattern = (
-            pattern.split("@", 1)[0]
-            + "@"
-            + pattern.split("@", 1)[1].split("/", 1)[0].replace("%2F", "/")
-            + "/"
-            + pattern.split("@", 1)[1].split("/", 1)[1]
-        )
     pattern, storage_options = _prepare_path_and_storage_options(pattern, download_config=download_config)
     fs, _, _ = get_fs_token_paths(pattern, storage_options=storage_options)
     fs_base_path = base_path.split("::")[0].split("://")[-1] or fs.root_marker

From a15ea9db34f3497d1c9234a4e5b3a9b10083a826 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 6 Dec 2023 17:55:35 +0100
Subject: [PATCH 14/37] allow load from cache in streaming mode

---
 src/datasets/packaged_modules/cache/cache.py | 59 ++++++++++++++++++--
 1 file changed, 54 insertions(+), 5 deletions(-)

diff --git a/src/datasets/packaged_modules/cache/cache.py b/src/datasets/packaged_modules/cache/cache.py
index b46e40d8c3e..42e756e1866 100644
--- a/src/datasets/packaged_modules/cache/cache.py
+++ b/src/datasets/packaged_modules/cache/cache.py
@@ -1,9 +1,58 @@
-from datasets import DatasetInfo, GeneratorBasedBuilder, SplitGenerator
+import os
+import shutil
+from typing import List, Optional
 
+import pyarrow as pa
 
-class Cache(GeneratorBasedBuilder):
-    def _info(self) -> DatasetInfo:
-        return DatasetInfo()
+import datasets
+from datasets.naming import filenames_for_dataset_split
+
+
+logger = datasets.utils.logging.get_logger(__name__)
+
+
+class Cache(datasets.ArrowBasedBuilder):
+    def _info(self) -> datasets.DatasetInfo:
+        return datasets.DatasetInfo()
+
+    def download_and_prepare(self, output_dir: Optional[str] = None, *args, **kwargs):
+        if not os.path.exists(self.cache_dir):
+            raise ValueError(f"Cache directory for {self.dataset_name} doesn't exist at {self.cache_dir}")
+        if output_dir is not None:
+            shutil.copytree(self.cache_dir, output_dir)
 
     def _split_generators(self, dl_manager):
-        return [SplitGenerator(name=name, gen_kwargs={}) for name in self.info.splits]
+        if isinstance(self.info.splits, datasets.SplitDict):
+            split_infos: List[datasets.SplitInfo] = list(self.info.splits.values())
+        else:
+            raise ValueError(f"Missing splits info for {self.dataset_name} in cache directory {self.cache_dir}")
+        return [
+            datasets.SplitGenerator(
+                name=split_info.name,
+                gen_kwargs={
+                    "files": filenames_for_dataset_split(
+                        self.cache_dir,
+                        dataset_name=self.dataset_name,
+                        split=split_info.name,
+                        filetype_suffix="arrow",
+                        shard_lengths=split_info.shard_lengths,
+                    )
+                },
+            )
+            for split_info in split_infos
+        ]
+
+    def _generate_tables(self, files):
+        # used to stream from cache
+        for file_idx, file in enumerate(files):
+            with open(file, "rb") as f:
+                try:
+                    for batch_idx, record_batch in enumerate(pa.ipc.open_stream(f)):
+                        pa_table = pa.Table.from_batches([record_batch])
+                        # Uncomment for debugging (will print the Arrow table size and elements)
+                        # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
+                        # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
+                        yield f"{file_idx}_{batch_idx}", pa_table
+                except ValueError as e:
+                    logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
+                    raise

From 809b5cce4d89b716c08fa79b5bb057dbf8f6483f Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 6 Dec 2023 17:56:42 +0100
Subject: [PATCH 15/37] remove comment

---
 src/datasets/load.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index d7046b2708a..79ac73445ba 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1602,7 +1602,6 @@ def _get_modification_time(cached_directory_path):
             for cached_directory_path in cached_directory_paths:
                 config_name = Path(cached_directory_path).parts[-3]
                 dataset_infos[config_name] = DatasetInfo.from_directory(cached_directory_path)
-            # builder_configs_parameters = BuilderConfigsParameters(builder_configs=[BuilderConfig(Path(cached_directory_path).parts[-3]) for cached_directory_path in cached_directory_paths])
             return DatasetModule(module_path, hash, builder_kwargs, dataset_infos=dataset_infos)
         raise FileNotFoundError(f"Dataset {self.name} is not cached in {self.cache_dir}")
 

From 90f3d34d2bfb3828c785edc272d2d31b3026fdcc Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Thu, 7 Dec 2023 17:25:45 +0100
Subject: [PATCH 16/37] more tests

---
 src/datasets/load.py                         | 42 ++++++++++----
 src/datasets/packaged_modules/cache/cache.py |  1 +
 tests/fixtures/files.py                      | 10 ++++
 tests/packaged_modules/test_cache.py         | 54 ++++++++++++++++++
 tests/test_load.py                           | 60 +++++++++++++++++++-
 5 files changed, 156 insertions(+), 11 deletions(-)
 create mode 100644 tests/packaged_modules/test_cache.py

diff --git a/src/datasets/load.py b/src/datasets/load.py
index 79ac73445ba..1ee66479f52 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1071,15 +1071,18 @@ def get_module(self) -> DatasetModule:
                 default_builder_kwargs=default_builder_kwargs,
             )
         else:
-            builder_configs, default_config_name = None, None
+            builder_configs: List[BuilderConfig] = [
+                import_main_class(module_path).BUILDER_CONFIG_CLASS(
+                    data_files=data_files,
+                    **default_builder_kwargs,
+                )
+            ]
+            default_config_name = None
         builder_kwargs = {
             "hash": hash,
             "base_path": self.path,
             "dataset_name": camelcase_to_snakecase(Path(self.path).name),
         }
-        if self.data_files is not None or not metadata_configs:
-            builder_kwargs["data_files"] = data_files
-            builder_kwargs.update(default_builder_kwargs)  # from _EXTENSION_TO_MODULE
         # this file is deprecated and was created automatically in old versions of push_to_hub
         if os.path.isfile(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME)):
             with open(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME), encoding="utf-8") as f:
@@ -1264,16 +1267,19 @@ def get_module(self) -> DatasetModule:
                 download_config=self.download_config,
             )
         else:
-            builder_configs, default_config_name = None, None
+            builder_configs: List[BuilderConfig] = [
+                import_main_class(module_path).BUILDER_CONFIG_CLASS(
+                    data_files=data_files,
+                    **default_builder_kwargs,
+                )
+            ]
+            default_config_name = None
         builder_kwargs = {
             "hash": hash,
             "base_path": hf_hub_url(self.name, "", revision=self.revision),
             "repo_id": self.name,
             "dataset_name": camelcase_to_snakecase(Path(self.name).name),
         }
-        if self.data_files is not None or not metadata_configs:
-            builder_kwargs["data_files"] = data_files
-            builder_kwargs.update(default_builder_kwargs)  # from _EXTENSION_TO_MODULE
         download_config = self.download_config.copy()
         if download_config.download_desc is None:
             download_config.download_desc = "Downloading metadata"
@@ -1599,10 +1605,18 @@ def _get_modification_time(cached_directory_path):
                 "dataset_name": self.name.split("/")[-1],
             }
             dataset_infos = DatasetInfosDict()
+            builder_configs = []
             for cached_directory_path in cached_directory_paths:
                 config_name = Path(cached_directory_path).parts[-3]
                 dataset_infos[config_name] = DatasetInfo.from_directory(cached_directory_path)
-            return DatasetModule(module_path, hash, builder_kwargs, dataset_infos=dataset_infos)
+                builder_configs.append(import_main_class(module_path).BUILDER_CONFIG_CLASS(name=config_name))
+            return DatasetModule(
+                module_path,
+                hash,
+                builder_kwargs,
+                dataset_infos=dataset_infos,
+                builder_configs_parameters=BuilderConfigsParameters(builder_configs=builder_configs),
+            )
         raise FileNotFoundError(f"Dataset {self.name} is not cached in {self.cache_dir}")
 
 
@@ -1663,6 +1677,7 @@ def dataset_module_factory(
     dynamic_modules_path: Optional[str] = None,
     data_dir: Optional[str] = None,
     data_files: Optional[Union[Dict, List, str, DataFilesDict]] = None,
+    cache_dir: Optional[str] = None,
     trust_remote_code: Optional[bool] = None,
     **download_kwargs,
 ) -> DatasetModule:
@@ -1705,6 +1720,10 @@ def dataset_module_factory(
         data_dir (:obj:`str`, optional): Directory with the data files. Used only if `data_files` is not specified,
             in which case it's equal to pass `os.path.join(data_dir, "**")` as `data_files`.
         data_files (:obj:`Union[Dict, List, str]`, optional): Defining the data_files of the dataset configuration.
+        cache_dir (`str`, *optional*):
+            Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`.
+
+            <Added version="2.16.0"/>
         trust_remote_code (`bool`, defaults to `True`):
             Whether or not to allow for datasets defined on the Hub using a dataset script. This option
             should only be set to `True` for repositories you trust and in which you have read the code, as it will
@@ -1852,7 +1871,9 @@ def dataset_module_factory(
         except Exception as e1:
             # All the attempts failed, before raising the error we should check if the module is already cached
             try:
-                return CachedDatasetModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module()
+                return CachedDatasetModuleFactory(
+                    path, dynamic_modules_path=dynamic_modules_path, cache_dir=cache_dir
+                ).get_module()
             except Exception:
                 # If it's not in the cache, then it doesn't exist.
                 if isinstance(e1, OfflineModeIsEnabled):
@@ -2219,6 +2240,7 @@ def load_dataset_builder(
         download_mode=download_mode,
         data_dir=data_dir,
         data_files=data_files,
+        cache_dir=cache_dir,
         trust_remote_code=trust_remote_code,
     )
     # Get dataset builder class from the processing script
diff --git a/src/datasets/packaged_modules/cache/cache.py b/src/datasets/packaged_modules/cache/cache.py
index 42e756e1866..876167c4195 100644
--- a/src/datasets/packaged_modules/cache/cache.py
+++ b/src/datasets/packaged_modules/cache/cache.py
@@ -22,6 +22,7 @@ def download_and_prepare(self, output_dir: Optional[str] = None, *args, **kwargs
             shutil.copytree(self.cache_dir, output_dir)
 
     def _split_generators(self, dl_manager):
+        # used to stream from cache
         if isinstance(self.info.splits, datasets.SplitDict):
             split_infos: List[datasets.SplitInfo] = list(self.info.splits.values())
         else:
diff --git a/tests/fixtures/files.py b/tests/fixtures/files.py
index da8d3efee48..76bf830676d 100644
--- a/tests/fixtures/files.py
+++ b/tests/fixtures/files.py
@@ -471,6 +471,16 @@ def text2_path(tmp_path_factory):
     return path
 
 
+@pytest.fixture(scope="session")
+def text_dir(tmp_path_factory):
+    data = ["0", "1", "2", "3"]
+    path = tmp_path_factory.mktemp("data_text_dir") / "dataset.txt"
+    with open(path, "w") as f:
+        for item in data:
+            f.write(item + "\n")
+    return path.parent
+
+
 @pytest.fixture(scope="session")
 def text_dir_with_unsupported_extension(tmp_path_factory):
     data = ["0", "1", "2", "3"]
diff --git a/tests/packaged_modules/test_cache.py b/tests/packaged_modules/test_cache.py
new file mode 100644
index 00000000000..9adb75a7846
--- /dev/null
+++ b/tests/packaged_modules/test_cache.py
@@ -0,0 +1,54 @@
+from pathlib import Path
+
+import pytest
+
+from datasets import load_dataset
+from datasets.packaged_modules.cache.cache import Cache
+
+
+SAMPLE_DATASET_TWO_CONFIG_IN_METADATA = "hf-internal-testing/audiofolder_two_configs_in_metadata"
+
+
+def test_cache(text_dir: Path):
+    ds = load_dataset(str(text_dir))
+    hash = Path(ds["train"].cache_files[0]["filename"]).parts[-2]
+    cache = Cache(dataset_name=text_dir.name, hash=hash)
+    reloaded = cache.as_dataset()
+    assert list(ds) == list(reloaded)
+    assert list(ds["train"]) == list(reloaded["train"])
+
+
+def test_cache_streaming(text_dir: Path):
+    ds = load_dataset(str(text_dir))
+    hash = Path(ds["train"].cache_files[0]["filename"]).parts[-2]
+    cache = Cache(dataset_name=text_dir.name, hash=hash)
+    reloaded = cache.as_streaming_dataset()
+    assert list(ds) == list(reloaded)
+    assert list(ds["train"]) == list(reloaded["train"])
+
+
+def test_cache_missing(text_dir: Path):
+    ds = load_dataset(str(text_dir))
+    hash = Path(ds["train"].cache_files[0]["filename"]).parts[-2]
+    Cache(dataset_name=text_dir.name, hash=hash).download_and_prepare()
+    with pytest.raises(ValueError):
+        Cache(dataset_name="missing", hash=hash).download_and_prepare()
+    with pytest.raises(ValueError):
+        Cache(dataset_name=text_dir.name, hash="missing").download_and_prepare()
+    with pytest.raises(ValueError):
+        Cache(dataset_name=text_dir.name, config_name="missing", hash=hash).download_and_prepare()
+
+
+@pytest.mark.integration
+def test_cache_multi_configs():
+    repo_id = SAMPLE_DATASET_TWO_CONFIG_IN_METADATA
+    config_name = "v1"
+    ds = load_dataset(repo_id, config_name)
+    hash = Path(ds["train"].cache_files[0]["filename"]).parts[-2]
+    cache = Cache(dataset_name=repo_id.split("/")[-1], repo_id=repo_id, config_name=config_name, hash=hash)
+    reloaded = cache.as_dataset()
+    assert list(ds) == list(reloaded)
+    assert len(ds["train"]) == len(reloaded["train"])
+    with pytest.raises(ValueError) as excinfo:
+        Cache(dataset_name=repo_id.split("/")[-1], repo_id=repo_id, config_name="missing", hash=hash)
+    assert config_name in str(excinfo.value)
diff --git a/tests/test_load.py b/tests/test_load.py
index 66321ef8c47..e10b1cdb950 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -805,7 +805,20 @@ def test_HubDatasetModuleFactoryWithParquetExport_errors_on_wrong_sha(self):
         with self.assertRaises(_datasets_server.DatasetsServerError):
             factory.get_module()
 
+    @pytest.mark.integration
     def test_CachedDatasetModuleFactory(self):
+        name = SAMPLE_DATASET_IDENTIFIER2
+        load_dataset_builder(name, cache_dir=self.cache_dir).download_and_prepare()
+        for offline_mode in OfflineSimulationMode:
+            with offline(offline_mode):
+                factory = CachedDatasetModuleFactory(
+                    name,
+                    cache_dir=self.cache_dir,
+                )
+                module_factory_result = factory.get_module()
+                assert importlib.import_module(module_factory_result.module_path) is not None
+
+    def test_CachedDatasetModuleFactory_with_script(self):
         path = os.path.join(self._dataset_loading_script_dir, f"{DATASET_LOADING_SCRIPT_NAME}.py")
         factory = LocalDatasetModuleFactoryWithScript(
             path, download_config=self.download_config, dynamic_modules_path=self.dynamic_modules_path
@@ -866,12 +879,14 @@ def inject_fixtures(self, caplog):
 
     def setUp(self):
         self.hf_modules_cache = tempfile.mkdtemp()
+        self.cache_dir = tempfile.mkdtemp()
         self.dynamic_modules_path = datasets.load.init_dynamic_modules(
             name="test_datasets_modules2", hf_modules_cache=self.hf_modules_cache
         )
 
     def tearDown(self):
         shutil.rmtree(self.hf_modules_cache)
+        shutil.rmtree(self.cache_dir)
 
     def _dummy_module_dir(self, modules_dir, dummy_module_name, dummy_code):
         assert dummy_module_name.startswith("__")
@@ -913,7 +928,20 @@ def test_dataset_module_factory(self):
                             "__missing_dummy_module_name__", dynamic_modules_path=self.dynamic_modules_path
                         )
 
+    @pytest.mark.integration
     def test_offline_dataset_module_factory(self):
+        repo_id = SAMPLE_DATASET_IDENTIFIER2
+        builder = load_dataset_builder(repo_id, cache_dir=self.cache_dir)
+        builder.download_and_prepare()
+        for offline_simulation_mode in list(OfflineSimulationMode):
+            with offline(offline_simulation_mode):
+                self._caplog.clear()
+                # allow provide the repo id without an explicit path to remote or local actual file
+                dataset_module = datasets.load.dataset_module_factory(repo_id, cache_dir=self.cache_dir)
+                self.assertEqual(dataset_module.module_path, "datasets.packaged_modules.cache.cache")
+                self.assertIn("Using the latest cached version of the dataset", self._caplog.text)
+
+    def test_offline_dataset_module_factory_with_script(self):
         with tempfile.TemporaryDirectory() as tmp_dir:
             dummy_code = "MY_DUMMY_VARIABLE = 'hello there'"
             module_dir = self._dummy_module_dir(tmp_dir, "__dummy_module_name2__", dummy_code)
@@ -1138,13 +1166,21 @@ def test_load_dataset_builder_fail():
 
 
 @pytest.mark.parametrize("keep_in_memory", [False, True])
-def test_load_dataset_local(dataset_loading_script_dir, data_dir, keep_in_memory, caplog):
+def test_load_dataset_local_script(dataset_loading_script_dir, data_dir, keep_in_memory, caplog):
     with assert_arrow_memory_increases() if keep_in_memory else assert_arrow_memory_doesnt_increase():
         dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, keep_in_memory=keep_in_memory)
     assert isinstance(dataset, DatasetDict)
     assert all(isinstance(d, Dataset) for d in dataset.values())
     assert len(dataset) == 2
     assert isinstance(next(iter(dataset["train"])), dict)
+
+
+def test_load_dataset_cached_local_script(dataset_loading_script_dir, data_dir, caplog):
+    dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir)
+    assert isinstance(dataset, DatasetDict)
+    assert all(isinstance(d, Dataset) for d in dataset.values())
+    assert len(dataset) == 2
+    assert isinstance(next(iter(dataset["train"])), dict)
     for offline_simulation_mode in list(OfflineSimulationMode):
         with offline(offline_simulation_mode):
             caplog.clear()
@@ -1152,6 +1188,28 @@ def test_load_dataset_local(dataset_loading_script_dir, data_dir, keep_in_memory
             dataset = datasets.load_dataset(DATASET_LOADING_SCRIPT_NAME, data_dir=data_dir)
             assert len(dataset) == 2
             assert "Using the latest cached version of the module" in caplog.text
+            assert isinstance(next(iter(dataset["train"])), dict)
+    with pytest.raises(DatasetNotFoundError) as exc_info:
+        datasets.load_dataset(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST)
+    assert f"Dataset '{SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST}' doesn't exist on the Hub" in str(exc_info.value)
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("stream_from_cache, ", [False, True])
+def test_load_dataset_cached_from_hub(stream_from_cache, caplog):
+    dataset = load_dataset(SAMPLE_DATASET_IDENTIFIER3)
+    assert isinstance(dataset, DatasetDict)
+    assert all(isinstance(d, Dataset) for d in dataset.values())
+    assert len(dataset) == 2
+    assert isinstance(next(iter(dataset["train"])), dict)
+    for offline_simulation_mode in list(OfflineSimulationMode):
+        with offline(offline_simulation_mode):
+            caplog.clear()
+            # Load dataset from cache
+            dataset = datasets.load_dataset(SAMPLE_DATASET_IDENTIFIER3, streaming=stream_from_cache)
+            assert len(dataset) == 2
+            assert "Using the latest cached version of the dataset" in caplog.text
+            assert isinstance(next(iter(dataset["train"])), dict)
     with pytest.raises(DatasetNotFoundError) as exc_info:
         datasets.load_dataset(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST)
     assert f"Dataset '{SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST}' doesn't exist on the Hub" in str(exc_info.value)

From 1e3074a17099870b51b313fc0569bdcdbeab72ec Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Thu, 7 Dec 2023 17:54:18 +0100
Subject: [PATCH 17/37] fix tests

---
 src/datasets/builder.py |   4 +-
 tests/test_load.py      | 106 ++++++++++++++++------------------------
 2 files changed, 44 insertions(+), 66 deletions(-)

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index a078cb4c2c8..d3f3e6e013e 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -517,11 +517,11 @@ def _create_builder_config(
         builder_config = None
 
         # try default config
-        if config_name is None and self.BUILDER_CONFIGS and not config_kwargs:
+        if config_name is None and self.BUILDER_CONFIGS:
             if self.DEFAULT_CONFIG_NAME is not None:
                 builder_config = self.builder_configs.get(self.DEFAULT_CONFIG_NAME)
                 logger.info(f"No config specified, defaulting to: {self.dataset_name}/{builder_config.name}")
-            else:
+            elif not config_kwargs:
                 if len(self.BUILDER_CONFIGS) > 1:
                     example_of_usage = f"load_dataset('{self.dataset_name}', '{self.BUILDER_CONFIGS[0].name}')"
                     raise ValueError(
diff --git a/tests/test_load.py b/tests/test_load.py
index e10b1cdb950..c7dd34afaee 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -469,34 +469,29 @@ def test_LocalDatasetModuleFactoryWithoutScript_with_data_dir(self):
         factory = LocalDatasetModuleFactoryWithoutScript(self._data_dir2, data_dir=self._sub_data_dir)
         module_factory_result = factory.get_module()
         assert importlib.import_module(module_factory_result.module_path) is not None
+        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]
         assert (
-            module_factory_result.builder_kwargs["data_files"] is not None
-            and len(module_factory_result.builder_kwargs["data_files"]["train"]) == 1
-            and len(module_factory_result.builder_kwargs["data_files"]["test"]) == 1
+            builder_config.data_files is not None
+            and len(builder_config.data_files["train"]) == 1
+            and len(builder_config.data_files["test"]) == 1
         )
         assert all(
             self._sub_data_dir in Path(data_file).parts
-            for data_file in module_factory_result.builder_kwargs["data_files"]["train"]
-            + module_factory_result.builder_kwargs["data_files"]["test"]
+            for data_file in builder_config.data_files["train"] + builder_config.data_files["test"]
         )
 
     def test_LocalDatasetModuleFactoryWithoutScript_with_metadata(self):
         factory = LocalDatasetModuleFactoryWithoutScript(self._data_dir_with_metadata)
         module_factory_result = factory.get_module()
         assert importlib.import_module(module_factory_result.module_path) is not None
+        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]
         assert (
-            module_factory_result.builder_kwargs["data_files"] is not None
-            and len(module_factory_result.builder_kwargs["data_files"]["train"]) > 0
-            and len(module_factory_result.builder_kwargs["data_files"]["test"]) > 0
-        )
-        assert any(
-            Path(data_file).name == "metadata.jsonl"
-            for data_file in module_factory_result.builder_kwargs["data_files"]["train"]
-        )
-        assert any(
-            Path(data_file).name == "metadata.jsonl"
-            for data_file in module_factory_result.builder_kwargs["data_files"]["test"]
+            builder_config.data_files is not None
+            and len(builder_config.data_files["train"]) > 0
+            and len(builder_config.data_files["test"]) > 0
         )
+        assert any(Path(data_file).name == "metadata.jsonl" for data_file in builder_config.data_files["train"])
+        assert any(Path(data_file).name == "metadata.jsonl" for data_file in builder_config.data_files["test"])
 
     def test_LocalDatasetModuleFactoryWithoutScript_with_single_config_in_metadata(self):
         factory = LocalDatasetModuleFactoryWithoutScript(
@@ -580,13 +575,14 @@ def test_PackagedDatasetModuleFactory_with_data_dir(self):
         factory = PackagedDatasetModuleFactory("json", data_dir=self._data_dir, download_config=self.download_config)
         module_factory_result = factory.get_module()
         assert importlib.import_module(module_factory_result.module_path) is not None
+        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]
         assert (
-            module_factory_result.builder_kwargs["data_files"] is not None
-            and len(module_factory_result.builder_kwargs["data_files"]["train"]) > 0
-            and len(module_factory_result.builder_kwargs["data_files"]["test"]) > 0
+            builder_config.data_files is not None
+            and len(builder_config.data_files["train"]) > 0
+            and len(builder_config.data_files["test"]) > 0
         )
-        assert Path(module_factory_result.builder_kwargs["data_files"]["train"][0]).parent.samefile(self._data_dir)
-        assert Path(module_factory_result.builder_kwargs["data_files"]["test"][0]).parent.samefile(self._data_dir)
+        assert Path(builder_config.data_files["train"][0]).parent.samefile(self._data_dir)
+        assert Path(builder_config.data_files["test"][0]).parent.samefile(self._data_dir)
 
     def test_PackagedDatasetModuleFactory_with_data_dir_and_metadata(self):
         factory = PackagedDatasetModuleFactory(
@@ -594,25 +590,16 @@ def test_PackagedDatasetModuleFactory_with_data_dir_and_metadata(self):
         )
         module_factory_result = factory.get_module()
         assert importlib.import_module(module_factory_result.module_path) is not None
+        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]
         assert (
-            module_factory_result.builder_kwargs["data_files"] is not None
-            and len(module_factory_result.builder_kwargs["data_files"]["train"]) > 0
-            and len(module_factory_result.builder_kwargs["data_files"]["test"]) > 0
-        )
-        assert Path(module_factory_result.builder_kwargs["data_files"]["train"][0]).parent.samefile(
-            self._data_dir_with_metadata
-        )
-        assert Path(module_factory_result.builder_kwargs["data_files"]["test"][0]).parent.samefile(
-            self._data_dir_with_metadata
-        )
-        assert any(
-            Path(data_file).name == "metadata.jsonl"
-            for data_file in module_factory_result.builder_kwargs["data_files"]["train"]
-        )
-        assert any(
-            Path(data_file).name == "metadata.jsonl"
-            for data_file in module_factory_result.builder_kwargs["data_files"]["test"]
+            builder_config.data_files is not None
+            and len(builder_config.data_files["train"]) > 0
+            and len(builder_config.data_files["test"]) > 0
         )
+        assert Path(builder_config.data_files["train"][0]).parent.samefile(self._data_dir_with_metadata)
+        assert Path(builder_config.data_files["test"][0]).parent.samefile(self._data_dir_with_metadata)
+        assert any(Path(data_file).name == "metadata.jsonl" for data_file in builder_config.data_files["train"])
+        assert any(Path(data_file).name == "metadata.jsonl" for data_file in builder_config.data_files["test"])
 
     @pytest.mark.integration
     def test_HubDatasetModuleFactoryWithoutScript(self):
@@ -631,16 +618,16 @@ def test_HubDatasetModuleFactoryWithoutScript_with_data_dir(self):
         )
         module_factory_result = factory.get_module()
         assert importlib.import_module(module_factory_result.module_path) is not None
+        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]
         assert module_factory_result.builder_kwargs["base_path"].startswith(config.HF_ENDPOINT)
         assert (
-            module_factory_result.builder_kwargs["data_files"] is not None
-            and len(module_factory_result.builder_kwargs["data_files"]["train"]) == 1
-            and len(module_factory_result.builder_kwargs["data_files"]["test"]) == 1
+            builder_config.data_files is not None
+            and len(builder_config.data_files["train"]) == 1
+            and len(builder_config.data_files["test"]) == 1
         )
         assert all(
             data_dir in Path(data_file).parts
-            for data_file in module_factory_result.builder_kwargs["data_files"]["train"]
-            + module_factory_result.builder_kwargs["data_files"]["test"]
+            for data_file in builder_config.data_files["train"] + builder_config.data_files["test"]
         )
 
     @pytest.mark.integration
@@ -650,20 +637,15 @@ def test_HubDatasetModuleFactoryWithoutScript_with_metadata(self):
         )
         module_factory_result = factory.get_module()
         assert importlib.import_module(module_factory_result.module_path) is not None
+        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]
         assert module_factory_result.builder_kwargs["base_path"].startswith(config.HF_ENDPOINT)
         assert (
-            module_factory_result.builder_kwargs["data_files"] is not None
-            and len(module_factory_result.builder_kwargs["data_files"]["train"]) > 0
-            and len(module_factory_result.builder_kwargs["data_files"]["test"]) > 0
-        )
-        assert any(
-            Path(data_file).name == "metadata.jsonl"
-            for data_file in module_factory_result.builder_kwargs["data_files"]["train"]
-        )
-        assert any(
-            Path(data_file).name == "metadata.jsonl"
-            for data_file in module_factory_result.builder_kwargs["data_files"]["test"]
+            builder_config.data_files is not None
+            and len(builder_config.data_files["train"]) > 0
+            and len(builder_config.data_files["test"]) > 0
         )
+        assert any(Path(data_file).name == "metadata.jsonl" for data_file in builder_config.data_files["train"])
+        assert any(Path(data_file).name == "metadata.jsonl" for data_file in builder_config.data_files["test"])
 
         factory = HubDatasetModuleFactoryWithoutScript(
             SAMPLE_DATASET_IDENTIFIER5, download_config=self.download_config
@@ -672,14 +654,11 @@ def test_HubDatasetModuleFactoryWithoutScript_with_metadata(self):
         assert importlib.import_module(module_factory_result.module_path) is not None
         assert module_factory_result.builder_kwargs["base_path"].startswith(config.HF_ENDPOINT)
         assert (
-            module_factory_result.builder_kwargs["data_files"] is not None
-            and len(module_factory_result.builder_kwargs["data_files"]) == 1
-            and len(module_factory_result.builder_kwargs["data_files"]["train"]) > 0
-        )
-        assert any(
-            Path(data_file).name == "metadata.jsonl"
-            for data_file in module_factory_result.builder_kwargs["data_files"]["train"]
+            builder_config.data_files is not None
+            and len(builder_config.data_files) == 1
+            and len(builder_config.data_files["train"]) > 0
         )
+        assert any(Path(data_file).name == "metadata.jsonl" for data_file in builder_config.data_files["train"])
 
     @pytest.mark.integration
     def test_HubDatasetModuleFactoryWithoutScript_with_one_default_config_in_metadata(self):
@@ -1014,9 +993,8 @@ def test_load_dataset_builder_with_metadata():
     assert builder.config.name == "default"
     assert builder.config.data_files is not None
     assert builder.config.drop_metadata is None
-    builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER4, "non-existing-config")
-    assert isinstance(builder, ImageFolder)
-    assert builder.config.name == "non-existing-config"
+    with pytest.raises(ValueError):
+        builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER4, "non-existing-config")
 
 
 @pytest.mark.integration

From 5cc79dddd390e3d64b7796b618cdc213b05cbffe Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Thu, 7 Dec 2023 18:52:02 +0100
Subject: [PATCH 18/37] fix more tests

---
 src/datasets/load.py                            | 10 +++++++---
 src/datasets/packaged_modules/__init__.py       |  1 +
 src/datasets/packaged_modules/cache/__init__.py |  0
 3 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 src/datasets/packaged_modules/cache/__init__.py

diff --git a/src/datasets/load.py b/src/datasets/load.py
index 1ee66479f52..031be87b55e 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1162,11 +1162,15 @@ def get_module(self) -> DatasetModule:
 
         builder_kwargs = {
             "hash": hash,
-            "data_files": data_files,
             "dataset_name": self.name,
         }
-
-        return DatasetModule(module_path, hash, builder_kwargs)
+        builder_configs = [import_main_class(module_path).BUILDER_CONFIG_CLASS(data_files=data_files)]
+        return DatasetModule(
+            module_path,
+            hash,
+            builder_kwargs,
+            builder_configs_parameters=BuilderConfigsParameters(builder_configs=builder_configs),
+        )
 
 
 class HubDatasetModuleFactoryWithoutScript(_DatasetModuleFactory):
diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py
index a9fe30d7f5d..91181c4f550 100644
--- a/src/datasets/packaged_modules/__init__.py
+++ b/src/datasets/packaged_modules/__init__.py
@@ -6,6 +6,7 @@
 
 from .arrow import arrow
 from .audiofolder import audiofolder
+from .cache import cache  # noqa F401
 from .csv import csv
 from .imagefolder import imagefolder
 from .json import json
diff --git a/src/datasets/packaged_modules/cache/__init__.py b/src/datasets/packaged_modules/cache/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d

From ba892e1fa13c4202255f5c3d14bb0b31bfd4fadc Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Thu, 7 Dec 2023 19:31:20 +0100
Subject: [PATCH 19/37] fix tests

---
 src/datasets/load.py | 16 ++++++++--------
 tests/test_load.py   | 28 ++++++++++------------------
 2 files changed, 18 insertions(+), 26 deletions(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index 031be87b55e..a832582a94d 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1162,15 +1162,11 @@ def get_module(self) -> DatasetModule:
 
         builder_kwargs = {
             "hash": hash,
+            "data_files": data_files,
             "dataset_name": self.name,
         }
-        builder_configs = [import_main_class(module_path).BUILDER_CONFIG_CLASS(data_files=data_files)]
-        return DatasetModule(
-            module_path,
-            hash,
-            builder_kwargs,
-            builder_configs_parameters=BuilderConfigsParameters(builder_configs=builder_configs),
-        )
+
+        return DatasetModule(module_path, hash, builder_kwargs)
 
 
 class HubDatasetModuleFactoryWithoutScript(_DatasetModuleFactory):
@@ -2265,7 +2261,11 @@ def load_dataset_builder(
             hash, dataset_module.builder_configs_parameters.metadata_configs[config_name]
         )
 
-    if path in _PACKAGED_DATASETS_MODULES and data_files is None:
+    if (
+        path in _PACKAGED_DATASETS_MODULES
+        and data_files is None
+        and dataset_module.builder_configs_parameters.builder_configs[0].data_files is None
+    ):
         error_msg = f"Please specify the data files or data directory to load for the {path} dataset builder."
         example_extensions = [
             extension for extension in _EXTENSION_TO_MODULE if _EXTENSION_TO_MODULE[extension] == path
diff --git a/tests/test_load.py b/tests/test_load.py
index c7dd34afaee..84444dfbc89 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -575,14 +575,10 @@ def test_PackagedDatasetModuleFactory_with_data_dir(self):
         factory = PackagedDatasetModuleFactory("json", data_dir=self._data_dir, download_config=self.download_config)
         module_factory_result = factory.get_module()
         assert importlib.import_module(module_factory_result.module_path) is not None
-        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]
-        assert (
-            builder_config.data_files is not None
-            and len(builder_config.data_files["train"]) > 0
-            and len(builder_config.data_files["test"]) > 0
-        )
-        assert Path(builder_config.data_files["train"][0]).parent.samefile(self._data_dir)
-        assert Path(builder_config.data_files["test"][0]).parent.samefile(self._data_dir)
+        data_files = module_factory_result.builder_kwargs.get("data_files")
+        assert data_files is not None and len(data_files["train"]) > 0 and len(data_files["test"]) > 0
+        assert Path(data_files["train"][0]).parent.samefile(self._data_dir)
+        assert Path(data_files["test"][0]).parent.samefile(self._data_dir)
 
     def test_PackagedDatasetModuleFactory_with_data_dir_and_metadata(self):
         factory = PackagedDatasetModuleFactory(
@@ -590,16 +586,12 @@ def test_PackagedDatasetModuleFactory_with_data_dir_and_metadata(self):
         )
         module_factory_result = factory.get_module()
         assert importlib.import_module(module_factory_result.module_path) is not None
-        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]
-        assert (
-            builder_config.data_files is not None
-            and len(builder_config.data_files["train"]) > 0
-            and len(builder_config.data_files["test"]) > 0
-        )
-        assert Path(builder_config.data_files["train"][0]).parent.samefile(self._data_dir_with_metadata)
-        assert Path(builder_config.data_files["test"][0]).parent.samefile(self._data_dir_with_metadata)
-        assert any(Path(data_file).name == "metadata.jsonl" for data_file in builder_config.data_files["train"])
-        assert any(Path(data_file).name == "metadata.jsonl" for data_file in builder_config.data_files["test"])
+        data_files = module_factory_result.builder_kwargs.get("data_files")
+        assert data_files is not None and len(data_files["train"]) > 0 and len(data_files["test"]) > 0
+        assert Path(data_files["train"][0]).parent.samefile(self._data_dir_with_metadata)
+        assert Path(data_files["test"][0]).parent.samefile(self._data_dir_with_metadata)
+        assert any(Path(data_file).name == "metadata.jsonl" for data_file in data_files["train"])
+        assert any(Path(data_file).name == "metadata.jsonl" for data_file in data_files["test"])
 
     @pytest.mark.integration
     def test_HubDatasetModuleFactoryWithoutScript(self):

From b9d3b0a008738e54bab8ef781f7fa76bc0fb20d2 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Fri, 8 Dec 2023 17:20:11 +0100
Subject: [PATCH 20/37] fix tests

---
 src/datasets/builder.py              |  2 +-
 tests/packaged_modules/test_cache.py | 12 ++++++++++--
 tests/test_load.py                   |  1 +
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index d3f3e6e013e..101ccb7feef 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -521,7 +521,7 @@ def _create_builder_config(
             if self.DEFAULT_CONFIG_NAME is not None:
                 builder_config = self.builder_configs.get(self.DEFAULT_CONFIG_NAME)
                 logger.info(f"No config specified, defaulting to: {self.dataset_name}/{builder_config.name}")
-            elif not config_kwargs:
+            else:
                 if len(self.BUILDER_CONFIGS) > 1:
                     example_of_usage = f"load_dataset('{self.dataset_name}', '{self.BUILDER_CONFIGS[0].name}')"
                     raise ValueError(
diff --git a/tests/packaged_modules/test_cache.py b/tests/packaged_modules/test_cache.py
index 9adb75a7846..82aca51a67b 100644
--- a/tests/packaged_modules/test_cache.py
+++ b/tests/packaged_modules/test_cache.py
@@ -3,6 +3,7 @@
 import pytest
 
 from datasets import load_dataset
+from datasets.load import configure_builder_class
 from datasets.packaged_modules.cache.cache import Cache
 
 
@@ -42,13 +43,20 @@ def test_cache_missing(text_dir: Path):
 @pytest.mark.integration
 def test_cache_multi_configs():
     repo_id = SAMPLE_DATASET_TWO_CONFIG_IN_METADATA
+    dataset_name = repo_id.split("/")[-1]
     config_name = "v1"
     ds = load_dataset(repo_id, config_name)
     hash = Path(ds["train"].cache_files[0]["filename"]).parts[-2]
-    cache = Cache(dataset_name=repo_id.split("/")[-1], repo_id=repo_id, config_name=config_name, hash=hash)
+    builder_cls = configure_builder_class(
+        Cache,
+        builder_configs=[Cache.BUILDER_CONFIG_CLASS(name="v1"), Cache.BUILDER_CONFIG_CLASS(name="v2")],
+        default_config_name=None,
+        dataset_name=dataset_name,
+    )
+    cache = builder_cls(dataset_name=dataset_name, repo_id=repo_id, config_name=config_name, hash=hash)
     reloaded = cache.as_dataset()
     assert list(ds) == list(reloaded)
     assert len(ds["train"]) == len(reloaded["train"])
     with pytest.raises(ValueError) as excinfo:
-        Cache(dataset_name=repo_id.split("/")[-1], repo_id=repo_id, config_name="missing", hash=hash)
+        builder_cls(dataset_name=dataset_name, repo_id=repo_id, config_name="missing", hash=hash)
     assert config_name in str(excinfo.value)
diff --git a/tests/test_load.py b/tests/test_load.py
index 84444dfbc89..7f3f0f39e8d 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -644,6 +644,7 @@ def test_HubDatasetModuleFactoryWithoutScript_with_metadata(self):
         )
         module_factory_result = factory.get_module()
         assert importlib.import_module(module_factory_result.module_path) is not None
+        builder_config = module_factory_result.builder_configs_parameters.builder_configs[0]
         assert module_factory_result.builder_kwargs["base_path"].startswith(config.HF_ENDPOINT)
         assert (
             builder_config.data_files is not None

From 565c294fc12bc547730a023a610ed4f92313d8fb Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Mon, 11 Dec 2023 12:38:32 +0100
Subject: [PATCH 21/37] fix cache on config change

---
 src/datasets/builder.py | 14 +++++++++++++-
 src/datasets/load.py    | 23 -----------------------
 tests/test_builder.py   | 27 +++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 24 deletions(-)

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index 15f4a0b7d63..369cd72d43b 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -218,6 +218,16 @@ def _resolve_data_files(self, base_path: str, download_config: DownloadConfig) -
             base_path = xjoin(base_path, self.data_dir) if self.data_dir else base_path
             self.data_files = self.data_files.resolve(base_path, download_config)
 
+    def _update_hash(self, hash: str) -> str:
+        """
+        Used to update hash of packaged modules which is used for creating unique cache directories to reflect
+        different config parameters which are passed in metadata from readme.
+        """
+        m = Hasher()
+        m.update(hash)
+        m.update(self)
+        return m.hexdigest()
+
 
 class DatasetBuilder:
     """Abstract base class for all datasets.
@@ -381,6 +391,8 @@ def __init__(
             custom_features=features,
             **config_kwargs,
         )
+        if self.config.name in self.builder_configs:
+            self.hash = self.builder_configs[self.config.name]._update_hash(hash)
 
         # prepare info: DatasetInfo are a standardized dataclass across all datasets
         # Prefill datasetinfo
@@ -601,7 +613,7 @@ def _create_builder_config(
     @classproperty
     @classmethod
     @memoize()
-    def builder_configs(cls):
+    def builder_configs(cls) -> Dict[str, BuilderConfig]:
         """Dictionary of pre-defined configurations for this builder class."""
         configs = {config.name: config for config in cls.BUILDER_CONFIGS}
         if len(configs) != len(cls.BUILDER_CONFIGS):
diff --git a/src/datasets/load.py b/src/datasets/load.py
index 19d54834199..8270a4d23ff 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -53,7 +53,6 @@
 from .download.streaming_download_manager import StreamingDownloadManager, xbasename, xglob, xjoin
 from .exceptions import DataFilesNotFoundError, DatasetNotFoundError
 from .features import Features
-from .fingerprint import Hasher
 from .info import DatasetInfo, DatasetInfosDict
 from .iterable_dataset import IterableDataset
 from .metric import Metric
@@ -591,21 +590,6 @@ def infer_module_for_data_files(
     return module_name, default_builder_kwargs
 
 
-def update_hash_with_config_parameters(hash: str, config_parameters: dict) -> str:
-    """
-    Used to update hash of packaged modules which is used for creating unique cache directories to reflect
-    different config parameters which are passed in metadata from readme.
-    """
-    params_to_exclude = {"config_name", "version", "description"}
-    params_to_add_to_hash = {
-        param: value for param, value in sorted(config_parameters.items()) if param not in params_to_exclude
-    }
-    m = Hasher()
-    m.update(hash)
-    m.update(params_to_add_to_hash)
-    return m.hexdigest()
-
-
 def create_builder_configs_from_metadata_configs(
     module_path: str,
     metadata_configs: MetadataConfigs,
@@ -2178,13 +2162,6 @@ def load_dataset_builder(
     dataset_name = builder_kwargs.pop("dataset_name", None)
     hash = builder_kwargs.pop("hash")
     info = dataset_module.dataset_infos.get(config_name) if dataset_module.dataset_infos else None
-    if (
-        dataset_module.builder_configs_parameters.metadata_configs
-        and config_name in dataset_module.builder_configs_parameters.metadata_configs
-    ):
-        hash = update_hash_with_config_parameters(
-            hash, dataset_module.builder_configs_parameters.metadata_configs[config_name]
-        )
 
     if path in _PACKAGED_DATASETS_MODULES and data_files is None:
         error_msg = f"Please specify the data files or data directory to load for the {path} dataset builder."
diff --git a/tests/test_builder.py b/tests/test_builder.py
index 54bae47ae08..feaa27ce8a6 100644
--- a/tests/test_builder.py
+++ b/tests/test_builder.py
@@ -23,6 +23,7 @@
 from datasets.features import Features, Value
 from datasets.info import DatasetInfo, PostProcessedInfo
 from datasets.iterable_dataset import IterableDataset
+from datasets.load import configure_builder_class
 from datasets.splits import Split, SplitDict, SplitGenerator, SplitInfo
 from datasets.streaming import xjoin
 from datasets.utils.file_utils import is_local_path
@@ -830,6 +831,32 @@ def test_cache_dir_for_data_dir(self):
             other_builder = DummyBuilderWithManualDownload(cache_dir=tmp_dir, config_name="a", data_dir=tmp_dir)
             self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)
 
+    def test_cache_dir_for_configured_builder(self):
+        with tempfile.TemporaryDirectory() as tmp_dir, tempfile.TemporaryDirectory() as data_dir:
+            builder_cls = configure_builder_class(
+                DummyBuilderWithManualDownload,
+                builder_configs=[BuilderConfig(data_dir=data_dir)],
+                default_config_name=None,
+                dataset_name="dummy",
+            )
+            other_builder_cls = configure_builder_class(
+                DummyBuilderWithManualDownload,
+                builder_configs=[BuilderConfig(data_dir=data_dir)],
+                default_config_name=None,
+                dataset_name="dummy",
+            )
+            builder = builder_cls(cache_dir=tmp_dir)
+            other_builder = other_builder_cls(cache_dir=tmp_dir)
+            self.assertEqual(builder.cache_dir, other_builder.cache_dir)
+            other_builder_cls = configure_builder_class(
+                DummyBuilderWithManualDownload,
+                builder_configs=[BuilderConfig(data_dir=tmp_dir)],
+                default_config_name=None,
+                dataset_name="dummy",
+            )
+            other_builder = other_builder_cls(cache_dir=tmp_dir)
+            self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)
+
 
 def test_arrow_based_download_and_prepare(tmp_path):
     builder = DummyArrowBasedBuilder(cache_dir=tmp_path)

From e01938d23b0ed7f0170d815dde4cf75eac34a784 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Mon, 11 Dec 2023 12:45:15 +0100
Subject: [PATCH 22/37] simpler

---
 src/datasets/builder.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index 369cd72d43b..118e589eac3 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -391,8 +391,7 @@ def __init__(
             custom_features=features,
             **config_kwargs,
         )
-        if self.config.name in self.builder_configs:
-            self.hash = self.builder_configs[self.config.name]._update_hash(hash)
+        self.hash = self.config._update_hash(hash)
 
         # prepare info: DatasetInfo are a standardized dataclass across all datasets
         # Prefill datasetinfo

From 09516db55ec5f6fd33f1eeaac973a02f3f2bf58d Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Mon, 11 Dec 2023 13:05:36 +0100
Subject: [PATCH 23/37] fix tests

---
 src/datasets/builder.py | 11 ++++++++---
 tests/test_builder.py   |  6 +++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index 118e589eac3..9412dadfb2e 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -26,7 +26,7 @@
 import time
 import urllib
 import warnings
-from dataclasses import dataclass
+from dataclasses import dataclass, fields
 from functools import partial
 from pathlib import Path
 from typing import Dict, Iterable, Mapping, Optional, Tuple, Union
@@ -225,7 +225,11 @@ def _update_hash(self, hash: str) -> str:
         """
         m = Hasher()
         m.update(hash)
-        m.update(self)
+        for field in sorted(field.name for field in fields(self)):
+            try:
+                m.update(getattr(self, field))
+            except TypeError:
+                continue
         return m.hexdigest()
 
 
@@ -391,7 +395,8 @@ def __init__(
             custom_features=features,
             **config_kwargs,
         )
-        self.hash = self.config._update_hash(hash)
+        if self.hash is not None:
+            self.hash = self.config._update_hash(hash)
 
         # prepare info: DatasetInfo are a standardized dataclass across all datasets
         # Prefill datasetinfo
diff --git a/tests/test_builder.py b/tests/test_builder.py
index feaa27ce8a6..5e89235fc3a 100644
--- a/tests/test_builder.py
+++ b/tests/test_builder.py
@@ -845,8 +845,8 @@ def test_cache_dir_for_configured_builder(self):
                 default_config_name=None,
                 dataset_name="dummy",
             )
-            builder = builder_cls(cache_dir=tmp_dir)
-            other_builder = other_builder_cls(cache_dir=tmp_dir)
+            builder = builder_cls(cache_dir=tmp_dir, hash="abc")
+            other_builder = other_builder_cls(cache_dir=tmp_dir, hash="abc")
             self.assertEqual(builder.cache_dir, other_builder.cache_dir)
             other_builder_cls = configure_builder_class(
                 DummyBuilderWithManualDownload,
@@ -854,7 +854,7 @@ def test_cache_dir_for_configured_builder(self):
                 default_config_name=None,
                 dataset_name="dummy",
             )
-            other_builder = other_builder_cls(cache_dir=tmp_dir)
+            other_builder = other_builder_cls(cache_dir=tmp_dir, hash="abc")
             self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)
 
 

From 46ae73de7dc80710f78c64fdce4a319e39fa7ca2 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Tue, 12 Dec 2023 18:14:26 +0100
Subject: [PATCH 24/37] make both PRs compatible

---
 src/datasets/builder.py | 16 ----------------
 src/datasets/load.py    | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index 8513c4d6325..9156208be9f 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -218,20 +218,6 @@ def _resolve_data_files(self, base_path: str, download_config: DownloadConfig) -
             base_path = xjoin(base_path, self.data_dir) if self.data_dir else base_path
             self.data_files = self.data_files.resolve(base_path, download_config)
 
-    def _update_hash(self, hash: str) -> str:
-        """
-        Used to update hash of packaged modules which is used for creating unique cache directories to reflect
-        different config parameters which are passed in metadata from readme.
-        """
-        m = Hasher()
-        m.update(hash)
-        for field in sorted(field.name for field in fields(self)):
-            try:
-                m.update(getattr(self, field))
-            except TypeError:
-                continue
-        return m.hexdigest()
-
 
 class DatasetBuilder:
     """Abstract base class for all datasets.
@@ -395,8 +381,6 @@ def __init__(
             custom_features=features,
             **config_kwargs,
         )
-        if self.hash is not None:
-            self.hash = self.config._update_hash(hash)
 
         # prepare info: DatasetInfo are a standardized dataclass across all datasets
         # Prefill datasetinfo
diff --git a/src/datasets/load.py b/src/datasets/load.py
index 1b72b4c873e..1729c21a6fe 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -54,6 +54,7 @@
 from .download.streaming_download_manager import StreamingDownloadManager, xbasename, xglob, xjoin
 from .exceptions import DataFilesNotFoundError, DatasetNotFoundError
 from .features import Features
+from .fingerprint import Hasher
 from .info import DatasetInfo, DatasetInfosDict
 from .iterable_dataset import IterableDataset
 from .metric import Metric
@@ -591,6 +592,17 @@ def infer_module_for_data_files(
     return module_name, default_builder_kwargs
 
 
+def update_hash_for_cache(hash: str, cache_kwargs: dict) -> str:
+    """
+    Used to update hash of packaged modules which is used for creating unique cache directories to reflect
+    different config parameters which are passed in metadata from readme.
+    """
+    m = Hasher()
+    m.update(hash)
+    m.update(cache_kwargs)
+    return m.hexdigest()
+
+
 def create_builder_configs_from_metadata_configs(
     module_path: str,
     metadata_configs: MetadataConfigs,
@@ -1058,6 +1070,7 @@ def get_module(self) -> DatasetModule:
                 )
             ]
             default_config_name = None
+        hash = update_hash_for_cache(hash, {config.name: config.data_files for config in builder_configs})
         builder_kwargs = {
             "hash": hash,
             "base_path": self.path,
@@ -1253,6 +1266,7 @@ def get_module(self) -> DatasetModule:
                 )
             ]
             default_config_name = None
+        hash = update_hash_for_cache(hash, revision)
         builder_kwargs = {
             "hash": hash,
             "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"),

From 64af455550afed3598cd7100ace29fe3b3d3b44c Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Tue, 12 Dec 2023 18:29:57 +0100
Subject: [PATCH 25/37] style

---
 src/datasets/builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index 9156208be9f..22077fd0249 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -26,7 +26,7 @@
 import time
 import urllib
 import warnings
-from dataclasses import dataclass, fields
+from dataclasses import dataclass
 from functools import partial
 from pathlib import Path
 from typing import Dict, Iterable, Mapping, Optional, Tuple, Union

From 0050b200a6942e7f35fa03e11e3c9b196e10d2f5 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Tue, 12 Dec 2023 18:47:22 +0100
Subject: [PATCH 26/37] fix tests

---
 tests/test_builder.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/tests/test_builder.py b/tests/test_builder.py
index 5e89235fc3a..3722a728147 100644
--- a/tests/test_builder.py
+++ b/tests/test_builder.py
@@ -839,22 +839,10 @@ def test_cache_dir_for_configured_builder(self):
                 default_config_name=None,
                 dataset_name="dummy",
             )
-            other_builder_cls = configure_builder_class(
-                DummyBuilderWithManualDownload,
-                builder_configs=[BuilderConfig(data_dir=data_dir)],
-                default_config_name=None,
-                dataset_name="dummy",
-            )
             builder = builder_cls(cache_dir=tmp_dir, hash="abc")
-            other_builder = other_builder_cls(cache_dir=tmp_dir, hash="abc")
+            other_builder = builder_cls(cache_dir=tmp_dir, hash="abc")
             self.assertEqual(builder.cache_dir, other_builder.cache_dir)
-            other_builder_cls = configure_builder_class(
-                DummyBuilderWithManualDownload,
-                builder_configs=[BuilderConfig(data_dir=tmp_dir)],
-                default_config_name=None,
-                dataset_name="dummy",
-            )
-            other_builder = other_builder_cls(cache_dir=tmp_dir, hash="abc")
+            other_builder = builder_cls(cache_dir=tmp_dir, hash="def")
             self.assertNotEqual(builder.cache_dir, other_builder.cache_dir)
 
 

From c141bfae844b6586de419394135b8cb887336adb Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Tue, 12 Dec 2023 23:57:16 +0100
Subject: [PATCH 27/37] fix tests

---
 src/datasets/load.py                         | 52 ++++--------------
 src/datasets/packaged_modules/cache/cache.py | 55 +++++++++++++++++++-
 2 files changed, 63 insertions(+), 44 deletions(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index 1729c21a6fe..e71f2316aeb 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -592,14 +592,15 @@ def infer_module_for_data_files(
     return module_name, default_builder_kwargs
 
 
-def update_hash_for_cache(hash: str, cache_kwargs: dict) -> str:
+def update_hash_for_cache(hash: str, metadata_configs: MetadataConfigs, dataset_infos: DatasetInfosDict) -> str:
     """
     Used to update hash of packaged modules which is used for creating unique cache directories to reflect
     different config parameters which are passed in metadata from readme.
     """
     m = Hasher()
     m.update(hash)
-    m.update(cache_kwargs)
+    m.update(metadata_configs)
+    m.update(dataset_infos)
     return m.hexdigest()
 
 
@@ -988,7 +989,7 @@ def get_module(self) -> DatasetModule:
 
         # make the new module to be noticed by the import system
         importlib.invalidate_caches()
-        builder_kwargs = {"hash": hash, "base_path": str(Path(self.path).parent)}
+        builder_kwargs = {"base_path": str(Path(self.path).parent)}
         return DatasetModule(module_path, hash, builder_kwargs)
 
 
@@ -1070,9 +1071,7 @@ def get_module(self) -> DatasetModule:
                 )
             ]
             default_config_name = None
-        hash = update_hash_for_cache(hash, {config.name: config.data_files for config in builder_configs})
         builder_kwargs = {
-            "hash": hash,
             "base_path": self.path,
             "dataset_name": camelcase_to_snakecase(Path(self.path).name),
         }
@@ -1094,6 +1093,7 @@ def get_module(self) -> DatasetModule:
         if default_config_name is None and len(dataset_infos) == 1:
             default_config_name = next(iter(dataset_infos))
 
+        hash = update_hash_for_cache(hash, metadata_configs=metadata_configs, dataset_infos=dataset_infos)
         return DatasetModule(
             module_path,
             hash,
@@ -1154,7 +1154,6 @@ def get_module(self) -> DatasetModule:
         module_path, hash = _PACKAGED_DATASETS_MODULES[self.name]
 
         builder_kwargs = {
-            "hash": hash,
             "data_files": data_files,
             "dataset_name": self.name,
         }
@@ -1266,9 +1265,7 @@ def get_module(self) -> DatasetModule:
                 )
             ]
             default_config_name = None
-        hash = update_hash_for_cache(hash, revision)
         builder_kwargs = {
-            "hash": hash,
             "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"),
             "repo_id": self.name,
             "dataset_name": camelcase_to_snakecase(Path(self.name).name),
@@ -1300,6 +1297,7 @@ def get_module(self) -> DatasetModule:
         if default_config_name is None and len(dataset_infos) == 1:
             default_config_name = next(iter(dataset_infos))
 
+        hash = update_hash_for_cache(hash, metadata_configs=metadata_configs, dataset_infos=dataset_infos)
         return DatasetModule(
             module_path,
             hash,
@@ -1355,7 +1353,6 @@ def get_module(self) -> DatasetModule:
             supports_metadata=False,
         )
         builder_kwargs = {
-            "hash": hash,
             "repo_id": self.name,
             "dataset_name": camelcase_to_snakecase(Path(self.name).name),
         }
@@ -1498,7 +1495,6 @@ def get_module(self) -> DatasetModule:
         # make the new module to be noticed by the import system
         importlib.invalidate_caches()
         builder_kwargs = {
-            "hash": hash,
             "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"),
             "repo_id": self.name,
         }
@@ -1560,7 +1556,6 @@ def _get_modification_time(module_hash):
             )
             importlib.invalidate_caches()
             builder_kwargs = {
-                "hash": hash,
                 "repo_id": self.name,
             }
             return DatasetModule(module_path, hash, builder_kwargs)
@@ -1572,42 +1567,14 @@ def _get_modification_time(module_hash):
             if os.path.isdir(cached_directory_path)
         ]
         if cached_directory_paths:
-            # get most recent
-            def _get_modification_time(cached_directory_path):
-                return (Path(cached_directory_path)).stat().st_mtime
-
-            hash = Path(sorted(cached_directory_paths, key=_get_modification_time)[-1]).name
-            cached_directory_paths = [
-                cached_directory_path
-                for cached_directory_path in cached_directory_paths
-                if Path(cached_directory_path).name == hash
-            ]
-            warning_msg = (
-                f"Using the latest cached version of the dataset from {cached_datasets_directory_path_root}/*/*/{hash}"
-                f"(last modified on {time.ctime(_get_modification_time(cached_directory_paths[0]))}) since it "
-                f"couldn't be found locally at {self.name}."
-            )
-            if not config.HF_DATASETS_OFFLINE:
-                warning_msg += ", or remotely on the Hugging Face Hub."
-            logger.warning(warning_msg)
-            module_path = "datasets.packaged_modules.cache.cache"
             builder_kwargs = {
-                "hash": hash,
                 "repo_id": self.name,
                 "dataset_name": self.name.split("/")[-1],
             }
-            dataset_infos = DatasetInfosDict()
-            builder_configs = []
-            for cached_directory_path in cached_directory_paths:
-                config_name = Path(cached_directory_path).parts[-3]
-                dataset_infos[config_name] = DatasetInfo.from_directory(cached_directory_path)
-                builder_configs.append(import_main_class(module_path).BUILDER_CONFIG_CLASS(name=config_name))
             return DatasetModule(
-                module_path,
-                hash,
+                "datasets.packaged_modules.cache.cache",
+                "auto",
                 builder_kwargs,
-                dataset_infos=dataset_infos,
-                builder_configs_parameters=BuilderConfigsParameters(builder_configs=builder_configs),
             )
         raise FileNotFoundError(f"Dataset {self.name} is not cached in {self.cache_dir}")
 
@@ -2243,7 +2210,6 @@ def load_dataset_builder(
         "config_name", name or dataset_module.builder_configs_parameters.default_config_name
     )
     dataset_name = builder_kwargs.pop("dataset_name", None)
-    hash = builder_kwargs.pop("hash")
     info = dataset_module.dataset_infos.get(config_name) if dataset_module.dataset_infos else None
 
     if (
@@ -2267,7 +2233,7 @@ def load_dataset_builder(
         config_name=config_name,
         data_dir=data_dir,
         data_files=data_files,
-        hash=hash,
+        hash=dataset_module.hash,
         info=info,
         features=features,
         token=token,
diff --git a/src/datasets/packaged_modules/cache/cache.py b/src/datasets/packaged_modules/cache/cache.py
index 876167c4195..93ba3becdd0 100644
--- a/src/datasets/packaged_modules/cache/cache.py
+++ b/src/datasets/packaged_modules/cache/cache.py
@@ -1,24 +1,77 @@
+import glob
 import os
 import shutil
+import time
+from pathlib import Path
 from typing import List, Optional
 
 import pyarrow as pa
 
 import datasets
+import datasets.config
 from datasets.naming import filenames_for_dataset_split
 
 
 logger = datasets.utils.logging.get_logger(__name__)
 
 
+def _get_modification_time(cached_directory_path):
+    return (Path(cached_directory_path)).stat().st_mtime
+
+
+def _find_hash_in_cache(repo_id: str, config_name: Optional[str], cache_dir: Optional[str]) -> str:
+    cache_dir = os.path.expanduser(str(cache_dir or datasets.config.HF_DATASETS_CACHE))
+    cached_datasets_directory_path_root = os.path.join(cache_dir, repo_id.replace("/", "___"))
+    cached_directory_paths = [
+        cached_directory_path
+        for cached_directory_path in glob.glob(
+            os.path.join(cached_datasets_directory_path_root, config_name or "*", "*", "*")
+        )
+        if os.path.isdir(cached_directory_path)
+    ]
+    if not cached_directory_paths:
+        raise FileNotFoundError(
+            f"Couldn't find cache for{repo_id}{f' for config {config_name}' if config_name else ''}"
+        )
+    # get most recent
+    hash = Path(sorted(cached_directory_paths, key=_get_modification_time)[-1]).name
+    warning_msg = (
+        f"Using the latest cached version of the dataset from {cached_datasets_directory_path_root}/*/*/{hash}"
+        f"(last modified on {time.ctime(_get_modification_time(cached_directory_paths[0]))}) since {repo_id} "
+        f"couldn't be found on the Hugging Face Hub"
+    )
+    if datasets.config.HF_DATASETS_OFFLINE:
+        warning_msg += " (offline mode is enabled)."
+    logger.warning(warning_msg)
+    return hash
+
+
 class Cache(datasets.ArrowBasedBuilder):
+    def __init__(
+        self,
+        cache_dir: Optional[str] = None,
+        config_name: Optional[str] = None,
+        hash: Optional[str] = None,
+        repo_id: Optional[str] = None,
+        **kwargs,
+    ):
+        if repo_id is None:
+            raise ValueError("repo_id is required for the Cache dataset builder")
+        if hash == "auto":
+            hash = _find_hash_in_cache(
+                repo_id=repo_id,
+                config_name=config_name,
+                cache_dir=cache_dir,
+            )
+        super().__init__(cache_dir=cache_dir, config_name=config_name, hash=hash, repo_id=repo_id, **kwargs)
+
     def _info(self) -> datasets.DatasetInfo:
         return datasets.DatasetInfo()
 
     def download_and_prepare(self, output_dir: Optional[str] = None, *args, **kwargs):
         if not os.path.exists(self.cache_dir):
             raise ValueError(f"Cache directory for {self.dataset_name} doesn't exist at {self.cache_dir}")
-        if output_dir is not None:
+        if output_dir is not None and output_dir != self.cache_dir:
             shutil.copytree(self.cache_dir, output_dir)
 
     def _split_generators(self, dl_manager):

From 70494f1828b19c31498925e7b79b289c72a942b4 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 13 Dec 2023 00:27:27 +0100
Subject: [PATCH 28/37] fix tests

---
 src/datasets/packaged_modules/cache/cache.py | 39 +++++++++++++++-----
 tests/packaged_modules/test_cache.py         | 29 +++++++--------
 2 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/src/datasets/packaged_modules/cache/cache.py b/src/datasets/packaged_modules/cache/cache.py
index 93ba3becdd0..d2af93ab3c7 100644
--- a/src/datasets/packaged_modules/cache/cache.py
+++ b/src/datasets/packaged_modules/cache/cache.py
@@ -19,9 +19,9 @@ def _get_modification_time(cached_directory_path):
     return (Path(cached_directory_path)).stat().st_mtime
 
 
-def _find_hash_in_cache(repo_id: str, config_name: Optional[str], cache_dir: Optional[str]) -> str:
+def _find_hash_in_cache(dataset_name: str, config_name: Optional[str], cache_dir: Optional[str]) -> str:
     cache_dir = os.path.expanduser(str(cache_dir or datasets.config.HF_DATASETS_CACHE))
-    cached_datasets_directory_path_root = os.path.join(cache_dir, repo_id.replace("/", "___"))
+    cached_datasets_directory_path_root = os.path.join(cache_dir, dataset_name.replace("/", "___"))
     cached_directory_paths = [
         cached_directory_path
         for cached_directory_path in glob.glob(
@@ -30,14 +30,27 @@ def _find_hash_in_cache(repo_id: str, config_name: Optional[str], cache_dir: Opt
         if os.path.isdir(cached_directory_path)
     ]
     if not cached_directory_paths:
-        raise FileNotFoundError(
-            f"Couldn't find cache for{repo_id}{f' for config {config_name}' if config_name else ''}"
+        if config_name is not None:
+            cached_directory_paths = [
+                cached_directory_path
+                for cached_directory_path in glob.glob(
+                    os.path.join(cached_datasets_directory_path_root, "*", "*", "*")
+                )
+                if os.path.isdir(cached_directory_path)
+            ]
+        available_configs = sorted(
+            {Path(cached_directory_path).parts[-3] for cached_directory_path in cached_directory_paths}
+        )
+        raise ValueError(
+            f"Couldn't find cache for {dataset_name}"
+            + (f" for config '{config_name}'" if config_name else "")
+            + (f"\nAvailable configs in the cache: {available_configs}" if available_configs else "")
         )
     # get most recent
     hash = Path(sorted(cached_directory_paths, key=_get_modification_time)[-1]).name
     warning_msg = (
         f"Using the latest cached version of the dataset from {cached_datasets_directory_path_root}/*/*/{hash}"
-        f"(last modified on {time.ctime(_get_modification_time(cached_directory_paths[0]))}) since {repo_id} "
+        f"(last modified on {time.ctime(_get_modification_time(cached_directory_paths[0]))}) since {dataset_name} "
         f"couldn't be found on the Hugging Face Hub"
     )
     if datasets.config.HF_DATASETS_OFFLINE:
@@ -50,20 +63,28 @@ class Cache(datasets.ArrowBasedBuilder):
     def __init__(
         self,
         cache_dir: Optional[str] = None,
+        dataset_name: Optional[str] = None,
         config_name: Optional[str] = None,
         hash: Optional[str] = None,
         repo_id: Optional[str] = None,
         **kwargs,
     ):
-        if repo_id is None:
-            raise ValueError("repo_id is required for the Cache dataset builder")
+        if repo_id is None and dataset_name is None:
+            raise ValueError("repo_id or dataset_name is required for the Cache dataset builder")
         if hash == "auto":
             hash = _find_hash_in_cache(
-                repo_id=repo_id,
+                dataset_name=repo_id or dataset_name,
                 config_name=config_name,
                 cache_dir=cache_dir,
             )
-        super().__init__(cache_dir=cache_dir, config_name=config_name, hash=hash, repo_id=repo_id, **kwargs)
+        super().__init__(
+            cache_dir=cache_dir,
+            dataset_name=dataset_name,
+            config_name=config_name,
+            hash=hash,
+            repo_id=repo_id,
+            **kwargs,
+        )
 
     def _info(self) -> datasets.DatasetInfo:
         return datasets.DatasetInfo()
diff --git a/tests/packaged_modules/test_cache.py b/tests/packaged_modules/test_cache.py
index 82aca51a67b..e43deec2879 100644
--- a/tests/packaged_modules/test_cache.py
+++ b/tests/packaged_modules/test_cache.py
@@ -3,7 +3,6 @@
 import pytest
 
 from datasets import load_dataset
-from datasets.load import configure_builder_class
 from datasets.packaged_modules.cache.cache import Cache
 
 
@@ -28,16 +27,23 @@ def test_cache_streaming(text_dir: Path):
     assert list(ds["train"]) == list(reloaded["train"])
 
 
-def test_cache_missing(text_dir: Path):
+def test_cache_auto_hash(text_dir: Path):
     ds = load_dataset(str(text_dir))
-    hash = Path(ds["train"].cache_files[0]["filename"]).parts[-2]
-    Cache(dataset_name=text_dir.name, hash=hash).download_and_prepare()
+    cache = Cache(dataset_name=text_dir.name, hash="auto")
+    reloaded = cache.as_dataset()
+    assert list(ds) == list(reloaded)
+    assert list(ds["train"]) == list(reloaded["train"])
+
+
+def test_cache_missing(text_dir: Path):
+    load_dataset(str(text_dir))
+    Cache(dataset_name=text_dir.name, hash="auto").download_and_prepare()
     with pytest.raises(ValueError):
-        Cache(dataset_name="missing", hash=hash).download_and_prepare()
+        Cache(dataset_name="missing", hash="auto").download_and_prepare()
     with pytest.raises(ValueError):
         Cache(dataset_name=text_dir.name, hash="missing").download_and_prepare()
     with pytest.raises(ValueError):
-        Cache(dataset_name=text_dir.name, config_name="missing", hash=hash).download_and_prepare()
+        Cache(dataset_name=text_dir.name, config_name="missing", hash="auto").download_and_prepare()
 
 
 @pytest.mark.integration
@@ -46,17 +52,10 @@ def test_cache_multi_configs():
     dataset_name = repo_id.split("/")[-1]
     config_name = "v1"
     ds = load_dataset(repo_id, config_name)
-    hash = Path(ds["train"].cache_files[0]["filename"]).parts[-2]
-    builder_cls = configure_builder_class(
-        Cache,
-        builder_configs=[Cache.BUILDER_CONFIG_CLASS(name="v1"), Cache.BUILDER_CONFIG_CLASS(name="v2")],
-        default_config_name=None,
-        dataset_name=dataset_name,
-    )
-    cache = builder_cls(dataset_name=dataset_name, repo_id=repo_id, config_name=config_name, hash=hash)
+    cache = Cache(dataset_name=dataset_name, repo_id=repo_id, config_name=config_name, hash="auto")
     reloaded = cache.as_dataset()
     assert list(ds) == list(reloaded)
     assert len(ds["train"]) == len(reloaded["train"])
     with pytest.raises(ValueError) as excinfo:
-        builder_cls(dataset_name=dataset_name, repo_id=repo_id, config_name="missing", hash=hash)
+        Cache(dataset_name=dataset_name, repo_id=repo_id, config_name="missing", hash="auto")
     assert config_name in str(excinfo.value)

From 81b3ccfc016f6a39837334a0173dac3f59112856 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 13 Dec 2023 11:16:06 +0100
Subject: [PATCH 29/37] fix test

---
 src/datasets/load.py                         |  4 ++++
 src/datasets/packaged_modules/cache/cache.py | 15 +++++++--------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index e71f2316aeb..6208b21f610 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1571,6 +1571,10 @@ def _get_modification_time(module_hash):
                 "repo_id": self.name,
                 "dataset_name": self.name.split("/")[-1],
             }
+            warning_msg = f"Using the latest cached version of the dataset since {self.name} couldn't be found on the Hugging Face Hub"
+            if config.HF_DATASETS_OFFLINE:
+                warning_msg += " (offline mode is enabled)."
+            logger.warning(warning_msg)
             return DatasetModule(
                 "datasets.packaged_modules.cache.cache",
                 "auto",
diff --git a/src/datasets/packaged_modules/cache/cache.py b/src/datasets/packaged_modules/cache/cache.py
index d2af93ab3c7..0c3c9361952 100644
--- a/src/datasets/packaged_modules/cache/cache.py
+++ b/src/datasets/packaged_modules/cache/cache.py
@@ -47,15 +47,14 @@ def _find_hash_in_cache(dataset_name: str, config_name: Optional[str], cache_dir
             + (f"\nAvailable configs in the cache: {available_configs}" if available_configs else "")
         )
     # get most recent
-    hash = Path(sorted(cached_directory_paths, key=_get_modification_time)[-1]).name
-    warning_msg = (
-        f"Using the latest cached version of the dataset from {cached_datasets_directory_path_root}/*/*/{hash}"
-        f"(last modified on {time.ctime(_get_modification_time(cached_directory_paths[0]))}) since {dataset_name} "
-        f"couldn't be found on the Hugging Face Hub"
+    cached_directory_path = Path(sorted(cached_directory_paths, key=_get_modification_time)[-1])
+    config_name = cached_directory_path.parts[-3]
+    hash = cached_directory_path.name
+    warnong_msg = (
+        f"Found the latest cached dataset configuration '{config_name}' at {cached_directory_path} "
+        f"(last modified on {time.ctime(_get_modification_time(cached_directory_path))})."
     )
-    if datasets.config.HF_DATASETS_OFFLINE:
-        warning_msg += " (offline mode is enabled)."
-    logger.warning(warning_msg)
+    logger.warning(warnong_msg)
     return hash
 
 

From 4608447d34de3011d5f7c158ba7baf6e9859fd1c Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 13 Dec 2023 15:06:29 +0100
Subject: [PATCH 30/37] update hash when loading from parquet export too

---
 src/datasets/load.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index 6208b21f610..8fb1589090e 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1352,6 +1352,13 @@ def get_module(self) -> DatasetModule:
             metadata_configs,
             supports_metadata=False,
         )
+        dataset_infos = DatasetInfosDict(
+            {
+                config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name])
+                for config_name in exported_dataset_infos
+            }
+        )
+        hash = update_hash_for_cache(hash, metadata_configs=metadata_configs, dataset_infos=dataset_infos)
         builder_kwargs = {
             "repo_id": self.name,
             "dataset_name": camelcase_to_snakecase(Path(self.name).name),
@@ -1361,12 +1368,7 @@ def get_module(self) -> DatasetModule:
             module_path,
             hash,
             builder_kwargs,
-            dataset_infos=DatasetInfosDict(
-                {
-                    config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name])
-                    for config_name in exported_dataset_infos
-                }
-            ),
+            dataset_infos=dataset_infos,
             builder_configs_parameters=BuilderConfigsParameters(
                 metadata_configs=metadata_configs,
                 builder_configs=builder_configs,

From c3ddbd0ca2a301ed44bcc06839e6b4e85ee1b3ba Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 13 Dec 2023 16:10:15 +0100
Subject: [PATCH 31/37] fix modify files

---
 src/datasets/load.py | 18 ++++++++++++++----
 tests/test_load.py   | 12 ++++++++++++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index 8fb1589090e..6aba18713ee 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -592,15 +592,15 @@ def infer_module_for_data_files(
     return module_name, default_builder_kwargs
 
 
-def update_hash_for_cache(hash: str, metadata_configs: MetadataConfigs, dataset_infos: DatasetInfosDict) -> str:
+def update_hash_for_cache(hash: str, **kwargs: Dict[str, Union[MetadataConfigs, DataFilesDict, DataFilesDict]]) -> str:
     """
     Used to update hash of packaged modules which is used for creating unique cache directories to reflect
     different config parameters which are passed in metadata from readme.
     """
     m = Hasher()
     m.update(hash)
-    m.update(metadata_configs)
-    m.update(dataset_infos)
+    for obj in kwargs.values():
+        m.update(obj)
     return m.hexdigest()
 
 
@@ -2231,7 +2231,17 @@ def load_dataset_builder(
             error_msg += f'\nFor example `data_files={{"train": "path/to/data/train/*.{example_extensions[0]}"}}`'
         raise ValueError(error_msg)
 
+    hash = dataset_module.hash
     builder_cls = get_dataset_builder_class(dataset_module, dataset_name=dataset_name)
+    if len(builder_cls.builder_configs) > 1:
+        builder_config = builder_cls.builder_configs.get(config_name or builder_cls.DEFAULT_CONFIG_NAME)
+    elif len(builder_cls.builder_configs) == 1:
+        builder_config = builder_cls.BUILDER_CONFIGS[0]
+    else:
+        builder_config = None
+    if builder_config and builder_config.data_files is not None:
+        builder_config._resolve_data_files(base_path=builder_kwargs["base_path"], download_config=download_config)
+        hash = update_hash_for_cache(hash, data_files=builder_config.data_files)
     # Instantiate the dataset builder
     builder_instance: DatasetBuilder = builder_cls(
         cache_dir=cache_dir,
@@ -2239,7 +2249,7 @@ def load_dataset_builder(
         config_name=config_name,
         data_dir=data_dir,
         data_files=data_files,
-        hash=dataset_module.hash,
+        hash=hash,
         info=info,
         features=features,
         token=token,
diff --git a/tests/test_load.py b/tests/test_load.py
index 6f00ed58e6f..acc9a2680a5 100644
--- a/tests/test_load.py
+++ b/tests/test_load.py
@@ -1503,6 +1503,18 @@ def test_load_dataset_then_move_then_reload(dataset_loading_script_dir, data_dir
     assert dataset._fingerprint != fingerprint1
 
 
+def test_load_dataset_builder_then_edit_then_load_again(tmp_path: Path):
+    dataset_dir = tmp_path / "test_load_dataset_then_edit_then_load_again"
+    dataset_dir.mkdir()
+    with open(dataset_dir / "train.txt", "w") as f:
+        f.write("Hello there")
+    dataset_builder = load_dataset_builder(str(dataset_dir))
+    with open(dataset_dir / "train.txt", "w") as f:
+        f.write("General Kenobi !")
+    edited_dataset_builder = load_dataset_builder(str(dataset_dir))
+    assert dataset_builder.cache_dir != edited_dataset_builder.cache_dir
+
+
 def test_load_dataset_readonly(dataset_loading_script_dir, dataset_loading_script_dir_readonly, data_dir, tmp_path):
     cache_dir1 = tmp_path / "cache1"
     cache_dir2 = tmp_path / "cache2"

From 5c9a6a24b185efdd519935576077f12f105b7f99 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 13 Dec 2023 16:24:38 +0100
Subject: [PATCH 32/37] fix base_path

---
 src/datasets/load.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index 6aba18713ee..e3427ac4eba 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -2240,7 +2240,9 @@ def load_dataset_builder(
     else:
         builder_config = None
     if builder_config and builder_config.data_files is not None:
-        builder_config._resolve_data_files(base_path=builder_kwargs["base_path"], download_config=download_config)
+        builder_config._resolve_data_files(
+            base_path=builder_kwargs.get("base_path", os.path.abspath(".")), download_config=download_config
+        )
         hash = update_hash_for_cache(hash, data_files=builder_config.data_files)
     # Instantiate the dataset builder
     builder_instance: DatasetBuilder = builder_cls(

From 579c785463784c8f674daea95ad27ddf143d30bb Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 13 Dec 2023 20:43:56 +0100
Subject: [PATCH 33/37] just use the commit sha as hash

---
 src/datasets/load.py | 38 +++++++-------------------------------
 1 file changed, 7 insertions(+), 31 deletions(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index e3427ac4eba..4849add7a31 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -592,18 +592,6 @@ def infer_module_for_data_files(
     return module_name, default_builder_kwargs
 
 
-def update_hash_for_cache(hash: str, **kwargs: Dict[str, Union[MetadataConfigs, DataFilesDict, DataFilesDict]]) -> str:
-    """
-    Used to update hash of packaged modules which is used for creating unique cache directories to reflect
-    different config parameters which are passed in metadata from readme.
-    """
-    m = Hasher()
-    m.update(hash)
-    for obj in kwargs.values():
-        m.update(obj)
-    return m.hexdigest()
-
-
 def create_builder_configs_from_metadata_configs(
     module_path: str,
     metadata_configs: MetadataConfigs,
@@ -1054,7 +1042,7 @@ def get_module(self) -> DatasetModule:
                         }
                     )
 
-        module_path, hash = _PACKAGED_DATASETS_MODULES[module_name]
+        module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
         if metadata_configs:
             builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
                 module_path,
@@ -1093,7 +1081,7 @@ def get_module(self) -> DatasetModule:
         if default_config_name is None and len(dataset_infos) == 1:
             default_config_name = next(iter(dataset_infos))
 
-        hash = update_hash_for_cache(hash, metadata_configs=metadata_configs, dataset_infos=dataset_infos)
+        hash = Hasher.hash({"dataset_infos": dataset_infos, "builder_configs": builder_configs})
         return DatasetModule(
             module_path,
             hash,
@@ -1248,7 +1236,7 @@ def get_module(self) -> DatasetModule:
                         }
                     )
 
-        module_path, hash = _PACKAGED_DATASETS_MODULES[module_name]
+        module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
         if metadata_configs:
             builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
                 module_path,
@@ -1297,7 +1285,7 @@ def get_module(self) -> DatasetModule:
         if default_config_name is None and len(dataset_infos) == 1:
             default_config_name = next(iter(dataset_infos))
 
-        hash = update_hash_for_cache(hash, metadata_configs=metadata_configs, dataset_infos=dataset_infos)
+        hash = revision
         return DatasetModule(
             module_path,
             hash,
@@ -1346,7 +1334,7 @@ def get_module(self) -> DatasetModule:
         metadata_configs = MetadataConfigs._from_exported_parquet_files(
             revision=revision, exported_parquet_files=exported_parquet_files
         )
-        module_path, hash = _PACKAGED_DATASETS_MODULES["parquet"]
+        module_path, _ = _PACKAGED_DATASETS_MODULES["parquet"]
         builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
             module_path,
             metadata_configs,
@@ -1358,7 +1346,7 @@ def get_module(self) -> DatasetModule:
                 for config_name in exported_dataset_infos
             }
         )
-        hash = update_hash_for_cache(hash, metadata_configs=metadata_configs, dataset_infos=dataset_infos)
+        hash = revision
         builder_kwargs = {
             "repo_id": self.name,
             "dataset_name": camelcase_to_snakecase(Path(self.name).name),
@@ -2231,19 +2219,7 @@ def load_dataset_builder(
             error_msg += f'\nFor example `data_files={{"train": "path/to/data/train/*.{example_extensions[0]}"}}`'
         raise ValueError(error_msg)
 
-    hash = dataset_module.hash
     builder_cls = get_dataset_builder_class(dataset_module, dataset_name=dataset_name)
-    if len(builder_cls.builder_configs) > 1:
-        builder_config = builder_cls.builder_configs.get(config_name or builder_cls.DEFAULT_CONFIG_NAME)
-    elif len(builder_cls.builder_configs) == 1:
-        builder_config = builder_cls.BUILDER_CONFIGS[0]
-    else:
-        builder_config = None
-    if builder_config and builder_config.data_files is not None:
-        builder_config._resolve_data_files(
-            base_path=builder_kwargs.get("base_path", os.path.abspath(".")), download_config=download_config
-        )
-        hash = update_hash_for_cache(hash, data_files=builder_config.data_files)
     # Instantiate the dataset builder
     builder_instance: DatasetBuilder = builder_cls(
         cache_dir=cache_dir,
@@ -2251,7 +2227,7 @@ def load_dataset_builder(
         config_name=config_name,
         data_dir=data_dir,
         data_files=data_files,
-        hash=hash,
+        hash=dataset_module.hash,
         info=info,
         features=features,
         token=token,

From 81d161f091bd961c2a610241a697213f500cfda4 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Wed, 13 Dec 2023 20:49:29 +0100
Subject: [PATCH 34/37] use commit sha in parquet export dataset cache
 directories too

---
 src/datasets/load.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index 4849add7a31..dd6a1de26d5 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1346,7 +1346,7 @@ def get_module(self) -> DatasetModule:
                 for config_name in exported_dataset_infos
             }
         )
-        hash = revision
+        hash = self.revision
         builder_kwargs = {
             "repo_id": self.name,
             "dataset_name": camelcase_to_snakecase(Path(self.name).name),

From e776c88e36e0fb485d58f68f19728b13c69b9ab2 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Thu, 14 Dec 2023 13:00:50 +0100
Subject: [PATCH 35/37] use version from parquet export dataset info

---
 src/datasets/load.py           | 20 +++++++++-----------
 src/datasets/utils/metadata.py | 11 ++++++++---
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index dd6a1de26d5..239d1ebb636 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1322,17 +1322,21 @@ def get_module(self) -> DatasetModule:
         exported_dataset_infos = _datasets_server.get_exported_dataset_infos(
             dataset=self.name, revision=self.revision, token=self.download_config.token
         )
+        dataset_infos = DatasetInfosDict(
+            {
+                config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name])
+                for config_name in exported_dataset_infos
+            }
+        )
         hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
             self.name,
             revision="refs/convert/parquet",
             token=self.download_config.token,
             timeout=100.0,
         )
-        # even if metadata_configs is not None (which means that we will resolve files for each config later)
-        # we cannot skip resolving all files because we need to infer module name by files extensions
         revision = hfh_dataset_info.sha  # fix the revision in case there are new commits in the meantime
-        metadata_configs = MetadataConfigs._from_exported_parquet_files(
-            revision=revision, exported_parquet_files=exported_parquet_files
+        metadata_configs = MetadataConfigs._from_exported_parquet_files_and_dataset_infos(
+            revision=revision, exported_parquet_files=exported_parquet_files, dataset_infos=dataset_infos
         )
         module_path, _ = _PACKAGED_DATASETS_MODULES["parquet"]
         builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
@@ -1340,12 +1344,6 @@ def get_module(self) -> DatasetModule:
             metadata_configs,
             supports_metadata=False,
         )
-        dataset_infos = DatasetInfosDict(
-            {
-                config_name: DatasetInfo.from_dict(exported_dataset_infos[config_name])
-                for config_name in exported_dataset_infos
-            }
-        )
         hash = self.revision
         builder_kwargs = {
             "repo_id": self.name,
@@ -1529,7 +1527,7 @@ def _get_modification_time(module_hash):
             warning_msg = (
                 f"Using the latest cached version of the module from {os.path.join(importable_directory_path, hash)} "
                 f"(last modified on {time.ctime(_get_modification_time(hash))}) since it "
-                f"couldn't be found locally at {self.name}."
+                f"couldn't be found locally at {self.name}"
             )
             if not config.HF_DATASETS_OFFLINE:
                 warning_msg += ", or remotely on the Hugging Face Hub."
diff --git a/src/datasets/utils/metadata.py b/src/datasets/utils/metadata.py
index 44b18d97919..4d2d6b059c7 100644
--- a/src/datasets/utils/metadata.py
+++ b/src/datasets/utils/metadata.py
@@ -9,6 +9,7 @@
 from huggingface_hub import DatasetCardData
 
 from ..config import METADATA_CONFIGS_FIELD
+from ..info import DatasetInfo, DatasetInfosDict
 from ..utils.logging import get_logger
 from .deprecation_utils import deprecated
 
@@ -172,8 +173,11 @@ def _raise_if_data_files_field_not_valid(metadata_config: dict):
                         raise ValueError(yaml_error_message)
 
     @classmethod
-    def _from_exported_parquet_files(
-        cls, revision: str, exported_parquet_files: List[Dict[str, Any]]
+    def _from_exported_parquet_files_and_dataset_infos(
+        cls,
+        revision: str,
+        exported_parquet_files: List[Dict[str, Any]],
+        dataset_infos: DatasetInfosDict,
     ) -> "MetadataConfigs":
         return cls(
             {
@@ -189,7 +193,8 @@ def _from_exported_parquet_files(
                         for split_name, parquet_files_for_split in groupby(
                             parquet_files_for_config, itemgetter("split")
                         )
-                    ]
+                    ],
+                    "version": str(dataset_infos.get(config_name, DatasetInfo()).version or "0.0.0"),
                 }
                 for config_name, parquet_files_for_config in groupby(exported_parquet_files, itemgetter("config"))
             }

From 2b88ad4a8030e3b4fabc354117484525207dbfcc Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Tue, 19 Dec 2023 12:18:20 +0100
Subject: [PATCH 36/37] fix cache reload when config name and version are not
 the default ones

---
 src/datasets/packaged_modules/cache/cache.py | 37 +++++++++++++++-----
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/src/datasets/packaged_modules/cache/cache.py b/src/datasets/packaged_modules/cache/cache.py
index 0c3c9361952..89bf895046f 100644
--- a/src/datasets/packaged_modules/cache/cache.py
+++ b/src/datasets/packaged_modules/cache/cache.py
@@ -3,7 +3,7 @@
 import shutil
 import time
 from pathlib import Path
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import pyarrow as pa
 
@@ -19,13 +19,15 @@ def _get_modification_time(cached_directory_path):
     return (Path(cached_directory_path)).stat().st_mtime
 
 
-def _find_hash_in_cache(dataset_name: str, config_name: Optional[str], cache_dir: Optional[str]) -> str:
+def _find_hash_in_cache(
+    dataset_name: str, config_name: Optional[str], version: Optional[str], cache_dir: Optional[str]
+) -> Tuple[str, str, str]:
     cache_dir = os.path.expanduser(str(cache_dir or datasets.config.HF_DATASETS_CACHE))
     cached_datasets_directory_path_root = os.path.join(cache_dir, dataset_name.replace("/", "___"))
     cached_directory_paths = [
         cached_directory_path
         for cached_directory_path in glob.glob(
-            os.path.join(cached_datasets_directory_path_root, config_name or "*", "*", "*")
+            os.path.join(cached_datasets_directory_path_root, config_name or "*", version or "*", "*")
         )
         if os.path.isdir(cached_directory_path)
     ]
@@ -41,21 +43,37 @@ def _find_hash_in_cache(dataset_name: str, config_name: Optional[str], cache_dir
         available_configs = sorted(
             {Path(cached_directory_path).parts[-3] for cached_directory_path in cached_directory_paths}
         )
+        available_versions = sorted(
+            {Path(cached_directory_path).parts[-2] for cached_directory_path in cached_directory_paths}
+        )
         raise ValueError(
             f"Couldn't find cache for {dataset_name}"
             + (f" for config '{config_name}'" if config_name else "")
+            + (f" for version '{version}'" if version else "")
             + (f"\nAvailable configs in the cache: {available_configs}" if available_configs else "")
+            + (f"\nAvailable versions in the cache: {available_versions}" if version and available_versions else "")
         )
     # get most recent
     cached_directory_path = Path(sorted(cached_directory_paths, key=_get_modification_time)[-1])
+    version, hash = cached_directory_path.parts[-2:]
+    other_configs = [
+        Path(cached_directory_path).parts[-3]
+        for cached_directory_path in glob.glob(os.path.join(cached_datasets_directory_path_root, "*", version, hash))
+        if os.path.isdir(cached_directory_path)
+    ]
+    if not config_name and len(other_configs) > 1:
+        raise ValueError(
+            f"There are multiple '{dataset_name}' configurations in the cache: {', '.join(other_configs)}"
+            f"\nPlease specify which configuration to reload from the cache, e.g."
+            f"\n\tload_dataset('{dataset_name}', '{other_configs[0]}')"
+        )
     config_name = cached_directory_path.parts[-3]
-    hash = cached_directory_path.name
-    warnong_msg = (
+    warning_msg = (
         f"Found the latest cached dataset configuration '{config_name}' at {cached_directory_path} "
         f"(last modified on {time.ctime(_get_modification_time(cached_directory_path))})."
     )
-    logger.warning(warnong_msg)
-    return hash
+    logger.warning(warning_msg)
+    return config_name, version, hash
 
 
 class Cache(datasets.ArrowBasedBuilder):
@@ -64,6 +82,7 @@ def __init__(
         cache_dir: Optional[str] = None,
         dataset_name: Optional[str] = None,
         config_name: Optional[str] = None,
+        version: Optional[str] = None,
         hash: Optional[str] = None,
         repo_id: Optional[str] = None,
         **kwargs,
@@ -71,15 +90,17 @@ def __init__(
         if repo_id is None and dataset_name is None:
             raise ValueError("repo_id or dataset_name is required for the Cache dataset builder")
         if hash == "auto":
-            hash = _find_hash_in_cache(
+            config_name, version, hash = _find_hash_in_cache(
                 dataset_name=repo_id or dataset_name,
                 config_name=config_name,
+                version=version,
                 cache_dir=cache_dir,
             )
         super().__init__(
             cache_dir=cache_dir,
             dataset_name=dataset_name,
             config_name=config_name,
+            version=version,
             hash=hash,
             repo_id=repo_id,
             **kwargs,

From 0762ddbdcb15a2a19554a7b57c2421afcb4ed1c1 Mon Sep 17 00:00:00 2001
From: Quentin Lhoest <lhoest.q@gmail.com>
Date: Tue, 19 Dec 2023 23:46:25 +0100
Subject: [PATCH 37/37] fix tests

---
 src/datasets/load.py                         |  2 +-
 src/datasets/packaged_modules/cache/cache.py | 16 ++++++----------
 tests/packaged_modules/test_cache.py         | 12 ++++++------
 3 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/src/datasets/load.py b/src/datasets/load.py
index ce077fd33d6..75112b8bc82 100644
--- a/src/datasets/load.py
+++ b/src/datasets/load.py
@@ -1566,7 +1566,7 @@ def _get_modification_time(module_hash):
             return DatasetModule(
                 "datasets.packaged_modules.cache.cache",
                 "auto",
-                builder_kwargs,
+                {**builder_kwargs, "version": "auto"},
             )
         raise FileNotFoundError(f"Dataset {self.name} is not cached in {self.cache_dir}")
 
diff --git a/src/datasets/packaged_modules/cache/cache.py b/src/datasets/packaged_modules/cache/cache.py
index 89bf895046f..a378df23165 100644
--- a/src/datasets/packaged_modules/cache/cache.py
+++ b/src/datasets/packaged_modules/cache/cache.py
@@ -20,14 +20,14 @@ def _get_modification_time(cached_directory_path):
 
 
 def _find_hash_in_cache(
-    dataset_name: str, config_name: Optional[str], version: Optional[str], cache_dir: Optional[str]
+    dataset_name: str, config_name: Optional[str], cache_dir: Optional[str]
 ) -> Tuple[str, str, str]:
     cache_dir = os.path.expanduser(str(cache_dir or datasets.config.HF_DATASETS_CACHE))
     cached_datasets_directory_path_root = os.path.join(cache_dir, dataset_name.replace("/", "___"))
     cached_directory_paths = [
         cached_directory_path
         for cached_directory_path in glob.glob(
-            os.path.join(cached_datasets_directory_path_root, config_name or "*", version or "*", "*")
+            os.path.join(cached_datasets_directory_path_root, config_name or "*", "*", "*")
         )
         if os.path.isdir(cached_directory_path)
     ]
@@ -43,15 +43,10 @@ def _find_hash_in_cache(
         available_configs = sorted(
             {Path(cached_directory_path).parts[-3] for cached_directory_path in cached_directory_paths}
         )
-        available_versions = sorted(
-            {Path(cached_directory_path).parts[-2] for cached_directory_path in cached_directory_paths}
-        )
         raise ValueError(
             f"Couldn't find cache for {dataset_name}"
             + (f" for config '{config_name}'" if config_name else "")
-            + (f" for version '{version}'" if version else "")
             + (f"\nAvailable configs in the cache: {available_configs}" if available_configs else "")
-            + (f"\nAvailable versions in the cache: {available_versions}" if version and available_versions else "")
         )
     # get most recent
     cached_directory_path = Path(sorted(cached_directory_paths, key=_get_modification_time)[-1])
@@ -82,20 +77,21 @@ def __init__(
         cache_dir: Optional[str] = None,
         dataset_name: Optional[str] = None,
         config_name: Optional[str] = None,
-        version: Optional[str] = None,
+        version: Optional[str] = "0.0.0",
         hash: Optional[str] = None,
         repo_id: Optional[str] = None,
         **kwargs,
     ):
         if repo_id is None and dataset_name is None:
             raise ValueError("repo_id or dataset_name is required for the Cache dataset builder")
-        if hash == "auto":
+        if hash == "auto" and version == "auto":
             config_name, version, hash = _find_hash_in_cache(
                 dataset_name=repo_id or dataset_name,
                 config_name=config_name,
-                version=version,
                 cache_dir=cache_dir,
             )
+        elif hash == "auto" or version == "auto":
+            raise NotImplementedError("Pass both hash='auto' and version='auto' instead")
         super().__init__(
             cache_dir=cache_dir,
             dataset_name=dataset_name,
diff --git a/tests/packaged_modules/test_cache.py b/tests/packaged_modules/test_cache.py
index e43deec2879..8d37fb0a2d9 100644
--- a/tests/packaged_modules/test_cache.py
+++ b/tests/packaged_modules/test_cache.py
@@ -29,7 +29,7 @@ def test_cache_streaming(text_dir: Path):
 
 def test_cache_auto_hash(text_dir: Path):
     ds = load_dataset(str(text_dir))
-    cache = Cache(dataset_name=text_dir.name, hash="auto")
+    cache = Cache(dataset_name=text_dir.name, version="auto", hash="auto")
     reloaded = cache.as_dataset()
     assert list(ds) == list(reloaded)
     assert list(ds["train"]) == list(reloaded["train"])
@@ -37,13 +37,13 @@ def test_cache_auto_hash(text_dir: Path):
 
 def test_cache_missing(text_dir: Path):
     load_dataset(str(text_dir))
-    Cache(dataset_name=text_dir.name, hash="auto").download_and_prepare()
+    Cache(dataset_name=text_dir.name, version="auto", hash="auto").download_and_prepare()
     with pytest.raises(ValueError):
-        Cache(dataset_name="missing", hash="auto").download_and_prepare()
+        Cache(dataset_name="missing", version="auto", hash="auto").download_and_prepare()
     with pytest.raises(ValueError):
         Cache(dataset_name=text_dir.name, hash="missing").download_and_prepare()
     with pytest.raises(ValueError):
-        Cache(dataset_name=text_dir.name, config_name="missing", hash="auto").download_and_prepare()
+        Cache(dataset_name=text_dir.name, config_name="missing", version="auto", hash="auto").download_and_prepare()
 
 
 @pytest.mark.integration
@@ -52,10 +52,10 @@ def test_cache_multi_configs():
     dataset_name = repo_id.split("/")[-1]
     config_name = "v1"
     ds = load_dataset(repo_id, config_name)
-    cache = Cache(dataset_name=dataset_name, repo_id=repo_id, config_name=config_name, hash="auto")
+    cache = Cache(dataset_name=dataset_name, repo_id=repo_id, config_name=config_name, version="auto", hash="auto")
     reloaded = cache.as_dataset()
     assert list(ds) == list(reloaded)
     assert len(ds["train"]) == len(reloaded["train"])
     with pytest.raises(ValueError) as excinfo:
-        Cache(dataset_name=dataset_name, repo_id=repo_id, config_name="missing", hash="auto")
+        Cache(dataset_name=dataset_name, repo_id=repo_id, config_name="missing", version="auto", hash="auto")
     assert config_name in str(excinfo.value)