Keep hffs cache in workers when streaming (huggingface#7820)

lhoestq · web-flow · commit 0b2a4c2dbf6a · 2025-10-17T11:59:16.000+02:00
* keep hffs cache in workers when streaming

* bonus: reorder hffs args to improve caching
diff --git a/src/datasets/download/download_config.py b/src/datasets/download/download_config.py
@@ -75,7 +75,7 @@ def copy(self) -> "DownloadConfig":
     def __setattr__(self, name, value):
         if name == "token" and getattr(self, "storage_options", None) is not None:
             if "hf" not in self.storage_options:
-                self.storage_options["hf"] = {"token": value, "endpoint": config.HF_ENDPOINT}
+                self.storage_options["hf"] = {"endpoint": config.HF_ENDPOINT, "token": value}
             elif getattr(self.storage_options["hf"], "token", None) is None:
                 self.storage_options["hf"]["token"] = value
         super().__setattr__(name, value)
diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py
@@ -26,7 +26,15 @@
 import pandas as pd
 import pyarrow as pa
 import pyarrow.parquet as pq
-from huggingface_hub import CommitInfo, CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi
+from huggingface_hub import (
+    CommitInfo,
+    CommitOperationAdd,
+    CommitOperationDelete,
+    DatasetCard,
+    DatasetCardData,
+    HfApi,
+    HfFileSystem,
+)
 from huggingface_hub.hf_api import RepoFile
 from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError
 from multiprocess import Pool
@@ -2151,6 +2159,7 @@ def __init__(
         self._token_per_repo_id: dict[str, Union[str, bool, None]] = token_per_repo_id or {}
         self._epoch: Union[int, "torch.Tensor"] = _maybe_share_with_torch_persistent_workers(0)
         self._starting_state_dict: Optional[dict] = None
+        self.__hffs_cache = HfFileSystem._cache  # keep the cache on pickling (e.g. for dataloader workers)
         self._prepare_ex_iterable_for_iteration()  # set state_dict
         _maybe_add_torch_iterable_dataset_parent_class(self.__class__)  # subclass of torch IterableDataset
 
@@ -2299,6 +2308,8 @@ def __setstate__(self, d):
         self.__dict__ = d
         # Re-add torch shared memory, since shared memory is not always kept when pickling
         self._epoch = _maybe_share_with_torch_persistent_workers(self._epoch)
+        # Re-add the cache to keep on pickling (e.g. for dataloader workers)
+        self.__hffs_cache = HfFileSystem._cache
         # Re-add torch iterable dataset as a parent class, since dynamically added parent classes are not kept when pickling
         _maybe_add_torch_iterable_dataset_parent_class(self.__class__)
 
diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py
@@ -897,8 +897,8 @@ def _prepare_single_hop_path_and_storage_options(
             storage_options["headers"] = {"Accept-Encoding": "identity", **headers}
     elif protocol == "hf":
         storage_options = {
-            "token": token,
             "endpoint": config.HF_ENDPOINT,
+            "token": token,
             **storage_options,
         }
     if storage_options: