Skip to content

Commit 9849523

Browse files
lhoestqmariosasko
andauthored
Faster webdataset streaming (#6578)
* faster webdataset streaming * fix test * use block_size * don't use block_size=0 with older versions of hfh * Update src/datasets/download/streaming_download_manager.py Co-authored-by: Mario Šaško <[email protected]> --------- Co-authored-by: Mario Šaško <[email protected]>
1 parent 237a2a6 commit 9849523

File tree

1 file changed

+9
-1
lines changed

1 file changed

+9
-1
lines changed

src/datasets/download/streaming_download_manager.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import fsspec
1818
from aiohttp.client_exceptions import ClientError
1919
from huggingface_hub.utils import EntryNotFoundError
20+
from packaging import version
2021

2122
from .. import config
2223
from ..filesystems import COMPRESSION_FILESYSTEMS
@@ -475,6 +476,9 @@ def _prepare_single_hop_path_and_storage_options(
475476
"endpoint": config.HF_ENDPOINT,
476477
**storage_options.get(protocol, {}),
477478
}
479+
# streaming with block_size=0 is only implemented in 0.21 (see https://github.com/huggingface/huggingface_hub/pull/1967)
480+
if config.HF_HUB_VERSION < version.parse("0.21.0"):
481+
storage_options[protocol]["block_size"] = "default"
478482
return urlpath, storage_options
479483

480484

@@ -498,6 +502,8 @@ def xopen(file: str, mode="r", *args, download_config: Optional[DownloadConfig]
498502
file_str = _as_str(file)
499503
main_hop, *rest_hops = file_str.split("::")
500504
if is_local_path(main_hop):
505+
# ignore fsspec-specific kwargs
506+
kwargs.pop("block_size", None)
501507
return open(main_hop, mode, *args, **kwargs)
502508
# add headers and cookies for authentication on the HF Hub and for Google Drive
503509
file, storage_options = _prepare_path_and_storage_options(file_str, download_config=download_config)
@@ -911,7 +917,9 @@ def _iter_from_urlpath(
911917
cls, urlpath: str, download_config: Optional[DownloadConfig] = None
912918
) -> Generator[Tuple, None, None]:
913919
compression = _get_extraction_protocol(urlpath, download_config=download_config)
914-
with xopen(urlpath, "rb", download_config=download_config) as f:
920+
# Set block_size=0 to get faster streaming
921+
# (e.g. for hf:// and https:// it uses streaming Requests file-like instances)
922+
with xopen(urlpath, "rb", download_config=download_config, block_size=0) as f:
915923
if compression == "zip":
916924
yield from cls._iter_zip(f)
917925
else:

0 commit comments

Comments
 (0)