-
Notifications
You must be signed in to change notification settings - Fork 3k
Multithreaded downloads #6794
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Multithreaded downloads #6794
Changes from 9 commits
f72bffb
b05e626
db97134
cd42a37
e4a1d01
a2e921b
bd5a779
0ccb360
6ccc067
8e56b3e
0ecc64f
d8e31fb
89c21fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -54,7 +54,7 @@ jobs: | |||
| if: ${{ matrix.os == 'ubuntu-latest' }} | ||||
| run: echo "installing pinned version of setuptools-scm to fix seqeval installation on 3.7" && pip install "setuptools-scm==6.4.2" | ||||
| - name: Install uv | ||||
| run: pip install --upgrade uv | ||||
| run: pip install uv==0.1.29 | ||||
| - name: Install dependencies | ||||
| run: | | ||||
| uv pip install --system "datasets[tests,metrics-tests] @ ." | ||||
|
|
@@ -89,7 +89,7 @@ jobs: | |||
| - name: Upgrade pip | ||||
| run: python -m pip install --upgrade pip | ||||
| - name: Install uv | ||||
| run: pip install --upgrade uv | ||||
| run: pip install uv==0.1.29 | ||||
|
||||
| run: pip install uv==0.1.29 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -544,8 +544,8 @@ def _get_single_origin_metadata( | |
|
|
||
| def _get_origin_metadata( | ||
| data_files: List[str], | ||
| max_workers=64, | ||
| download_config: Optional[DownloadConfig] = None, | ||
| max_workers: int = 16, | ||
|
||
| ) -> Tuple[str]: | ||
| return thread_map( | ||
| partial(_get_single_origin_metadata, download_config=download_config), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
|
|
||
| import enum | ||
| import io | ||
| import multiprocessing | ||
| import os | ||
| import posixpath | ||
| import tarfile | ||
|
|
@@ -27,6 +28,10 @@ | |
| from itertools import chain | ||
| from typing import Callable, Dict, Generator, List, Optional, Tuple, Union | ||
|
|
||
| import fsspec | ||
| from fsspec.core import url_to_fs | ||
| from tqdm.contrib.concurrent import thread_map | ||
|
|
||
| from .. import config | ||
| from ..utils import tqdm as hf_tqdm | ||
| from ..utils.deprecation_utils import DeprecatedEnum, deprecated | ||
|
|
@@ -39,7 +44,7 @@ | |
| url_or_path_join, | ||
| ) | ||
| from ..utils.info_utils import get_size_checksum_dict | ||
| from ..utils.logging import get_logger | ||
| from ..utils.logging import get_logger, tqdm | ||
| from ..utils.py_utils import NestedDataStructure, map_nested, size_str | ||
| from ..utils.track import TrackedIterable, tracked_str | ||
| from .download_config import DownloadConfig | ||
|
|
@@ -427,7 +432,7 @@ def download(self, url_or_urls): | |
| if download_config.download_desc is None: | ||
| download_config.download_desc = "Downloading data" | ||
|
|
||
| download_func = partial(self._download, download_config=download_config) | ||
| download_func = partial(self._download_batched, download_config=download_config) | ||
|
|
||
| start_time = datetime.now() | ||
| with stack_multiprocessing_download_progress_bars(): | ||
|
|
@@ -437,6 +442,8 @@ def download(self, url_or_urls): | |
| map_tuple=True, | ||
| num_proc=download_config.num_proc, | ||
| desc="Downloading data files", | ||
| batched=True, | ||
| batch_size=-1, | ||
| ) | ||
| duration = datetime.now() - start_time | ||
| logger.info(f"Downloading took {duration.total_seconds() // 60} min") | ||
|
|
@@ -451,6 +458,43 @@ def download(self, url_or_urls): | |
|
|
||
| return downloaded_path_or_paths.data | ||
|
|
||
| def _download_batched( | ||
| self, | ||
| url_or_filenames: List[str], | ||
| download_config: DownloadConfig, | ||
| ) -> List[str]: | ||
| if len(url_or_filenames) >= 16: | ||
| download_config = download_config.copy() | ||
| download_config.disable_tqdm = True | ||
| download_func = partial(self._download, download_config=download_config) | ||
|
|
||
| fs: fsspec.AbstractFileSystem | ||
| fs, path = url_to_fs(url_or_filenames[0], **download_config.storage_options) | ||
| size = 0 | ||
| try: | ||
| size = fs.info(path).get("size", 0) | ||
| except Exception: | ||
| pass | ||
| max_workers = 16 if size < (20 << 20) else 1 # enable multithreading if files are small | ||
|
|
||
| return thread_map( | ||
| download_func, | ||
| url_or_filenames, | ||
| desc=download_config.download_desc or "Downloading", | ||
| unit="files", | ||
| position=multiprocessing.current_process()._identity[-1] # contains the ranks of subprocesses | ||
| if os.environ.get("HF_DATASETS_STACK_MULTIPROCESSING_DOWNLOAD_PROGRESS_BARS") == "1" | ||
| and multiprocessing.current_process()._identity | ||
| else None, | ||
| max_workers=max_workers, | ||
| tqdm_class=tqdm, | ||
| ) | ||
| else: | ||
| return [ | ||
| self._download(url_or_filename, download_config=download_config) | ||
|
||
| for url_or_filename in url_or_filenames | ||
| ] | ||
|
|
||
| def _download(self, url_or_filename: str, download_config: DownloadConfig) -> str: | ||
| url_or_filename = str(url_or_filename) | ||
| if is_relative_path(url_or_filename): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would remove the pin to be consistent with
huggingface_hubanddiffusers:(we don't use
uv's advanced/experimental features, so a breaking change here is unlikely)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I had pinned it because 0.1.30 had bugs - I'll see if 0.1.31 has fixed them
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's been fixed in 0.1.31 (issue in
uv: astral-sh/uv#2941) :)