From 7873621f0121dc4d06f915d32ed992e086e454f2 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Thu, 18 Sep 2025 16:44:48 +0200 Subject: [PATCH 01/32] Let's test like this --- setup.py | 4 +- src/datasets/arrow_dataset.py | 8 ++-- src/datasets/data_files.py | 2 +- src/datasets/dataset_dict.py | 9 ++-- src/datasets/iterable_dataset.py | 7 ++- src/datasets/load.py | 18 +++----- src/datasets/utils/file_utils.py | 28 +++++------- src/datasets/utils/hub.py | 8 ---- tests/fixtures/hub.py | 22 +++------- tests/test_load.py | 8 ++-- tests/test_offline_util.py | 20 ++++----- tests/utils.py | 75 +++++++++++++++++++++++--------- 12 files changed, 104 insertions(+), 105 deletions(-) diff --git a/setup.py b/setup.py index 88197336b3e..c557992fa76 100644 --- a/setup.py +++ b/setup.py @@ -117,7 +117,7 @@ # For performance gains with apache arrow "pandas", # for downloading datasets over HTTPS - "requests>=2.32.2", + "httpx<1.0.0", # progress bars in downloads and data operations "tqdm>=4.66.3", # for fast hashing @@ -128,7 +128,7 @@ # minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143 "fsspec[http]>=2023.1.0,<=2025.9.0", # To get datasets from the Datasets Hub on huggingface.co - "huggingface-hub>=0.24.0", + "huggingface-hub==1.0.0.rc0", # Utilities from PyPA to e.g., compare versions "packaging", # To parse YAML metadata from dataset cards diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 36740e458b7..49d384aaa81 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -68,9 +68,9 @@ DatasetCardData, HfApi, ) -from huggingface_hub.hf_api import HfHubHTTPError, RepoFile, RepositoryNotFoundError +from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError +from huggingface_hub.hf_api import RepoFile from multiprocess import Pool -from requests import HTTPError from tqdm.contrib.concurrent import thread_map from . import config @@ -5993,7 +5993,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete], except HfHubHTTPError as err: if ( err.__context__ - and isinstance(err.__context__, HTTPError) + and isinstance(err.__context__, HfHubHTTPError) and err.__context__.response.status_code == 409 ): # 409 is Conflict (another commit is in progress) @@ -6043,7 +6043,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete], except HfHubHTTPError as err: if ( err.__context__ - and isinstance(err.__context__, HTTPError) + and isinstance(err.__context__, HfHubHTTPError) and err.__context__.response.status_code in (412, 409) ): # 412 is Precondition failed (parent_commit isn't satisfied) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 087e037a186..9fefd4a4c69 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -352,7 +352,7 @@ def resolve_pattern( protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0] protocol_prefix = protocol + "://" if protocol != "file" else "" glob_kwargs = {} - if protocol == "hf" and config.HF_HUB_VERSION >= version.parse("0.20.0"): + if protocol == "hf": # 10 times faster glob with detail=True (ignores costly info like lastCommit) glob_kwargs["expand_info"] = False matched_paths = [ diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 733b96d0069..63a93429c45 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -26,7 +26,6 @@ ) from huggingface_hub.hf_api import RepoFile from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError -from requests import HTTPError from . import config from .arrow_dataset import ( @@ -1917,7 +1916,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete], except HfHubHTTPError as err: if ( err.__context__ - and isinstance(err.__context__, HTTPError) + and isinstance(err.__context__, HfHubHTTPError) and err.__context__.response.status_code == 409 ): # 409 is Conflict (another commit is in progress) @@ -1967,7 +1966,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete], except HfHubHTTPError as err: if ( err.__context__ - and isinstance(err.__context__, HTTPError) + and isinstance(err.__context__, HfHubHTTPError) and err.__context__.response.status_code in (412, 409) ): # 412 is Precondition failed (parent_commit isn't satisfied) @@ -2786,7 +2785,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete], except HfHubHTTPError as err: if ( err.__context__ - and isinstance(err.__context__, HTTPError) + and isinstance(err.__context__, HfHubHTTPError) and err.__context__.response.status_code == 409 ): # 409 is Conflict (another commit is in progress) @@ -2836,7 +2835,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete], except HfHubHTTPError as err: if ( err.__context__ - and isinstance(err.__context__, HTTPError) + and isinstance(err.__context__, HfHubHTTPError) and err.__context__.response.status_code in (412, 409) ): # 412 is Precondition failed (parent_commit isn't satisfied) diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index d5e8c6e0a91..b931342db34 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -27,10 +27,9 @@ import pyarrow as pa import pyarrow.parquet as pq from huggingface_hub import CommitInfo, CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi +from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError from huggingface_hub.hf_api import RepoFile -from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError from multiprocess import Pool -from requests import HTTPError from . import config from .arrow_dataset import PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED, Dataset, DatasetInfoMixin @@ -4332,7 +4331,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete], except HfHubHTTPError as err: if ( err.__context__ - and isinstance(err.__context__, HTTPError) + and isinstance(err.__context__, HfHubHTTPError) and err.__context__.response.status_code == 409 ): # 409 is Conflict (another commit is in progress) @@ -4382,7 +4381,7 @@ def get_deletions_and_dataset_card() -> tuple[str, list[CommitOperationDelete], except HfHubHTTPError as err: if ( err.__context__ - and isinstance(err.__context__, HTTPError) + and isinstance(err.__context__, HfHubHTTPError) and err.__context__.response.status_code in (412, 409) ): # 412 is Precondition failed (parent_commit isn't satisfied) diff --git a/src/datasets/load.py b/src/datasets/load.py index bc2b0e679b6..557ce3514f5 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -28,19 +28,19 @@ from typing import Any, Optional, Union import fsspec -import requests +import httpx import yaml from fsspec.core import url_to_fs from huggingface_hub import DatasetCard, DatasetCardData, HfApi -from huggingface_hub.utils import ( +from huggingface_hub.errors import ( EntryNotFoundError, GatedRepoError, LocalEntryNotFoundError, OfflineModeIsEnabled, RepositoryNotFoundError, RevisionNotFoundError, - get_session, ) +from huggingface_hub.utils import get_session from . import __version__, config from .arrow_dataset import Dataset @@ -944,11 +944,7 @@ def dataset_module_factory( except LocalEntryNotFoundError as e: if isinstance( e.__cause__, - ( - OfflineModeIsEnabled, - requests.exceptions.Timeout, - requests.exceptions.ConnectionError, - ), + (OfflineModeIsEnabled, httpx.ConnectError, httpx.TimeoutException), ): raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e else: @@ -959,11 +955,7 @@ def dataset_module_factory( revision=revision, timeout=100.0, ).sha - except ( - OfflineModeIsEnabled, - requests.exceptions.Timeout, - requests.exceptions.ConnectionError, - ) as e: + except (OfflineModeIsEnabled, httpx.ConnectError, httpx.TimeoutException) as e: raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e except GatedRepoError as e: message = f"Dataset '{path}' is a gated dataset on the Hub." diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 81be4f295c4..9c9e33285ad 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -27,12 +27,12 @@ from xml.etree import ElementTree as ET import fsspec +import httpx import huggingface_hub import huggingface_hub.errors -import requests from fsspec.core import strip_protocol, url_to_fs from fsspec.utils import can_be_local -from huggingface_hub.utils import EntryNotFoundError, get_session, insecure_hashlib +from huggingface_hub.utils import get_session, insecure_hashlib from packaging import version from .. import __version__, config @@ -140,7 +140,7 @@ def cached_path( ConnectionError: in case of unreachable url and no cache on disk ValueError: if it couldn't parse the url or filename correctly - requests.exceptions.ConnectionError: in case of internet connection issue + httpx.NetworkError: in case of internet connection issue """ if download_config is None: download_config = DownloadConfig(**download_kwargs) @@ -183,10 +183,10 @@ def cached_path( proxies=download_config.proxies, ) except ( - huggingface_hub.utils.RepositoryNotFoundError, - huggingface_hub.utils.EntryNotFoundError, - huggingface_hub.utils.RevisionNotFoundError, - huggingface_hub.utils.GatedRepoError, + huggingface_hub.errors.RepositoryNotFoundError, + huggingface_hub.errors.EntryNotFoundError, + huggingface_hub.errors.RevisionNotFoundError, + huggingface_hub.errors.GatedRepoError, ) as e: raise FileNotFoundError(str(e)) from e # Download external files @@ -246,7 +246,7 @@ def cached_path( def get_datasets_user_agent(user_agent: Optional[Union[str, dict]] = None) -> str: ua = f"datasets/{__version__}" ua += f"; python/{config.PY_VERSION}" - ua += f"; huggingface_hub/{huggingface_hub.__version__}" + ua += f"; hf_hub/{huggingface_hub.__version__}" ua += f"; pyarrow/{config.PYARROW_VERSION}" if config.TORCH_AVAILABLE: ua += f"; torch/{config.TORCH_VERSION}" @@ -753,7 +753,7 @@ def xgetsize(path, download_config: Optional[DownloadConfig] = None) -> int: fs, *_ = fs, *_ = url_to_fs(path, **storage_options) try: size = fs.size(main_hop) - except EntryNotFoundError: + except huggingface_hub.errors.EntryNotFoundError: raise FileNotFoundError(f"No such file: {path}") if size is None: # use xopen instead of fs.open to make data fetching more robust @@ -812,12 +812,7 @@ def read_with_retries(*args, **kwargs): try: out = read(*args, **kwargs) break - except ( - _AiohttpClientError, - asyncio.TimeoutError, - requests.exceptions.ConnectionError, - requests.exceptions.Timeout, - ) as err: + except httpx.RequestError as err: disconnect_err = err logger.warning( f"Got disconnected from remote data host. Retrying in {config.STREAMING_READ_RETRY_INTERVAL}sec [{retry}/{max_retries}]" @@ -897,9 +892,6 @@ def _prepare_single_hop_path_and_storage_options( "endpoint": config.HF_ENDPOINT, **storage_options, } - # streaming with block_size=0 is only implemented in 0.21 (see https://github.com/huggingface/huggingface_hub/pull/1967) - if config.HF_HUB_VERSION < version.parse("0.21.0"): - storage_options["block_size"] = "default" if storage_options: storage_options = {protocol: storage_options} return urlpath, storage_options diff --git a/src/datasets/utils/hub.py b/src/datasets/utils/hub.py index 555157afd52..6d784333b23 100644 --- a/src/datasets/utils/hub.py +++ b/src/datasets/utils/hub.py @@ -1,14 +1,6 @@ from functools import partial from huggingface_hub import hf_hub_url -from huggingface_hub.utils import get_session, hf_raise_for_status hf_dataset_url = partial(hf_hub_url, repo_type="dataset") - - -def check_auth(hf_api, repo_id, token=None): - headers = hf_api._build_hf_headers(token=token) - path = f"{hf_api.endpoint}/api/datasets/{repo_id}/auth-check" - r = get_session().get(path, headers=headers) - hf_raise_for_status(r) diff --git a/tests/fixtures/hub.py b/tests/fixtures/hub.py index c4baa1d733c..feff5651eae 100644 --- a/tests/fixtures/hub.py +++ b/tests/fixtures/hub.py @@ -5,9 +5,8 @@ from typing import Optional import pytest -import requests -from huggingface_hub.hf_api import HfApi, RepositoryNotFoundError -from huggingface_hub.utils import hf_raise_for_status +from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError +from huggingface_hub.hf_api import HfApi from huggingface_hub.utils._headers import _http_user_agent @@ -107,18 +106,11 @@ def _hf_gated_dataset_repo_txt_data(hf_api: HfApi, hf_token, text_file_content): repo_id=repo_id, repo_type="dataset", ) - path = f"{hf_api.endpoint}/api/datasets/{repo_id}/settings" - repo_settings = {"gated": "auto"} - r = requests.put( - path, - headers={"authorization": f"Bearer {hf_token}"}, - json=repo_settings, - ) - hf_raise_for_status(r) + hf_api.update_repo_settings(repo_id, token=hf_token, repo_type="dataset", gated="auto") yield repo_id try: hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset") - except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error + except (HfHubHTTPError, ValueError): # catch http error and token invalid error pass @@ -142,7 +134,7 @@ def hf_private_dataset_repo_txt_data_(hf_api: HfApi, hf_token, text_file_content yield repo_id try: hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset") - except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error + except (HfHubHTTPError, ValueError): # catch http error and token invalid error pass @@ -166,7 +158,7 @@ def hf_private_dataset_repo_zipped_txt_data_(hf_api: HfApi, hf_token, zip_csv_wi yield repo_id try: hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset") - except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error + except (HfHubHTTPError, ValueError): # catch http error and token invalid error pass @@ -190,7 +182,7 @@ def hf_private_dataset_repo_zipped_img_data_(hf_api: HfApi, hf_token, zip_image_ yield repo_id try: hf_api.delete_repo(repo_id, token=hf_token, repo_type="dataset") - except (requests.exceptions.HTTPError, ValueError): # catch http error and token invalid error + except (HfHubHTTPError, ValueError): # catch http error and token invalid error pass diff --git a/tests/test_load.py b/tests/test_load.py index ed28a506f89..44857867732 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -9,9 +9,9 @@ from unittest.mock import patch import dill +import httpx import pyarrow as pa import pytest -import requests import datasets from datasets import config, load_dataset @@ -1050,17 +1050,17 @@ def test_load_dataset_with_unsupported_extensions(text_dir_with_unsupported_exte @pytest.mark.integration def test_loading_from_the_datasets_hub_with_token(): - true_request = requests.Session().request + true_request = httpx.Client().request def assert_auth(method, url, *args, headers, **kwargs): assert headers["authorization"] == "Bearer foo" return true_request(method, url, *args, headers=headers, **kwargs) - with patch("requests.Session.request") as mock_request: + with patch("httpx.Client.request") as mock_request: mock_request.side_effect = assert_auth with tempfile.TemporaryDirectory() as tmp_dir: with offline(): - with pytest.raises((ConnectionError, requests.exceptions.ConnectionError)): + with pytest.raises(ConnectionError): load_dataset(SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER, cache_dir=tmp_dir, token="foo") mock_request.assert_called() diff --git a/tests/test_offline_util.py b/tests/test_offline_util.py index ed8ff49b815..bb56be9fcdc 100644 --- a/tests/test_offline_util.py +++ b/tests/test_offline_util.py @@ -1,7 +1,7 @@ from tempfile import NamedTemporaryFile +import httpx import pytest -import requests from datasets.utils.file_utils import fsspec_get, fsspec_head @@ -13,10 +13,10 @@ def test_offline_with_timeout(): with offline(OfflineSimulationMode.CONNECTION_TIMES_OUT): with pytest.raises(RequestWouldHangIndefinitelyError): - requests.request("GET", "https://huggingface.co") - with pytest.raises(requests.exceptions.Timeout): - requests.request("GET", "https://huggingface.co", timeout=1.0) - with pytest.raises(requests.exceptions.Timeout), NamedTemporaryFile() as temp_file: + httpx.request("GET", "https://huggingface.co") + with pytest.raises(httpx.ConnectTimeout): + httpx.request("GET", "https://huggingface.co", timeout=1.0) + with pytest.raises(httpx.ConnectTimeout), NamedTemporaryFile() as temp_file: fsspec_get("hf://dummy", temp_file=temp_file) @@ -24,15 +24,15 @@ def test_offline_with_timeout(): @require_not_windows # fsspec get keeps a file handle on windows that raises PermissionError def test_offline_with_connection_error(): with offline(OfflineSimulationMode.CONNECTION_FAILS): - with pytest.raises(requests.exceptions.ConnectionError): - requests.request("GET", "https://huggingface.co") - with pytest.raises(requests.exceptions.ConnectionError), NamedTemporaryFile() as temp_file: + with pytest.raises(httpx.ConnectError): + httpx.request("GET", "https://huggingface.co") + with pytest.raises(httpx.ConnectError), NamedTemporaryFile() as temp_file: fsspec_get("hf://dummy", temp_file=temp_file) def test_offline_with_datasets_offline_mode_enabled(): with offline(OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1): - with pytest.raises(ConnectionError): + with pytest.raises(httpx.ConnectTimeout): fsspec_head("hf://dummy") - with pytest.raises(ConnectionError), NamedTemporaryFile() as temp_file: + with pytest.raises(httpx.ConnectTimeout), NamedTemporaryFile() as temp_file: fsspec_get("hf://dummy", temp_file=temp_file) diff --git a/tests/utils.py b/tests/utils.py index 0e411e8734b..83a99bb4e97 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -2,6 +2,7 @@ import importlib.metadata import os import re +import socket import sys import tempfile import unittest @@ -11,11 +12,11 @@ from enum import Enum from importlib.util import find_spec from pathlib import Path -from unittest.mock import patch +from unittest.mock import Mock, patch +import httpx import pyarrow as pa import pytest -import requests from packaging import version from datasets import config @@ -372,19 +373,20 @@ def offline(mode=OfflineSimulationMode.CONNECTION_FAILS, timeout=1e-16): """ Simulate offline mode. - There are three offline simulatiom modes: + There are three offline simulation modes: CONNECTION_FAILS (default mode): a ConnectionError is raised for each network call. Connection errors are created by mocking socket.socket CONNECTION_TIMES_OUT: the connection hangs until it times out. The default timeout value is low (1e-16) to speed up the tests. - Timeout errors are created by mocking requests.request - HF_HUB_OFFLINE_SET_TO_1: the HF_HUB_OFFLINE environment variable is set to 1. - This makes the http/ftp calls of the library instantly fail and raise an OfflineModeEmabled error. + Timeout errors are created by mocking httpx.request + HF_HUB_OFFLINE_SET_TO_1: the HF_HUB_OFFLINE_SET_TO_1 environment variable is set to 1. + This makes the http/ftp calls of the library instantly fail and raise an OfflineModeEnabled error. """ - online_request = requests.Session().request + # Store the original httpx.request to avoid recursion + original_httpx_request = httpx.request - def timeout_request(session, method, url, **kwargs): + def timeout_request(method, url, **kwargs): # Change the url to an invalid url so that the connection hangs invalid_url = "https://10.255.255.1" if kwargs.get("timeout") is None: @@ -393,25 +395,57 @@ def timeout_request(session, method, url, **kwargs): ) kwargs["timeout"] = timeout try: - return online_request(method, invalid_url, **kwargs) + return original_httpx_request(method, invalid_url, **kwargs) except Exception as e: # The following changes in the error are just here to make the offline timeout error prettier - e.request.url = url - max_retry_error = e.args[0] - max_retry_error.args = (max_retry_error.args[0].replace("10.255.255.1", f"OfflineMock[{url}]"),) - e.args = (max_retry_error,) + if hasattr(e, "request"): + e.request.url = url + if hasattr(e, "args") and e.args: + max_retry_error = e.args[0] + if hasattr(max_retry_error, "args"): + max_retry_error.args = (max_retry_error.args[0].replace("10.255.255.1", f"OfflineMock[{url}]"),) + e.args = (max_retry_error,) raise - def raise_connection_error(session, prepared_request, **kwargs): - raise requests.ConnectionError("Offline mode is enabled.", request=prepared_request) + def offline_socket(*args, **kwargs): + raise socket.error("Offline mode is enabled.") if mode is OfflineSimulationMode.CONNECTION_FAILS: - with patch("requests.Session.send", raise_connection_error): - yield + # inspired from https://stackoverflow.com/a/18601897 + with patch("socket.socket", offline_socket): + with patch("huggingface_hub.utils._http.get_session") as get_session_mock: + mock_client = Mock() + + # Mock the request method to raise connection error + def mock_request(*args, **kwargs): + raise httpx.ConnectError("Connection failed") + + # Mock the stream method to raise connection error + def mock_stream(*args, **kwargs): + raise httpx.ConnectError("Connection failed") + + mock_client.request = mock_request + mock_client.stream = mock_stream + get_session_mock.return_value = mock_client + yield elif mode is OfflineSimulationMode.CONNECTION_TIMES_OUT: # inspired from https://stackoverflow.com/a/904609 - with patch("requests.Session.request", timeout_request): - yield + with patch("httpx.request", timeout_request): + with patch("huggingface_hub.utils._http._GLOBAL_CLIENT_FACTORY") as session_factory_mock: + mock_client = Mock() + mock_client.get = lambda *args, **kwargs: timeout_request("GET", *args, **kwargs) + mock_client.post = lambda *args, **kwargs: timeout_request("POST", *args, **kwargs) + mock_client.put = lambda *args, **kwargs: timeout_request("PUT", *args, **kwargs) + mock_client.delete = lambda *args, **kwargs: timeout_request("DELETE", *args, **kwargs) + mock_client.request = timeout_request + + # Mock the stream method to raise timeout + def mock_stream(*args, **kwargs): + raise httpx.ConnectTimeout("Connection timed out") + + mock_client.stream = mock_stream + session_factory_mock.return_value = mock_client + yield elif mode is OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1: with patch("datasets.config.HF_HUB_OFFLINE", True): yield @@ -456,12 +490,11 @@ def is_rng_equal(rng1, rng2): def xfail_if_500_502_http_error(func): import decorator - from requests.exceptions import HTTPError def _wrapper(func, *args, **kwargs): try: return func(*args, **kwargs) - except HTTPError as err: + except httpx.HTTPError as err: if str(err).startswith("500") or str(err).startswith("502"): pytest.xfail(str(err)) raise err From 07b3a9dbd4a0ae263136133fdedbec36f9a8a686 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Thu, 18 Sep 2025 17:35:08 +0200 Subject: [PATCH 02/32] code quality --- src/datasets/utils/file_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 9c9e33285ad..2ddc09585c2 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -4,7 +4,6 @@ Copyright by the AllenNLP authors. """ -import asyncio import glob import io import json From 532e50799d2dbdd9947b9734df2e94c7db40f2ce Mon Sep 17 00:00:00 2001 From: Wauplin Date: Fri, 19 Sep 2025 14:26:22 +0200 Subject: [PATCH 03/32] add back requests --- setup.py | 1 + src/datasets/utils/file_utils.py | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c557992fa76..5b74fc930b4 100644 --- a/setup.py +++ b/setup.py @@ -117,6 +117,7 @@ # For performance gains with apache arrow "pandas", # for downloading datasets over HTTPS + "requests>=2.32.2", "httpx<1.0.0", # progress bars in downloads and data operations "tqdm>=4.66.3", diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 2ddc09585c2..03d3ffb0dc8 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -4,6 +4,7 @@ Copyright by the AllenNLP authors. """ +import asyncio import glob import io import json @@ -29,6 +30,7 @@ import httpx import huggingface_hub import huggingface_hub.errors +import requests from fsspec.core import strip_protocol, url_to_fs from fsspec.utils import can_be_local from huggingface_hub.utils import get_session, insecure_hashlib @@ -811,7 +813,13 @@ def read_with_retries(*args, **kwargs): try: out = read(*args, **kwargs) break - except httpx.RequestError as err: + except ( + _AiohttpClientError, + asyncio.TimeoutError, + requests.exceptions.ConnectionError, + requests.exceptions.Timeout, + httpx.RequestError, + ) as err: disconnect_err = err logger.warning( f"Got disconnected from remote data host. Retrying in {config.STREAMING_READ_RETRY_INTERVAL}sec [{retry}/{max_retries}]" From fe845fbf93bb824cf823aba824c2e4220d86e485 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Fri, 19 Sep 2025 14:51:53 +0200 Subject: [PATCH 04/32] install transformers from source --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 5b74fc930b4..3b4474cb31c 100644 --- a/setup.py +++ b/setup.py @@ -181,7 +181,7 @@ "tiktoken", "torch>=2.8.0", "torchdata", - "transformers>=4.42.0", # Pins numpy < 2 + "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0", # Pins numpy < 2 "zstandard", "polars[timezone]>=0.20.0", "Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced @@ -200,7 +200,7 @@ DOCS_REQUIRE = [ # Following dependencies are required for the Python reference to be built properly - "transformers", + "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0", "torch", "tensorflow>=2.6.0", ] From 8b5e0f39ba4e5cd0df512f58244e6f2b198dd7c4 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 22 Sep 2025 14:02:44 +0200 Subject: [PATCH 05/32] will it work? --- .github/workflows/ci.yml | 22 ++++++++++++++++------ setup.py | 4 ++-- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7fe210d99fe..fc7d6721666 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,7 +13,6 @@ env: CI_HEADERS: ${{ secrets.CI_HEADERS }} jobs: - check_code_quality: runs-on: ubuntu-latest steps: @@ -26,6 +25,7 @@ jobs: run: | python -m pip install --upgrade pip pip install .[quality] + pip install git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 - name: Check quality run: | ruff check tests src benchmarks utils setup.py # linter @@ -35,7 +35,7 @@ jobs: needs: check_code_quality strategy: matrix: - test: ['unit', 'integration'] + test: ["unit", "integration"] os: [ubuntu-latest, windows-latest] deps_versions: [deps-latest, deps-minimum] continue-on-error: ${{ matrix.test == 'integration' }} @@ -48,7 +48,7 @@ jobs: if: ${{ matrix.os == 'ubuntu-latest' }} run: | sudo apt update - sudo apt install -y ffmpeg + sudo apt install -y ffmpeg - name: Set up Python 3.9 uses: actions/setup-python@v5 with: @@ -73,6 +73,10 @@ jobs: - name: Install dependencies (latest versions) if: ${{ matrix.deps_versions == 'deps-latest' }} run: uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9" + - name: Install transformers from source (latest versions) + if: ${{ matrix.deps_versions == 'deps-latest' }} + run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 + - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} run: uv pip install --system pyarrow==21.0.0 huggingface-hub==0.24.7 transformers dill==0.3.1.1 @@ -84,7 +88,7 @@ jobs: needs: check_code_quality strategy: matrix: - test: ['unit'] + test: ["unit"] os: [ubuntu-latest, windows-latest] deps_versions: [deps-latest] continue-on-error: false @@ -119,6 +123,9 @@ jobs: run: pip install --upgrade uv - name: Install dependencies run: uv pip install --system "datasets[tests] @ ." + - name: Install transformers from source + run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 + - name: Test with pytest run: | python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/ @@ -127,7 +134,7 @@ jobs: needs: check_code_quality strategy: matrix: - test: ['unit'] + test: ["unit"] os: [ubuntu-latest, windows-latest] deps_versions: [deps-latest] continue-on-error: false @@ -140,7 +147,7 @@ jobs: if: ${{ matrix.os == 'ubuntu-latest' }} run: | sudo apt update - sudo apt install -y ffmpeg + sudo apt install -y ffmpeg - name: Set up Python 3.11 uses: actions/setup-python@v5 with: @@ -162,6 +169,9 @@ jobs: run: pip install --upgrade uv - name: Install dependencies run: uv pip install --system "datasets[tests_numpy2] @ ." + - name: Install transformers from source + run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 + - name: Test with pytest run: | python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/ diff --git a/setup.py b/setup.py index a7207eea963..408bf0552f6 100644 --- a/setup.py +++ b/setup.py @@ -181,7 +181,7 @@ "tiktoken", "torch>=2.8.0", "torchdata", - "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0", # Pins numpy < 2 + # "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0", # Pins numpy < 2 "zstandard", "polars[timezone]>=0.20.0", "Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced @@ -200,7 +200,7 @@ DOCS_REQUIRE = [ # Following dependencies are required for the Python reference to be built properly - "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0", + # "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0", "torch", "tensorflow>=2.6.0", ] From 09ac91014e40c1ebee4b1054f38b2224353450cf Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 22 Sep 2025 14:23:09 +0200 Subject: [PATCH 06/32] to remove later: don't fail fast --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fc7d6721666..2f09d97e2ea 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,7 +38,7 @@ jobs: test: ["unit", "integration"] os: [ubuntu-latest, windows-latest] deps_versions: [deps-latest, deps-minimum] - continue-on-error: ${{ matrix.test == 'integration' }} + # continue-on-error: ${{ matrix.test == 'integration' }} runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 @@ -91,7 +91,7 @@ jobs: test: ["unit"] os: [ubuntu-latest, windows-latest] deps_versions: [deps-latest] - continue-on-error: false + # continue-on-error: false runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 @@ -137,7 +137,7 @@ jobs: test: ["unit"] os: [ubuntu-latest, windows-latest] deps_versions: [deps-latest] - continue-on-error: false + # continue-on-error: false runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 From bb921737667f9a796dff8fe601a8fa9e65e67012 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 22 Sep 2025 14:43:58 +0200 Subject: [PATCH 07/32] don't fail fast --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2f09d97e2ea..51e52d2f6bb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,6 +34,7 @@ jobs: test: needs: check_code_quality strategy: + fail-fast: false matrix: test: ["unit", "integration"] os: [ubuntu-latest, windows-latest] @@ -87,6 +88,7 @@ jobs: test_py311: needs: check_code_quality strategy: + fail-fast: false matrix: test: ["unit"] os: [ubuntu-latest, windows-latest] @@ -133,6 +135,7 @@ jobs: test_py311_numpy2: needs: check_code_quality strategy: + fail-fast: false matrix: test: ["unit"] os: [ubuntu-latest, windows-latest] From bd6945c064a387cce21f1d4881e6b7bf030d14e3 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 22 Sep 2025 14:50:52 +0200 Subject: [PATCH 08/32] fix test fixture --- tests/fixtures/hub.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/fixtures/hub.py b/tests/fixtures/hub.py index feff5651eae..fe3c5897f00 100644 --- a/tests/fixtures/hub.py +++ b/tests/fixtures/hub.py @@ -23,9 +23,7 @@ def ci_hub_config(monkeypatch): monkeypatch.setattr("datasets.config.HF_ENDPOINT", CI_HUB_ENDPOINT) monkeypatch.setattr("datasets.config.HUB_DATASETS_URL", CI_HUB_DATASETS_URL) - monkeypatch.setattr( - "huggingface_hub.file_download.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE - ) + monkeypatch.setattr("huggingface_hub.constants.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE) old_environ = dict(os.environ) os.environ["HF_ENDPOINT"] = CI_HUB_ENDPOINT yield From ac365fa5235884b5f9d054955779facc031bfa9c Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 22 Sep 2025 14:52:40 +0200 Subject: [PATCH 09/32] fix OfflineModeIsEnabled test --- tests/test_offline_util.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_offline_util.py b/tests/test_offline_util.py index bb56be9fcdc..1166755e28b 100644 --- a/tests/test_offline_util.py +++ b/tests/test_offline_util.py @@ -2,6 +2,7 @@ import httpx import pytest +from huggingface_hub.errors import OfflineModeIsEnabled from datasets.utils.file_utils import fsspec_get, fsspec_head @@ -32,7 +33,7 @@ def test_offline_with_connection_error(): def test_offline_with_datasets_offline_mode_enabled(): with offline(OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1): - with pytest.raises(httpx.ConnectTimeout): + with pytest.raises(OfflineModeIsEnabled): fsspec_head("hf://dummy") - with pytest.raises(httpx.ConnectTimeout), NamedTemporaryFile() as temp_file: + with pytest.raises(OfflineModeIsEnabled), NamedTemporaryFile() as temp_file: fsspec_get("hf://dummy", temp_file=temp_file) From 248b0523c11394a69db1c2e30aecb4143679a64d Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 22 Sep 2025 15:03:25 +0200 Subject: [PATCH 10/32] huggingface_hub 1.0.0 even if deps latest --- .github/workflows/ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 51e52d2f6bb..7333dc913aa 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -75,12 +75,11 @@ jobs: if: ${{ matrix.deps_versions == 'deps-latest' }} run: uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9" - name: Install transformers from source (latest versions) - if: ${{ matrix.deps_versions == 'deps-latest' }} run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} - run: uv pip install --system pyarrow==21.0.0 huggingface-hub==0.24.7 transformers dill==0.3.1.1 + run: uv pip install --system pyarrow==21.0.0 dill==0.3.1.1 - name: Test with pytest run: | python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/ From 167882cdbdadf2d3ddedac548af3f7c6018ba1bf Mon Sep 17 00:00:00 2001 From: Wauplin Date: Tue, 23 Sep 2025 17:19:06 +0200 Subject: [PATCH 11/32] will be broken but better --- .github/workflows/ci.yml | 17 ++++++++--------- src/datasets/iterable_dataset.py | 2 +- src/datasets/load.py | 21 +++++++++++++++++---- src/datasets/utils/file_utils.py | 12 ++++++------ tests/fixtures/hub.py | 9 ++++++++- 5 files changed, 40 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7333dc913aa..eb815d88e6c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,6 +13,7 @@ env: CI_HEADERS: ${{ secrets.CI_HEADERS }} jobs: + check_code_quality: runs-on: ubuntu-latest steps: @@ -25,7 +26,7 @@ jobs: run: | python -m pip install --upgrade pip pip install .[quality] - pip install git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 + pip install git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 # temporary (only for tests) - name: Check quality run: | ruff check tests src benchmarks utils setup.py # linter @@ -34,12 +35,11 @@ jobs: test: needs: check_code_quality strategy: - fail-fast: false matrix: - test: ["unit", "integration"] + test: ['unit', 'integration'] os: [ubuntu-latest, windows-latest] deps_versions: [deps-latest, deps-minimum] - # continue-on-error: ${{ matrix.test == 'integration' }} + continue-on-error: ${{ matrix.test == 'integration' }} runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 @@ -49,7 +49,7 @@ jobs: if: ${{ matrix.os == 'ubuntu-latest' }} run: | sudo apt update - sudo apt install -y ffmpeg + sudo apt install -y ffmpeg - name: Set up Python 3.9 uses: actions/setup-python@v5 with: @@ -87,12 +87,11 @@ jobs: test_py311: needs: check_code_quality strategy: - fail-fast: false matrix: - test: ["unit"] + test: ['unit'] os: [ubuntu-latest, windows-latest] deps_versions: [deps-latest] - # continue-on-error: false + continue-on-error: false runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 @@ -102,7 +101,7 @@ jobs: if: ${{ matrix.os == 'ubuntu-latest' }} run: | sudo apt update - sudo apt install -y ffmpeg + sudo apt install -y ffmpeg - name: Set up Python 3.11 uses: actions/setup-python@v5 with: diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index b931342db34..2578309bd78 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -27,8 +27,8 @@ import pyarrow as pa import pyarrow.parquet as pq from huggingface_hub import CommitInfo, CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi -from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError from huggingface_hub.hf_api import RepoFile +from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError from multiprocess import Pool from . import config diff --git a/src/datasets/load.py b/src/datasets/load.py index 557ce3514f5..ae3b9825970 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -29,18 +29,19 @@ import fsspec import httpx +import requests import yaml from fsspec.core import url_to_fs from huggingface_hub import DatasetCard, DatasetCardData, HfApi -from huggingface_hub.errors import ( +from huggingface_hub.utils import ( EntryNotFoundError, GatedRepoError, LocalEntryNotFoundError, OfflineModeIsEnabled, RepositoryNotFoundError, RevisionNotFoundError, + get_session, ) -from huggingface_hub.utils import get_session from . import __version__, config from .arrow_dataset import Dataset @@ -944,7 +945,13 @@ def dataset_module_factory( except LocalEntryNotFoundError as e: if isinstance( e.__cause__, - (OfflineModeIsEnabled, httpx.ConnectError, httpx.TimeoutException), + ( + OfflineModeIsEnabled, + requests.exceptions.Timeout, + requests.exceptions.ConnectionError, + httpx.ConnectError, + httpx.TimeoutException, + ), ): raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e else: @@ -955,7 +962,13 @@ def dataset_module_factory( revision=revision, timeout=100.0, ).sha - except (OfflineModeIsEnabled, httpx.ConnectError, httpx.TimeoutException) as e: + except ( + OfflineModeIsEnabled, + requests.exceptions.Timeout, + requests.exceptions.ConnectionError, + httpx.ConnectError, + httpx.TimeoutException, + ) as e: raise ConnectionError(f"Couldn't reach '{path}' on the Hub ({e.__class__.__name__})") from e except GatedRepoError as e: message = f"Dataset '{path}' is a gated dataset on the Hub." diff --git a/src/datasets/utils/file_utils.py b/src/datasets/utils/file_utils.py index 03d3ffb0dc8..7a07f8cd267 100644 --- a/src/datasets/utils/file_utils.py +++ b/src/datasets/utils/file_utils.py @@ -141,7 +141,7 @@ def cached_path( ConnectionError: in case of unreachable url and no cache on disk ValueError: if it couldn't parse the url or filename correctly - httpx.NetworkError: in case of internet connection issue + httpx.NetworkError or requests.exceptions.ConnectionError: in case of internet connection issue """ if download_config is None: download_config = DownloadConfig(**download_kwargs) @@ -184,10 +184,10 @@ def cached_path( proxies=download_config.proxies, ) except ( - huggingface_hub.errors.RepositoryNotFoundError, - huggingface_hub.errors.EntryNotFoundError, - huggingface_hub.errors.RevisionNotFoundError, - huggingface_hub.errors.GatedRepoError, + huggingface_hub.utils.RepositoryNotFoundError, + huggingface_hub.utils.EntryNotFoundError, + huggingface_hub.utils.RevisionNotFoundError, + huggingface_hub.utils.GatedRepoError, ) as e: raise FileNotFoundError(str(e)) from e # Download external files @@ -754,7 +754,7 @@ def xgetsize(path, download_config: Optional[DownloadConfig] = None) -> int: fs, *_ = fs, *_ = url_to_fs(path, **storage_options) try: size = fs.size(main_hop) - except huggingface_hub.errors.EntryNotFoundError: + except huggingface_hub.utils.EntryNotFoundError: raise FileNotFoundError(f"No such file: {path}") if size is None: # use xopen instead of fs.open to make data fetching more robust diff --git a/tests/fixtures/hub.py b/tests/fixtures/hub.py index fe3c5897f00..a6ba8472f21 100644 --- a/tests/fixtures/hub.py +++ b/tests/fixtures/hub.py @@ -5,8 +5,8 @@ from typing import Optional import pytest -from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError from huggingface_hub.hf_api import HfApi +from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError from huggingface_hub.utils._headers import _http_user_agent @@ -24,6 +24,13 @@ def ci_hub_config(monkeypatch): monkeypatch.setattr("datasets.config.HF_ENDPOINT", CI_HUB_ENDPOINT) monkeypatch.setattr("datasets.config.HUB_DATASETS_URL", CI_HUB_DATASETS_URL) monkeypatch.setattr("huggingface_hub.constants.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE) + try: + # for backward compatibility with huggingface_hub 0.x + monkeypatch.setattr( + "huggingface_hub.file_download.HUGGINGFACE_CO_URL_TEMPLATE", CI_HFH_HUGGINGFACE_CO_URL_TEMPLATE + ) + except AttributeError: + pass old_environ = dict(os.environ) os.environ["HF_ENDPOINT"] = CI_HUB_ENDPOINT yield From 3c7c555d8a95734977e29a8c9a2942be0fe30244 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Wed, 24 Sep 2025 13:20:20 +0200 Subject: [PATCH 12/32] pip list in CI --- .github/workflows/ci.yml | 15 ++++++++++----- setup.py | 6 +++--- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index eb815d88e6c..da783ee7a4d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: run: | python -m pip install --upgrade pip pip install .[quality] - pip install git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 # temporary (only for tests) + pip install git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc1 # temporary (only for tests) - name: Check quality run: | ruff check tests src benchmarks utils setup.py # linter @@ -75,11 +75,13 @@ jobs: if: ${{ matrix.deps_versions == 'deps-latest' }} run: uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9" - name: Install transformers from source (latest versions) - run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 + run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc1 # temporary (only for tests) - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} run: uv pip install --system pyarrow==21.0.0 dill==0.3.1.1 + - name: Print dependencies + run: uv pip list - name: Test with pytest run: | python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/ @@ -124,8 +126,9 @@ jobs: - name: Install dependencies run: uv pip install --system "datasets[tests] @ ." - name: Install transformers from source - run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 - + run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc1 # temporary (only for tests) + - name: Print dependencies + run: uv pip list - name: Test with pytest run: | python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/ @@ -171,7 +174,9 @@ jobs: - name: Install dependencies run: uv pip install --system "datasets[tests_numpy2] @ ." - name: Install transformers from source - run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 + run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc1 + - name: Print dependencies + run: pip list - name: Test with pytest run: | diff --git a/setup.py b/setup.py index 408bf0552f6..6019002bc92 100644 --- a/setup.py +++ b/setup.py @@ -129,7 +129,7 @@ # minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143 "fsspec[http]>=2023.1.0,<=2025.9.0", # To get datasets from the Datasets Hub on huggingface.co - "huggingface-hub==1.0.0.rc0", + "huggingface-hub==1.0.0.rc1", # Utilities from PyPA to e.g., compare versions "packaging", # To parse YAML metadata from dataset cards @@ -181,7 +181,7 @@ "tiktoken", "torch>=2.8.0", "torchdata", - # "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0", # Pins numpy < 2 + # "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc1", # Pins numpy < 2 "zstandard", "polars[timezone]>=0.20.0", "Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced @@ -200,7 +200,7 @@ DOCS_REQUIRE = [ # Following dependencies are required for the Python reference to be built properly - # "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0", + # "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc1", "torch", "tensorflow>=2.6.0", ] From 67f8d2ac48c25c0e7620618e774988c37a067a33 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Wed, 24 Sep 2025 13:35:46 +0200 Subject: [PATCH 13/32] revert branch --- .github/workflows/ci.yml | 8 ++++---- setup.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index da783ee7a4d..faa3a855e29 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: run: | python -m pip install --upgrade pip pip install .[quality] - pip install git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc1 # temporary (only for tests) + pip install git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 # temporary (only for tests) - name: Check quality run: | ruff check tests src benchmarks utils setup.py # linter @@ -75,7 +75,7 @@ jobs: if: ${{ matrix.deps_versions == 'deps-latest' }} run: uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9" - name: Install transformers from source (latest versions) - run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc1 # temporary (only for tests) + run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 # temporary (only for tests) - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} @@ -126,7 +126,7 @@ jobs: - name: Install dependencies run: uv pip install --system "datasets[tests] @ ." - name: Install transformers from source - run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc1 # temporary (only for tests) + run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 # temporary (only for tests) - name: Print dependencies run: uv pip list - name: Test with pytest @@ -174,7 +174,7 @@ jobs: - name: Install dependencies run: uv pip install --system "datasets[tests_numpy2] @ ." - name: Install transformers from source - run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc1 + run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 - name: Print dependencies run: pip list diff --git a/setup.py b/setup.py index 6019002bc92..2c19e29b215 100644 --- a/setup.py +++ b/setup.py @@ -181,7 +181,7 @@ "tiktoken", "torch>=2.8.0", "torchdata", - # "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc1", # Pins numpy < 2 + # "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0", # Pins numpy < 2 "zstandard", "polars[timezone]>=0.20.0", "Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced @@ -200,7 +200,7 @@ DOCS_REQUIRE = [ # Following dependencies are required for the Python reference to be built properly - # "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc1", + # "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0", "torch", "tensorflow>=2.6.0", ] From 9a23ff50692453f1d8f52f38d53a3632ee84d837 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Wed, 24 Sep 2025 14:20:13 +0200 Subject: [PATCH 14/32] install latest only in latest tests --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index faa3a855e29..022e88fcfc9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -75,6 +75,7 @@ jobs: if: ${{ matrix.deps_versions == 'deps-latest' }} run: uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9" - name: Install transformers from source (latest versions) + if: ${{ matrix.deps_versions == 'deps-latest' }} run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 # temporary (only for tests) - name: Install dependencies (minimum versions) @@ -126,6 +127,7 @@ jobs: - name: Install dependencies run: uv pip install --system "datasets[tests] @ ." - name: Install transformers from source + if: ${{ matrix.deps_versions == 'deps-latest' }} run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 # temporary (only for tests) - name: Print dependencies run: uv pip list From 9efbb0297cf986ef88b60e3be0a242362c8cf447 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Wed, 24 Sep 2025 14:54:08 +0200 Subject: [PATCH 15/32] offline --- tests/test_offline_util.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/test_offline_util.py b/tests/test_offline_util.py index 1166755e28b..16daa49e7d9 100644 --- a/tests/test_offline_util.py +++ b/tests/test_offline_util.py @@ -2,6 +2,7 @@ import httpx import pytest +import requests from huggingface_hub.errors import OfflineModeIsEnabled from datasets.utils.file_utils import fsspec_get, fsspec_head @@ -13,11 +14,17 @@ @require_not_windows # fsspec get keeps a file handle on windows that raises PermissionError def test_offline_with_timeout(): with offline(OfflineSimulationMode.CONNECTION_TIMES_OUT): + with pytest.raises(RequestWouldHangIndefinitelyError): + requests.request("GET", "https://huggingface.co") with pytest.raises(RequestWouldHangIndefinitelyError): httpx.request("GET", "https://huggingface.co") + + with pytest.raises(requests.exceptions.Timeout): + requests.request("GET", "https://huggingface.co", timeout=1.0) with pytest.raises(httpx.ConnectTimeout): httpx.request("GET", "https://huggingface.co", timeout=1.0) - with pytest.raises(httpx.ConnectTimeout), NamedTemporaryFile() as temp_file: + + with pytest.raises((requests.exceptions.Timeout, httpx.ConnectTimeout)), NamedTemporaryFile() as temp_file: fsspec_get("hf://dummy", temp_file=temp_file) @@ -25,9 +32,12 @@ def test_offline_with_timeout(): @require_not_windows # fsspec get keeps a file handle on windows that raises PermissionError def test_offline_with_connection_error(): with offline(OfflineSimulationMode.CONNECTION_FAILS): + with pytest.raises(requests.exceptions.Timeout): + requests.request("GET", "https://huggingface.co") with pytest.raises(httpx.ConnectError): httpx.request("GET", "https://huggingface.co") - with pytest.raises(httpx.ConnectError), NamedTemporaryFile() as temp_file: + + with pytest.raises((requests.exceptions.Timeout, httpx.ConnectError)), NamedTemporaryFile() as temp_file: fsspec_get("hf://dummy", temp_file=temp_file) From 48e55a4446a1839658a6cac021d65b877920bfee Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 29 Sep 2025 18:05:57 +0200 Subject: [PATCH 16/32] get back to normal --- .github/workflows/ci.yml | 16 +++++----------- setup.py | 6 +++--- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 022e88fcfc9..4490040283d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,6 @@ jobs: run: | python -m pip install --upgrade pip pip install .[quality] - pip install git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 # temporary (only for tests) - name: Check quality run: | ruff check tests src benchmarks utils setup.py # linter @@ -73,11 +72,10 @@ jobs: run: uv pip install --system "datasets[tests] @ ." - name: Install dependencies (latest versions) if: ${{ matrix.deps_versions == 'deps-latest' }} - run: uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9" - - name: Install transformers from source (latest versions) - if: ${{ matrix.deps_versions == 'deps-latest' }} - run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 # temporary (only for tests) - + run: + uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9" + uv pip uninstall -y transformers && uv pip uninstall -y huggingface_hub # temporary (only for tests) + uv pip install --prerelease=allow git+https://github.com/huggingface/transformers.git # temporary (only for tests) - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} run: uv pip install --system pyarrow==21.0.0 dill==0.3.1.1 @@ -126,9 +124,6 @@ jobs: run: pip install --upgrade uv - name: Install dependencies run: uv pip install --system "datasets[tests] @ ." - - name: Install transformers from source - if: ${{ matrix.deps_versions == 'deps-latest' }} - run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 # temporary (only for tests) - name: Print dependencies run: uv pip list - name: Test with pytest @@ -138,12 +133,11 @@ jobs: test_py311_numpy2: needs: check_code_quality strategy: - fail-fast: false matrix: test: ["unit"] os: [ubuntu-latest, windows-latest] deps_versions: [deps-latest] - # continue-on-error: false + continue-on-error: false runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 diff --git a/setup.py b/setup.py index 2c19e29b215..62051ee1165 100644 --- a/setup.py +++ b/setup.py @@ -129,7 +129,7 @@ # minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143 "fsspec[http]>=2023.1.0,<=2025.9.0", # To get datasets from the Datasets Hub on huggingface.co - "huggingface-hub==1.0.0.rc1", + "huggingface-hub>=0.24.0,<2.0", # Utilities from PyPA to e.g., compare versions "packaging", # To parse YAML metadata from dataset cards @@ -181,7 +181,7 @@ "tiktoken", "torch>=2.8.0", "torchdata", - # "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0", # Pins numpy < 2 + "transformers>=4.42.0", # Pins numpy < 2 "zstandard", "polars[timezone]>=0.20.0", "Pillow>=9.4.0", # When PIL.Image.ExifTags was introduced @@ -200,7 +200,7 @@ DOCS_REQUIRE = [ # Following dependencies are required for the Python reference to be built properly - # "git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0", + "transformers", "torch", "tensorflow>=2.6.0", ] From ac0366df91b1169f0e44f8bb7a5e48112384d2b4 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 29 Sep 2025 18:09:20 +0200 Subject: [PATCH 17/32] better --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4490040283d..0af33119255 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,7 +74,7 @@ jobs: if: ${{ matrix.deps_versions == 'deps-latest' }} run: uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9" - uv pip uninstall -y transformers && uv pip uninstall -y huggingface_hub # temporary (only for tests) + uv pip uninstall transformers huggingface_hub # temporary (only for tests) uv pip install --prerelease=allow git+https://github.com/huggingface/transformers.git # temporary (only for tests) - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} @@ -102,7 +102,7 @@ jobs: if: ${{ matrix.os == 'ubuntu-latest' }} run: | sudo apt update - sudo apt install -y ffmpeg + sudo apt install -y ffmpeg - name: Set up Python 3.11 uses: actions/setup-python@v5 with: From 01fd0115875de96ea867f66509b7969162855caf Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 29 Sep 2025 18:10:00 +0200 Subject: [PATCH 18/32] ofc --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0af33119255..28bf7e55b47 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -78,7 +78,7 @@ jobs: uv pip install --prerelease=allow git+https://github.com/huggingface/transformers.git # temporary (only for tests) - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} - run: uv pip install --system pyarrow==21.0.0 dill==0.3.1.1 + run: uv pip install --system pyarrow==21.0.0 huggingface-hub==0.24.7 transformers dill==0.3.1.1 - name: Print dependencies run: uv pip list - name: Test with pytest From 6a7151e00142b9acac572874e7212e3ed060cc3e Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 29 Sep 2025 18:10:49 +0200 Subject: [PATCH 19/32] why not --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 28bf7e55b47..89042bd6004 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -134,7 +134,7 @@ jobs: needs: check_code_quality strategy: matrix: - test: ["unit"] + test: ['unit'] os: [ubuntu-latest, windows-latest] deps_versions: [deps-latest] continue-on-error: false From b799352515964e2de6690669f6e4fffd4abea412 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 29 Sep 2025 18:11:54 +0200 Subject: [PATCH 20/32] as before --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 89042bd6004..61021d79874 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -147,7 +147,7 @@ jobs: if: ${{ matrix.os == 'ubuntu-latest' }} run: | sudo apt update - sudo apt install -y ffmpeg + sudo apt install -y ffmpeg - name: Set up Python 3.11 uses: actions/setup-python@v5 with: From 784256c3acbe5e476ba7d4875b147248de7c5191 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 29 Sep 2025 18:13:32 +0200 Subject: [PATCH 21/32] this time is good --- .github/workflows/ci.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 61021d79874..f20b88e224e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -168,9 +168,10 @@ jobs: - name: Install uv run: pip install --upgrade uv - name: Install dependencies - run: uv pip install --system "datasets[tests_numpy2] @ ." - - name: Install transformers from source - run: uv pip install --system git+https://github.com/huggingface/transformers.git@ci-test-huggingface-hub-v1.0.0.rc0 + run: + uv pip install --system "datasets[tests_numpy2] @ ." + uv pip uninstall transformers huggingface_hub # temporary (only for tests) + uv pip install --prerelease=allow git+https://github.com/huggingface/transformers.git # temporary (only for tests) - name: Print dependencies run: pip list From f7dcf49079c335a58c7ff7b67672c19f2d3048b6 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 29 Sep 2025 18:20:07 +0200 Subject: [PATCH 22/32] fix yaml format --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f20b88e224e..e0883a8137a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -72,7 +72,7 @@ jobs: run: uv pip install --system "datasets[tests] @ ." - name: Install dependencies (latest versions) if: ${{ matrix.deps_versions == 'deps-latest' }} - run: + run: | uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9" uv pip uninstall transformers huggingface_hub # temporary (only for tests) uv pip install --prerelease=allow git+https://github.com/huggingface/transformers.git # temporary (only for tests) @@ -168,7 +168,7 @@ jobs: - name: Install uv run: pip install --upgrade uv - name: Install dependencies - run: + run: | uv pip install --system "datasets[tests_numpy2] @ ." uv pip uninstall transformers huggingface_hub # temporary (only for tests) uv pip install --prerelease=allow git+https://github.com/huggingface/transformers.git # temporary (only for tests) From ace01a305b49fe5124d58371647d1b9a85106bd0 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 29 Sep 2025 18:23:39 +0200 Subject: [PATCH 23/32] system --- .github/workflows/ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e0883a8137a..6edbc10c4e1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,8 +74,8 @@ jobs: if: ${{ matrix.deps_versions == 'deps-latest' }} run: | uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9" - uv pip uninstall transformers huggingface_hub # temporary (only for tests) - uv pip install --prerelease=allow git+https://github.com/huggingface/transformers.git # temporary (only for tests) + uv pip uninstall --system transformers huggingface_hub # temporary (only for tests) + uv pip install --system --prerelease=allow git+https://github.com/huggingface/transformers.git # temporary (only for tests) - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} run: uv pip install --system pyarrow==21.0.0 huggingface-hub==0.24.7 transformers dill==0.3.1.1 @@ -170,8 +170,8 @@ jobs: - name: Install dependencies run: | uv pip install --system "datasets[tests_numpy2] @ ." - uv pip uninstall transformers huggingface_hub # temporary (only for tests) - uv pip install --prerelease=allow git+https://github.com/huggingface/transformers.git # temporary (only for tests) + uv pip uninstall --system transformers huggingface_hub # temporary (only for tests) + uv pip install --system --prerelease=allow git+https://github.com/huggingface/transformers.git # temporary (only for tests) - name: Print dependencies run: pip list From a472f80f0cf017daaa08b069bcdd03a924ff081d Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 29 Sep 2025 19:50:34 +0200 Subject: [PATCH 24/32] fix import in o.x --- src/datasets/arrow_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 49d384aaa81..8384030bcb0 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -68,8 +68,9 @@ DatasetCardData, HfApi, ) -from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError +from huggingface_hub.errors import RepositoryNotFoundError from huggingface_hub.hf_api import RepoFile +from huggingface_hub.utils import HfHubHTTPError from multiprocess import Pool from tqdm.contrib.concurrent import thread_map From f4f0f6e71f40cbf3e9eaf56b2d33314d5d73ce76 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Mon, 29 Sep 2025 20:03:38 +0200 Subject: [PATCH 25/32] :/ --- src/datasets/arrow_dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 8384030bcb0..16cf3c72bc5 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -68,9 +68,8 @@ DatasetCardData, HfApi, ) -from huggingface_hub.errors import RepositoryNotFoundError from huggingface_hub.hf_api import RepoFile -from huggingface_hub.utils import HfHubHTTPError +from huggingface_hub.utils import HfHubHTTPError, RepositoryNotFoundError from multiprocess import Pool from tqdm.contrib.concurrent import thread_map From 83e3d2ce752b7e1a58d0422ab80be4d8fe4ee5df Mon Sep 17 00:00:00 2001 From: Wauplin Date: Tue, 30 Sep 2025 11:30:15 +0200 Subject: [PATCH 26/32] Bump minimal version to 0.25.0 --- .github/workflows/ci.yml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6edbc10c4e1..a40e3740ec9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -78,7 +78,7 @@ jobs: uv pip install --system --prerelease=allow git+https://github.com/huggingface/transformers.git # temporary (only for tests) - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} - run: uv pip install --system pyarrow==21.0.0 huggingface-hub==0.24.7 transformers dill==0.3.1.1 + run: uv pip install --system pyarrow==21.0.0 huggingface-hub==0.25.0 transformers dill==0.3.1.1 - name: Print dependencies run: uv pip list - name: Test with pytest diff --git a/setup.py b/setup.py index 62051ee1165..0dee50b1f42 100644 --- a/setup.py +++ b/setup.py @@ -129,7 +129,7 @@ # minimum 2023.1.0 to support protocol=kwargs in fsspec's `open`, `get_fs_token_paths`, etc.: see https://github.com/fsspec/filesystem_spec/pull/1143 "fsspec[http]>=2023.1.0,<=2025.9.0", # To get datasets from the Datasets Hub on huggingface.co - "huggingface-hub>=0.24.0,<2.0", + "huggingface-hub>=0.25.0,<2.0", # Utilities from PyPA to e.g., compare versions "packaging", # To parse YAML metadata from dataset cards From e39a04de1ce724f3e98bbd7c76049fff98eed5bc Mon Sep 17 00:00:00 2001 From: Wauplin Date: Tue, 30 Sep 2025 12:45:25 +0200 Subject: [PATCH 27/32] x-compatible offline helper --- tests/test_load.py | 5 +-- tests/utils.py | 104 ++++++++++++++++----------------------------- 2 files changed, 38 insertions(+), 71 deletions(-) diff --git a/tests/test_load.py b/tests/test_load.py index 44857867732..c8c9ee01e3e 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -767,10 +767,7 @@ def test_load_dataset_from_hub(self): def test_load_dataset_namespace(self): with self.assertRaises(DatasetNotFoundError) as context: datasets.load_dataset("hf-internal-testing/_dummy") - self.assertIn( - "hf-internal-testing/_dummy", - str(context.exception), - ) + self.assertIn("hf-internal-testing/_dummy", str(context.exception)) for offline_simulation_mode in list(OfflineSimulationMode): with offline(offline_simulation_mode): with self.assertRaises(ConnectionError) as context: diff --git a/tests/utils.py b/tests/utils.py index 83a99bb4e97..aa25f46124e 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -17,6 +17,7 @@ import httpx import pyarrow as pa import pytest +import requests from packaging import version from datasets import config @@ -68,6 +69,8 @@ def parse_flag_from_env(key, default=False): reason="test requires numpy < 2.0 on windows", ) +IS_HF_HUB_1_x = config.HF_HUB_VERSION >= version.parse("0.99") # clunky but works with pre-releases + def require_regex(test_case): """ @@ -376,82 +379,49 @@ def offline(mode=OfflineSimulationMode.CONNECTION_FAILS, timeout=1e-16): There are three offline simulation modes: CONNECTION_FAILS (default mode): a ConnectionError is raised for each network call. - Connection errors are created by mocking socket.socket - CONNECTION_TIMES_OUT: the connection hangs until it times out. - The default timeout value is low (1e-16) to speed up the tests. - Timeout errors are created by mocking httpx.request + CONNECTION_TIMES_OUT: a ReadTimeout or ConnectTimeout is raised for each network call. HF_HUB_OFFLINE_SET_TO_1: the HF_HUB_OFFLINE_SET_TO_1 environment variable is set to 1. This makes the http/ftp calls of the library instantly fail and raise an OfflineModeEnabled error. + + The raised exceptions are either from the `requests` library (if `huggingface_hub<1.0.0`) + or from the `httpx` library (if `huggingface_hub>=1.0.0`). """ - # Store the original httpx.request to avoid recursion - original_httpx_request = httpx.request - - def timeout_request(method, url, **kwargs): - # Change the url to an invalid url so that the connection hangs - invalid_url = "https://10.255.255.1" - if kwargs.get("timeout") is None: - raise RequestWouldHangIndefinitelyError( - f"Tried a call to {url} in offline mode with no timeout set. Please set a timeout." - ) - kwargs["timeout"] = timeout - try: - return original_httpx_request(method, invalid_url, **kwargs) - except Exception as e: - # The following changes in the error are just here to make the offline timeout error prettier - if hasattr(e, "request"): - e.request.url = url - if hasattr(e, "args") and e.args: - max_retry_error = e.args[0] - if hasattr(max_retry_error, "args"): - max_retry_error.args = (max_retry_error.args[0].replace("10.255.255.1", f"OfflineMock[{url}]"),) - e.args = (max_retry_error,) - raise - - def offline_socket(*args, **kwargs): - raise socket.error("Offline mode is enabled.") + # Enable offline mode + if mode is OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1: + with patch("datasets.config.HF_HUB_OFFLINE", True): + yield + return + # Determine which exception to raise based on mode if mode is OfflineSimulationMode.CONNECTION_FAILS: - # inspired from https://stackoverflow.com/a/18601897 - with patch("socket.socket", offline_socket): - with patch("huggingface_hub.utils._http.get_session") as get_session_mock: - mock_client = Mock() - - # Mock the request method to raise connection error - def mock_request(*args, **kwargs): - raise httpx.ConnectError("Connection failed") - - # Mock the stream method to raise connection error - def mock_stream(*args, **kwargs): - raise httpx.ConnectError("Connection failed") - - mock_client.request = mock_request - mock_client.stream = mock_stream - get_session_mock.return_value = mock_client - yield + exc = httpx.ConnectError if IS_HF_HUB_1_x else requests.ConnectionError + error_msg = "Connection failed" elif mode is OfflineSimulationMode.CONNECTION_TIMES_OUT: - # inspired from https://stackoverflow.com/a/904609 - with patch("httpx.request", timeout_request): - with patch("huggingface_hub.utils._http._GLOBAL_CLIENT_FACTORY") as session_factory_mock: - mock_client = Mock() - mock_client.get = lambda *args, **kwargs: timeout_request("GET", *args, **kwargs) - mock_client.post = lambda *args, **kwargs: timeout_request("POST", *args, **kwargs) - mock_client.put = lambda *args, **kwargs: timeout_request("PUT", *args, **kwargs) - mock_client.delete = lambda *args, **kwargs: timeout_request("DELETE", *args, **kwargs) - mock_client.request = timeout_request - - # Mock the stream method to raise timeout - def mock_stream(*args, **kwargs): - raise httpx.ConnectTimeout("Connection timed out") - - mock_client.stream = mock_stream - session_factory_mock.return_value = mock_client - yield - elif mode is OfflineSimulationMode.HF_HUB_OFFLINE_SET_TO_1: - with patch("datasets.config.HF_HUB_OFFLINE", True): - yield + exc = httpx.ReadTimeout if IS_HF_HUB_1_x else requests.ConnectTimeout + error_msg = "Connection timed out" else: raise ValueError("Please use a value from the OfflineSimulationMode enum.") + def error_response(*args, **kwargs): + raise exc(error_msg) + + # Patch all client methods to raise the appropriate error + client_mock = Mock() + for method in ["head", "get", "post", "put", "delete", "request", "stream"]: + setattr(client_mock, method, Mock(side_effect=error_response)) + + # Patching is slightly different depending on hfh internals + patch_target = ( + {"target": "huggingface_hub.utils._http._GLOBAL_CLIENT", "new": client_mock} + if IS_HF_HUB_1_x + else { + "target": "huggingface_hub.utils._http._get_session_from_cache", + "return_value": client_mock, + } + ) + with patch(**patch_target): + yield + @contextmanager def set_current_working_directory_to_temp_dir(*args, **kwargs): From d6e6dbc18cdc7125d92949a41c75637a9888f563 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Tue, 30 Sep 2025 12:47:43 +0200 Subject: [PATCH 28/32] code quality --- tests/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/utils.py b/tests/utils.py index aa25f46124e..e7e1820908a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -2,7 +2,6 @@ import importlib.metadata import os import re -import socket import sys import tempfile import unittest From e94c469349a8962846e1dd127b98fef7b55066ae Mon Sep 17 00:00:00 2001 From: Wauplin Date: Tue, 30 Sep 2025 14:41:30 +0200 Subject: [PATCH 29/32] fix utils tests --- tests/test_offline_util.py | 31 +++++++++++++++++-------------- tests/utils.py | 20 +++++++++++--------- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/tests/test_offline_util.py b/tests/test_offline_util.py index 16daa49e7d9..c51f3b0659f 100644 --- a/tests/test_offline_util.py +++ b/tests/test_offline_util.py @@ -3,41 +3,44 @@ import httpx import pytest import requests +from huggingface_hub import get_session from huggingface_hub.errors import OfflineModeIsEnabled from datasets.utils.file_utils import fsspec_get, fsspec_head -from .utils import OfflineSimulationMode, RequestWouldHangIndefinitelyError, offline, require_not_windows +from .utils import ( + IS_HF_HUB_1_x, + OfflineSimulationMode, + RequestWouldHangIndefinitelyError, + offline, + require_not_windows, +) @pytest.mark.integration @require_not_windows # fsspec get keeps a file handle on windows that raises PermissionError def test_offline_with_timeout(): + expected_exception = httpx.ReadTimeout if IS_HF_HUB_1_x else requests.ConnectTimeout with offline(OfflineSimulationMode.CONNECTION_TIMES_OUT): with pytest.raises(RequestWouldHangIndefinitelyError): - requests.request("GET", "https://huggingface.co") - with pytest.raises(RequestWouldHangIndefinitelyError): - httpx.request("GET", "https://huggingface.co") + get_session().request("GET", "https://huggingface.co") - with pytest.raises(requests.exceptions.Timeout): - requests.request("GET", "https://huggingface.co", timeout=1.0) - with pytest.raises(httpx.ConnectTimeout): - httpx.request("GET", "https://huggingface.co", timeout=1.0) + with pytest.raises(expected_exception): + get_session().request("GET", "https://huggingface.co", timeout=1.0) - with pytest.raises((requests.exceptions.Timeout, httpx.ConnectTimeout)), NamedTemporaryFile() as temp_file: + with pytest.raises(expected_exception), NamedTemporaryFile() as temp_file: fsspec_get("hf://dummy", temp_file=temp_file) @pytest.mark.integration @require_not_windows # fsspec get keeps a file handle on windows that raises PermissionError def test_offline_with_connection_error(): + expected_exception = httpx.ConnectError if IS_HF_HUB_1_x else requests.ConnectionError with offline(OfflineSimulationMode.CONNECTION_FAILS): - with pytest.raises(requests.exceptions.Timeout): - requests.request("GET", "https://huggingface.co") - with pytest.raises(httpx.ConnectError): - httpx.request("GET", "https://huggingface.co") + with pytest.raises(expected_exception): + get_session().request("GET", "https://huggingface.co") - with pytest.raises((requests.exceptions.Timeout, httpx.ConnectError)), NamedTemporaryFile() as temp_file: + with pytest.raises(expected_exception), NamedTemporaryFile() as temp_file: fsspec_get("hf://dummy", temp_file=temp_file) diff --git a/tests/utils.py b/tests/utils.py index e7e1820908a..7777f8d7cd1 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -392,17 +392,19 @@ def offline(mode=OfflineSimulationMode.CONNECTION_FAILS, timeout=1e-16): return # Determine which exception to raise based on mode - if mode is OfflineSimulationMode.CONNECTION_FAILS: - exc = httpx.ConnectError if IS_HF_HUB_1_x else requests.ConnectionError - error_msg = "Connection failed" - elif mode is OfflineSimulationMode.CONNECTION_TIMES_OUT: - exc = httpx.ReadTimeout if IS_HF_HUB_1_x else requests.ConnectTimeout - error_msg = "Connection timed out" - else: - raise ValueError("Please use a value from the OfflineSimulationMode enum.") def error_response(*args, **kwargs): - raise exc(error_msg) + if mode is OfflineSimulationMode.CONNECTION_FAILS: + exc = httpx.ConnectError if IS_HF_HUB_1_x else requests.ConnectionError + elif mode is OfflineSimulationMode.CONNECTION_TIMES_OUT: + if kwargs.get("timeout") is None: + raise RequestWouldHangIndefinitelyError( + "Tried an HTTP call in offline mode with no timeout set. Please set a timeout." + ) + exc = httpx.ReadTimeout if IS_HF_HUB_1_x else requests.ConnectTimeout + else: + raise ValueError("Please use a value from the OfflineSimulationMode enum.") + raise exc(f"Offline mode {mode}") # Patch all client methods to raise the appropriate error client_mock = Mock() From f9f2f001e10b05cd0821572ed4ff033be73f82ff Mon Sep 17 00:00:00 2001 From: Wauplin Date: Tue, 30 Sep 2025 14:54:42 +0200 Subject: [PATCH 30/32] fixing last bits --- tests/test_load.py | 20 ++++++++------------ tests/utils.py | 2 +- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/tests/test_load.py b/tests/test_load.py index c8c9ee01e3e..13c5d5a85c5 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -9,7 +9,6 @@ from unittest.mock import patch import dill -import httpx import pyarrow as pa import pytest @@ -1047,19 +1046,16 @@ def test_load_dataset_with_unsupported_extensions(text_dir_with_unsupported_exte @pytest.mark.integration def test_loading_from_the_datasets_hub_with_token(): - true_request = httpx.Client().request + class CustomException(Exception): + pass - def assert_auth(method, url, *args, headers, **kwargs): - assert headers["authorization"] == "Bearer foo" - return true_request(method, url, *args, headers=headers, **kwargs) - - with patch("httpx.Client.request") as mock_request: - mock_request.side_effect = assert_auth + with patch("huggingface_hub.file_download.http_backoff") as mock_request: + mock_request.side_effect = CustomException() with tempfile.TemporaryDirectory() as tmp_dir: - with offline(): - with pytest.raises(ConnectionError): - load_dataset(SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER, cache_dir=tmp_dir, token="foo") - mock_request.assert_called() + with pytest.raises(CustomException): + load_dataset(SAMPLE_NOT_EXISTING_DATASET_IDENTIFIER, cache_dir=tmp_dir, token="foo") + mock_request.assert_called_once() + assert mock_request.call_args_list[0][1]["headers"]["authorization"] == "Bearer foo" @pytest.mark.integration diff --git a/tests/utils.py b/tests/utils.py index 7777f8d7cd1..ab70dbd374c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -371,7 +371,7 @@ class OfflineSimulationMode(Enum): @contextmanager -def offline(mode=OfflineSimulationMode.CONNECTION_FAILS, timeout=1e-16): +def offline(mode: OfflineSimulationMode): """ Simulate offline mode. From 50f40c2cc9315b461470c35ecb19bf8d412c6e06 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Tue, 30 Sep 2025 15:32:29 +0200 Subject: [PATCH 31/32] x-version compat --- tests/test_load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_load.py b/tests/test_load.py index 13c5d5a85c5..422e6cd3180 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -1049,7 +1049,7 @@ def test_loading_from_the_datasets_hub_with_token(): class CustomException(Exception): pass - with patch("huggingface_hub.file_download.http_backoff") as mock_request: + with patch("huggingface_hub.file_download._get_metadata_or_catch_error") as mock_request: mock_request.side_effect = CustomException() with tempfile.TemporaryDirectory() as tmp_dir: with pytest.raises(CustomException): From 7903f24b86493ca780b3e7acfb75975ba3f30114 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Tue, 30 Sep 2025 15:51:45 +0200 Subject: [PATCH 32/32] final commit --- .github/workflows/ci.yml | 10 ++++++---- tests/utils.py | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a40e3740ec9..b66b12cecd0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,8 +74,9 @@ jobs: if: ${{ matrix.deps_versions == 'deps-latest' }} run: | uv pip install --system --upgrade pyarrow huggingface-hub "dill<0.3.9" - uv pip uninstall --system transformers huggingface_hub # temporary (only for tests) - uv pip install --system --prerelease=allow git+https://github.com/huggingface/transformers.git # temporary (only for tests) + # TODO: remove once transformers v5 / huggingface_hub v1 are released officially + uv pip uninstall --system transformers huggingface_hub + uv pip install --system --prerelease=allow git+https://github.com/huggingface/transformers.git - name: Install dependencies (minimum versions) if: ${{ matrix.deps_versions != 'deps-latest' }} run: uv pip install --system pyarrow==21.0.0 huggingface-hub==0.25.0 transformers dill==0.3.1.1 @@ -170,8 +171,9 @@ jobs: - name: Install dependencies run: | uv pip install --system "datasets[tests_numpy2] @ ." - uv pip uninstall --system transformers huggingface_hub # temporary (only for tests) - uv pip install --system --prerelease=allow git+https://github.com/huggingface/transformers.git # temporary (only for tests) + # TODO: remove once transformers v5 / huggingface_hub v1 are released officially + uv pip uninstall --system transformers huggingface_hub + uv pip install --system --prerelease=allow git+https://github.com/huggingface/transformers.git - name: Print dependencies run: pip list diff --git a/tests/utils.py b/tests/utils.py index ab70dbd374c..166bd4789c2 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -465,7 +465,7 @@ def xfail_if_500_502_http_error(func): def _wrapper(func, *args, **kwargs): try: return func(*args, **kwargs) - except httpx.HTTPError as err: + except (requests.HTTPError, httpx.HTTPError) as err: if str(err).startswith("500") or str(err).startswith("502"): pytest.xfail(str(err)) raise err