From 1a703e713eacd32a3dc780b34fb7c8d41a8414c1 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Thu, 28 Mar 2024 10:26:24 +0100 Subject: [PATCH 1/5] Remove deprecated list_files_info --- src/datasets/arrow_dataset.py | 19 ++++++++++++++++--- src/datasets/dataset_dict.py | 9 +++++++-- src/datasets/utils/hub.py | 15 --------------- 3 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index f66cfecfe8c..87e2e263c66 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -60,7 +60,16 @@ import pyarrow as pa import pyarrow.compute as pc from fsspec.core import url_to_fs -from huggingface_hub import CommitInfo, CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi +from huggingface_hub import ( + CommitInfo, + CommitOperationAdd, + CommitOperationDelete, + DatasetCard, + DatasetCardData, + HfApi, + list_repo_tree, +) +from huggingface_hub.hf_api import RepoFile from multiprocess import Pool from tqdm.contrib.concurrent import thread_map @@ -115,7 +124,7 @@ from .utils import tqdm as hf_tqdm from .utils.deprecation_utils import deprecated from .utils.file_utils import estimate_dataset_size -from .utils.hub import list_files_info, preupload_lfs_files +from .utils.hub import preupload_lfs_files from .utils.info_utils import is_small_dataset from .utils.metadata import MetadataConfigs from .utils.py_utils import ( @@ -5577,7 +5586,11 @@ def push_to_hub( deletions, deleted_size = [], 0 repo_splits = [] # use a list to keep the order of the splits repo_files_to_add = [addition.path_in_repo for addition in additions] - for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token): + for repo_file in list_repo_tree( + api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True + ): + if not isinstance(repo_file, RepoFile): + continue if repo_file.rfilename == config.REPOCARD_FILENAME: repo_with_dataset_card = True elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME: diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 18c99dac1b8..4ad2bffa738 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -20,7 +20,9 @@ DatasetCard, DatasetCardData, HfApi, + list_repo_tree, ) +from huggingface_hub.hf_api import RepoFile from . import config from .arrow_dataset import PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED, Dataset @@ -34,7 +36,6 @@ from .utils import logging from .utils.deprecation_utils import deprecated from .utils.doc_utils import is_documented_by -from .utils.hub import list_files_info from .utils.metadata import MetadataConfigs from .utils.py_utils import asdict, glob_pattern_to_regex, string_to_dict from .utils.typing import PathLike @@ -1745,7 +1746,11 @@ def push_to_hub( repo_splits = [] # use a list to keep the order of the splits deletions = [] repo_files_to_add = [addition.path_in_repo for addition in additions] - for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token): + for repo_file in list_repo_tree( + api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True + ): + if not isinstance(repo_file, RepoFile): + continue if repo_file.rfilename == config.REPOCARD_FILENAME: repo_with_dataset_card = True elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME: diff --git a/src/datasets/utils/hub.py b/src/datasets/utils/hub.py index f19181b41b1..4c9ccb15abc 100644 --- a/src/datasets/utils/hub.py +++ b/src/datasets/utils/hub.py @@ -2,7 +2,6 @@ from functools import partial from huggingface_hub import HfApi, hf_hub_url -from huggingface_hub.hf_api import RepoFile from packaging import version from requests import ConnectionError, HTTPError @@ -46,19 +45,5 @@ def preupload_lfs_files(hf_api: HfApi, **kwargs): hf_api.preupload_lfs_files(**kwargs) -# `list_files_info` is deprecated in favor of `list_repo_tree` in `huggingface_hub>=0.20.0` -if config.HF_HUB_VERSION.release < version.parse("0.20.0").release: - - def list_files_info(hf_api: HfApi, **kwargs): - yield from hf_api.list_files_info(**kwargs) -else: - - def list_files_info(hf_api: HfApi, **kwargs): - kwargs = {**kwargs, "recursive": True} - for repo_path in hf_api.list_repo_tree(**kwargs): - if isinstance(repo_path, RepoFile): - yield repo_path - - # bakckward compatibility hf_hub_url = partial(hf_hub_url, repo_type="dataset") From 7ec1dd8a63d985bd9de6908fe62a74a4537e1303 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Thu, 28 Mar 2024 10:51:02 +0100 Subject: [PATCH 2/5] remove legacy preupload_lfs_file + rename hf_hub_url partial --- src/datasets/arrow_dataset.py | 2 +- src/datasets/load.py | 20 +++++----- src/datasets/utils/hub.py | 47 +----------------------- tests/test_hub.py | 6 +-- tests/test_streaming_download_manager.py | 16 ++++---- tests/test_upstream_hub.py | 6 +-- 6 files changed, 27 insertions(+), 70 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 87e2e263c66..4af4b33724e 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -68,6 +68,7 @@ DatasetCardData, HfApi, list_repo_tree, + preupload_lfs_files, ) from huggingface_hub.hf_api import RepoFile from multiprocess import Pool @@ -124,7 +125,6 @@ from .utils import tqdm as hf_tqdm from .utils.deprecation_utils import deprecated from .utils.file_utils import estimate_dataset_size -from .utils.hub import preupload_lfs_files from .utils.info_utils import is_small_dataset from .utils.metadata import MetadataConfigs from .utils.py_utils import ( diff --git a/src/datasets/load.py b/src/datasets/load.py index c08ad274acf..991a3b35651 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -84,7 +84,7 @@ relative_to_absolute_path, url_or_path_join, ) -from .utils.hub import hf_hub_url +from .utils.hub import hf_dataset_url from .utils.info_utils import VerificationMode, is_small_dataset from .utils.logging import get_logger from .utils.metadata import MetadataConfigs @@ -1211,7 +1211,7 @@ def get_module(self) -> DatasetModule: download_config.download_desc = "Downloading readme" try: dataset_readme_path = cached_path( - hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=revision), + hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=revision), download_config=download_config, ) dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data @@ -1222,7 +1222,7 @@ def get_module(self) -> DatasetModule: download_config.download_desc = "Downloading standalone yaml" try: standalone_yaml_path = cached_path( - hf_hub_url(self.name, config.REPOYAML_FILENAME, revision=revision), + hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=revision), download_config=download_config, ) with open(standalone_yaml_path, "r", encoding="utf-8") as f: @@ -1308,7 +1308,7 @@ def get_module(self) -> DatasetModule: ] default_config_name = None builder_kwargs = { - "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"), + "base_path": hf_dataset_url(self.name, "", revision=revision).rstrip("/"), "repo_id": self.name, "dataset_name": camelcase_to_snakecase(Path(self.name).name), } @@ -1320,7 +1320,7 @@ def get_module(self) -> DatasetModule: try: # this file is deprecated and was created automatically in old versions of push_to_hub dataset_infos_path = cached_path( - hf_hub_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision), + hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision), download_config=download_config, ) with open(dataset_infos_path, encoding="utf-8") as f: @@ -1444,14 +1444,14 @@ def __init__( increase_load_count(name, resource_type="dataset") def download_loading_script(self) -> str: - file_path = hf_hub_url(self.name, self.name.split("/")[-1] + ".py", revision=self.revision) + file_path = hf_dataset_url(self.name, self.name.split("/")[-1] + ".py", revision=self.revision) download_config = self.download_config.copy() if download_config.download_desc is None: download_config.download_desc = "Downloading builder script" return cached_path(file_path, download_config=download_config) def download_dataset_infos_file(self) -> str: - dataset_infos = hf_hub_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.revision) + dataset_infos = hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.revision) # Download the dataset infos file if available download_config = self.download_config.copy() if download_config.download_desc is None: @@ -1465,7 +1465,7 @@ def download_dataset_infos_file(self) -> str: return None def download_dataset_readme_file(self) -> str: - readme_url = hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=self.revision) + readme_url = hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=self.revision) # Download the dataset infos file if available download_config = self.download_config.copy() if download_config.download_desc is None: @@ -1494,7 +1494,7 @@ def get_module(self) -> DatasetModule: imports = get_imports(local_path) local_imports = _download_additional_modules( name=self.name, - base_path=hf_hub_url(self.name, "", revision=self.revision), + base_path=hf_dataset_url(self.name, "", revision=self.revision), imports=imports, download_config=self.download_config, ) @@ -1540,7 +1540,7 @@ def get_module(self) -> DatasetModule: # make the new module to be noticed by the import system importlib.invalidate_caches() builder_kwargs = { - "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"), + "base_path": hf_dataset_url(self.name, "", revision=self.revision).rstrip("/"), "repo_id": self.name, } return DatasetModule(module_path, hash, builder_kwargs, importable_file_path=importable_file_path) diff --git a/src/datasets/utils/hub.py b/src/datasets/utils/hub.py index 4c9ccb15abc..6d784333b23 100644 --- a/src/datasets/utils/hub.py +++ b/src/datasets/utils/hub.py @@ -1,49 +1,6 @@ -import time from functools import partial -from huggingface_hub import HfApi, hf_hub_url -from packaging import version -from requests import ConnectionError, HTTPError +from huggingface_hub import hf_hub_url -from .. import config -from . import logging - -logger = logging.get_logger(__name__) - -# Retry `preupload_lfs_files` in `huggingface_hub<0.20.0` on the "500 (Internal Server Error)" and "503 (Service Unavailable)" HTTP errors -if config.HF_HUB_VERSION.release < version.parse("0.20.0").release: - - def preupload_lfs_files(hf_api: HfApi, **kwargs): - max_retries = 5 - base_wait_time = 1 - max_wait_time = 8 - retry = 0 - while True: - try: - hf_api.preupload_lfs_files(**kwargs) - except (RuntimeError, HTTPError, ConnectionError) as err: - if isinstance(err, RuntimeError): - if isinstance(err.__cause__, (HTTPError, ConnectionError)): - err = err.__cause__ - else: - raise err - if retry >= max_retries or err.response and err.response.status_code not in [500, 503]: - raise err - else: - sleep_time = min(max_wait_time, base_wait_time * 2**retry) # Exponential backoff - logger.info( - f"{hf_api.preupload_lfs_files} timed out, retrying in {sleep_time}s... [{retry/max_retries}]" - ) - time.sleep(sleep_time) - retry += 1 - else: - break -else: - - def preupload_lfs_files(hf_api: HfApi, **kwargs): - hf_api.preupload_lfs_files(**kwargs) - - -# bakckward compatibility -hf_hub_url = partial(hf_hub_url, repo_type="dataset") +hf_dataset_url = partial(hf_hub_url, repo_type="dataset") diff --git a/tests/test_hub.py b/tests/test_hub.py index e940d7b8b29..116ed588585 100644 --- a/tests/test_hub.py +++ b/tests/test_hub.py @@ -2,12 +2,12 @@ import pytest -from datasets.utils.hub import hf_hub_url +from datasets.utils.hub import hf_dataset_url @pytest.mark.parametrize("repo_id", ["canonical_dataset_name", "org-name/dataset-name"]) @pytest.mark.parametrize("filename", ["filename.csv", "filename with blanks.csv"]) @pytest.mark.parametrize("revision", [None, "v2"]) -def test_hf_hub_url(repo_id, filename, revision): - url = hf_hub_url(repo_id=repo_id, filename=filename, revision=revision) +def test_dataset_url(repo_id, filename, revision): + url = hf_dataset_url(repo_id=repo_id, filename=filename, revision=revision) assert url == f"https://huggingface.co/datasets/{repo_id}/resolve/{revision or 'main'}/{quote(filename)}" diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py index 44e73eee73c..3e90514b26e 100644 --- a/tests/test_streaming_download_manager.py +++ b/tests/test_streaming_download_manager.py @@ -28,7 +28,7 @@ xwalk, ) from datasets.filesystems import COMPRESSION_FILESYSTEMS -from datasets.utils.hub import hf_hub_url +from datasets.utils.hub import hf_dataset_url from .utils import require_lz4, require_zstandard, slow @@ -236,7 +236,7 @@ def test_xexists(input_path, exists, tmp_path, mock_fsspec): @pytest.mark.integration def test_xexists_private(hf_private_dataset_repo_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "") + root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "") download_config = DownloadConfig(token=hf_token) assert xexists(root_url + "data/text_data.txt", download_config=download_config) assert not xexists(root_url + "file_that_doesnt_exist.txt", download_config=download_config) @@ -321,7 +321,7 @@ def test_xlistdir(input_path, expected_paths, tmp_path, mock_fsspec): @pytest.mark.integration def test_xlistdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") + root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") download_config = DownloadConfig(token=hf_token) assert len(xlistdir("zip://::" + root_url, download_config=download_config)) == 1 assert len(xlistdir("zip://main_dir::" + root_url, download_config=download_config)) == 2 @@ -350,7 +350,7 @@ def test_xisdir(input_path, isdir, tmp_path, mock_fsspec): @pytest.mark.integration def test_xisdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") + root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") download_config = DownloadConfig(token=hf_token) assert xisdir("zip://::" + root_url, download_config=download_config) is True assert xisdir("zip://main_dir::" + root_url, download_config=download_config) is True @@ -376,7 +376,7 @@ def test_xisfile(input_path, isfile, tmp_path, mock_fsspec): @pytest.mark.integration def test_xisfile_private(hf_private_dataset_repo_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "") + root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "") download_config = DownloadConfig(token=hf_token) assert xisfile(root_url + "data/text_data.txt", download_config=download_config) is True assert xisfile(root_url + "qwertyuiop", download_config=download_config) is False @@ -400,7 +400,7 @@ def test_xgetsize(input_path, size, tmp_path, mock_fsspec): @pytest.mark.integration def test_xgetsize_private(hf_private_dataset_repo_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "") + root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "") download_config = DownloadConfig(token=hf_token) assert xgetsize(root_url + "data/text_data.txt", download_config=download_config) == 39 with pytest.raises(FileNotFoundError): @@ -444,7 +444,7 @@ def test_xglob(input_path, expected_paths, tmp_path, mock_fsspec): @pytest.mark.integration def test_xglob_private(hf_private_dataset_repo_zipped_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") + root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") download_config = DownloadConfig(token=hf_token) assert len(xglob("zip://**::" + root_url, download_config=download_config)) == 3 assert len(xglob("zip://qwertyuiop/*::" + root_url, download_config=download_config)) == 0 @@ -483,7 +483,7 @@ def test_xwalk(input_path, expected_outputs, tmp_path, mock_fsspec): @pytest.mark.integration def test_xwalk_private(hf_private_dataset_repo_zipped_txt_data, hf_token): - root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") + root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip") download_config = DownloadConfig(token=hf_token) assert len(list(xwalk("zip://::" + root_url, download_config=download_config))) == 2 assert len(list(xwalk("zip://main_dir::" + root_url, download_config=download_config))) == 1 diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index 2952cf310bf..daa0841ac98 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -33,7 +33,7 @@ FolderBasedBuilderConfig, ) from datasets.utils.file_utils import cached_path -from datasets.utils.hub import hf_hub_url +from datasets.utils.hub import hf_dataset_url from tests.fixtures.hub import CI_HUB_ENDPOINT, CI_HUB_USER, CI_HUB_USER_TOKEN from tests.utils import for_all_test_methods, require_pil, require_sndfile, xfail_if_500_502_http_error @@ -608,7 +608,7 @@ def test_push_multiple_dataset_configs_to_hub_readme_metadata_content( ds_config2.push_to_hub(ds_name, "config2", token=self._token) # check that configs args was correctly pushed to README.md - ds_readme_path = cached_path(hf_hub_url(ds_name, "README.md")) + ds_readme_path = cached_path(hf_dataset_url(ds_name, "README.md")) dataset_card_data = DatasetCard.load(ds_readme_path).data assert METADATA_CONFIGS_FIELD in dataset_card_data assert isinstance(dataset_card_data[METADATA_CONFIGS_FIELD], list) @@ -757,7 +757,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_readme_metadata_content( ds_config2.push_to_hub(ds_name, "config2", token=self._token) # check that configs args was correctly pushed to README.md - ds_readme_path = cached_path(hf_hub_url(ds_name, "README.md")) + ds_readme_path = cached_path(hf_dataset_url(ds_name, "README.md")) dataset_card_data = DatasetCard.load(ds_readme_path).data assert METADATA_CONFIGS_FIELD in dataset_card_data assert isinstance(dataset_card_data[METADATA_CONFIGS_FIELD], list) From 31f8bec22f94b43e5a5dd0c1fb2b7237043596d1 Mon Sep 17 00:00:00 2001 From: Wauplin Date: Thu, 28 Mar 2024 11:10:21 +0100 Subject: [PATCH 3/5] fix tests --- src/datasets/arrow_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 4af4b33724e..a1a35a6e06e 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -5398,7 +5398,6 @@ def shards_with_embedded_external_files(shards): uploaded_size += buffer.tell() shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=buffer) preupload_lfs_files( - api, repo_id=repo_id, additions=[shard_addition], token=token, From a0aa69e3a889038d307713ea4eedd3150aa27f1e Mon Sep 17 00:00:00 2001 From: Wauplin Date: Thu, 28 Mar 2024 12:18:40 +0100 Subject: [PATCH 4/5] fix --- src/datasets/arrow_dataset.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index a1a35a6e06e..7efb7beb515 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -68,7 +68,6 @@ DatasetCardData, HfApi, list_repo_tree, - preupload_lfs_files, ) from huggingface_hub.hf_api import RepoFile from multiprocess import Pool @@ -5397,10 +5396,9 @@ def shards_with_embedded_external_files(shards): shard.to_parquet(buffer) uploaded_size += buffer.tell() shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=buffer) - preupload_lfs_files( + api.preupload_lfs_files( repo_id=repo_id, additions=[shard_addition], - token=token, repo_type="dataset", revision=revision, create_pr=create_pr, From 64c9cdadf4be4459a001b08ad70e3787f9a808be Mon Sep 17 00:00:00 2001 From: Wauplin Date: Fri, 29 Mar 2024 12:44:33 +0100 Subject: [PATCH 5/5] fix tests --- src/datasets/arrow_dataset.py | 5 ++--- src/datasets/dataset_dict.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 7efb7beb515..945d9bad873 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -67,7 +67,6 @@ DatasetCard, DatasetCardData, HfApi, - list_repo_tree, ) from huggingface_hub.hf_api import RepoFile from multiprocess import Pool @@ -5583,8 +5582,8 @@ def push_to_hub( deletions, deleted_size = [], 0 repo_splits = [] # use a list to keep the order of the splits repo_files_to_add = [addition.path_in_repo for addition in additions] - for repo_file in list_repo_tree( - api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True + for repo_file in api.list_repo_tree( + repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True ): if not isinstance(repo_file, RepoFile): continue diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 4ad2bffa738..563479f26a4 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -20,7 +20,6 @@ DatasetCard, DatasetCardData, HfApi, - list_repo_tree, ) from huggingface_hub.hf_api import RepoFile @@ -1746,8 +1745,8 @@ def push_to_hub( repo_splits = [] # use a list to keep the order of the splits deletions = [] repo_files_to_add = [addition.path_in_repo for addition in additions] - for repo_file in list_repo_tree( - api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True + for repo_file in api.list_repo_tree( + repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True ): if not isinstance(repo_file, RepoFile): continue