huggingface · Wauplin · Mar 29, 2024 · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -60,7 +60,15 @@
 import pyarrow as pa
 import pyarrow.compute as pc
 from fsspec.core import url_to_fs
-from huggingface_hub import CommitInfo, CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi
+from huggingface_hub import (
+    CommitInfo,
+    CommitOperationAdd,
+    CommitOperationDelete,
+    DatasetCard,
+    DatasetCardData,
+    HfApi,
+)
+from huggingface_hub.hf_api import RepoFile
 from multiprocess import Pool
 from tqdm.contrib.concurrent import thread_map
 
@@ -115,7 +123,6 @@
 from .utils import tqdm as hf_tqdm
 from .utils.deprecation_utils import deprecated
 from .utils.file_utils import estimate_dataset_size
-from .utils.hub import list_files_info, preupload_lfs_files
 from .utils.info_utils import is_small_dataset
 from .utils.metadata import MetadataConfigs
 from .utils.py_utils import (
@@ -5388,11 +5395,9 @@ def shards_with_embedded_external_files(shards):
             shard.to_parquet(buffer)
             uploaded_size += buffer.tell()
             shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=buffer)
-            preupload_lfs_files(
-                api,
+            api.preupload_lfs_files(
                 repo_id=repo_id,
                 additions=[shard_addition],
-                token=token,
                 repo_type="dataset",
                 revision=revision,
                 create_pr=create_pr,
@@ -5577,7 +5582,11 @@ def push_to_hub(
         deletions, deleted_size = [], 0
         repo_splits = []  # use a list to keep the order of the splits
         repo_files_to_add = [addition.path_in_repo for addition in additions]
-        for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token):
+        for repo_file in api.list_repo_tree(
+            repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True
+        ):
+            if not isinstance(repo_file, RepoFile):
+                continue
             if repo_file.rfilename == config.REPOCARD_FILENAME:
                 repo_with_dataset_card = True
             elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME:

diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
@@ -21,6 +21,7 @@
     DatasetCardData,
     HfApi,
 )
+from huggingface_hub.hf_api import RepoFile
 
 from . import config
 from .arrow_dataset import PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED, Dataset
@@ -34,7 +35,6 @@
 from .utils import logging
 from .utils.deprecation_utils import deprecated
 from .utils.doc_utils import is_documented_by
-from .utils.hub import list_files_info
 from .utils.metadata import MetadataConfigs
 from .utils.py_utils import asdict, glob_pattern_to_regex, string_to_dict
 from .utils.typing import PathLike
@@ -1745,7 +1745,11 @@ def push_to_hub(
         repo_splits = []  # use a list to keep the order of the splits
         deletions = []
         repo_files_to_add = [addition.path_in_repo for addition in additions]
-        for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token):
+        for repo_file in api.list_repo_tree(
+            repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True
+        ):
+            if not isinstance(repo_file, RepoFile):
+                continue
             if repo_file.rfilename == config.REPOCARD_FILENAME:
                 repo_with_dataset_card = True
             elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME:

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -84,7 +84,7 @@
     relative_to_absolute_path,
     url_or_path_join,
 )
-from .utils.hub import hf_hub_url
+from .utils.hub import hf_dataset_url
 from .utils.info_utils import VerificationMode, is_small_dataset
 from .utils.logging import get_logger
 from .utils.metadata import MetadataConfigs
@@ -1211,7 +1211,7 @@ def get_module(self) -> DatasetModule:
             download_config.download_desc = "Downloading readme"
         try:
             dataset_readme_path = cached_path(
-                hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=revision),
+                hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=revision),
                 download_config=download_config,
             )
             dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data
@@ -1222,7 +1222,7 @@ def get_module(self) -> DatasetModule:
             download_config.download_desc = "Downloading standalone yaml"
         try:
             standalone_yaml_path = cached_path(
-                hf_hub_url(self.name, config.REPOYAML_FILENAME, revision=revision),
+                hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=revision),
                 download_config=download_config,
             )
             with open(standalone_yaml_path, "r", encoding="utf-8") as f:
@@ -1308,7 +1308,7 @@ def get_module(self) -> DatasetModule:
             ]
             default_config_name = None
         builder_kwargs = {
-            "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"),
+            "base_path": hf_dataset_url(self.name, "", revision=revision).rstrip("/"),
             "repo_id": self.name,
             "dataset_name": camelcase_to_snakecase(Path(self.name).name),
         }
@@ -1320,7 +1320,7 @@ def get_module(self) -> DatasetModule:
         try:
             # this file is deprecated and was created automatically in old versions of push_to_hub
             dataset_infos_path = cached_path(
-                hf_hub_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision),
+                hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision),
                 download_config=download_config,
             )
             with open(dataset_infos_path, encoding="utf-8") as f:
@@ -1444,14 +1444,14 @@ def __init__(
         increase_load_count(name, resource_type="dataset")
 
     def download_loading_script(self) -> str:
-        file_path = hf_hub_url(self.name, self.name.split("/")[-1] + ".py", revision=self.revision)
+        file_path = hf_dataset_url(self.name, self.name.split("/")[-1] + ".py", revision=self.revision)
         download_config = self.download_config.copy()
         if download_config.download_desc is None:
             download_config.download_desc = "Downloading builder script"
         return cached_path(file_path, download_config=download_config)
 
     def download_dataset_infos_file(self) -> str:
-        dataset_infos = hf_hub_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.revision)
+        dataset_infos = hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.revision)
         # Download the dataset infos file if available
         download_config = self.download_config.copy()
         if download_config.download_desc is None:
@@ -1465,7 +1465,7 @@ def download_dataset_infos_file(self) -> str:
             return None
 
     def download_dataset_readme_file(self) -> str:
-        readme_url = hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=self.revision)
+        readme_url = hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=self.revision)
         # Download the dataset infos file if available
         download_config = self.download_config.copy()
         if download_config.download_desc is None:
@@ -1494,7 +1494,7 @@ def get_module(self) -> DatasetModule:
         imports = get_imports(local_path)
         local_imports = _download_additional_modules(
             name=self.name,
-            base_path=hf_hub_url(self.name, "", revision=self.revision),
+            base_path=hf_dataset_url(self.name, "", revision=self.revision),
             imports=imports,
             download_config=self.download_config,
         )
@@ -1540,7 +1540,7 @@ def get_module(self) -> DatasetModule:
         # make the new module to be noticed by the import system
         importlib.invalidate_caches()
         builder_kwargs = {
-            "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"),
+            "base_path": hf_dataset_url(self.name, "", revision=self.revision).rstrip("/"),
             "repo_id": self.name,
         }
         return DatasetModule(module_path, hash, builder_kwargs, importable_file_path=importable_file_path)

diff --git a/src/datasets/utils/hub.py b/src/datasets/utils/hub.py
@@ -1,64 +1,6 @@
-import time
 from functools import partial
 
-from huggingface_hub import HfApi, hf_hub_url
-from huggingface_hub.hf_api import RepoFile
-from packaging import version
-from requests import ConnectionError, HTTPError
+from huggingface_hub import hf_hub_url
 
-from .. import config
-from . import logging
 
-
-logger = logging.get_logger(__name__)
-
-# Retry `preupload_lfs_files` in `huggingface_hub<0.20.0` on the "500 (Internal Server Error)" and "503 (Service Unavailable)" HTTP errors
-if config.HF_HUB_VERSION.release < version.parse("0.20.0").release:
-
-    def preupload_lfs_files(hf_api: HfApi, **kwargs):
-        max_retries = 5
-        base_wait_time = 1
-        max_wait_time = 8
-        retry = 0
-        while True:
-            try:
-                hf_api.preupload_lfs_files(**kwargs)
-            except (RuntimeError, HTTPError, ConnectionError) as err:
-                if isinstance(err, RuntimeError):
-                    if isinstance(err.__cause__, (HTTPError, ConnectionError)):
-                        err = err.__cause__
-                    else:
-                        raise err
-                if retry >= max_retries or err.response and err.response.status_code not in [500, 503]:
-                    raise err
-                else:
-                    sleep_time = min(max_wait_time, base_wait_time * 2**retry)  # Exponential backoff
-                    logger.info(
-                        f"{hf_api.preupload_lfs_files} timed out, retrying in {sleep_time}s... [{retry/max_retries}]"
-                    )
-                    time.sleep(sleep_time)
-                    retry += 1
-            else:
-                break
-else:
-
-    def preupload_lfs_files(hf_api: HfApi, **kwargs):
-        hf_api.preupload_lfs_files(**kwargs)
-
-
-# `list_files_info` is deprecated in favor of `list_repo_tree` in `huggingface_hub>=0.20.0`
-if config.HF_HUB_VERSION.release < version.parse("0.20.0").release:
-
-    def list_files_info(hf_api: HfApi, **kwargs):
-        yield from hf_api.list_files_info(**kwargs)
-else:
-
-    def list_files_info(hf_api: HfApi, **kwargs):
-        kwargs = {**kwargs, "recursive": True}
-        for repo_path in hf_api.list_repo_tree(**kwargs):
-            if isinstance(repo_path, RepoFile):
-                yield repo_path
-
-
-# bakckward compatibility
-hf_hub_url = partial(hf_hub_url, repo_type="dataset")
+hf_dataset_url = partial(hf_hub_url, repo_type="dataset")
diff --git a/tests/test_hub.py b/tests/test_hub.py
@@ -2,12 +2,12 @@
 
 import pytest
 
-from datasets.utils.hub import hf_hub_url
+from datasets.utils.hub import hf_dataset_url
 
 
 @pytest.mark.parametrize("repo_id", ["canonical_dataset_name", "org-name/dataset-name"])
 @pytest.mark.parametrize("filename", ["filename.csv", "filename with blanks.csv"])
 @pytest.mark.parametrize("revision", [None, "v2"])
-def test_hf_hub_url(repo_id, filename, revision):
-    url = hf_hub_url(repo_id=repo_id, filename=filename, revision=revision)
+def test_dataset_url(repo_id, filename, revision):
+    url = hf_dataset_url(repo_id=repo_id, filename=filename, revision=revision)
     assert url == f"https://huggingface.co/datasets/{repo_id}/resolve/{revision or 'main'}/{quote(filename)}"
diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py
@@ -28,7 +28,7 @@
     xwalk,
 )
 from datasets.filesystems import COMPRESSION_FILESYSTEMS
-from datasets.utils.hub import hf_hub_url
+from datasets.utils.hub import hf_dataset_url
 
 from .utils import require_lz4, require_zstandard, slow
 
@@ -236,7 +236,7 @@ def test_xexists(input_path, exists, tmp_path, mock_fsspec):
 
 @pytest.mark.integration
 def test_xexists_private(hf_private_dataset_repo_txt_data, hf_token):
-    root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "")
+    root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "")
     download_config = DownloadConfig(token=hf_token)
     assert xexists(root_url + "data/text_data.txt", download_config=download_config)
     assert not xexists(root_url + "file_that_doesnt_exist.txt", download_config=download_config)
@@ -321,7 +321,7 @@ def test_xlistdir(input_path, expected_paths, tmp_path, mock_fsspec):
 
 @pytest.mark.integration
 def test_xlistdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
-    root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
+    root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
     download_config = DownloadConfig(token=hf_token)
     assert len(xlistdir("zip://::" + root_url, download_config=download_config)) == 1
     assert len(xlistdir("zip://main_dir::" + root_url, download_config=download_config)) == 2
@@ -350,7 +350,7 @@ def test_xisdir(input_path, isdir, tmp_path, mock_fsspec):
 
 @pytest.mark.integration
 def test_xisdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
-    root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
+    root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
     download_config = DownloadConfig(token=hf_token)
     assert xisdir("zip://::" + root_url, download_config=download_config) is True
     assert xisdir("zip://main_dir::" + root_url, download_config=download_config) is True
@@ -376,7 +376,7 @@ def test_xisfile(input_path, isfile, tmp_path, mock_fsspec):
 
 @pytest.mark.integration
 def test_xisfile_private(hf_private_dataset_repo_txt_data, hf_token):
-    root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "")
+    root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "")
     download_config = DownloadConfig(token=hf_token)
     assert xisfile(root_url + "data/text_data.txt", download_config=download_config) is True
     assert xisfile(root_url + "qwertyuiop", download_config=download_config) is False
@@ -400,7 +400,7 @@ def test_xgetsize(input_path, size, tmp_path, mock_fsspec):
 
 @pytest.mark.integration
 def test_xgetsize_private(hf_private_dataset_repo_txt_data, hf_token):
-    root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "")
+    root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "")
     download_config = DownloadConfig(token=hf_token)
     assert xgetsize(root_url + "data/text_data.txt", download_config=download_config) == 39
     with pytest.raises(FileNotFoundError):
@@ -444,7 +444,7 @@ def test_xglob(input_path, expected_paths, tmp_path, mock_fsspec):
 
 @pytest.mark.integration
 def test_xglob_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
-    root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
+    root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
     download_config = DownloadConfig(token=hf_token)
     assert len(xglob("zip://**::" + root_url, download_config=download_config)) == 3
     assert len(xglob("zip://qwertyuiop/*::" + root_url, download_config=download_config)) == 0
@@ -483,7 +483,7 @@ def test_xwalk(input_path, expected_outputs, tmp_path, mock_fsspec):
 
 @pytest.mark.integration
 def test_xwalk_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
-    root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
+    root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
     download_config = DownloadConfig(token=hf_token)
     assert len(list(xwalk("zip://::" + root_url, download_config=download_config))) == 2
     assert len(list(xwalk("zip://main_dir::" + root_url, download_config=download_config))) == 1

diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py
@@ -33,7 +33,7 @@
     FolderBasedBuilderConfig,
 )
 from datasets.utils.file_utils import cached_path
-from datasets.utils.hub import hf_hub_url
+from datasets.utils.hub import hf_dataset_url
 from tests.fixtures.hub import CI_HUB_ENDPOINT, CI_HUB_USER, CI_HUB_USER_TOKEN
 from tests.utils import for_all_test_methods, require_pil, require_sndfile, xfail_if_500_502_http_error
 
@@ -608,7 +608,7 @@ def test_push_multiple_dataset_configs_to_hub_readme_metadata_content(
             ds_config2.push_to_hub(ds_name, "config2", token=self._token)
 
             # check that configs args was correctly pushed to README.md
-            ds_readme_path = cached_path(hf_hub_url(ds_name, "README.md"))
+            ds_readme_path = cached_path(hf_dataset_url(ds_name, "README.md"))
             dataset_card_data = DatasetCard.load(ds_readme_path).data
             assert METADATA_CONFIGS_FIELD in dataset_card_data
             assert isinstance(dataset_card_data[METADATA_CONFIGS_FIELD], list)
@@ -757,7 +757,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_readme_metadata_content(
             ds_config2.push_to_hub(ds_name, "config2", token=self._token)
 
             # check that configs args was correctly pushed to README.md
-            ds_readme_path = cached_path(hf_hub_url(ds_name, "README.md"))
+            ds_readme_path = cached_path(hf_dataset_url(ds_name, "README.md"))
             dataset_card_data = DatasetCard.load(ds_readme_path).data
             assert METADATA_CONFIGS_FIELD in dataset_card_data
             assert isinstance(dataset_card_data[METADATA_CONFIGS_FIELD], list)