Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 15 additions & 6 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,15 @@
import pyarrow as pa
import pyarrow.compute as pc
from fsspec.core import url_to_fs
from huggingface_hub import CommitInfo, CommitOperationAdd, CommitOperationDelete, DatasetCard, DatasetCardData, HfApi
from huggingface_hub import (
CommitInfo,
CommitOperationAdd,
CommitOperationDelete,
DatasetCard,
DatasetCardData,
HfApi,
)
from huggingface_hub.hf_api import RepoFile
from multiprocess import Pool
from tqdm.contrib.concurrent import thread_map

Expand Down Expand Up @@ -115,7 +123,6 @@
from .utils import tqdm as hf_tqdm
from .utils.deprecation_utils import deprecated
from .utils.file_utils import estimate_dataset_size
from .utils.hub import list_files_info, preupload_lfs_files
from .utils.info_utils import is_small_dataset
from .utils.metadata import MetadataConfigs
from .utils.py_utils import (
Expand Down Expand Up @@ -5388,11 +5395,9 @@ def shards_with_embedded_external_files(shards):
shard.to_parquet(buffer)
uploaded_size += buffer.tell()
shard_addition = CommitOperationAdd(path_in_repo=shard_path_in_repo, path_or_fileobj=buffer)
preupload_lfs_files(
api,
api.preupload_lfs_files(
repo_id=repo_id,
additions=[shard_addition],
token=token,
repo_type="dataset",
revision=revision,
create_pr=create_pr,
Expand Down Expand Up @@ -5577,7 +5582,11 @@ def push_to_hub(
deletions, deleted_size = [], 0
repo_splits = [] # use a list to keep the order of the splits
repo_files_to_add = [addition.path_in_repo for addition in additions]
for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token):
for repo_file in api.list_repo_tree(
repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True
):
if not isinstance(repo_file, RepoFile):
continue
if repo_file.rfilename == config.REPOCARD_FILENAME:
repo_with_dataset_card = True
elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME:
Expand Down
8 changes: 6 additions & 2 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
DatasetCardData,
HfApi,
)
from huggingface_hub.hf_api import RepoFile

from . import config
from .arrow_dataset import PUSH_TO_HUB_WITHOUT_METADATA_CONFIGS_SPLIT_PATTERN_SHARDED, Dataset
Expand All @@ -34,7 +35,6 @@
from .utils import logging
from .utils.deprecation_utils import deprecated
from .utils.doc_utils import is_documented_by
from .utils.hub import list_files_info
from .utils.metadata import MetadataConfigs
from .utils.py_utils import asdict, glob_pattern_to_regex, string_to_dict
from .utils.typing import PathLike
Expand Down Expand Up @@ -1745,7 +1745,11 @@ def push_to_hub(
repo_splits = [] # use a list to keep the order of the splits
deletions = []
repo_files_to_add = [addition.path_in_repo for addition in additions]
for repo_file in list_files_info(api, repo_id=repo_id, revision=revision, repo_type="dataset", token=token):
for repo_file in api.list_repo_tree(
repo_id=repo_id, revision=revision, repo_type="dataset", token=token, recursive=True
):
if not isinstance(repo_file, RepoFile):
continue
if repo_file.rfilename == config.REPOCARD_FILENAME:
repo_with_dataset_card = True
elif repo_file.rfilename == config.DATASETDICT_INFOS_FILENAME:
Expand Down
20 changes: 10 additions & 10 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
relative_to_absolute_path,
url_or_path_join,
)
from .utils.hub import hf_hub_url
from .utils.hub import hf_dataset_url
from .utils.info_utils import VerificationMode, is_small_dataset
from .utils.logging import get_logger
from .utils.metadata import MetadataConfigs
Expand Down Expand Up @@ -1211,7 +1211,7 @@ def get_module(self) -> DatasetModule:
download_config.download_desc = "Downloading readme"
try:
dataset_readme_path = cached_path(
hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=revision),
hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=revision),
download_config=download_config,
)
dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data
Expand All @@ -1222,7 +1222,7 @@ def get_module(self) -> DatasetModule:
download_config.download_desc = "Downloading standalone yaml"
try:
standalone_yaml_path = cached_path(
hf_hub_url(self.name, config.REPOYAML_FILENAME, revision=revision),
hf_dataset_url(self.name, config.REPOYAML_FILENAME, revision=revision),
download_config=download_config,
)
with open(standalone_yaml_path, "r", encoding="utf-8") as f:
Expand Down Expand Up @@ -1308,7 +1308,7 @@ def get_module(self) -> DatasetModule:
]
default_config_name = None
builder_kwargs = {
"base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"),
"base_path": hf_dataset_url(self.name, "", revision=revision).rstrip("/"),
"repo_id": self.name,
"dataset_name": camelcase_to_snakecase(Path(self.name).name),
}
Expand All @@ -1320,7 +1320,7 @@ def get_module(self) -> DatasetModule:
try:
# this file is deprecated and was created automatically in old versions of push_to_hub
dataset_infos_path = cached_path(
hf_hub_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision),
hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=revision),
download_config=download_config,
)
with open(dataset_infos_path, encoding="utf-8") as f:
Expand Down Expand Up @@ -1444,14 +1444,14 @@ def __init__(
increase_load_count(name, resource_type="dataset")

def download_loading_script(self) -> str:
file_path = hf_hub_url(self.name, self.name.split("/")[-1] + ".py", revision=self.revision)
file_path = hf_dataset_url(self.name, self.name.split("/")[-1] + ".py", revision=self.revision)
download_config = self.download_config.copy()
if download_config.download_desc is None:
download_config.download_desc = "Downloading builder script"
return cached_path(file_path, download_config=download_config)

def download_dataset_infos_file(self) -> str:
dataset_infos = hf_hub_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.revision)
dataset_infos = hf_dataset_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.revision)
# Download the dataset infos file if available
download_config = self.download_config.copy()
if download_config.download_desc is None:
Expand All @@ -1465,7 +1465,7 @@ def download_dataset_infos_file(self) -> str:
return None

def download_dataset_readme_file(self) -> str:
readme_url = hf_hub_url(self.name, config.REPOCARD_FILENAME, revision=self.revision)
readme_url = hf_dataset_url(self.name, config.REPOCARD_FILENAME, revision=self.revision)
# Download the dataset infos file if available
download_config = self.download_config.copy()
if download_config.download_desc is None:
Expand Down Expand Up @@ -1494,7 +1494,7 @@ def get_module(self) -> DatasetModule:
imports = get_imports(local_path)
local_imports = _download_additional_modules(
name=self.name,
base_path=hf_hub_url(self.name, "", revision=self.revision),
base_path=hf_dataset_url(self.name, "", revision=self.revision),
imports=imports,
download_config=self.download_config,
)
Expand Down Expand Up @@ -1540,7 +1540,7 @@ def get_module(self) -> DatasetModule:
# make the new module to be noticed by the import system
importlib.invalidate_caches()
builder_kwargs = {
"base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"),
"base_path": hf_dataset_url(self.name, "", revision=self.revision).rstrip("/"),
"repo_id": self.name,
}
return DatasetModule(module_path, hash, builder_kwargs, importable_file_path=importable_file_path)
Expand Down
62 changes: 2 additions & 60 deletions src/datasets/utils/hub.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,6 @@
import time
from functools import partial

from huggingface_hub import HfApi, hf_hub_url
from huggingface_hub.hf_api import RepoFile
from packaging import version
from requests import ConnectionError, HTTPError
from huggingface_hub import hf_hub_url

from .. import config
from . import logging


logger = logging.get_logger(__name__)

# Retry `preupload_lfs_files` in `huggingface_hub<0.20.0` on the "500 (Internal Server Error)" and "503 (Service Unavailable)" HTTP errors
if config.HF_HUB_VERSION.release < version.parse("0.20.0").release:

def preupload_lfs_files(hf_api: HfApi, **kwargs):
max_retries = 5
base_wait_time = 1
max_wait_time = 8
retry = 0
while True:
try:
hf_api.preupload_lfs_files(**kwargs)
except (RuntimeError, HTTPError, ConnectionError) as err:
if isinstance(err, RuntimeError):
if isinstance(err.__cause__, (HTTPError, ConnectionError)):
err = err.__cause__
else:
raise err
if retry >= max_retries or err.response and err.response.status_code not in [500, 503]:
raise err
else:
sleep_time = min(max_wait_time, base_wait_time * 2**retry) # Exponential backoff
logger.info(
f"{hf_api.preupload_lfs_files} timed out, retrying in {sleep_time}s... [{retry/max_retries}]"
)
time.sleep(sleep_time)
retry += 1
else:
break
else:

def preupload_lfs_files(hf_api: HfApi, **kwargs):
hf_api.preupload_lfs_files(**kwargs)


# `list_files_info` is deprecated in favor of `list_repo_tree` in `huggingface_hub>=0.20.0`
if config.HF_HUB_VERSION.release < version.parse("0.20.0").release:

def list_files_info(hf_api: HfApi, **kwargs):
yield from hf_api.list_files_info(**kwargs)
else:

def list_files_info(hf_api: HfApi, **kwargs):
kwargs = {**kwargs, "recursive": True}
for repo_path in hf_api.list_repo_tree(**kwargs):
if isinstance(repo_path, RepoFile):
yield repo_path


# bakckward compatibility
hf_hub_url = partial(hf_hub_url, repo_type="dataset")
hf_dataset_url = partial(hf_hub_url, repo_type="dataset")
6 changes: 3 additions & 3 deletions tests/test_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@

import pytest

from datasets.utils.hub import hf_hub_url
from datasets.utils.hub import hf_dataset_url


@pytest.mark.parametrize("repo_id", ["canonical_dataset_name", "org-name/dataset-name"])
@pytest.mark.parametrize("filename", ["filename.csv", "filename with blanks.csv"])
@pytest.mark.parametrize("revision", [None, "v2"])
def test_hf_hub_url(repo_id, filename, revision):
url = hf_hub_url(repo_id=repo_id, filename=filename, revision=revision)
def test_dataset_url(repo_id, filename, revision):
url = hf_dataset_url(repo_id=repo_id, filename=filename, revision=revision)
assert url == f"https://huggingface.co/datasets/{repo_id}/resolve/{revision or 'main'}/{quote(filename)}"
16 changes: 8 additions & 8 deletions tests/test_streaming_download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
xwalk,
)
from datasets.filesystems import COMPRESSION_FILESYSTEMS
from datasets.utils.hub import hf_hub_url
from datasets.utils.hub import hf_dataset_url

from .utils import require_lz4, require_zstandard, slow

Expand Down Expand Up @@ -236,7 +236,7 @@ def test_xexists(input_path, exists, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xexists_private(hf_private_dataset_repo_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "")
root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "")
download_config = DownloadConfig(token=hf_token)
assert xexists(root_url + "data/text_data.txt", download_config=download_config)
assert not xexists(root_url + "file_that_doesnt_exist.txt", download_config=download_config)
Expand Down Expand Up @@ -321,7 +321,7 @@ def test_xlistdir(input_path, expected_paths, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xlistdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
download_config = DownloadConfig(token=hf_token)
assert len(xlistdir("zip://::" + root_url, download_config=download_config)) == 1
assert len(xlistdir("zip://main_dir::" + root_url, download_config=download_config)) == 2
Expand Down Expand Up @@ -350,7 +350,7 @@ def test_xisdir(input_path, isdir, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xisdir_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
download_config = DownloadConfig(token=hf_token)
assert xisdir("zip://::" + root_url, download_config=download_config) is True
assert xisdir("zip://main_dir::" + root_url, download_config=download_config) is True
Expand All @@ -376,7 +376,7 @@ def test_xisfile(input_path, isfile, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xisfile_private(hf_private_dataset_repo_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "")
root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "")
download_config = DownloadConfig(token=hf_token)
assert xisfile(root_url + "data/text_data.txt", download_config=download_config) is True
assert xisfile(root_url + "qwertyuiop", download_config=download_config) is False
Expand All @@ -400,7 +400,7 @@ def test_xgetsize(input_path, size, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xgetsize_private(hf_private_dataset_repo_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_txt_data, "")
root_url = hf_dataset_url(hf_private_dataset_repo_txt_data, "")
download_config = DownloadConfig(token=hf_token)
assert xgetsize(root_url + "data/text_data.txt", download_config=download_config) == 39
with pytest.raises(FileNotFoundError):
Expand Down Expand Up @@ -444,7 +444,7 @@ def test_xglob(input_path, expected_paths, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xglob_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
download_config = DownloadConfig(token=hf_token)
assert len(xglob("zip://**::" + root_url, download_config=download_config)) == 3
assert len(xglob("zip://qwertyuiop/*::" + root_url, download_config=download_config)) == 0
Expand Down Expand Up @@ -483,7 +483,7 @@ def test_xwalk(input_path, expected_outputs, tmp_path, mock_fsspec):

@pytest.mark.integration
def test_xwalk_private(hf_private_dataset_repo_zipped_txt_data, hf_token):
root_url = hf_hub_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
root_url = hf_dataset_url(hf_private_dataset_repo_zipped_txt_data, "data.zip")
download_config = DownloadConfig(token=hf_token)
assert len(list(xwalk("zip://::" + root_url, download_config=download_config))) == 2
assert len(list(xwalk("zip://main_dir::" + root_url, download_config=download_config))) == 1
Expand Down
6 changes: 3 additions & 3 deletions tests/test_upstream_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
FolderBasedBuilderConfig,
)
from datasets.utils.file_utils import cached_path
from datasets.utils.hub import hf_hub_url
from datasets.utils.hub import hf_dataset_url
from tests.fixtures.hub import CI_HUB_ENDPOINT, CI_HUB_USER, CI_HUB_USER_TOKEN
from tests.utils import for_all_test_methods, require_pil, require_sndfile, xfail_if_500_502_http_error

Expand Down Expand Up @@ -608,7 +608,7 @@ def test_push_multiple_dataset_configs_to_hub_readme_metadata_content(
ds_config2.push_to_hub(ds_name, "config2", token=self._token)

# check that configs args was correctly pushed to README.md
ds_readme_path = cached_path(hf_hub_url(ds_name, "README.md"))
ds_readme_path = cached_path(hf_dataset_url(ds_name, "README.md"))
dataset_card_data = DatasetCard.load(ds_readme_path).data
assert METADATA_CONFIGS_FIELD in dataset_card_data
assert isinstance(dataset_card_data[METADATA_CONFIGS_FIELD], list)
Expand Down Expand Up @@ -757,7 +757,7 @@ def test_push_multiple_dataset_dict_configs_to_hub_readme_metadata_content(
ds_config2.push_to_hub(ds_name, "config2", token=self._token)

# check that configs args was correctly pushed to README.md
ds_readme_path = cached_path(hf_hub_url(ds_name, "README.md"))
ds_readme_path = cached_path(hf_dataset_url(ds_name, "README.md"))
dataset_card_data = DatasetCard.load(ds_readme_path).data
assert METADATA_CONFIGS_FIELD in dataset_card_data
assert isinstance(dataset_card_data[METADATA_CONFIGS_FIELD], list)
Expand Down