From f84482cda48429f6c3a804e73fb8d9f843fe99fc Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 29 Jan 2024 16:33:01 +0100 Subject: [PATCH] Revert "Support push_to_hub canonical datasets (#6519)" This reverts commit a887ee78835573f5d80f9e414e8443b4caff3541. --- src/datasets/arrow_dataset.py | 3 ++- src/datasets/dataset_dict.py | 3 ++- tests/test_upstream_hub.py | 14 ++++++++++---- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index c720c75600f..bcdc846e663 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -5365,13 +5365,14 @@ def push_to_hub( api = HfApi(endpoint=config.HF_ENDPOINT, token=token) - _ = api.create_repo( + repo_url = api.create_repo( repo_id, token=token, repo_type="dataset", private=private, exist_ok=True, ) + repo_id = repo_url.repo_id if revision is not None: api.create_branch(repo_id, branch=revision, token=token, repo_type="dataset", exist_ok=True) diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 06fd124e2c3..1c0b6794132 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -1685,13 +1685,14 @@ def push_to_hub( api = HfApi(endpoint=config.HF_ENDPOINT, token=token) - _ = api.create_repo( + repo_url = api.create_repo( repo_id, token=token, repo_type="dataset", private=private, exist_ok=True, ) + repo_id = repo_url.repo_id if revision is not None: api.create_branch(repo_id, branch=revision, token=token, repo_type="dataset", exist_ok=True) diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index c99e3559ae6..2952cf310bf 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -13,7 +13,6 @@ import numpy as np import pytest from huggingface_hub import DatasetCard, HfApi -from huggingface_hub.utils import RepositoryNotFoundError from datasets import ( Audio, @@ -71,9 +70,16 @@ def test_push_dataset_dict_to_hub_name_without_namespace(self, temporary_repo): local_ds = DatasetDict({"train": ds}) with temporary_repo() as ds_name: - # cannot create a repo without namespace - with pytest.raises(RepositoryNotFoundError): - local_ds.push_to_hub(ds_name.split("/")[-1], token=self._token) + local_ds.push_to_hub(ds_name.split("/")[-1], token=self._token) + hub_ds = load_dataset(ds_name, download_mode="force_redownload") + + assert local_ds.column_names == hub_ds.column_names + assert list(local_ds["train"].features.keys()) == list(hub_ds["train"].features.keys()) + assert local_ds["train"].features == hub_ds["train"].features + + # Ensure that there is a single file on the repository that has the correct name + files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset")) + assert files == [".gitattributes", "README.md", "data/train-00000-of-00001.parquet"] def test_push_dataset_dict_to_hub_datasets_with_different_features(self, cleanup_repo): ds_train = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})