Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4073,7 +4073,7 @@ def _push_parquet_shards_to_hub(

if token is None:
raise OSError(
"You need to provide a `token` or be logged in to Hugging Face with " "`huggingface-cli login`."
"You need to provide a `token` or be logged in to Hugging Face with `huggingface-cli login`."
)

if split is None:
Expand Down
9 changes: 6 additions & 3 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,7 @@ def get_module(self) -> DatasetModule:
hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
self.name,
revision=self.revision,
token=token,
token=token if token else "no-token",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would passing token=False work instead?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also don't hesitate to ping @SBrandeis / @coyotte508 / @Pierrci on those kind of PRs =)

Copy link
Member Author

@lhoestq lhoestq Jun 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked the source code of dataset_info and it would use token="False" and pass Bearer False for the authentication x) so yes it would work

Though the type hint requires token to be a string, not a boolean. So unless we're ok to say that the type hint can be ignored, I'll keep "no-token"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"no-token" won't trigger the type checker so I think it's better

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If HFH maintainers prefer "no-token", then this is OK! ;)

timeout=100.0,
)
patterns = (
Expand Down Expand Up @@ -1140,7 +1140,7 @@ def dataset_module_factory(
dataset_info = hf_api.dataset_info(
repo_id=path,
revision=revision,
token=token,
token=token if token else "no-token",
timeout=100.0,
)
except Exception as e: # noqa: catch any exception of hf_hub and consider that the dataset doesn't exist
Expand All @@ -1159,7 +1159,10 @@ def dataset_module_factory(
elif "401" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub"
msg = msg + f" at revision '{revision}'" if revision else msg
raise FileNotFoundError(msg + ". If the repo is private, make sure you are authenticated.")
raise FileNotFoundError(
msg
+ ". If the repo is private, make sure you are authenticated with `use_auth_token=True` after logging in with `huggingface-cli login`."
)
else:
raise e
if filename in [sibling.rfilename for sibling in dataset_info.siblings]:
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ def get_from_cache(
logger.info(f"Couldn't get ETag version for url {url}")
elif response.status_code == 401 and config.HF_ENDPOINT in url and use_auth_token is None:
raise ConnectionError(
f"Unauthorized for URL {url}. Please use the parameter ``use_auth_token=True`` after logging in with ``huggingface-cli login``"
f"Unauthorized for URL {url}. Please use the parameter `use_auth_token=True` after logging in with `huggingface-cli login`"
)
except (OSError, requests.exceptions.Timeout) as e:
# not connected
Expand Down
6 changes: 0 additions & 6 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,19 +781,13 @@ def assert_auth(url, *args, headers, **kwargs):
mock_head.assert_called()


@pytest.mark.skipif(
os.name == "nt", reason="skip on windows because of SSL issues with moon-staging.huggingface.co:443"
)
def test_load_streaming_private_dataset(hf_token, hf_private_dataset_repo_txt_data):
with pytest.raises(FileNotFoundError):
load_dataset(hf_private_dataset_repo_txt_data, streaming=True)
ds = load_dataset(hf_private_dataset_repo_txt_data, streaming=True, use_auth_token=hf_token)
assert next(iter(ds)) is not None


@pytest.mark.skipif(
os.name == "nt", reason="skip on windows because of SSL issues with moon-staging.huggingface.co:443"
)
def test_load_streaming_private_dataset_with_zipped_data(hf_token, hf_private_dataset_repo_zipped_txt_data):
with pytest.raises(FileNotFoundError):
load_dataset(hf_private_dataset_repo_zipped_txt_data, streaming=True)
Expand Down