Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,7 @@ def get_module(self) -> DatasetModule:
hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
self.name,
revision=self.revision,
token=token,
token=token if token else "no-token",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would passing token=False work instead?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also don't hesitate to ping @SBrandeis / @coyotte508 / @Pierrci on those kind of PRs =)

Copy link
Member Author

@lhoestq lhoestq Jun 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked the source code of dataset_info and it would use token="False" and pass Bearer False for the authentication x) so yes it would work

Though the type hint requires token to be a string, not a boolean. So unless we're ok to say that the type hint can be ignored, I'll keep "no-token"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"no-token" won't trigger the type checker so I think it's better

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If HFH maintainers prefer "no-token", then this is OK! ;)

timeout=100.0,
)
patterns = (
Expand Down Expand Up @@ -1140,7 +1140,7 @@ def dataset_module_factory(
dataset_info = hf_api.dataset_info(
repo_id=path,
revision=revision,
token=token,
token=token if token else "no-token",
timeout=100.0,
)
except Exception as e: # noqa: catch any exception of hf_hub and consider that the dataset doesn't exist
Expand All @@ -1159,7 +1159,10 @@ def dataset_module_factory(
elif "401" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub"
msg = msg + f" at revision '{revision}'" if revision else msg
raise FileNotFoundError(msg + ". If the repo is private, make sure you are authenticated.")
raise FileNotFoundError(
msg
+ ". If the repo is private, make sure you are authenticated with ``use_auth_token=True`` after logging in with ``huggingface-cli login```."
)
else:
raise e
if filename in [sibling.rfilename for sibling in dataset_info.siblings]:
Expand Down
6 changes: 0 additions & 6 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,19 +781,13 @@ def assert_auth(url, *args, headers, **kwargs):
mock_head.assert_called()


@pytest.mark.skipif(
os.name == "nt", reason="skip on windows because of SSL issues with moon-staging.huggingface.co:443"
)
def test_load_streaming_private_dataset(hf_token, hf_private_dataset_repo_txt_data):
with pytest.raises(FileNotFoundError):
load_dataset(hf_private_dataset_repo_txt_data, streaming=True)
ds = load_dataset(hf_private_dataset_repo_txt_data, streaming=True, use_auth_token=hf_token)
assert next(iter(ds)) is not None


@pytest.mark.skipif(
os.name == "nt", reason="skip on windows because of SSL issues with moon-staging.huggingface.co:443"
)
def test_load_streaming_private_dataset_with_zipped_data(hf_token, hf_private_dataset_repo_zipped_txt_data):
with pytest.raises(FileNotFoundError):
load_dataset(hf_private_dataset_repo_zipped_txt_data, streaming=True)
Expand Down