Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/source/upload_dataset.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -113,16 +113,16 @@ To set your dataset as private, set the `private` parameter to `True`. This para

A private dataset is only accessible by you. Similarly, if you share a dataset within your organization, then members of the organization can also access the dataset.

Load a private dataset by providing your authentication token to the `use_auth_token` parameter:
Load a private dataset by providing your authentication token to the `token` parameter:

```py
>>> from datasets import load_dataset

# Load a private individual dataset
>>> dataset = load_dataset("stevhliu/demo", use_auth_token=True)
>>> dataset = load_dataset("stevhliu/demo", token=True)

# Load a private organization dataset
>>> dataset = load_dataset("organization/dataset_name", use_auth_token=True)
>>> dataset = load_dataset("organization/dataset_name", token=True)
```

## What's next?
Expand Down
8 changes: 3 additions & 5 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5316,9 +5316,7 @@ def path_in_repo(_index, shard):
for data_file in data_files
if data_file.startswith(f"data/{split}-") and data_file not in shards_path_in_repo
]
deleted_size = sum(
xgetsize(hf_hub_url(repo_id, data_file), use_auth_token=token) for data_file in data_files_to_delete
)
deleted_size = sum(xgetsize(hf_hub_url(repo_id, data_file), token=token) for data_file in data_files_to_delete)

def delete_file(file):
api.delete_file(file, repo_id=repo_id, token=token, repo_type="dataset", revision=branch)
Expand Down Expand Up @@ -5420,7 +5418,7 @@ def push_to_hub(
if "README.md" in repo_files:
download_config = DownloadConfig()
download_config.download_desc = "Downloading metadata"
download_config.use_auth_token = token
download_config.token = token
dataset_readme_path = cached_path(
hf_hub_url(repo_id, "README.md"),
download_config=download_config,
Expand All @@ -5438,7 +5436,7 @@ def push_to_hub(
dataset_card_data = DatasetCardData()
download_config = DownloadConfig()
download_config.download_desc = "Downloading metadata"
download_config.use_auth_token = token
download_config.token = token
dataset_infos_path = cached_path(
hf_hub_url(repo_id, config.DATASETDICT_INFOS_FILENAME),
download_config=download_config,
Expand Down
31 changes: 21 additions & 10 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ class DatasetBuilder:
features ([`Features`], *optional*):
Features types to use with this dataset.
It can be used to change the [`Features`] types of a dataset, for example.
use_auth_token (`str` or `bool`, *optional*):
token (`str` or `bool`, *optional*):
String or boolean to use as Bearer token for remote files on the
Datasets Hub. If `True`, will get token from `"~/.huggingface"`.
repo_id (`str`, *optional*):
Expand Down Expand Up @@ -316,7 +316,8 @@ def __init__(
base_path: Optional[str] = None,
info: Optional[DatasetInfo] = None,
features: Optional[Features] = None,
use_auth_token: Optional[Union[bool, str]] = None,
token: Optional[Union[bool, str]] = None,
use_auth_token="deprecated",
repo_id: Optional[str] = None,
data_files: Optional[Union[str, list, dict, DataFilesDict]] = None,
data_dir: Optional[str] = None,
Expand All @@ -325,6 +326,13 @@ def __init__(
name="deprecated",
**config_kwargs,
):
if use_auth_token != "deprecated":
warnings.warn(
"'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n"
f"You can remove this warning by passing 'token={use_auth_token}' instead.",
FutureWarning,
)
token = use_auth_token
if name != "deprecated":
warnings.warn(
"Parameter 'name' was renamed to 'config_name' in version 2.3.0 and will be removed in 3.0.0.",
Expand All @@ -335,14 +343,16 @@ def __init__(
self.name: str = camelcase_to_snakecase(self.__module__.split(".")[-1])
self.hash: Optional[str] = hash
self.base_path = base_path
self.use_auth_token = use_auth_token
self.token = token
# For backwards compatibility (e.g. if accessed in a dataset script)
self.use_auth_token = token
self.repo_id = repo_id
self.storage_options = storage_options
self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE

if data_files is not None and not isinstance(data_files, DataFilesDict):
data_files = DataFilesDict.from_local_or_remote(
sanitize_patterns(data_files), base_path=base_path, use_auth_token=use_auth_token
sanitize_patterns(data_files), base_path=base_path, token=token
)

# Prepare config: DatasetConfig contains name, version and description but can be extended by each dataset
Expand Down Expand Up @@ -699,7 +709,7 @@ def download_and_prepare(

<Deprecated version="2.7.1">

Pass `use_auth_token` to the initializer/`load_dataset_builder` instead.
Pass `use_auth_token` to `load_dataset_builder` instead.

</Deprecated>
file_format (`str`, *optional*):
Expand Down Expand Up @@ -761,11 +771,12 @@ def download_and_prepare(
)
if use_auth_token != "deprecated":
warnings.warn(
"'use_auth_token' was deprecated in version 2.7.1 and will be removed in 3.0.0. Pass `use_auth_token` to the initializer/`load_dataset_builder` instead.",
"'use_auth_token' was deprecated in version 2.7.1 and will be removed in 3.0.0. Pass `token` to `load_dataset_builder` instead.",
FutureWarning,
)
token = use_auth_token
else:
use_auth_token = self.use_auth_token
token = self.token

output_dir = output_dir if output_dir is not None else self._cache_dir
# output_dir can be a remote bucket on GCS or S3 (when using BeamBasedBuilder for distributed data processing)
Expand Down Expand Up @@ -799,7 +810,7 @@ def download_and_prepare(
force_extract=download_mode == DownloadMode.FORCE_REDOWNLOAD,
use_etag=False,
num_proc=num_proc,
use_auth_token=use_auth_token,
token=token,
storage_options=self.storage_options,
) # We don't use etag for data files to speed up the process

Expand Down Expand Up @@ -1273,7 +1284,7 @@ def as_streaming_dataset(

dl_manager = StreamingDownloadManager(
base_path=base_path or self.base_path,
download_config=DownloadConfig(use_auth_token=self.use_auth_token, storage_options=self.storage_options),
download_config=DownloadConfig(token=self.token, storage_options=self.storage_options),
dataset_name=self.name,
data_dir=self.config.data_dir,
)
Expand Down Expand Up @@ -1303,7 +1314,7 @@ def _as_streaming_dataset_single(
) -> IterableDataset:
ex_iterable = self._get_examples_iterable_for_split(splits_generator)
# add auth to be able to access and decode audio/image files from private repositories.
token_per_repo_id = {self.repo_id: self.use_auth_token} if self.repo_id else {}
token_per_repo_id = {self.repo_id: self.token} if self.repo_id else {}
return IterableDataset(
ex_iterable, info=self.info, split=splits_generator.name, token_per_repo_id=token_per_repo_id
)
Expand Down
16 changes: 8 additions & 8 deletions src/datasets/data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,21 +679,21 @@ def get_metadata_patterns_in_dataset_repository(


def _get_single_origin_metadata_locally_or_by_urls(
data_file: Union[Path, Url], use_auth_token: Optional[Union[bool, str]] = None
data_file: Union[Path, Url], token: Optional[Union[bool, str]] = None
) -> Tuple[str]:
if isinstance(data_file, Url):
data_file = str(data_file)
return (request_etag(data_file, use_auth_token=use_auth_token),)
return (request_etag(data_file, token=token),)
else:
data_file = str(data_file.resolve())
return (str(os.path.getmtime(data_file)),)


def _get_origin_metadata_locally_or_by_urls(
data_files: List[Union[Path, Url]], max_workers=64, use_auth_token: Optional[Union[bool, str]] = None
data_files: List[Union[Path, Url]], max_workers=64, token: Optional[Union[bool, str]] = None
) -> Tuple[str]:
return thread_map(
partial(_get_single_origin_metadata_locally_or_by_urls, use_auth_token=use_auth_token),
partial(_get_single_origin_metadata_locally_or_by_urls, token=token),
data_files,
max_workers=max_workers,
tqdm_class=logging.tqdm,
Expand Down Expand Up @@ -742,11 +742,11 @@ def from_local_or_remote(
patterns: List[str],
base_path: Optional[str] = None,
allowed_extensions: Optional[List[str]] = None,
use_auth_token: Optional[Union[bool, str]] = None,
token: Optional[Union[bool, str]] = None,
) -> "DataFilesList":
base_path = base_path if base_path is not None else str(Path().resolve())
data_files = resolve_patterns_locally_or_by_urls(base_path, patterns, allowed_extensions)
origin_metadata = _get_origin_metadata_locally_or_by_urls(data_files, use_auth_token=use_auth_token)
origin_metadata = _get_origin_metadata_locally_or_by_urls(data_files, token=token)
return cls(data_files, origin_metadata)

def filter_extensions(self, extensions: List[str]) -> "DataFilesList":
Expand Down Expand Up @@ -784,7 +784,7 @@ def from_local_or_remote(
patterns: Dict[str, Union[List[str], DataFilesList]],
base_path: Optional[str] = None,
allowed_extensions: Optional[List[str]] = None,
use_auth_token: Optional[Union[bool, str]] = None,
token: Optional[Union[bool, str]] = None,
) -> "DataFilesDict":
out = cls()
for key, patterns_for_key in patterns.items():
Expand All @@ -793,7 +793,7 @@ def from_local_or_remote(
patterns_for_key,
base_path=base_path,
allowed_extensions=allowed_extensions,
use_auth_token=use_auth_token,
token=token,
)
if not isinstance(patterns_for_key, DataFilesList)
else patterns_for_key
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -1669,7 +1669,7 @@ def push_to_hub(
if "README.md" in repo_files:
download_config = DownloadConfig()
download_config.download_desc = "Downloading metadata"
download_config.use_auth_token = token
download_config.token = token
dataset_readme_path = cached_path(
hf_hub_url(repo_id, "README.md"),
download_config=download_config,
Expand Down
23 changes: 22 additions & 1 deletion src/datasets/download/download_config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Optional, Union
Expand Down Expand Up @@ -36,9 +37,19 @@ class DownloadConfig:
The number of processes to launch to download the files in parallel.
max_retries (`int`, default to `1`):
The number of times to retry an HTTP request if it fails.
token (`str` or `bool`, *optional*):
Optional string or boolean to use as Bearer token
for remote files on the Datasets Hub. If `True`, or not specified, will get token from `~/.huggingface`.
use_auth_token (`str` or `bool`, *optional*):
Optional string or boolean to use as Bearer token
for remote files on the Datasets Hub. If `True`, or not specified, will get token from `~/.huggingface`.

<Deprecated version="2.14.0">

`use_auth_token` was deprecated in favor of `token` in version 2.14.0 and will be removed in 3.0.0.

</Deprecated>

ignore_url_params (`bool`, defaults to `False`):
Whether to strip all query parameters and fragments from
the download URL before using it for caching the file.
Expand All @@ -60,10 +71,20 @@ class DownloadConfig:
use_etag: bool = True
num_proc: Optional[int] = None
max_retries: int = 1
use_auth_token: Optional[Union[str, bool]] = None
token: Optional[Union[str, bool]] = None
use_auth_token = "deprecated"
ignore_url_params: bool = False
storage_options: Optional[Dict] = None
download_desc: Optional[str] = None

def __post_init__(self):
if self.use_auth_token != "deprecated":
warnings.warn(
"'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n"
f"You can remove this warning by passing 'token={self.use_auth_token}' instead.",
FutureWarning,
)
self.token = self.use_auth_token

def copy(self) -> "DownloadConfig":
return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()})
Loading