Skip to content
19 changes: 16 additions & 3 deletions src/datasets/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,23 @@
class DatasetsError(Exception):
"""Base class for exceptions in this library."""

pass


class DefunctDatasetError(DatasetsError):
"""The dataset has been defunct."""

pass

class FileNotFoundDatasetsError(DatasetsError, FileNotFoundError):
"""FileNotFoundError raised by this library."""


class DataFilesNotFoundError(FileNotFoundDatasetsError):
"""No (supported) data files found."""


class DatasetNotFoundError(FileNotFoundDatasetsError):
"""Dataset not found.
Raised when trying to access:
- a missing dataset, or
- a private/gated dataset and the user is not authenticated.
"""
Comment on lines +21 to +27
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we could re-use huggingface_hub's RepositoryNotFound and GatedRepo errors instead of introducing our own (this exception should at least subclass them)

For example, transformers throws an EnvironmentError (with the error description) and chains the caught (huggingface_hub) exception, so consistency with them would be nice.

Copy link
Member Author

@albertvillanova albertvillanova Nov 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree we could sub-class huggingface_hub errors as well, but at the same time, sub-classing our datasets DatasetsError base class, so that a user of datasets can catch all datasets errors by using this DatasetsError class.

But anyway, we are just replacing the FileNotFound errors we were previously raising from datasets, whereas huggingface_hub RepositoryNotFoundErrorsubclassesrequests.HTTPError(through the classHfHubHTTPError) not FileNotFoundError. I see this as adding unnecessary complexity to our datasets` error hierarchy.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would propose to address in a subsequent PR to catch specific huggingface_hub errors (instead of parsing the HTTP error status code: this is done by huggingface_hub) and raise our specific errors accordingly.

30 changes: 21 additions & 9 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
from .download.download_config import DownloadConfig
from .download.download_manager import DownloadMode
from .download.streaming_download_manager import StreamingDownloadManager, xbasename, xglob, xjoin
from .exceptions import DataFilesNotFoundError, DatasetNotFoundError
from .features import Features
from .fingerprint import Hasher
from .info import DatasetInfo, DatasetInfosDict
Expand Down Expand Up @@ -494,9 +495,10 @@ def infer_module_for_data_files(
"""Infer module (and builder kwargs) from data files. Raise if module names for different splits don't match.

Args:
data_files (DataFilesDict): List of data files.
path (str, optional): Dataset name or path.
DownloadConfig (bool or str, optional): for authenticate on the Hugging Face Hub for private remote files.
data_files ([`DataFilesDict`]): Dict of list of data files.
path (str, *optional*): Dataset name or path.
download_config ([`DownloadConfig`], *optional*):
Specific download configuration parameters to authenticate on the Hugging Face Hub for private remote files.

Returns:
tuple[str, dict[str, Any]]: Tuple with
Expand All @@ -512,7 +514,7 @@ def infer_module_for_data_files(
raise ValueError(f"Couldn't infer the same data file format for all splits. Got {split_modules}")
if not module_name:
path = f" in {path}. " if path else ". "
raise FileNotFoundError(f"No (supported) data files or dataset script found{path}")
raise DataFilesNotFoundError(f"No (supported) data files or dataset script found{path}")
return module_name, default_builder_kwargs


Expand Down Expand Up @@ -1471,7 +1473,7 @@ def dataset_module_factory(
elif "401" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub"
msg = msg + f" at revision '{revision}'" if revision else msg
raise FileNotFoundError(
raise DatasetNotFoundError(
msg + ". If the repo is private or gated, make sure to log in with `huggingface-cli login`."
)
else:
Expand All @@ -1493,16 +1495,26 @@ def dataset_module_factory(
download_config=download_config,
download_mode=download_mode,
).get_module()
except (
Exception
) as e1: # noqa all the attempts failed, before raising the error we should check if the module is already cached.
except Exception as e1:
# All the attempts failed, before raising the error we should check if the module is already cached
try:
return CachedDatasetModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module()
except Exception: # noqa if it's not in the cache, then it doesn't exist.
except Exception:
# If it's not in the cache, then it doesn't exist.
if isinstance(e1, OfflineModeIsEnabled):
raise ConnectionError(f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}") from None
if isinstance(e1, EmptyDatasetError):
raise e1 from None
if isinstance(e1, DataFilesNotFoundError):
raise DataFilesNotFoundError(
f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
) from None
if isinstance(e1, DatasetNotFoundError):
raise DatasetNotFoundError(
f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
) from None
if isinstance(e1, FileNotFoundError):
raise FileNotFoundError(
f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. "
Expand Down