Skip to content
19 changes: 16 additions & 3 deletions src/datasets/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,23 @@
class DatasetsError(Exception):
"""Base class for exceptions in this library."""

pass


class DefunctDatasetError(DatasetsError):
"""The dataset has been defunct."""

pass

class FileNotFoundDatasetsError(DatasetsError, FileNotFoundError):
"""FileNotFoundError raised by this library."""


class DataFilesNotFoundError(FileNotFoundDatasetsError):
"""No (supported) data files found."""


class DatasetNotFoundError(FileNotFoundDatasetsError):
"""Dataset not found.
Raised when trying to access:
- a missing dataset, or
- a private/gated dataset and the user is not authenticated.
"""
Comment on lines +21 to +27
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we could re-use huggingface_hub's RepositoryNotFound and GatedRepo errors instead of introducing our own (this exception should at least subclass them)

For example, transformers throws an EnvironmentError (with the error description) and chains the caught (huggingface_hub) exception, so consistency with them would be nice.

Copy link
Member Author

@albertvillanova albertvillanova Nov 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree we could sub-class huggingface_hub errors as well, but at the same time, sub-classing our datasets DatasetsError base class, so that a user of datasets can catch all datasets errors by using this DatasetsError class.

But anyway, we are just replacing the FileNotFound errors we were previously raising from datasets, whereas huggingface_hub RepositoryNotFoundErrorsubclassesrequests.HTTPError(through the classHfHubHTTPError) not FileNotFoundError. I see this as adding unnecessary complexity to our datasets` error hierarchy.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would propose to address in a subsequent PR to catch specific huggingface_hub errors (instead of parsing the HTTP error status code: this is done by huggingface_hub) and raise our specific errors accordingly.

22 changes: 12 additions & 10 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
from .download.download_config import DownloadConfig
from .download.download_manager import DownloadMode
from .download.streaming_download_manager import StreamingDownloadManager, xbasename, xglob, xjoin
from .exceptions import DataFilesNotFoundError, DatasetNotFoundError
from .features import Features
from .fingerprint import Hasher
from .info import DatasetInfo, DatasetInfosDict
Expand Down Expand Up @@ -494,9 +495,10 @@ def infer_module_for_data_files(
"""Infer module (and builder kwargs) from data files. Raise if module names for different splits don't match.

Args:
data_files (DataFilesDict): List of data files.
path (str, optional): Dataset name or path.
DownloadConfig (bool or str, optional): for authenticate on the Hugging Face Hub for private remote files.
data_files ([`DataFilesDict`]): Dict of list of data files.
path (str, *optional*): Dataset name or path.
download_config ([`DownloadConfig`], *optional*):
Specific download configuration parameters to authenticate on the Hugging Face Hub for private remote files.

Returns:
tuple[str, dict[str, Any]]: Tuple with
Expand All @@ -512,7 +514,7 @@ def infer_module_for_data_files(
raise ValueError(f"Couldn't infer the same data file format for all splits. Got {split_modules}")
if not module_name:
path = f" in {path}. " if path else ". "
raise FileNotFoundError(f"No (supported) data files or dataset script found{path}")
raise DataFilesNotFoundError(f"No (supported) data files or dataset script found{path}")
return module_name, default_builder_kwargs


Expand Down Expand Up @@ -1471,7 +1473,7 @@ def dataset_module_factory(
elif "401" in str(e):
msg = f"Dataset '{path}' doesn't exist on the Hub"
msg = msg + f" at revision '{revision}'" if revision else msg
raise FileNotFoundError(
raise DatasetNotFoundError(
msg + ". If the repo is private or gated, make sure to log in with `huggingface-cli login`."
)
else:
Expand All @@ -1493,15 +1495,15 @@ def dataset_module_factory(
download_config=download_config,
download_mode=download_mode,
).get_module()
except (
Exception
) as e1: # noqa all the attempts failed, before raising the error we should check if the module is already cached.
except Exception as e1:
# All the attempts failed, before raising the error we should check if the module is already cached
try:
return CachedDatasetModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module()
except Exception: # noqa if it's not in the cache, then it doesn't exist.
except Exception:
# If it's not in the cache, then it doesn't exist.
if isinstance(e1, OfflineModeIsEnabled):
raise ConnectionError(f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}") from None
if isinstance(e1, EmptyDatasetError):
if isinstance(e1, (DataFilesNotFoundError, DatasetNotFoundError, EmptyDatasetError)):
raise e1 from None
if isinstance(e1, FileNotFoundError):
raise FileNotFoundError(
Expand Down
16 changes: 9 additions & 7 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from datasets.data_files import DataFilesDict
from datasets.dataset_dict import DatasetDict, IterableDatasetDict
from datasets.download.download_config import DownloadConfig
from datasets.exceptions import DatasetNotFoundError
from datasets.features import Features, Value
from datasets.iterable_dataset import IterableDataset
from datasets.load import (
Expand Down Expand Up @@ -819,7 +820,9 @@ def test_dataset_module_factory(self):
# missing module
for offline_simulation_mode in list(OfflineSimulationMode):
with offline(offline_simulation_mode):
with self.assertRaises((FileNotFoundError, ConnectionError, requests.exceptions.ConnectionError)):
with self.assertRaises(
(DatasetNotFoundError, ConnectionError, requests.exceptions.ConnectionError)
):
datasets.load.dataset_module_factory(
"__missing_dummy_module_name__", dynamic_modules_path=self.dynamic_modules_path
)
Expand Down Expand Up @@ -850,13 +853,13 @@ def test_offline_dataset_module_factory(self):
self.assertIn("Using the latest cached version of the module", self._caplog.text)

def test_load_dataset_from_hub(self):
with self.assertRaises(FileNotFoundError) as context:
with self.assertRaises(DatasetNotFoundError) as context:
datasets.load_dataset("_dummy")
self.assertIn(
"Dataset '_dummy' doesn't exist on the Hub",
str(context.exception),
)
with self.assertRaises(FileNotFoundError) as context:
with self.assertRaises(DatasetNotFoundError) as context:
datasets.load_dataset("_dummy", revision="0.0.0")
self.assertIn(
"Dataset '_dummy' doesn't exist on the Hub",
Expand All @@ -877,7 +880,7 @@ def test_load_dataset_from_hub(self):
)

def test_load_dataset_namespace(self):
with self.assertRaises(FileNotFoundError) as context:
with self.assertRaises(DatasetNotFoundError) as context:
datasets.load_dataset("hf-internal-testing/_dummy")
self.assertIn(
"hf-internal-testing/_dummy",
Expand Down Expand Up @@ -1018,7 +1021,7 @@ def test_load_dataset_builder_for_community_dataset_without_script():


def test_load_dataset_builder_fail():
with pytest.raises(FileNotFoundError):
with pytest.raises(DatasetNotFoundError):
datasets.load_dataset_builder("blabla")


Expand All @@ -1037,10 +1040,9 @@ def test_load_dataset_local(dataset_loading_script_dir, data_dir, keep_in_memory
dataset = datasets.load_dataset(DATASET_LOADING_SCRIPT_NAME, data_dir=data_dir)
assert len(dataset) == 2
assert "Using the latest cached version of the module" in caplog.text
with pytest.raises(FileNotFoundError) as exc_info:
with pytest.raises(DatasetNotFoundError) as exc_info:
datasets.load_dataset(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST)
assert f"Dataset '{SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST}' doesn't exist on the Hub" in str(exc_info.value)
assert os.path.abspath(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST) in str(exc_info.value)


def test_load_dataset_streaming(dataset_loading_script_dir, data_dir):
Expand Down