diff --git a/src/datasets/exceptions.py b/src/datasets/exceptions.py index 8ff036cbe60..a6a7aa1acf9 100644 --- a/src/datasets/exceptions.py +++ b/src/datasets/exceptions.py @@ -5,10 +5,23 @@ class DatasetsError(Exception): """Base class for exceptions in this library.""" - pass - class DefunctDatasetError(DatasetsError): """The dataset has been defunct.""" - pass + +class FileNotFoundDatasetsError(DatasetsError, FileNotFoundError): + """FileNotFoundError raised by this library.""" + + +class DataFilesNotFoundError(FileNotFoundDatasetsError): + """No (supported) data files found.""" + + +class DatasetNotFoundError(FileNotFoundDatasetsError): + """Dataset not found. + + Raised when trying to access: + - a missing dataset, or + - a private/gated dataset and the user is not authenticated. + """ diff --git a/src/datasets/load.py b/src/datasets/load.py index b84f794cb24..88c8948b052 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -48,6 +48,7 @@ from .download.download_config import DownloadConfig from .download.download_manager import DownloadMode from .download.streaming_download_manager import StreamingDownloadManager, xbasename, xglob, xjoin +from .exceptions import DataFilesNotFoundError, DatasetNotFoundError from .features import Features from .fingerprint import Hasher from .info import DatasetInfo, DatasetInfosDict @@ -494,9 +495,10 @@ def infer_module_for_data_files( """Infer module (and builder kwargs) from data files. Raise if module names for different splits don't match. Args: - data_files (DataFilesDict): List of data files. - path (str, optional): Dataset name or path. - DownloadConfig (bool or str, optional): for authenticate on the Hugging Face Hub for private remote files. + data_files ([`DataFilesDict`]): Dict of list of data files. + path (str, *optional*): Dataset name or path. + download_config ([`DownloadConfig`], *optional*): + Specific download configuration parameters to authenticate on the Hugging Face Hub for private remote files. Returns: tuple[str, dict[str, Any]]: Tuple with @@ -511,8 +513,7 @@ def infer_module_for_data_files( if any((module_name, default_builder_kwargs) != split_module for split_module in split_modules.values()): raise ValueError(f"Couldn't infer the same data file format for all splits. Got {split_modules}") if not module_name: - path = f" in {path}. " if path else ". " - raise FileNotFoundError(f"No (supported) data files or dataset script found{path}") + raise DataFilesNotFoundError("No (supported) data files found" + (f" in {path}" if path else "")) return module_name, default_builder_kwargs @@ -1471,7 +1472,7 @@ def dataset_module_factory( elif "401" in str(e): msg = f"Dataset '{path}' doesn't exist on the Hub" msg = msg + f" at revision '{revision}'" if revision else msg - raise FileNotFoundError( + raise DatasetNotFoundError( msg + ". If the repo is private or gated, make sure to log in with `huggingface-cli login`." ) else: @@ -1493,13 +1494,15 @@ def dataset_module_factory( download_config=download_config, download_mode=download_mode, ).get_module() - except Exception as e1: # noqa all the attempts failed, before raising the error we should check if the module is already cached. + except Exception as e1: + # All the attempts failed, before raising the error we should check if the module is already cached try: return CachedDatasetModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module() - except Exception: # noqa if it's not in the cache, then it doesn't exist. + except Exception: + # If it's not in the cache, then it doesn't exist. if isinstance(e1, OfflineModeIsEnabled): raise ConnectionError(f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}") from None - if isinstance(e1, EmptyDatasetError): + if isinstance(e1, (DataFilesNotFoundError, DatasetNotFoundError, EmptyDatasetError)): raise e1 from None if isinstance(e1, FileNotFoundError): raise FileNotFoundError( diff --git a/tests/test_load.py b/tests/test_load.py index 41ca010ab12..f7afe9246e6 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -24,6 +24,7 @@ from datasets.data_files import DataFilesDict from datasets.dataset_dict import DatasetDict, IterableDatasetDict from datasets.download.download_config import DownloadConfig +from datasets.exceptions import DatasetNotFoundError from datasets.features import Features, Value from datasets.iterable_dataset import IterableDataset from datasets.load import ( @@ -819,7 +820,9 @@ def test_dataset_module_factory(self): # missing module for offline_simulation_mode in list(OfflineSimulationMode): with offline(offline_simulation_mode): - with self.assertRaises((FileNotFoundError, ConnectionError, requests.exceptions.ConnectionError)): + with self.assertRaises( + (DatasetNotFoundError, ConnectionError, requests.exceptions.ConnectionError) + ): datasets.load.dataset_module_factory( "__missing_dummy_module_name__", dynamic_modules_path=self.dynamic_modules_path ) @@ -850,13 +853,13 @@ def test_offline_dataset_module_factory(self): self.assertIn("Using the latest cached version of the module", self._caplog.text) def test_load_dataset_from_hub(self): - with self.assertRaises(FileNotFoundError) as context: + with self.assertRaises(DatasetNotFoundError) as context: datasets.load_dataset("_dummy") self.assertIn( "Dataset '_dummy' doesn't exist on the Hub", str(context.exception), ) - with self.assertRaises(FileNotFoundError) as context: + with self.assertRaises(DatasetNotFoundError) as context: datasets.load_dataset("_dummy", revision="0.0.0") self.assertIn( "Dataset '_dummy' doesn't exist on the Hub", @@ -877,7 +880,7 @@ def test_load_dataset_from_hub(self): ) def test_load_dataset_namespace(self): - with self.assertRaises(FileNotFoundError) as context: + with self.assertRaises(DatasetNotFoundError) as context: datasets.load_dataset("hf-internal-testing/_dummy") self.assertIn( "hf-internal-testing/_dummy", @@ -1018,7 +1021,7 @@ def test_load_dataset_builder_for_community_dataset_without_script(): def test_load_dataset_builder_fail(): - with pytest.raises(FileNotFoundError): + with pytest.raises(DatasetNotFoundError): datasets.load_dataset_builder("blabla") @@ -1037,10 +1040,9 @@ def test_load_dataset_local(dataset_loading_script_dir, data_dir, keep_in_memory dataset = datasets.load_dataset(DATASET_LOADING_SCRIPT_NAME, data_dir=data_dir) assert len(dataset) == 2 assert "Using the latest cached version of the module" in caplog.text - with pytest.raises(FileNotFoundError) as exc_info: + with pytest.raises(DatasetNotFoundError) as exc_info: datasets.load_dataset(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST) assert f"Dataset '{SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST}' doesn't exist on the Hub" in str(exc_info.value) - assert os.path.abspath(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST) in str(exc_info.value) def test_load_dataset_streaming(dataset_loading_script_dir, data_dir):