From 4ed512a1e05bd56c3926f2226b31a1c08e07cd60 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 16 Nov 2023 16:59:01 +0100 Subject: [PATCH 1/8] Create DatasetNotFoundError and DataFilesNotFoundError --- src/datasets/exceptions.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/datasets/exceptions.py b/src/datasets/exceptions.py index 8ff036cbe60..a6a7aa1acf9 100644 --- a/src/datasets/exceptions.py +++ b/src/datasets/exceptions.py @@ -5,10 +5,23 @@ class DatasetsError(Exception): """Base class for exceptions in this library.""" - pass - class DefunctDatasetError(DatasetsError): """The dataset has been defunct.""" - pass + +class FileNotFoundDatasetsError(DatasetsError, FileNotFoundError): + """FileNotFoundError raised by this library.""" + + +class DataFilesNotFoundError(FileNotFoundDatasetsError): + """No (supported) data files found.""" + + +class DatasetNotFoundError(FileNotFoundDatasetsError): + """Dataset not found. + + Raised when trying to access: + - a missing dataset, or + - a private/gated dataset and the user is not authenticated. + """ From f2b6c3afbd862025f4ce9a2a9f48fe2935d7c4f7 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 16 Nov 2023 17:00:08 +0100 Subject: [PATCH 2/8] Raise DatasetNotFoundError and DataFilesNotFoundError --- src/datasets/load.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/datasets/load.py b/src/datasets/load.py index 23a34a9946f..dfd724008ea 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -48,6 +48,7 @@ from .download.download_config import DownloadConfig from .download.download_manager import DownloadMode from .download.streaming_download_manager import StreamingDownloadManager, xbasename, xglob, xjoin +from .exceptions import DataFilesNotFoundError, DatasetNotFoundError from .features import Features from .fingerprint import Hasher from .info import DatasetInfo, DatasetInfosDict @@ -512,7 +513,7 @@ def infer_module_for_data_files( raise ValueError(f"Couldn't infer the same data file format for all splits. Got {split_modules}") if not module_name: path = f" in {path}. " if path else ". " - raise FileNotFoundError(f"No (supported) data files or dataset script found{path}") + raise DataFilesNotFoundError(f"No (supported) data files or dataset script found{path}") return module_name, default_builder_kwargs @@ -1471,7 +1472,7 @@ def dataset_module_factory( elif "401" in str(e): msg = f"Dataset '{path}' doesn't exist on the Hub" msg = msg + f" at revision '{revision}'" if revision else msg - raise FileNotFoundError( + raise DatasetNotFoundError( msg + ". If the repo is private or gated, make sure to log in with `huggingface-cli login`." ) else: From 2415efb7e4c854e30e03e1a241382c772c818b92 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 16 Nov 2023 17:00:54 +0100 Subject: [PATCH 3/8] Catch and re-raise DatasetNotFoundError and DataFilesNotFoundError --- src/datasets/load.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/datasets/load.py b/src/datasets/load.py index dfd724008ea..e018470cfaa 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -1504,6 +1504,16 @@ def dataset_module_factory( raise ConnectionError(f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}") from None if isinstance(e1, EmptyDatasetError): raise e1 from None + if isinstance(e1, DataFilesNotFoundError): + raise DataFilesNotFoundError( + f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. " + f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}" + ) from None + if isinstance(e1, DatasetNotFoundError): + raise DatasetNotFoundError( + f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. " + f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}" + ) from None if isinstance(e1, FileNotFoundError): raise FileNotFoundError( f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. " From a70812b2c7dbc6be7ba73f0eb046d061552b81e0 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 16 Nov 2023 17:01:15 +0100 Subject: [PATCH 4/8] Fix docstring --- src/datasets/load.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/datasets/load.py b/src/datasets/load.py index e018470cfaa..3a7c88845ef 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -495,9 +495,10 @@ def infer_module_for_data_files( """Infer module (and builder kwargs) from data files. Raise if module names for different splits don't match. Args: - data_files (DataFilesDict): List of data files. - path (str, optional): Dataset name or path. - DownloadConfig (bool or str, optional): for authenticate on the Hugging Face Hub for private remote files. + data_files ([`DataFilesDict`]): Dict of list of data files. + path (str, *optional*): Dataset name or path. + download_config ([`DownloadConfig`], *optional*): + Specific download configuration parameters to authenticate on the Hugging Face Hub for private remote files. Returns: tuple[str, dict[str, Any]]: Tuple with From cf4ba6f0e2641056774c01f62984aef5de5d68f1 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 16 Nov 2023 17:01:27 +0100 Subject: [PATCH 5/8] Fix style --- src/datasets/load.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/datasets/load.py b/src/datasets/load.py index 3a7c88845ef..bb3934864f7 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -1495,12 +1495,12 @@ def dataset_module_factory( download_config=download_config, download_mode=download_mode, ).get_module() - except ( - Exception - ) as e1: # noqa all the attempts failed, before raising the error we should check if the module is already cached. + except Exception as e1: + # All the attempts failed, before raising the error we should check if the module is already cached try: return CachedDatasetModuleFactory(path, dynamic_modules_path=dynamic_modules_path).get_module() - except Exception: # noqa if it's not in the cache, then it doesn't exist. + except Exception: + # If it's not in the cache, then it doesn't exist. if isinstance(e1, OfflineModeIsEnabled): raise ConnectionError(f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}") from None if isinstance(e1, EmptyDatasetError): From 6f3f3e3feec9d7d4d36111401787eb7b5fd51836 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Fri, 17 Nov 2023 14:09:29 +0100 Subject: [PATCH 6/8] Re-raise e1 --- src/datasets/load.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/datasets/load.py b/src/datasets/load.py index bb3934864f7..0ccdb4bc687 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -1503,18 +1503,8 @@ def dataset_module_factory( # If it's not in the cache, then it doesn't exist. if isinstance(e1, OfflineModeIsEnabled): raise ConnectionError(f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}") from None - if isinstance(e1, EmptyDatasetError): + if isinstance(e1, (DataFilesNotFoundError, DatasetNotFoundError, EmptyDatasetError)): raise e1 from None - if isinstance(e1, DataFilesNotFoundError): - raise DataFilesNotFoundError( - f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. " - f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}" - ) from None - if isinstance(e1, DatasetNotFoundError): - raise DatasetNotFoundError( - f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. " - f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}" - ) from None if isinstance(e1, FileNotFoundError): raise FileNotFoundError( f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or any data file in the same directory. " From bf8fa7ad7609ad34d4cc689f529ea606dd2560e0 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Mon, 20 Nov 2023 18:02:03 +0100 Subject: [PATCH 7/8] Fix tests --- tests/test_load.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/test_load.py b/tests/test_load.py index 41ca010ab12..f7afe9246e6 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -24,6 +24,7 @@ from datasets.data_files import DataFilesDict from datasets.dataset_dict import DatasetDict, IterableDatasetDict from datasets.download.download_config import DownloadConfig +from datasets.exceptions import DatasetNotFoundError from datasets.features import Features, Value from datasets.iterable_dataset import IterableDataset from datasets.load import ( @@ -819,7 +820,9 @@ def test_dataset_module_factory(self): # missing module for offline_simulation_mode in list(OfflineSimulationMode): with offline(offline_simulation_mode): - with self.assertRaises((FileNotFoundError, ConnectionError, requests.exceptions.ConnectionError)): + with self.assertRaises( + (DatasetNotFoundError, ConnectionError, requests.exceptions.ConnectionError) + ): datasets.load.dataset_module_factory( "__missing_dummy_module_name__", dynamic_modules_path=self.dynamic_modules_path ) @@ -850,13 +853,13 @@ def test_offline_dataset_module_factory(self): self.assertIn("Using the latest cached version of the module", self._caplog.text) def test_load_dataset_from_hub(self): - with self.assertRaises(FileNotFoundError) as context: + with self.assertRaises(DatasetNotFoundError) as context: datasets.load_dataset("_dummy") self.assertIn( "Dataset '_dummy' doesn't exist on the Hub", str(context.exception), ) - with self.assertRaises(FileNotFoundError) as context: + with self.assertRaises(DatasetNotFoundError) as context: datasets.load_dataset("_dummy", revision="0.0.0") self.assertIn( "Dataset '_dummy' doesn't exist on the Hub", @@ -877,7 +880,7 @@ def test_load_dataset_from_hub(self): ) def test_load_dataset_namespace(self): - with self.assertRaises(FileNotFoundError) as context: + with self.assertRaises(DatasetNotFoundError) as context: datasets.load_dataset("hf-internal-testing/_dummy") self.assertIn( "hf-internal-testing/_dummy", @@ -1018,7 +1021,7 @@ def test_load_dataset_builder_for_community_dataset_without_script(): def test_load_dataset_builder_fail(): - with pytest.raises(FileNotFoundError): + with pytest.raises(DatasetNotFoundError): datasets.load_dataset_builder("blabla") @@ -1037,10 +1040,9 @@ def test_load_dataset_local(dataset_loading_script_dir, data_dir, keep_in_memory dataset = datasets.load_dataset(DATASET_LOADING_SCRIPT_NAME, data_dir=data_dir) assert len(dataset) == 2 assert "Using the latest cached version of the module" in caplog.text - with pytest.raises(FileNotFoundError) as exc_info: + with pytest.raises(DatasetNotFoundError) as exc_info: datasets.load_dataset(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST) assert f"Dataset '{SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST}' doesn't exist on the Hub" in str(exc_info.value) - assert os.path.abspath(SAMPLE_DATASET_NAME_THAT_DOESNT_EXIST) in str(exc_info.value) def test_load_dataset_streaming(dataset_loading_script_dir, data_dir): From 08ceb927025575c453228cab31291b74043dba1a Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 22 Nov 2023 15:03:47 +0100 Subject: [PATCH 8/8] Remove mention to script from DataFilesNotFoundError --- src/datasets/load.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/datasets/load.py b/src/datasets/load.py index edab92581a8..88c8948b052 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -513,8 +513,7 @@ def infer_module_for_data_files( if any((module_name, default_builder_kwargs) != split_module for split_module in split_modules.values()): raise ValueError(f"Couldn't infer the same data file format for all splits. Got {split_modules}") if not module_name: - path = f" in {path}. " if path else ". " - raise DataFilesNotFoundError(f"No (supported) data files or dataset script found{path}") + raise DataFilesNotFoundError("No (supported) data files found" + (f" in {path}" if path else "")) return module_name, default_builder_kwargs