From b9a2b30647f041837944045e48bf4151adf43c28 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 17 Aug 2023 11:00:34 +0200 Subject: [PATCH 1/4] raise filenotfound --- src/datasets/data_files.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 395f03ceba9..869a2e1670d 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -320,7 +320,7 @@ def resolve_pattern( allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions). For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"] Returns: - List[Union[Path, Url]]: List of paths or URLs to the local or remote files that match the patterns. + List[str]: List of paths or URLs to the local or remote files that match the patterns. """ if is_relative_path(pattern): pattern = xjoin(base_path, pattern) @@ -573,17 +573,14 @@ def from_patterns( base_path = base_path if base_path is not None else Path().resolve().as_posix() data_files = [] for pattern in patterns: - try: - data_files.extend( - resolve_pattern( - pattern, - base_path=base_path, - allowed_extensions=allowed_extensions, - download_config=download_config, - ) + data_files.extend( + resolve_pattern( + pattern, + base_path=base_path, + allowed_extensions=allowed_extensions, + download_config=download_config, ) - except FileNotFoundError: - pass + ) origin_metadata = _get_origin_metadata(data_files, download_config=download_config) return cls(data_files, origin_metadata) From 0adfa9ada14c38fce5973b5e3f196a2c46dc9170 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 17 Aug 2023 15:57:34 +0200 Subject: [PATCH 2/4] only raise if no magic char in pattern --- src/datasets/data_files.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 869a2e1670d..d5149eeab92 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -573,14 +573,18 @@ def from_patterns( base_path = base_path if base_path is not None else Path().resolve().as_posix() data_files = [] for pattern in patterns: - data_files.extend( - resolve_pattern( - pattern, - base_path=base_path, - allowed_extensions=allowed_extensions, - download_config=download_config, + try: + data_files.extend( + resolve_pattern( + pattern, + base_path=base_path, + allowed_extensions=allowed_extensions, + download_config=download_config, + ) ) - ) + except FileNotFoundError: + if "*" not in pattern and "[" not in pattern and "?" not in pattern: + raise origin_metadata = _get_origin_metadata(data_files, download_config=download_config) return cls(data_files, origin_metadata) From a46ca9cc138754629be261522301e725c7d14152 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 17 Aug 2023 16:27:16 +0200 Subject: [PATCH 3/4] add test --- tests/test_data_files.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 3c2ee59ec9e..34bfb26332e 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -380,6 +380,11 @@ def test_DataFilesList_from_patterns_locally_with_extra_files(complex_data_dir, assert len(data_files_list.origin_metadata) == 2 +def test_DataFilesList_from_patterns_raises_FileNotFoundError(complex_data_dir): + with pytest.raises(FileNotFoundError): + DataFilesList.from_patterns(["file_that_doesnt_exist.txt"], complex_data_dir) + + @pytest.mark.parametrize("pattern", _TEST_PATTERNS) def test_DataFilesDict_from_patterns_in_dataset_repository( hub_dataset_repo_path, hub_dataset_repo_patterns_results, pattern From d84cd1d6f51ca75ec5f5c3db3f372f093758cac9 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Fri, 18 Aug 2023 12:24:14 +0200 Subject: [PATCH 4/4] use has_magic --- src/datasets/data_files.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index d5149eeab92..fd1eb739c99 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -1,6 +1,7 @@ import os import re from functools import partial +from glob import has_magic from pathlib import Path, PurePath from typing import Callable, Dict, List, Optional, Set, Tuple, Union @@ -583,7 +584,7 @@ def from_patterns( ) ) except FileNotFoundError: - if "*" not in pattern and "[" not in pattern and "?" not in pattern: + if not has_magic(pattern): raise origin_metadata = _get_origin_metadata(data_files, download_config=download_config) return cls(data_files, origin_metadata)