diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 9592355adfa..915c603d56c 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -10,7 +10,7 @@ from .filesystems.hffilesystem import HfFileSystem from .splits import Split from .utils import logging -from .utils.file_utils import hf_hub_url, is_remote_url, request_etag +from .utils.file_utils import hf_hub_url, is_relative_path, is_remote_url, request_etag from .utils.py_utils import string_to_dict @@ -76,6 +76,101 @@ def sanitize_patterns(patterns: Union[Dict, List, str]) -> Dict[str, Union[List[ return {DEFAULT_SPLIT: list(patterns)} +def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool: + """ + When a path matches a pattern, we additionnally check if it's inside a special directory + we ignore by default (if it starts with a double underscore). + + Users can still explicitly request a filepath inside such a directory if "__pycache__" is + mentioned explicitly in the requested pattern. + + Some examples: + + base directory: + + ./ + └── __pycache__ + └── b.txt + + >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**") + True + >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt") + True + >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*") + False + >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*") + False + """ + # We just need to check if every special directories from the path is present explicly in the pattern. + # Since we assume that the path matches the pattern, it's equivalent to counting that both + # the parent path and the parent pattern have the same number of special directories. + data_dirs_to_ignore_in_path = [part for part in PurePath(matched_rel_path).parent.parts if part.startswith("__")] + data_dirs_to_ignore_in_pattern = [part for part in PurePath(pattern).parent.parts if part.startswith("__")] + return len(data_dirs_to_ignore_in_path) != len(data_dirs_to_ignore_in_pattern) + + +def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_path: str, pattern: str) -> bool: + """ + When a path matches a pattern, we additionnally check if it's a hidden file or if it's inside + a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot. + + Users can still explicitly request a filepath that is hidden or is inside a hidden directory + if the hidden part is mentioned explicitly in the requested pattern. + + Some examples: + + base directory: + + ./ + └── .hidden_file.txt + + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**") + True + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*") + False + + base directory: + + ./ + └── .hidden_dir + └── a.txt + + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**") + True + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*") + False + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*") + False + + base directory: + + ./ + └── .hidden_dir + └── .hidden_file.txt + + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**") + True + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*") + True + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*") + False + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*") + True + >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*") + False + """ + # We just need to check if every hidden part from the path is present explicly in the pattern. + # Since we assume that the path matches the pattern, it's equivalent to counting that both + # the path and the pattern have the same number of hidden parts. + hidden_directories_in_path = [ + part for part in PurePath(matched_rel_path).parts if part.startswith(".") and not set(part) == {"."} + ] + hidden_directories_in_pattern = [ + part for part in PurePath(pattern).parts if part.startswith(".") and not set(part) == {"."} + ] + return len(hidden_directories_in_path) != len(hidden_directories_in_pattern) + + def _get_data_files_patterns(pattern_resolver: Callable[[str], List[PurePath]]) -> Dict[str, List[str]]: """ Get the default pattern from a directory or repository by testing all the supported patterns. @@ -133,15 +228,22 @@ def _resolve_single_pattern_locally( It also supports absolute paths in patterns. If an URL is passed, it is returned as is. """ - pattern = os.path.join(base_path, pattern) - data_files_ignore = FILES_TO_IGNORE + if is_relative_path(pattern): + pattern = os.path.join(base_path, pattern) + else: + base_path = "/" fs = LocalFileSystem() glob_iter = [PurePath(filepath) for filepath in fs.glob(pattern) if fs.isfile(filepath)] matched_paths = [ Path(filepath).resolve() for filepath in glob_iter - if filepath.name not in data_files_ignore - and not any(part.startswith((".", "__")) and set(part) != {"."} for part in filepath.parts) + if (filepath.name not in FILES_TO_IGNORE or PurePath(pattern).name == filepath.name) + and not _is_inside_unrequested_special_dir( + os.path.relpath(filepath, base_path), os.path.relpath(pattern, base_path) + ) + and not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir( + os.path.relpath(filepath, base_path), os.path.relpath(pattern, base_path) + ) ] # ignore .ipynb and __pycache__, but keep /../ if allowed_extensions is not None: out = [ @@ -187,9 +289,15 @@ def resolve_patterns_locally_or_by_urls( - '*' matches any character except a forward-slash (to match just the file or directory name) - '**' matches any character including a forward-slash / + Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested. + The same applies to special directories that start with a double underscore like "__pycache__". + You can still include one if the pattern explicilty mentions it: + - to include a hidden file: "*/.hidden.txt" or "*/.*" + - to include a hidden directory: ".hidden/*" or ".*/*" + - to include a special directory: "__special__/*" or "__*/*" + Example:: - >>> import huggingface_hub >>> from datasets.data_files import resolve_patterns_locally_or_by_urls >>> base_path = "." >>> resolve_patterns_locally_or_by_urls(base_path, ["src/**/*.yaml"]) @@ -329,16 +437,22 @@ def _resolve_single_pattern_in_dataset_repository( base_path: Optional[str] = None, allowed_extensions: Optional[list] = None, ) -> List[PurePath]: - data_files_ignore = FILES_TO_IGNORE fs = HfFileSystem(repo_info=dataset_info) if base_path: pattern = f"{base_path}/{pattern}" + else: + base_path = "/" glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)] matched_paths = [ filepath for filepath in glob_iter - if filepath.name not in data_files_ignore - and not any(part.startswith((".", "__")) and set(part) != {"."} for part in filepath.parts) + if (filepath.name not in FILES_TO_IGNORE or PurePath(pattern).name == filepath.name) + and not _is_inside_unrequested_special_dir( + os.path.relpath(filepath, base_path), os.path.relpath(pattern, base_path) + ) + and not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir( + os.path.relpath(filepath, base_path), os.path.relpath(pattern, base_path) + ) ] # ignore .ipynb and __pycache__, but keep /../ if allowed_extensions is not None: out = [ @@ -386,6 +500,13 @@ def resolve_patterns_in_dataset_repository( - '*' matches any character except a forward-slash (to match just the file or directory name) - '**' matches any character including a forward-slash / + Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested. + The same applies to special directories that start with a double underscore like "__pycache__". + You can still include one if the pattern explicilty mentions it: + - to include a hidden file: "*/.hidden.txt" or "*/.*" + - to include a hidden directory: ".hidden/*" or ".*/*" + - to include a special directory: "__special__/*" or "__*/*" + Example:: >>> import huggingface_hub diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 53e153ee573..b66fe1960fb 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -13,6 +13,8 @@ Url, _get_data_files_patterns, _get_metadata_files_patterns, + _is_inside_unrequested_special_dir, + _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir, resolve_patterns_in_dataset_repository, resolve_patterns_locally_or_by_urls, ) @@ -22,7 +24,7 @@ _TEST_PATTERNS = ["*", "**", "**/*", "*.txt", "data/*", "**/*.txt", "**/train.txt"] _FILES_TO_IGNORE = {".dummy", "README.md", "dummy_data.zip", "dataset_infos.json"} -_DIRS_TO_IGNORE = {"data/.dummy_subdir"} +_DIRS_TO_IGNORE = {"data/.dummy_subdir", "__pycache__"} _TEST_PATTERNS_SIZES = dict( [ ("*", 0), @@ -65,6 +67,10 @@ def complex_data_dir(tmp_path): with open(data_dir / "data" / ".dummy_subdir" / "test.txt", "w") as f: f.write("bar\n" * 10) + (data_dir / "__pycache__").mkdir() + with open(data_dir / "__pycache__" / "script.py", "w") as f: + f.write("foo\n" * 10) + return str(data_dir) @@ -131,6 +137,42 @@ def hub_dataset_info_patterns_results(hub_dataset_info, complex_data_dir, patter } +def test_is_inside_unrequested_special_dir(complex_data_dir, pattern_results): + # usual patterns outside special dir work fine + for pattern, result in pattern_results.items(): + if result: + matched_rel_path = str(Path(result[0]).relative_to(complex_data_dir)) + assert _is_inside_unrequested_special_dir(matched_rel_path, pattern) is False + # check behavior for special dir + f = _is_inside_unrequested_special_dir + assert f("__pycache__/b.txt", "**") is True + assert f("__pycache__/b.txt", "*/b.txt") is True + assert f("__pycache__/b.txt", "__pycache__/*") is False + assert f("__pycache__/__b.txt", "__pycache__/*") is False + assert f("__pycache__/__b.txt", "__*/*") is False + assert f("__b.txt", "*") is False + + +def test_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(complex_data_dir, pattern_results): + # usual patterns outside hidden dir work fine + for pattern, result in pattern_results.items(): + if result: + matched_rel_path = str(Path(result[0]).relative_to(complex_data_dir)) + assert _is_inside_unrequested_special_dir(matched_rel_path, pattern) is False + # check behavior for hidden dir and file + f = _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir + assert f(".hidden_file.txt", "**") is True + assert f(".hidden_file.txt", ".*") is False + assert f(".hidden_dir/a.txt", "**") is True + assert f(".hidden_dir/a.txt", ".*/*") is False + assert f(".hidden_dir/a.txt", ".hidden_dir/*") is False + assert f(".hidden_dir/.hidden_file.txt", "**") is True + assert f(".hidden_dir/.hidden_file.txt", ".*/*") is True + assert f(".hidden_dir/.hidden_file.txt", ".*/.*") is False + assert f(".hidden_dir/.hidden_file.txt", ".hidden_dir/*") is True + assert f(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*") is False + + @pytest.mark.parametrize("pattern", _TEST_PATTERNS) def test_pattern_results_fixture(pattern_results, pattern): assert len(pattern_results[pattern]) == _TEST_PATTERNS_SIZES[pattern] @@ -147,6 +189,14 @@ def test_resolve_patterns_locally_or_by_urls(complex_data_dir, pattern, pattern_ assert len(pattern_results[pattern]) == 0 +def test_resolve_patterns_locally_or_by_urls_with_dot_in_base_path(complex_data_dir): + base_path_with_dot = os.path.join(complex_data_dir, "data", ".dummy_subdir") + resolved_data_files = resolve_patterns_locally_or_by_urls( + base_path_with_dot, [os.path.join(base_path_with_dot, "train.txt")] + ) + assert len(resolved_data_files) == 1 + + def test_resolve_patterns_locally_or_by_urls_with_absolute_path(tmp_path, complex_data_dir): abs_path = os.path.join(complex_data_dir, "data", "train.txt") resolved_data_files = resolve_patterns_locally_or_by_urls(str(tmp_path / "blabla"), [abs_path]) @@ -159,6 +209,47 @@ def test_resolve_patterns_locally_or_by_urls_with_double_dots(tmp_path, complex_ assert len(resolved_data_files) == 1 +def test_resolve_patterns_locally_or_by_urls_returns_hidden_file_only_if_requested(complex_data_dir): + with pytest.raises(FileNotFoundError): + resolve_patterns_locally_or_by_urls(complex_data_dir, ["*dummy"]) + resolved_data_files = resolve_patterns_locally_or_by_urls(complex_data_dir, [".dummy"]) + assert len(resolved_data_files) == 1 + + +def test_resolve_patterns_locally_or_by_urls_hidden_base_path(tmp_path): + hidden = tmp_path / ".test_hidden_base_path" + hidden.mkdir() + (tmp_path / ".test_hidden_base_path" / "a.txt").touch() + resolved_data_files = resolve_patterns_locally_or_by_urls(str(hidden), ["*"]) + assert len(resolved_data_files) == 1 + + +def test_resolve_patterns_locally_or_by_urls_returns_hidden_dir_only_if_requested(complex_data_dir): + with pytest.raises(FileNotFoundError): + resolve_patterns_locally_or_by_urls(complex_data_dir, ["data/*dummy_subdir/train.txt"]) + resolved_data_files = resolve_patterns_locally_or_by_urls(complex_data_dir, ["data/.dummy_subdir/train.txt"]) + assert len(resolved_data_files) == 1 + resolved_data_files = resolve_patterns_locally_or_by_urls(complex_data_dir, ["*/.dummy_subdir/train.txt"]) + assert len(resolved_data_files) == 1 + + +def test_resolve_patterns_locally_or_by_urls_returns_special_dir_only_if_requested(complex_data_dir): + with pytest.raises(FileNotFoundError): + resolve_patterns_locally_or_by_urls(complex_data_dir, ["data/*dummy_subdir/train.txt"]) + resolved_data_files = resolve_patterns_locally_or_by_urls(complex_data_dir, ["data/.dummy_subdir/train.txt"]) + assert len(resolved_data_files) == 1 + resolved_data_files = resolve_patterns_locally_or_by_urls(complex_data_dir, ["*/.dummy_subdir/train.txt"]) + assert len(resolved_data_files) == 1 + + +def test_resolve_patterns_locally_or_by_urls_special_base_path(tmp_path): + special = tmp_path / "__test_special_base_path__" + special.mkdir() + (tmp_path / "__test_special_base_path__" / "a.txt").touch() + resolved_data_files = resolve_patterns_locally_or_by_urls(str(special), ["*"]) + assert len(resolved_data_files) == 1 + + @pytest.mark.parametrize("pattern,size,extensions", [("**", 4, ["txt"]), ("**", 4, None), ("**", 0, ["blablabla"])]) def test_resolve_patterns_locally_or_by_urls_with_extensions(complex_data_dir, pattern, size, extensions): if size > 0: @@ -239,6 +330,45 @@ def test_resolve_patterns_in_dataset_repository_sorted_files(): assert resolved_names == sorted(unsorted_names) +def test_resolve_patterns_in_dataset_repository_returns_hidden_file_only_if_requested(hub_dataset_info): + with pytest.raises(FileNotFoundError): + resolve_patterns_in_dataset_repository(hub_dataset_info, ["*dummy"]) + resolved_data_files = resolve_patterns_in_dataset_repository(hub_dataset_info, [".dummy"]) + assert len(resolved_data_files) == 1 + + +def test_resolve_patterns_in_dataset_repository_hidden_base_path(): + siblings = [{"rfilename": ".hidden/a.txt"}] + datasets_infos = DatasetInfo(id="test_hidden_base_path", siblings=siblings, sha="foobar") + resolved_data_files = resolve_patterns_in_dataset_repository(datasets_infos, ["*"], base_path=".hidden") + assert len(resolved_data_files) == 1 + + +def test_resolve_patterns_in_dataset_repository_returns_hidden_dir_only_if_requested(hub_dataset_info): + with pytest.raises(FileNotFoundError): + resolve_patterns_in_dataset_repository(hub_dataset_info, ["data/*dummy_subdir/train.txt"]) + resolved_data_files = resolve_patterns_in_dataset_repository(hub_dataset_info, ["data/.dummy_subdir/train.txt"]) + assert len(resolved_data_files) == 1 + resolved_data_files = resolve_patterns_in_dataset_repository(hub_dataset_info, ["*/.dummy_subdir/train.txt"]) + assert len(resolved_data_files) == 1 + + +def test_resolve_patterns_in_dataset_repository_returns_special_dir_only_if_requested(hub_dataset_info): + with pytest.raises(FileNotFoundError): + resolve_patterns_in_dataset_repository(hub_dataset_info, ["data/*dummy_subdir/train.txt"]) + resolved_data_files = resolve_patterns_in_dataset_repository(hub_dataset_info, ["data/.dummy_subdir/train.txt"]) + assert len(resolved_data_files) == 1 + resolved_data_files = resolve_patterns_in_dataset_repository(hub_dataset_info, ["*/.dummy_subdir/train.txt"]) + assert len(resolved_data_files) == 1 + + +def test_resolve_patterns_in_dataset_repository_special_base_path(): + siblings = [{"rfilename": "__special__/a.txt"}] + datasets_infos = DatasetInfo(id="test_hidden_base_path", siblings=siblings, sha="foobar") + resolved_data_files = resolve_patterns_in_dataset_repository(datasets_infos, ["*"], base_path="__special__") + assert len(resolved_data_files) == 1 + + @pytest.mark.parametrize("pattern", _TEST_PATTERNS) def test_DataFilesList_from_hf_repo(hub_dataset_info, hub_dataset_info_patterns_results, pattern): try: