Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 130 additions & 9 deletions src/datasets/data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from .filesystems.hffilesystem import HfFileSystem
from .splits import Split
from .utils import logging
from .utils.file_utils import hf_hub_url, is_remote_url, request_etag
from .utils.file_utils import hf_hub_url, is_relative_path, is_remote_url, request_etag
from .utils.py_utils import string_to_dict


Expand Down Expand Up @@ -76,6 +76,101 @@ def sanitize_patterns(patterns: Union[Dict, List, str]) -> Dict[str, Union[List[
return {DEFAULT_SPLIT: list(patterns)}


def _is_inside_unrequested_special_dir(matched_rel_path: str, pattern: str) -> bool:
"""
When a path matches a pattern, we additionnally check if it's inside a special directory
we ignore by default (if it starts with a double underscore).

Users can still explicitly request a filepath inside such a directory if "__pycache__" is
mentioned explicitly in the requested pattern.

Some examples:

base directory:

./
└── __pycache__
└── b.txt

>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
True
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
True
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
False
>>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
False
"""
# We just need to check if every special directories from the path is present explicly in the pattern.
# Since we assume that the path matches the pattern, it's equivalent to counting that both
# the parent path and the parent pattern have the same number of special directories.
data_dirs_to_ignore_in_path = [part for part in PurePath(matched_rel_path).parent.parts if part.startswith("__")]
data_dirs_to_ignore_in_pattern = [part for part in PurePath(pattern).parent.parts if part.startswith("__")]
return len(data_dirs_to_ignore_in_path) != len(data_dirs_to_ignore_in_pattern)


def _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(matched_rel_path: str, pattern: str) -> bool:
"""
When a path matches a pattern, we additionnally check if it's a hidden file or if it's inside
a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.

Users can still explicitly request a filepath that is hidden or is inside a hidden directory
if the hidden part is mentioned explicitly in the requested pattern.

Some examples:

base directory:

./
└── .hidden_file.txt

>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
True
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
False

base directory:

./
└── .hidden_dir
└── a.txt

>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
True
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
False
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
False

base directory:

./
└── .hidden_dir
└── .hidden_file.txt

>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
True
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
True
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
False
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
True
>>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
False
"""
# We just need to check if every hidden part from the path is present explicly in the pattern.
# Since we assume that the path matches the pattern, it's equivalent to counting that both
# the path and the pattern have the same number of hidden parts.
hidden_directories_in_path = [
part for part in PurePath(matched_rel_path).parts if part.startswith(".") and not set(part) == {"."}
]
hidden_directories_in_pattern = [
part for part in PurePath(pattern).parts if part.startswith(".") and not set(part) == {"."}
]
return len(hidden_directories_in_path) != len(hidden_directories_in_pattern)


def _get_data_files_patterns(pattern_resolver: Callable[[str], List[PurePath]]) -> Dict[str, List[str]]:
"""
Get the default pattern from a directory or repository by testing all the supported patterns.
Expand Down Expand Up @@ -133,15 +228,22 @@ def _resolve_single_pattern_locally(
It also supports absolute paths in patterns.
If an URL is passed, it is returned as is.
"""
pattern = os.path.join(base_path, pattern)
data_files_ignore = FILES_TO_IGNORE
if is_relative_path(pattern):
pattern = os.path.join(base_path, pattern)
else:
base_path = "/"
fs = LocalFileSystem()
glob_iter = [PurePath(filepath) for filepath in fs.glob(pattern) if fs.isfile(filepath)]
matched_paths = [
Path(filepath).resolve()
for filepath in glob_iter
if filepath.name not in data_files_ignore
and not any(part.startswith((".", "__")) and set(part) != {"."} for part in filepath.parts)
if (filepath.name not in FILES_TO_IGNORE or PurePath(pattern).name == filepath.name)
and not _is_inside_unrequested_special_dir(
os.path.relpath(filepath, base_path), os.path.relpath(pattern, base_path)
)
and not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(
os.path.relpath(filepath, base_path), os.path.relpath(pattern, base_path)
)
] # ignore .ipynb and __pycache__, but keep /../
if allowed_extensions is not None:
out = [
Expand Down Expand Up @@ -187,9 +289,15 @@ def resolve_patterns_locally_or_by_urls(
- '*' matches any character except a forward-slash (to match just the file or directory name)
- '**' matches any character including a forward-slash /

Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
The same applies to special directories that start with a double underscore like "__pycache__".
You can still include one if the pattern explicilty mentions it:
- to include a hidden file: "*/.hidden.txt" or "*/.*"
- to include a hidden directory: ".hidden/*" or ".*/*"
- to include a special directory: "__special__/*" or "__*/*"

Example::

>>> import huggingface_hub
>>> from datasets.data_files import resolve_patterns_locally_or_by_urls
>>> base_path = "."
>>> resolve_patterns_locally_or_by_urls(base_path, ["src/**/*.yaml"])
Expand Down Expand Up @@ -329,16 +437,22 @@ def _resolve_single_pattern_in_dataset_repository(
base_path: Optional[str] = None,
allowed_extensions: Optional[list] = None,
) -> List[PurePath]:
data_files_ignore = FILES_TO_IGNORE
fs = HfFileSystem(repo_info=dataset_info)
if base_path:
pattern = f"{base_path}/{pattern}"
else:
base_path = "/"
glob_iter = [PurePath(filepath) for filepath in fs.glob(PurePath(pattern).as_posix()) if fs.isfile(filepath)]
matched_paths = [
filepath
for filepath in glob_iter
if filepath.name not in data_files_ignore
and not any(part.startswith((".", "__")) and set(part) != {"."} for part in filepath.parts)
if (filepath.name not in FILES_TO_IGNORE or PurePath(pattern).name == filepath.name)
and not _is_inside_unrequested_special_dir(
os.path.relpath(filepath, base_path), os.path.relpath(pattern, base_path)
)
and not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(
os.path.relpath(filepath, base_path), os.path.relpath(pattern, base_path)
)
] # ignore .ipynb and __pycache__, but keep /../
if allowed_extensions is not None:
out = [
Expand Down Expand Up @@ -386,6 +500,13 @@ def resolve_patterns_in_dataset_repository(
- '*' matches any character except a forward-slash (to match just the file or directory name)
- '**' matches any character including a forward-slash /

Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
The same applies to special directories that start with a double underscore like "__pycache__".
You can still include one if the pattern explicilty mentions it:
- to include a hidden file: "*/.hidden.txt" or "*/.*"
- to include a hidden directory: ".hidden/*" or ".*/*"
- to include a special directory: "__special__/*" or "__*/*"

Example::

>>> import huggingface_hub
Expand Down
132 changes: 131 additions & 1 deletion tests/test_data_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
Url,
_get_data_files_patterns,
_get_metadata_files_patterns,
_is_inside_unrequested_special_dir,
_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir,
resolve_patterns_in_dataset_repository,
resolve_patterns_locally_or_by_urls,
)
Expand All @@ -22,7 +24,7 @@

_TEST_PATTERNS = ["*", "**", "**/*", "*.txt", "data/*", "**/*.txt", "**/train.txt"]
_FILES_TO_IGNORE = {".dummy", "README.md", "dummy_data.zip", "dataset_infos.json"}
_DIRS_TO_IGNORE = {"data/.dummy_subdir"}
_DIRS_TO_IGNORE = {"data/.dummy_subdir", "__pycache__"}
_TEST_PATTERNS_SIZES = dict(
[
("*", 0),
Expand Down Expand Up @@ -65,6 +67,10 @@ def complex_data_dir(tmp_path):
with open(data_dir / "data" / ".dummy_subdir" / "test.txt", "w") as f:
f.write("bar\n" * 10)

(data_dir / "__pycache__").mkdir()
with open(data_dir / "__pycache__" / "script.py", "w") as f:
f.write("foo\n" * 10)

return str(data_dir)


Expand Down Expand Up @@ -131,6 +137,42 @@ def hub_dataset_info_patterns_results(hub_dataset_info, complex_data_dir, patter
}


def test_is_inside_unrequested_special_dir(complex_data_dir, pattern_results):
# usual patterns outside special dir work fine
for pattern, result in pattern_results.items():
if result:
matched_rel_path = str(Path(result[0]).relative_to(complex_data_dir))
assert _is_inside_unrequested_special_dir(matched_rel_path, pattern) is False
# check behavior for special dir
f = _is_inside_unrequested_special_dir
assert f("__pycache__/b.txt", "**") is True
assert f("__pycache__/b.txt", "*/b.txt") is True
assert f("__pycache__/b.txt", "__pycache__/*") is False
assert f("__pycache__/__b.txt", "__pycache__/*") is False
assert f("__pycache__/__b.txt", "__*/*") is False
assert f("__b.txt", "*") is False


def test_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(complex_data_dir, pattern_results):
# usual patterns outside hidden dir work fine
for pattern, result in pattern_results.items():
if result:
matched_rel_path = str(Path(result[0]).relative_to(complex_data_dir))
assert _is_inside_unrequested_special_dir(matched_rel_path, pattern) is False
# check behavior for hidden dir and file
f = _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir
assert f(".hidden_file.txt", "**") is True
assert f(".hidden_file.txt", ".*") is False
assert f(".hidden_dir/a.txt", "**") is True
assert f(".hidden_dir/a.txt", ".*/*") is False
assert f(".hidden_dir/a.txt", ".hidden_dir/*") is False
assert f(".hidden_dir/.hidden_file.txt", "**") is True
assert f(".hidden_dir/.hidden_file.txt", ".*/*") is True
assert f(".hidden_dir/.hidden_file.txt", ".*/.*") is False
assert f(".hidden_dir/.hidden_file.txt", ".hidden_dir/*") is True
assert f(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*") is False


@pytest.mark.parametrize("pattern", _TEST_PATTERNS)
def test_pattern_results_fixture(pattern_results, pattern):
assert len(pattern_results[pattern]) == _TEST_PATTERNS_SIZES[pattern]
Expand All @@ -147,6 +189,14 @@ def test_resolve_patterns_locally_or_by_urls(complex_data_dir, pattern, pattern_
assert len(pattern_results[pattern]) == 0


def test_resolve_patterns_locally_or_by_urls_with_dot_in_base_path(complex_data_dir):
base_path_with_dot = os.path.join(complex_data_dir, "data", ".dummy_subdir")
resolved_data_files = resolve_patterns_locally_or_by_urls(
base_path_with_dot, [os.path.join(base_path_with_dot, "train.txt")]
)
assert len(resolved_data_files) == 1


def test_resolve_patterns_locally_or_by_urls_with_absolute_path(tmp_path, complex_data_dir):
abs_path = os.path.join(complex_data_dir, "data", "train.txt")
resolved_data_files = resolve_patterns_locally_or_by_urls(str(tmp_path / "blabla"), [abs_path])
Expand All @@ -159,6 +209,47 @@ def test_resolve_patterns_locally_or_by_urls_with_double_dots(tmp_path, complex_
assert len(resolved_data_files) == 1


def test_resolve_patterns_locally_or_by_urls_returns_hidden_file_only_if_requested(complex_data_dir):
with pytest.raises(FileNotFoundError):
resolve_patterns_locally_or_by_urls(complex_data_dir, ["*dummy"])
resolved_data_files = resolve_patterns_locally_or_by_urls(complex_data_dir, [".dummy"])
assert len(resolved_data_files) == 1


def test_resolve_patterns_locally_or_by_urls_hidden_base_path(tmp_path):
hidden = tmp_path / ".test_hidden_base_path"
hidden.mkdir()
(tmp_path / ".test_hidden_base_path" / "a.txt").touch()
resolved_data_files = resolve_patterns_locally_or_by_urls(str(hidden), ["*"])
assert len(resolved_data_files) == 1


def test_resolve_patterns_locally_or_by_urls_returns_hidden_dir_only_if_requested(complex_data_dir):
with pytest.raises(FileNotFoundError):
resolve_patterns_locally_or_by_urls(complex_data_dir, ["data/*dummy_subdir/train.txt"])
resolved_data_files = resolve_patterns_locally_or_by_urls(complex_data_dir, ["data/.dummy_subdir/train.txt"])
assert len(resolved_data_files) == 1
resolved_data_files = resolve_patterns_locally_or_by_urls(complex_data_dir, ["*/.dummy_subdir/train.txt"])
assert len(resolved_data_files) == 1


def test_resolve_patterns_locally_or_by_urls_returns_special_dir_only_if_requested(complex_data_dir):
with pytest.raises(FileNotFoundError):
resolve_patterns_locally_or_by_urls(complex_data_dir, ["data/*dummy_subdir/train.txt"])
resolved_data_files = resolve_patterns_locally_or_by_urls(complex_data_dir, ["data/.dummy_subdir/train.txt"])
assert len(resolved_data_files) == 1
resolved_data_files = resolve_patterns_locally_or_by_urls(complex_data_dir, ["*/.dummy_subdir/train.txt"])
assert len(resolved_data_files) == 1


def test_resolve_patterns_locally_or_by_urls_special_base_path(tmp_path):
special = tmp_path / "__test_special_base_path__"
special.mkdir()
(tmp_path / "__test_special_base_path__" / "a.txt").touch()
resolved_data_files = resolve_patterns_locally_or_by_urls(str(special), ["*"])
assert len(resolved_data_files) == 1


@pytest.mark.parametrize("pattern,size,extensions", [("**", 4, ["txt"]), ("**", 4, None), ("**", 0, ["blablabla"])])
def test_resolve_patterns_locally_or_by_urls_with_extensions(complex_data_dir, pattern, size, extensions):
if size > 0:
Expand Down Expand Up @@ -239,6 +330,45 @@ def test_resolve_patterns_in_dataset_repository_sorted_files():
assert resolved_names == sorted(unsorted_names)


def test_resolve_patterns_in_dataset_repository_returns_hidden_file_only_if_requested(hub_dataset_info):
with pytest.raises(FileNotFoundError):
resolve_patterns_in_dataset_repository(hub_dataset_info, ["*dummy"])
resolved_data_files = resolve_patterns_in_dataset_repository(hub_dataset_info, [".dummy"])
assert len(resolved_data_files) == 1


def test_resolve_patterns_in_dataset_repository_hidden_base_path():
siblings = [{"rfilename": ".hidden/a.txt"}]
datasets_infos = DatasetInfo(id="test_hidden_base_path", siblings=siblings, sha="foobar")
resolved_data_files = resolve_patterns_in_dataset_repository(datasets_infos, ["*"], base_path=".hidden")
assert len(resolved_data_files) == 1


def test_resolve_patterns_in_dataset_repository_returns_hidden_dir_only_if_requested(hub_dataset_info):
with pytest.raises(FileNotFoundError):
resolve_patterns_in_dataset_repository(hub_dataset_info, ["data/*dummy_subdir/train.txt"])
resolved_data_files = resolve_patterns_in_dataset_repository(hub_dataset_info, ["data/.dummy_subdir/train.txt"])
assert len(resolved_data_files) == 1
resolved_data_files = resolve_patterns_in_dataset_repository(hub_dataset_info, ["*/.dummy_subdir/train.txt"])
assert len(resolved_data_files) == 1


def test_resolve_patterns_in_dataset_repository_returns_special_dir_only_if_requested(hub_dataset_info):
with pytest.raises(FileNotFoundError):
resolve_patterns_in_dataset_repository(hub_dataset_info, ["data/*dummy_subdir/train.txt"])
resolved_data_files = resolve_patterns_in_dataset_repository(hub_dataset_info, ["data/.dummy_subdir/train.txt"])
assert len(resolved_data_files) == 1
resolved_data_files = resolve_patterns_in_dataset_repository(hub_dataset_info, ["*/.dummy_subdir/train.txt"])
assert len(resolved_data_files) == 1


def test_resolve_patterns_in_dataset_repository_special_base_path():
siblings = [{"rfilename": "__special__/a.txt"}]
datasets_infos = DatasetInfo(id="test_hidden_base_path", siblings=siblings, sha="foobar")
resolved_data_files = resolve_patterns_in_dataset_repository(datasets_infos, ["*"], base_path="__special__")
assert len(resolved_data_files) == 1


@pytest.mark.parametrize("pattern", _TEST_PATTERNS)
def test_DataFilesList_from_hf_repo(hub_dataset_info, hub_dataset_info_patterns_results, pattern):
try:
Expand Down