diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index e7df83f53f0..8bdfbd4323f 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -275,7 +275,7 @@ def _resolve_single_pattern_locally( fs = LocalFileSystem() glob_iter = [PurePath(filepath) for filepath in fs.glob(pattern) if fs.isfile(filepath)] matched_paths = [ - Path(filepath).resolve() + Path(os.path.abspath(filepath)) for filepath in glob_iter if (filepath.name not in FILES_TO_IGNORE or PurePath(pattern).name == filepath.name) and not _is_inside_unrequested_special_dir( diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 75d2109ce6f..f236bb49f07 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -46,11 +46,13 @@ def complex_data_dir(tmp_path): data_dir = tmp_path / "complex_data_dir" data_dir.mkdir() + (data_dir / "data").mkdir() with open(data_dir / "data" / "train.txt", "w") as f: f.write("foo\n" * 10) with open(data_dir / "data" / "test.txt", "w") as f: f.write("bar\n" * 10) + with open(data_dir / "README.md", "w") as f: f.write("This is a readme") with open(data_dir / ".dummy", "w") as f: @@ -100,7 +102,7 @@ def pattern_results(complex_data_dir): return { pattern: sorted( - str(Path(path).resolve()) + str(Path(os.path.abspath(path))) for path in fsspec.filesystem("file").glob(os.path.join(complex_data_dir, pattern)) if Path(path).name not in _FILES_TO_IGNORE and not any( @@ -268,6 +270,14 @@ def test_fail_resolve_patterns_locally_or_by_urls(complex_data_dir): resolve_patterns_locally_or_by_urls(complex_data_dir, ["blablabla"]) +@pytest.mark.skipif(os.name == "nt", reason="Windows does not support symlinks in the default mode") +def test_resolve_patterns_locally_or_by_urls_does_not_resolve_symbolic_links(tmp_path, complex_data_dir): + (tmp_path / "train_data_symlink.txt").symlink_to(os.path.join(complex_data_dir, "data", "train.txt")) + resolved_data_files = resolve_patterns_locally_or_by_urls(str(tmp_path), ["train_data_symlink.txt"]) + assert len(resolved_data_files) == 1 + assert resolved_data_files[0] == tmp_path / "train_data_symlink.txt" + + def test_resolve_patterns_locally_or_by_urls_sorted_files(tmp_path_factory): path = str(tmp_path_factory.mktemp("unsorted_text_files")) unsorted_names = ["0.txt", "2.txt", "3.txt"]