diff --git a/src/datasets/download/download_manager.py b/src/datasets/download/download_manager.py index ef06e6a7fe3..d3d89255d13 100644 --- a/src/datasets/download/download_manager.py +++ b/src/datasets/download/download_manager.py @@ -130,11 +130,11 @@ def _iter_from_paths(cls, urlpaths: Union[str, List[str]]) -> Generator[str, Non for dirpath, dirnames, filenames in os.walk(urlpath): # skipping hidden directories; prune the search # [:] for the in-place list modification required by os.walk - dirnames[:] = [dirname for dirname in dirnames if not dirname.startswith((".", "__"))] + dirnames[:] = sorted([dirname for dirname in dirnames if not dirname.startswith((".", "__"))]) if os.path.basename(dirpath).startswith((".", "__")): # skipping hidden directories continue - for filename in filenames: + for filename in sorted(filenames): if filename.startswith((".", "__")): # skipping hidden files continue diff --git a/src/datasets/download/mock_download_manager.py b/src/datasets/download/mock_download_manager.py index 46ac34076e4..84321e708d5 100644 --- a/src/datasets/download/mock_download_manager.py +++ b/src/datasets/download/mock_download_manager.py @@ -236,10 +236,11 @@ def iter_files(self, paths): return yield path else: - for dirpath, _, filenames in os.walk(path): + for dirpath, dirnames, filenames in os.walk(path): if os.path.basename(dirpath).startswith((".", "__")): continue - for filename in filenames: + dirnames.sort() + for filename in sorted(filenames): if filename.startswith((".", "__")): continue yield os.path.join(dirpath, filename) diff --git a/src/datasets/download/streaming_download_manager.py b/src/datasets/download/streaming_download_manager.py index 42edd42ce85..37d7b732e65 100644 --- a/src/datasets/download/streaming_download_manager.py +++ b/src/datasets/download/streaming_download_manager.py @@ -835,11 +835,11 @@ def _iter_from_urlpaths( # skipping hidden directories; prune the search # [:] for the in-place list modification required by os.walk # (only works for local paths as fsspec's walk doesn't support the in-place modification) - dirnames[:] = [dirname for dirname in dirnames if not dirname.startswith((".", "__"))] + dirnames[:] = sorted([dirname for dirname in dirnames if not dirname.startswith((".", "__"))]) if xbasename(dirpath).startswith((".", "__")): # skipping hidden directories continue - for filename in filenames: + for filename in sorted(filenames): if filename.startswith((".", "__")): # skipping hidden files continue diff --git a/tests/test_download_manager.py b/tests/test_download_manager.py index 695884cd0f7..0dfc0a2098e 100644 --- a/tests/test_download_manager.py +++ b/tests/test_download_manager.py @@ -141,5 +141,5 @@ def test_iter_archive_file(tar_nested_jsonl_path): def test_iter_files(data_dir_with_hidden_files): dl_manager = DownloadManager() for num_file, file in enumerate(dl_manager.iter_files(data_dir_with_hidden_files), start=1): - pass + assert os.path.basename(file) == ("test.txt" if num_file == 1 else "train.txt") assert num_file == 2 diff --git a/tests/test_streaming_download_manager.py b/tests/test_streaming_download_manager.py index 7452e5dacf6..34a5b45e4b0 100644 --- a/tests/test_streaming_download_manager.py +++ b/tests/test_streaming_download_manager.py @@ -834,5 +834,5 @@ def test_iter_archive_file(tar_nested_jsonl_path): def test_iter_files(data_dir_with_hidden_files): dl_manager = StreamingDownloadManager() for num_file, file in enumerate(dl_manager.iter_files(data_dir_with_hidden_files), start=1): - pass + assert os.path.basename(file) == ("test.txt" if num_file == 1 else "train.txt") assert num_file == 2