Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/datasets/download/download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,11 @@ def _iter_from_paths(cls, urlpaths: Union[str, List[str]]) -> Generator[str, Non
for dirpath, dirnames, filenames in os.walk(urlpath):
# skipping hidden directories; prune the search
# [:] for the in-place list modification required by os.walk
dirnames[:] = [dirname for dirname in dirnames if not dirname.startswith((".", "__"))]
dirnames[:] = sorted([dirname for dirname in dirnames if not dirname.startswith((".", "__"))])
if os.path.basename(dirpath).startswith((".", "__")):
# skipping hidden directories
continue
for filename in filenames:
for filename in sorted(filenames):
if filename.startswith((".", "__")):
# skipping hidden files
continue
Expand Down
5 changes: 3 additions & 2 deletions src/datasets/download/mock_download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,10 +236,11 @@ def iter_files(self, paths):
return
yield path
else:
for dirpath, _, filenames in os.walk(path):
for dirpath, dirnames, filenames in os.walk(path):
if os.path.basename(dirpath).startswith((".", "__")):
continue
for filename in filenames:
dirnames.sort()
for filename in sorted(filenames):
if filename.startswith((".", "__")):
continue
yield os.path.join(dirpath, filename)
4 changes: 2 additions & 2 deletions src/datasets/download/streaming_download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,11 +835,11 @@ def _iter_from_urlpaths(
# skipping hidden directories; prune the search
# [:] for the in-place list modification required by os.walk
# (only works for local paths as fsspec's walk doesn't support the in-place modification)
dirnames[:] = [dirname for dirname in dirnames if not dirname.startswith((".", "__"))]
dirnames[:] = sorted([dirname for dirname in dirnames if not dirname.startswith((".", "__"))])
if xbasename(dirpath).startswith((".", "__")):
# skipping hidden directories
continue
for filename in filenames:
for filename in sorted(filenames):
if filename.startswith((".", "__")):
# skipping hidden files
continue
Expand Down
2 changes: 1 addition & 1 deletion tests/test_download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,5 +141,5 @@ def test_iter_archive_file(tar_nested_jsonl_path):
def test_iter_files(data_dir_with_hidden_files):
dl_manager = DownloadManager()
for num_file, file in enumerate(dl_manager.iter_files(data_dir_with_hidden_files), start=1):
pass
assert os.path.basename(file) == ("test.txt" if num_file == 1 else "train.txt")
assert num_file == 2
2 changes: 1 addition & 1 deletion tests/test_streaming_download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,5 +834,5 @@ def test_iter_archive_file(tar_nested_jsonl_path):
def test_iter_files(data_dir_with_hidden_files):
dl_manager = StreamingDownloadManager()
for num_file, file in enumerate(dl_manager.iter_files(data_dir_with_hidden_files), start=1):
pass
assert os.path.basename(file) == ("test.txt" if num_file == 1 else "train.txt")
assert num_file == 2