Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,8 @@ def _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs
+ str(e)
)

dl_manager.manage_extracted_files()

if verify_infos:
verify_splits(self.info.splits, split_dict)

Expand Down
11 changes: 11 additions & 0 deletions src/datasets/utils/download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,3 +290,14 @@ def download_and_extract(self, url_or_urls):

def get_recorded_sizes_checksums(self):
return self._recorded_sizes_checksums.copy()

def delete_extracted_files(self):
paths_to_delete = set(self.extracted_paths.values()) - set(self.downloaded_paths.values())
for key, path in list(self.extracted_paths.items()):
if path in paths_to_delete:
os.remove(path)
del self.extracted_paths[key]

def manage_extracted_files(self):
if not self._download_config.keep_extracted:
self.delete_extracted_files()
2 changes: 2 additions & 0 deletions src/datasets/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ class DownloadConfig:
extract the compressed file in a folder along the archive.
force_extract (:obj:`bool`, default ``False``): If True when extract_compressed_file is True and the archive
was already extracted, re-extract the archive and override the folder where it was extracted.
keep_extracted (:obj:`bool`, default ``False``): Whether to keep (or delete) the extracted files.
use_etag (:obj:`bool`, default ``True``):
num_proc (:obj:`int`, optional):
max_retries (:obj:`int`, default ``1``): The number of times to retry an HTTP request if it fails.
Expand All @@ -232,6 +233,7 @@ class DownloadConfig:
user_agent: Optional[str] = None
extract_compressed_file: bool = False
force_extract: bool = False
keep_extracted: bool = False
use_etag: bool = True
num_proc: Optional[int] = None
max_retries: int = 1
Expand Down
6 changes: 6 additions & 0 deletions src/datasets/utils/mock_download_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,3 +201,9 @@ def create_dummy_data_single(self, path_to_dummy_data, data_url):
# while now we expected the dummy_data.zip file to be a directory containing
# the downloaded file.
return path_to_dummy_data

def delete_extracted_files(self):
pass

def manage_extracted_files(self):
pass
8 changes: 8 additions & 0 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,3 +345,11 @@ def test_remote_data_files():
assert isinstance(ds, IterableDataset)
ds_item = next(iter(ds))
assert ds_item.keys() == {"langs", "ner_tags", "spans", "tokens"}


def test_load_dataset_deletes_extracted_files(jsonl_gz_path, tmp_path):
data_files = jsonl_gz_path
cache_dir = tmp_path / "cache"
ds = load_dataset("json", split="train", data_files=data_files, cache_dir=cache_dir)
assert ds[0] == {"col_1": "0", "col_2": 0, "col_3": 0.0}
assert sorted((cache_dir / "downloads" / "extracted").iterdir()) == []