From ddbad4d1691ba2c5f1adad78dd27207d25a23b8b Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 15 Nov 2023 19:48:39 +0100 Subject: [PATCH 1/5] More robust temporary directory deletion --- src/datasets/arrow_dataset.py | 2 - src/datasets/commands/datasets_cli.py | 2 + src/datasets/commands/delete_temp_cache.py | 30 ++++++++++ src/datasets/config.py | 3 + src/datasets/fingerprint.py | 65 +++++++--------------- tests/commands/test_delete_temp_cache.py | 21 +++++++ 6 files changed, 76 insertions(+), 47 deletions(-) create mode 100644 src/datasets/commands/delete_temp_cache.py create mode 100644 tests/commands/test_delete_temp_cache.py diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index aeee3b13419..b8b8e63fa14 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -86,7 +86,6 @@ generate_random_fingerprint, get_temporary_cache_files_directory, is_caching_enabled, - maybe_register_dataset_for_temp_dir_deletion, update_fingerprint, validate_fingerprint, ) @@ -671,7 +670,6 @@ def __init__( self._data: Table = _check_table(arrow_table) self._indices: Optional[Table] = _check_table(indices_table) if indices_table is not None else None - maybe_register_dataset_for_temp_dir_deletion(self) self._format_type: Optional[str] = None self._format_kwargs: dict = {} diff --git a/src/datasets/commands/datasets_cli.py b/src/datasets/commands/datasets_cli.py index 927518e311c..e61d319b657 100644 --- a/src/datasets/commands/datasets_cli.py +++ b/src/datasets/commands/datasets_cli.py @@ -2,6 +2,7 @@ from argparse import ArgumentParser from datasets.commands.convert import ConvertCommand +from datasets.commands.delete_temp_cache import DeleteTempCacheCommand from datasets.commands.dummy_data import DummyDataCommand from datasets.commands.env import EnvironmentCommand from datasets.commands.run_beam import RunBeamCommand @@ -26,6 +27,7 @@ def main(): TestCommand.register_subcommand(commands_parser) RunBeamCommand.register_subcommand(commands_parser) DummyDataCommand.register_subcommand(commands_parser) + DeleteTempCacheCommand.register_subcommand(commands_parser) # Parse args args, unknown_args = parser.parse_known_args() diff --git a/src/datasets/commands/delete_temp_cache.py b/src/datasets/commands/delete_temp_cache.py new file mode 100644 index 00000000000..3c83b5da5c4 --- /dev/null +++ b/src/datasets/commands/delete_temp_cache.py @@ -0,0 +1,30 @@ +import os +import shutil +import tempfile +from argparse import ArgumentParser + +from datasets import config +from datasets.commands import BaseDatasetsCLICommand + + +def delete_temp_cache_command_factory(_): + return DeleteTempCacheCommand() + + +class DeleteTempCacheCommand(BaseDatasetsCLICommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + download_parser = parser.add_parser("delete-temp-cache", help="Print relevant system environment info.") + download_parser.set_defaults(func=delete_temp_cache_command_factory) + + def run(self): + num_deleted = 0 + for name in os.listdir(tempfile.gettempdir()): + path = os.path.join(tempfile.gettempdir(), name) + if name.startswith(config.TEMP_CACHE_DIR_PREFIX) and os.path.isdir(path): + shutil.rmtree(path) + num_deleted += 1 + if num_deleted == 0: + print("No temporary cache directories to delete.") + else: + print(f"Deleted {num_deleted} temporary cache" + "directories" if num_deleted > 1 else "directory" + ".") diff --git a/src/datasets/config.py b/src/datasets/config.py index 3e1f20475ac..5ef4bd54a6d 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -209,6 +209,9 @@ MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules" +# Temporary cache directory prefix +TEMP_CACHE_DIR_PREFIX = "hf_datasets-" + MAX_DATASET_CONFIG_ID_READABLE_LENGTH = 255 # Streaming diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py index 7d73758a049..76488f980bb 100644 --- a/src/datasets/fingerprint.py +++ b/src/datasets/fingerprint.py @@ -6,13 +6,14 @@ import tempfile import weakref from functools import wraps -from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import pyarrow as pa import xxhash +from packaging import version +from . import config from .info import DatasetInfo from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH from .table import ConcatenationTable, InMemoryTable, MemoryMappedTable, Table @@ -42,57 +43,37 @@ ################# _CACHING_ENABLED = True -_TEMP_DIR_FOR_TEMP_CACHE_FILES: Optional["_TempDirWithCustomCleanup"] = None -_DATASETS_WITH_TABLE_IN_TEMP_DIR: Optional[weakref.WeakSet] = None +_TEMP_DIR_FOR_TEMP_CACHE_FILES: Optional["_TempCacheDir"] = None -class _TempDirWithCustomCleanup: +class _TempCacheDir: """ - A temporary directory with a custom cleanup function. - We need a custom temporary directory cleanup in order to delete the dataset objects that have - cache files in the temporary directory before deleting the dorectory itself. + A temporary directory for storing cached Arrow files with a custom cleanup. """ - def __init__(self, cleanup_func=None, *cleanup_func_args, **cleanup_func_kwargs): - self.name = tempfile.mkdtemp() + def __init__(self): + self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX) self._finalizer = weakref.finalize(self, self._cleanup) - self._cleanup_func = cleanup_func - self._cleanup_func_args = cleanup_func_args - self._cleanup_func_kwargs = cleanup_func_kwargs def _cleanup(self): - self._cleanup_func(*self._cleanup_func_args, **self._cleanup_func_kwargs) - if os.path.exists(self.name): - shutil.rmtree(self.name) + # A PermissionError can happen on Windows if a datasets referencing + # the files from the cache directory is not garbage collected + def onexc(func, path, exc_info): + if isinstance(exc_info, PermissionError): + logger.warning( + f"Failed to remove temporary cache directory {self.name}. Run `datasets-cli delete-temp-cache` to remove this directory manually." + ) + + if config.PY_VERSION <= version.parse("3.12"): + shutil.rmtree(self.name, onerror=onexc) + else: + shutil.rmtree(self.name, onexc=onexc) def cleanup(self): if self._finalizer.detach(): self._cleanup() -def maybe_register_dataset_for_temp_dir_deletion(dataset): - """ - This function registers the datasets that have cache files in _TEMP_DIR_FOR_TEMP_CACHE_FILES in order - to properly delete them before deleting the temporary directory. - The temporary directory _TEMP_DIR_FOR_TEMP_CACHE_FILES is used when caching is disabled. - """ - if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None: - return - - global _DATASETS_WITH_TABLE_IN_TEMP_DIR - if _DATASETS_WITH_TABLE_IN_TEMP_DIR is None: - _DATASETS_WITH_TABLE_IN_TEMP_DIR = weakref.WeakSet() - if any( - Path(_TEMP_DIR_FOR_TEMP_CACHE_FILES.name) in Path(cache_file["filename"]).parents - for cache_file in dataset.cache_files - ): - _DATASETS_WITH_TABLE_IN_TEMP_DIR.add(dataset) - - -def get_datasets_with_cache_file_in_temp_dir(): - return list(_DATASETS_WITH_TABLE_IN_TEMP_DIR) if _DATASETS_WITH_TABLE_IN_TEMP_DIR is not None else [] - - def enable_caching(): """ When applying transforms on a dataset, the data are stored in cache files. @@ -184,13 +165,7 @@ def get_temporary_cache_files_directory() -> str: """Return a directory that is deleted when session closes.""" global _TEMP_DIR_FOR_TEMP_CACHE_FILES if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None: - # Avoids a PermissionError on Windows caused by the datasets referencing - # the files from the cache directory on clean-up - def cleanup_func(): - for dset in get_datasets_with_cache_file_in_temp_dir(): - dset.__del__() - - _TEMP_DIR_FOR_TEMP_CACHE_FILES = _TempDirWithCustomCleanup(cleanup_func=cleanup_func) + _TEMP_DIR_FOR_TEMP_CACHE_FILES = _TempCacheDir() return _TEMP_DIR_FOR_TEMP_CACHE_FILES.name diff --git a/tests/commands/test_delete_temp_cache.py b/tests/commands/test_delete_temp_cache.py new file mode 100644 index 00000000000..300a8e73713 --- /dev/null +++ b/tests/commands/test_delete_temp_cache.py @@ -0,0 +1,21 @@ +from pathlib import Path + +import pytest + +from datasets.commands.delete_temp_cache import DeleteTempCacheCommand +from datasets.fingerprint import get_temporary_cache_files_directory + + +@pytest.mark.integration +def test_delete_temp_cache_command(tmp_path, monkeypatch): + monkeypatch.setattr("tempfile.tempdir", str(tmp_path)) + hf_temp_cache_dir = Path(get_temporary_cache_files_directory()) + dummy_dir = tmp_path / "dummy_dir" + dummy_dir.mkdir() + dummy_file = tmp_path / "dummy.txt" + with open(dummy_file, "w"): + pass + assert hf_temp_cache_dir.exists() and dummy_dir.exists() and dummy_file.exists() + command = DeleteTempCacheCommand() + command.run() + assert not hf_temp_cache_dir.exists() and dummy_dir.exists() and dummy_file.exists() From 2d51f37eb9996d4c52250ee6e987ccce0d74f2f4 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Wed, 15 Nov 2023 20:43:15 +0100 Subject: [PATCH 2/5] Fix test --- tests/test_arrow_dataset.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 75a0d3559b9..63a73a3b122 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -1392,8 +1392,14 @@ def test_map_caching(self, in_memory): self.assertEqual(len(dset_test2.cache_files), 1) self.assertNotIn("Loading cached processed dataset", self._caplog.text) # make sure the arrow files are going to be removed - self.assertIn("tmp", dset_test1.cache_files[0]["filename"]) - self.assertIn("tmp", dset_test2.cache_files[0]["filename"]) + self.assertIn( + Path(tempfile.gettempdir()), + Path(dset_test1.cache_files[0]["filename"]).parents, + ) + self.assertIn( + Path(tempfile.gettempdir()), + Path(dset_test2.cache_files[0]["filename"]).parents, + ) finally: datasets.enable_caching() @@ -3987,11 +3993,11 @@ def test_build_local_temp_path(uri_or_path): extracted_path = strip_protocol(uri_or_path) local_temp_path = Dataset._build_local_temp_path(extracted_path).as_posix() extracted_path_without_anchor = Path(extracted_path).relative_to(Path(extracted_path).anchor).as_posix() - path_relative_to_tmp_dir = local_temp_path.split("tmp")[-1].split("/", 1)[1] + # Check that the local temp path is relative to the system temp dir + path_relative_to_tmp_dir = Path(local_temp_path).relative_to(Path(tempfile.gettempdir())).as_posix() assert ( - "tmp" in local_temp_path - and "hdfs" not in path_relative_to_tmp_dir + "hdfs" not in path_relative_to_tmp_dir and "s3" not in path_relative_to_tmp_dir and not local_temp_path.startswith(extracted_path_without_anchor) and local_temp_path.endswith(extracted_path_without_anchor) From 7eaad71464e85c7358eaa36494227a43257ffcd8 Mon Sep 17 00:00:00 2001 From: mariosasko Date: Thu, 16 Nov 2023 02:12:28 +0100 Subject: [PATCH 3/5] Improve error message --- src/datasets/fingerprint.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py index 76488f980bb..d64c710c1d3 100644 --- a/src/datasets/fingerprint.py +++ b/src/datasets/fingerprint.py @@ -58,10 +58,10 @@ def __init__(self): def _cleanup(self): # A PermissionError can happen on Windows if a datasets referencing # the files from the cache directory is not garbage collected - def onexc(func, path, exc_info): - if isinstance(exc_info, PermissionError): + def onexc(func, path, exc): + if isinstance(exc, PermissionError): logger.warning( - f"Failed to remove temporary cache directory {self.name}. Run `datasets-cli delete-temp-cache` to remove this directory manually." + f"Failed to remove temporary cache directory {self.name} ({type(exc).__name__}: {exc}). Run `datasets-cli delete-temp-cache` to remove this directory manually." ) if config.PY_VERSION <= version.parse("3.12"): From 3f42f2269c0405584633c25e1381228b31aaa85d Mon Sep 17 00:00:00 2001 From: mariosasko Date: Fri, 1 Dec 2023 00:53:08 +0100 Subject: [PATCH 4/5] Revert "More robust temporary directory deletion" This reverts commit ddbad4d1691ba2c5f1adad78dd27207d25a23b8b. --- src/datasets/arrow_dataset.py | 2 + src/datasets/commands/datasets_cli.py | 2 - src/datasets/commands/delete_temp_cache.py | 30 ---------- src/datasets/config.py | 3 - src/datasets/fingerprint.py | 65 +++++++++++++++------- tests/commands/test_delete_temp_cache.py | 21 ------- 6 files changed, 47 insertions(+), 76 deletions(-) delete mode 100644 src/datasets/commands/delete_temp_cache.py delete mode 100644 tests/commands/test_delete_temp_cache.py diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 26f4bb1d558..cecf8d454a3 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -86,6 +86,7 @@ generate_random_fingerprint, get_temporary_cache_files_directory, is_caching_enabled, + maybe_register_dataset_for_temp_dir_deletion, update_fingerprint, validate_fingerprint, ) @@ -670,6 +671,7 @@ def __init__( self._data: Table = _check_table(arrow_table) self._indices: Optional[Table] = _check_table(indices_table) if indices_table is not None else None + maybe_register_dataset_for_temp_dir_deletion(self) self._format_type: Optional[str] = None self._format_kwargs: dict = {} diff --git a/src/datasets/commands/datasets_cli.py b/src/datasets/commands/datasets_cli.py index e61d319b657..927518e311c 100644 --- a/src/datasets/commands/datasets_cli.py +++ b/src/datasets/commands/datasets_cli.py @@ -2,7 +2,6 @@ from argparse import ArgumentParser from datasets.commands.convert import ConvertCommand -from datasets.commands.delete_temp_cache import DeleteTempCacheCommand from datasets.commands.dummy_data import DummyDataCommand from datasets.commands.env import EnvironmentCommand from datasets.commands.run_beam import RunBeamCommand @@ -27,7 +26,6 @@ def main(): TestCommand.register_subcommand(commands_parser) RunBeamCommand.register_subcommand(commands_parser) DummyDataCommand.register_subcommand(commands_parser) - DeleteTempCacheCommand.register_subcommand(commands_parser) # Parse args args, unknown_args = parser.parse_known_args() diff --git a/src/datasets/commands/delete_temp_cache.py b/src/datasets/commands/delete_temp_cache.py deleted file mode 100644 index 3c83b5da5c4..00000000000 --- a/src/datasets/commands/delete_temp_cache.py +++ /dev/null @@ -1,30 +0,0 @@ -import os -import shutil -import tempfile -from argparse import ArgumentParser - -from datasets import config -from datasets.commands import BaseDatasetsCLICommand - - -def delete_temp_cache_command_factory(_): - return DeleteTempCacheCommand() - - -class DeleteTempCacheCommand(BaseDatasetsCLICommand): - @staticmethod - def register_subcommand(parser: ArgumentParser): - download_parser = parser.add_parser("delete-temp-cache", help="Print relevant system environment info.") - download_parser.set_defaults(func=delete_temp_cache_command_factory) - - def run(self): - num_deleted = 0 - for name in os.listdir(tempfile.gettempdir()): - path = os.path.join(tempfile.gettempdir(), name) - if name.startswith(config.TEMP_CACHE_DIR_PREFIX) and os.path.isdir(path): - shutil.rmtree(path) - num_deleted += 1 - if num_deleted == 0: - print("No temporary cache directories to delete.") - else: - print(f"Deleted {num_deleted} temporary cache" + "directories" if num_deleted > 1 else "directory" + ".") diff --git a/src/datasets/config.py b/src/datasets/config.py index 5ef4bd54a6d..3e1f20475ac 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -209,9 +209,6 @@ MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules" -# Temporary cache directory prefix -TEMP_CACHE_DIR_PREFIX = "hf_datasets-" - MAX_DATASET_CONFIG_ID_READABLE_LENGTH = 255 # Streaming diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py index d64c710c1d3..7d73758a049 100644 --- a/src/datasets/fingerprint.py +++ b/src/datasets/fingerprint.py @@ -6,14 +6,13 @@ import tempfile import weakref from functools import wraps +from pathlib import Path from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import pyarrow as pa import xxhash -from packaging import version -from . import config from .info import DatasetInfo from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH from .table import ConcatenationTable, InMemoryTable, MemoryMappedTable, Table @@ -43,37 +42,57 @@ ################# _CACHING_ENABLED = True -_TEMP_DIR_FOR_TEMP_CACHE_FILES: Optional["_TempCacheDir"] = None +_TEMP_DIR_FOR_TEMP_CACHE_FILES: Optional["_TempDirWithCustomCleanup"] = None +_DATASETS_WITH_TABLE_IN_TEMP_DIR: Optional[weakref.WeakSet] = None -class _TempCacheDir: +class _TempDirWithCustomCleanup: """ - A temporary directory for storing cached Arrow files with a custom cleanup. + A temporary directory with a custom cleanup function. + We need a custom temporary directory cleanup in order to delete the dataset objects that have + cache files in the temporary directory before deleting the dorectory itself. """ - def __init__(self): - self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX) + def __init__(self, cleanup_func=None, *cleanup_func_args, **cleanup_func_kwargs): + self.name = tempfile.mkdtemp() self._finalizer = weakref.finalize(self, self._cleanup) + self._cleanup_func = cleanup_func + self._cleanup_func_args = cleanup_func_args + self._cleanup_func_kwargs = cleanup_func_kwargs def _cleanup(self): - # A PermissionError can happen on Windows if a datasets referencing - # the files from the cache directory is not garbage collected - def onexc(func, path, exc): - if isinstance(exc, PermissionError): - logger.warning( - f"Failed to remove temporary cache directory {self.name} ({type(exc).__name__}: {exc}). Run `datasets-cli delete-temp-cache` to remove this directory manually." - ) - - if config.PY_VERSION <= version.parse("3.12"): - shutil.rmtree(self.name, onerror=onexc) - else: - shutil.rmtree(self.name, onexc=onexc) + self._cleanup_func(*self._cleanup_func_args, **self._cleanup_func_kwargs) + if os.path.exists(self.name): + shutil.rmtree(self.name) def cleanup(self): if self._finalizer.detach(): self._cleanup() +def maybe_register_dataset_for_temp_dir_deletion(dataset): + """ + This function registers the datasets that have cache files in _TEMP_DIR_FOR_TEMP_CACHE_FILES in order + to properly delete them before deleting the temporary directory. + The temporary directory _TEMP_DIR_FOR_TEMP_CACHE_FILES is used when caching is disabled. + """ + if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None: + return + + global _DATASETS_WITH_TABLE_IN_TEMP_DIR + if _DATASETS_WITH_TABLE_IN_TEMP_DIR is None: + _DATASETS_WITH_TABLE_IN_TEMP_DIR = weakref.WeakSet() + if any( + Path(_TEMP_DIR_FOR_TEMP_CACHE_FILES.name) in Path(cache_file["filename"]).parents + for cache_file in dataset.cache_files + ): + _DATASETS_WITH_TABLE_IN_TEMP_DIR.add(dataset) + + +def get_datasets_with_cache_file_in_temp_dir(): + return list(_DATASETS_WITH_TABLE_IN_TEMP_DIR) if _DATASETS_WITH_TABLE_IN_TEMP_DIR is not None else [] + + def enable_caching(): """ When applying transforms on a dataset, the data are stored in cache files. @@ -165,7 +184,13 @@ def get_temporary_cache_files_directory() -> str: """Return a directory that is deleted when session closes.""" global _TEMP_DIR_FOR_TEMP_CACHE_FILES if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None: - _TEMP_DIR_FOR_TEMP_CACHE_FILES = _TempCacheDir() + # Avoids a PermissionError on Windows caused by the datasets referencing + # the files from the cache directory on clean-up + def cleanup_func(): + for dset in get_datasets_with_cache_file_in_temp_dir(): + dset.__del__() + + _TEMP_DIR_FOR_TEMP_CACHE_FILES = _TempDirWithCustomCleanup(cleanup_func=cleanup_func) return _TEMP_DIR_FOR_TEMP_CACHE_FILES.name diff --git a/tests/commands/test_delete_temp_cache.py b/tests/commands/test_delete_temp_cache.py deleted file mode 100644 index 300a8e73713..00000000000 --- a/tests/commands/test_delete_temp_cache.py +++ /dev/null @@ -1,21 +0,0 @@ -from pathlib import Path - -import pytest - -from datasets.commands.delete_temp_cache import DeleteTempCacheCommand -from datasets.fingerprint import get_temporary_cache_files_directory - - -@pytest.mark.integration -def test_delete_temp_cache_command(tmp_path, monkeypatch): - monkeypatch.setattr("tempfile.tempdir", str(tmp_path)) - hf_temp_cache_dir = Path(get_temporary_cache_files_directory()) - dummy_dir = tmp_path / "dummy_dir" - dummy_dir.mkdir() - dummy_file = tmp_path / "dummy.txt" - with open(dummy_file, "w"): - pass - assert hf_temp_cache_dir.exists() and dummy_dir.exists() and dummy_file.exists() - command = DeleteTempCacheCommand() - command.run() - assert not hf_temp_cache_dir.exists() and dummy_dir.exists() and dummy_file.exists() From c166692aa955528180dd4d55474a984f6044896d Mon Sep 17 00:00:00 2001 From: mariosasko Date: Fri, 1 Dec 2023 01:16:45 +0100 Subject: [PATCH 5/5] Register temp cache files for deletion on copy/pickle --- src/datasets/arrow_dataset.py | 5 +++++ src/datasets/config.py | 3 +++ src/datasets/fingerprint.py | 35 ++++++++++++++++------------------- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index cecf8d454a3..178e43eb8ca 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1360,6 +1360,11 @@ def from_sql( **kwargs, ).read() + def __setstate__(self, state): + self.__dict__.update(state) + maybe_register_dataset_for_temp_dir_deletion(self) + return self + def __del__(self): if hasattr(self, "_data"): del self._data diff --git a/src/datasets/config.py b/src/datasets/config.py index 3e1f20475ac..e604593949a 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -211,6 +211,9 @@ MAX_DATASET_CONFIG_ID_READABLE_LENGTH = 255 +# Temporary cache directory prefix +TEMP_CACHE_DIR_PREFIX = "hf_datasets-" + # Streaming STREAMING_READ_MAX_RETRIES = 20 STREAMING_READ_RETRY_INTERVAL = 5 diff --git a/src/datasets/fingerprint.py b/src/datasets/fingerprint.py index 7d73758a049..f1b8d458899 100644 --- a/src/datasets/fingerprint.py +++ b/src/datasets/fingerprint.py @@ -13,6 +13,7 @@ import pyarrow as pa import xxhash +from . import config from .info import DatasetInfo from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH from .table import ConcatenationTable, InMemoryTable, MemoryMappedTable, Table @@ -42,28 +43,30 @@ ################# _CACHING_ENABLED = True -_TEMP_DIR_FOR_TEMP_CACHE_FILES: Optional["_TempDirWithCustomCleanup"] = None +_TEMP_DIR_FOR_TEMP_CACHE_FILES: Optional["_TempCacheDir"] = None _DATASETS_WITH_TABLE_IN_TEMP_DIR: Optional[weakref.WeakSet] = None -class _TempDirWithCustomCleanup: +class _TempCacheDir: """ - A temporary directory with a custom cleanup function. - We need a custom temporary directory cleanup in order to delete the dataset objects that have - cache files in the temporary directory before deleting the dorectory itself. + A temporary directory for storing cached Arrow files with a cleanup that frees references to the Arrow files + before deleting the directory itself to avoid permission errors on Windows. """ - def __init__(self, cleanup_func=None, *cleanup_func_args, **cleanup_func_kwargs): - self.name = tempfile.mkdtemp() + def __init__(self): + self.name = tempfile.mkdtemp(prefix=config.TEMP_CACHE_DIR_PREFIX) self._finalizer = weakref.finalize(self, self._cleanup) - self._cleanup_func = cleanup_func - self._cleanup_func_args = cleanup_func_args - self._cleanup_func_kwargs = cleanup_func_kwargs def _cleanup(self): - self._cleanup_func(*self._cleanup_func_args, **self._cleanup_func_kwargs) + for dset in get_datasets_with_cache_file_in_temp_dir(): + dset.__del__() if os.path.exists(self.name): - shutil.rmtree(self.name) + try: + shutil.rmtree(self.name) + except Exception as e: + raise OSError( + f"An error occured while trying to delete temporary cache directory {self.name}. Please delete it manually." + ) from e def cleanup(self): if self._finalizer.detach(): @@ -184,13 +187,7 @@ def get_temporary_cache_files_directory() -> str: """Return a directory that is deleted when session closes.""" global _TEMP_DIR_FOR_TEMP_CACHE_FILES if _TEMP_DIR_FOR_TEMP_CACHE_FILES is None: - # Avoids a PermissionError on Windows caused by the datasets referencing - # the files from the cache directory on clean-up - def cleanup_func(): - for dset in get_datasets_with_cache_file_in_temp_dir(): - dset.__del__() - - _TEMP_DIR_FOR_TEMP_CACHE_FILES = _TempDirWithCustomCleanup(cleanup_func=cleanup_func) + _TEMP_DIR_FOR_TEMP_CACHE_FILES = _TempCacheDir() return _TEMP_DIR_FOR_TEMP_CACHE_FILES.name