Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ def builder_configs(cls):
def cache_dir(self):
return self._cache_dir

def _relative_data_dir(self, with_version=True, with_hash=True):
def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
"""Relative path of this dataset in cache_dir:
Will be:
self.name/self.config.version/self.hash/
Expand Down Expand Up @@ -845,7 +845,16 @@ def _as_dataset(self, split: Union[ReadInstruction, Split] = Split.TRAIN, in_mem
split_infos=self.info.splits.values(),
in_memory=in_memory,
)
return Dataset(**dataset_kwargs)
fingerprint = self._get_dataset_fingerprint(split)
return Dataset(fingerprint=fingerprint, **dataset_kwargs)

def _get_dataset_fingerprint(self, split: Union[ReadInstruction, Split]) -> str:
"""The dataset fingerprint is the hash of the relative directory dataset_name/config_name/version/hash, as well as the split specs."""
hasher = Hasher()
hasher.update(self._relative_data_dir().replace(os.sep, "/"))
hasher.update(str(split)) # for example: train, train+test, train[:10%], test[:33%](pct1_dropremainder)
fingerprint = hasher.hexdigest()
return fingerprint

def _post_process(self, dataset: Dataset, resources_paths: Dict[str, str]) -> Optional[Dataset]:
"""Run dataset transforms or add indexes"""
Expand Down
23 changes: 21 additions & 2 deletions src/datasets/utils/filelock.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,14 @@

# Data
# ------------------------------------------------
__all__ = ["Timeout", "BaseFileLock", "WindowsFileLock", "UnixFileLock", "SoftFileLock", "FileLock"]
__all__ = [
"Timeout",
"BaseFileLock",
"WindowsFileLock",
"UnixFileLock",
"SoftFileLock",
"FileLock",
]

__version__ = "3.0.12"

Expand Down Expand Up @@ -125,8 +132,10 @@ class BaseFileLock:
Implements the base class of a file lock.
"""

def __init__(self, lock_file, timeout=-1):
def __init__(self, lock_file, timeout=-1, filename_max_length=255):
""" """
# Hash the filename if it's too long
lock_file = self.hash_filename_if_too_long(lock_file, filename_max_length)
# The path to the lock file.
self._lock_file = lock_file

Expand Down Expand Up @@ -322,6 +331,16 @@ def __del__(self):
self.release(force=True)
return None

def hash_filename_if_too_long(self, path: str, max_length: int) -> str:
filename = os.path.basename(path)
if len(filename) > max_length and max_length > 0:
dirname = os.path.dirname(path)
hashed_filename = str(hash(filename))
new_filename = filename[: max_length - len(hashed_filename) - 8] + "..." + hashed_filename + ".lock"
return os.path.join(dirname, new_filename)
else:
return path


# Windows locking mechanism
# ~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
29 changes: 29 additions & 0 deletions tests/test_filelock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import time
import os

import pytest

from datasets.utils.filelock import FileLock, Timeout


def test_filelock(tmpdir):
lock1 = FileLock(tmpdir / "foo.lock")
lock2 = FileLock(tmpdir / "foo.lock")
timeout = 0.01
with lock1.acquire():
with pytest.raises(Timeout):
_start = time.time()
lock2.acquire(timeout)
assert time.time() - _start > timeout


def test_long_filename(tmpdir):
filename = "a" * 1000 + ".lock"
lock1 = FileLock(tmpdir / filename)
assert lock1._lock_file.endswith(".lock")
assert not lock1._lock_file.endswith(filename)
assert len(os.path.basename(lock1._lock_file)) <= 255
lock2 = FileLock(tmpdir / filename)
with lock1.acquire():
with pytest.raises(Timeout):
lock2.acquire(0)
15 changes: 15 additions & 0 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,21 @@ def assert_auth(url, *args, headers, **kwargs):
mock_head.assert_called()


def test_load_dataset_then_move_then_reload(dataset_loading_script_dir, data_dir, tmp_path, caplog):
cache_dir1 = tmp_path / "cache1"
cache_dir2 = tmp_path / "cache2"
dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="train", cache_dir=cache_dir1)
fingerprint1 = dataset._fingerprint
del dataset
os.rename(cache_dir1, cache_dir2)
caplog.clear()
dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="train", cache_dir=cache_dir2)
assert "Reusing dataset" in caplog.text
assert dataset._fingerprint == fingerprint1, "for the caching mechanism to work, fingerprint should stay the same"
dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="test", cache_dir=cache_dir2)
assert dataset._fingerprint != fingerprint1


@pytest.mark.parametrize("max_in_memory_dataset_size", ["default", 0, 50, 500])
def test_load_dataset_local_with_default_in_memory(
max_in_memory_dataset_size, dataset_loading_script_dir, data_dir, monkeypatch
Expand Down