huggingface · lhoestq · Jun 21, 2021 · Jun 16, 2021 · Jun 16, 2021 · Jun 16, 2021
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -381,7 +381,7 @@ def builder_configs(cls):
     def cache_dir(self):
         return self._cache_dir
 
-    def _relative_data_dir(self, with_version=True, with_hash=True):
+    def _relative_data_dir(self, with_version=True, with_hash=True) -> str:
         """Relative path of this dataset in cache_dir:
         Will be:
             self.name/self.config.version/self.hash/
@@ -845,7 +845,16 @@ def _as_dataset(self, split: Union[ReadInstruction, Split] = Split.TRAIN, in_mem
             split_infos=self.info.splits.values(),
             in_memory=in_memory,
         )
-        return Dataset(**dataset_kwargs)
+        fingerprint = self._get_dataset_fingerprint(split)
+        return Dataset(fingerprint=fingerprint, **dataset_kwargs)
+
+    def _get_dataset_fingerprint(self, split: Union[ReadInstruction, Split]) -> str:
+        """The dataset fingerprint is the hash of the relative directory dataset_name/config_name/version/hash, as well as the split specs."""
+        hasher = Hasher()
+        hasher.update(self._relative_data_dir().replace(os.sep, "/"))
+        hasher.update(str(split))  # for example: train, train+test, train[:10%], test[:33%](pct1_dropremainder)
+        fingerprint = hasher.hexdigest()
+        return fingerprint
 
     def _post_process(self, dataset: Dataset, resources_paths: Dict[str, str]) -> Optional[Dataset]:
         """Run dataset transforms or add indexes"""

diff --git a/src/datasets/utils/filelock.py b/src/datasets/utils/filelock.py
@@ -62,7 +62,14 @@
 
 # Data
 # ------------------------------------------------
-__all__ = ["Timeout", "BaseFileLock", "WindowsFileLock", "UnixFileLock", "SoftFileLock", "FileLock"]
+__all__ = [
+    "Timeout",
+    "BaseFileLock",
+    "WindowsFileLock",
+    "UnixFileLock",
+    "SoftFileLock",
+    "FileLock",
+]
 
 __version__ = "3.0.12"
 
@@ -125,8 +132,10 @@ class BaseFileLock:
     Implements the base class of a file lock.
     """
 
-    def __init__(self, lock_file, timeout=-1):
+    def __init__(self, lock_file, timeout=-1, filename_max_length=255):
         """ """
+        # Hash the filename if it's too long
+        lock_file = self.hash_filename_if_too_long(lock_file, filename_max_length)
         # The path to the lock file.
         self._lock_file = lock_file
 
@@ -322,6 +331,16 @@ def __del__(self):
         self.release(force=True)
         return None
 
+    def hash_filename_if_too_long(self, path: str, max_length: int) -> str:
+        filename = os.path.basename(path)
+        if len(filename) > max_length and max_length > 0:
+            dirname = os.path.dirname(path)
+            hashed_filename = str(hash(filename))
+            new_filename = filename[: max_length - len(hashed_filename) - 8] + "..." + hashed_filename + ".lock"
+            return os.path.join(dirname, new_filename)
+        else:
+            return path
+
 
 # Windows locking mechanism
 # ~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/tests/test_filelock.py b/tests/test_filelock.py
@@ -0,0 +1,29 @@
+import time
+import os
+
+import pytest
+
+from datasets.utils.filelock import FileLock, Timeout
+
+
+def test_filelock(tmpdir):
+    lock1 = FileLock(tmpdir / "foo.lock")
+    lock2 = FileLock(tmpdir / "foo.lock")
+    timeout = 0.01
+    with lock1.acquire():
+        with pytest.raises(Timeout):
+            _start = time.time()
+            lock2.acquire(timeout)
+            assert time.time() - _start > timeout
+
+
+def test_long_filename(tmpdir):
+    filename = "a" * 1000 + ".lock"
+    lock1 = FileLock(tmpdir / filename)
+    assert lock1._lock_file.endswith(".lock")
+    assert not lock1._lock_file.endswith(filename)
+    assert len(os.path.basename(lock1._lock_file)) <= 255
+    lock2 = FileLock(tmpdir / filename)
+    with lock1.acquire():
+        with pytest.raises(Timeout):
+            lock2.acquire(0)
diff --git a/tests/test_load.py b/tests/test_load.py
@@ -227,6 +227,21 @@ def assert_auth(url, *args, headers, **kwargs):
         mock_head.assert_called()
 
 
+def test_load_dataset_then_move_then_reload(dataset_loading_script_dir, data_dir, tmp_path, caplog):
+    cache_dir1 = tmp_path / "cache1"
+    cache_dir2 = tmp_path / "cache2"
+    dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="train", cache_dir=cache_dir1)
+    fingerprint1 = dataset._fingerprint
+    del dataset
+    os.rename(cache_dir1, cache_dir2)
+    caplog.clear()
+    dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="train", cache_dir=cache_dir2)
+    assert "Reusing dataset" in caplog.text
+    assert dataset._fingerprint == fingerprint1, "for the caching mechanism to work, fingerprint should stay the same"
+    dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="test", cache_dir=cache_dir2)
+    assert dataset._fingerprint != fingerprint1
+
+
 @pytest.mark.parametrize("max_in_memory_dataset_size", ["default", 0, 50, 500])
 def test_load_dataset_local_with_default_in_memory(
     max_in_memory_dataset_size, dataset_loading_script_dir, data_dir, monkeypatch