diff --git a/src/datasets/utils/py_utils.py b/src/datasets/utils/py_utils.py index e233cd389e5..dbe17d45ed2 100644 --- a/src/datasets/utils/py_utils.py +++ b/src/datasets/utils/py_utils.py @@ -405,12 +405,18 @@ def _save_code(pickler, obj): # ex: for ipython, and for shell # Moreover lambda functions have a special name: '' # ex: (lambda x: x).__code__.co_name == "" # True + # # For the hashing mechanism we ignore where the function has been defined # More specifically: # - we ignore the filename of special functions (filename starts with '<') # - we always ignore the line number + # - we only use the base name of the file instead of the whole path, + # to be robust in case a script is moved for example. + # # Only those two lines are different from the original implementation: - co_filename = "" if obj.co_filename.startswith("<") or obj.co_name == "" else obj.co_filename + co_filename = ( + "" if obj.co_filename.startswith("<") or obj.co_name == "" else os.path.basename(obj.co_filename) + ) co_firstlineno = 1 # The rest is the same as in the original dill implementation if dill._dill.PY3: diff --git a/tests/test_caching.py b/tests/test_caching.py index 2cb39c23c10..6442f310b97 100644 --- a/tests/test_caching.py +++ b/tests/test_caching.py @@ -1,5 +1,8 @@ import pickle +import subprocess from hashlib import md5 +from pathlib import Path +from textwrap import dedent from types import CodeType, FunctionType from unittest import TestCase from unittest.mock import patch @@ -258,3 +261,28 @@ def test_hash_update(self): def test_hash_unpicklable(self): with self.assertRaises(pickle.PicklingError): Hasher.hash(UnpicklableCallable(Foo("hello"))) + + +def test_move_script_doesnt_change_hash(tmp_path: Path): + dir1 = tmp_path / "dir1" + dir2 = tmp_path / "dir2" + dir1.mkdir() + dir2.mkdir() + script_filename = "script.py" + code = dedent( + """ + from datasets.fingerprint import Hasher + def foo(): + pass + print(Hasher.hash(foo)) + """ + ) + script_path1 = dir1 / script_filename + script_path2 = dir2 / script_filename + with script_path1.open("w") as f: + f.write(code) + with script_path2.open("w") as f: + f.write(code) + fingerprint1 = subprocess.check_output(["python", str(script_path1)]) + fingerprint2 = subprocess.check_output(["python", str(script_path2)]) + assert fingerprint1 == fingerprint2