Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/datasets/utils/py_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,12 +405,18 @@ def _save_code(pickler, obj):
# ex: <ipython-input-13-9ed2afe61d25> for ipython, and <stdin> for shell
# Moreover lambda functions have a special name: '<lambda>'
# ex: (lambda x: x).__code__.co_name == "<lambda>" # True
#
# For the hashing mechanism we ignore where the function has been defined
# More specifically:
# - we ignore the filename of special functions (filename starts with '<')
# - we always ignore the line number
# - we only use the base name of the file instead of the whole path,
# to be robust in case a script is moved for example.
#
# Only those two lines are different from the original implementation:
co_filename = "" if obj.co_filename.startswith("<") or obj.co_name == "<lambda>" else obj.co_filename
co_filename = (
"" if obj.co_filename.startswith("<") or obj.co_name == "<lambda>" else os.path.basename(obj.co_filename)
)
co_firstlineno = 1
# The rest is the same as in the original dill implementation
if dill._dill.PY3:
Expand Down
28 changes: 28 additions & 0 deletions tests/test_caching.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import pickle
import subprocess
from hashlib import md5
from pathlib import Path
from textwrap import dedent
from types import CodeType, FunctionType
from unittest import TestCase
from unittest.mock import patch
Expand Down Expand Up @@ -258,3 +261,28 @@ def test_hash_update(self):
def test_hash_unpicklable(self):
with self.assertRaises(pickle.PicklingError):
Hasher.hash(UnpicklableCallable(Foo("hello")))


def test_move_script_doesnt_change_hash(tmp_path: Path):
dir1 = tmp_path / "dir1"
dir2 = tmp_path / "dir2"
dir1.mkdir()
dir2.mkdir()
script_filename = "script.py"
code = dedent(
"""
from datasets.fingerprint import Hasher
def foo():
pass
print(Hasher.hash(foo))
"""
)
script_path1 = dir1 / script_filename
script_path2 = dir2 / script_filename
with script_path1.open("w") as f:
f.write(code)
with script_path2.open("w") as f:
f.write(code)
fingerprint1 = subprocess.check_output(["python", str(script_path1)])
fingerprint2 = subprocess.check_output(["python", str(script_path2)])
assert fingerprint1 == fingerprint2