Skip to content

Commit ad7447d

Browse files
committed
Deprecate hashregister API
1 parent 4413b2d commit ad7447d

File tree

1 file changed

+4
-34
lines changed

1 file changed

+4
-34
lines changed

src/datasets/fingerprint.py

Lines changed: 4 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import inspect
2-
import json
32
import os
43
import random
54
import shutil
@@ -10,18 +9,12 @@
109
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
1110

1211
import numpy as np
13-
import pyarrow as pa
1412
import xxhash
1513

16-
from .info import DatasetInfo
1714
from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH
18-
from .table import ConcatenationTable, InMemoryTable, MemoryMappedTable, Table
1915
from .utils._dill import dumps
20-
21-
# from .utils.py_utils import dumps
2216
from .utils.deprecation_utils import deprecated
2317
from .utils.logging import get_logger
24-
from .utils.py_utils import asdict
2518

2619

2720
if TYPE_CHECKING:
@@ -202,6 +195,7 @@ def cleanup_func():
202195
#################
203196

204197

198+
@deprecated("Use `copyreg.pickle` to register a custom reducer.")
205199
def hashregister(*types):
206200
def proxy(func):
207201
for t in types:
@@ -228,15 +222,13 @@ def hash_bytes(cls, value: Union[bytes, List[bytes]]) -> str:
228222
return m.hexdigest()
229223

230224
@classmethod
225+
@deprecated("Use `Hasher.hash` instead.")
231226
def hash_default(cls, value: Any) -> str:
232-
return cls.hash_bytes(dumps(value))
227+
return cls.hash(value)
233228

234229
@classmethod
235230
def hash(cls, value: Any) -> str:
236-
if type(value) in cls.dispatch:
237-
return cls.dispatch[type(value)](cls, value)
238-
else:
239-
return cls.hash_default(value)
231+
return cls.hash_bytes(dumps(value))
240232

241233
def update(self, value: Any) -> None:
242234
header_for_update = f"=={type(value)}=="
@@ -248,28 +240,6 @@ def hexdigest(self) -> str:
248240
return self.m.hexdigest()
249241

250242

251-
# Register a new hasher can be useful for two possible reasons:
252-
# 1 - optimize the hashing of large amount of data (e.g. pa.Table)
253-
# 2 - take advantage of a custom serialization method (e.g. DatasetInfo)
254-
255-
256-
@hashregister(pa.Table, Table, InMemoryTable, MemoryMappedTable, ConcatenationTable)
257-
def _hash_pa_table(hasher, value):
258-
def _hash_pa_array(value):
259-
if isinstance(value, pa.ChunkedArray):
260-
return hasher.hash_bytes(c.to_string().encode("utf-8") for c in value.chunks)
261-
else:
262-
return hasher.hash_bytes(value.to_string().encode("utf-8"))
263-
264-
value = "-".join(col + "-" + _hash_pa_array(value[col]) for col in sorted(value.column_names))
265-
return hasher.hash_bytes(value.encode("utf-8"))
266-
267-
268-
@hashregister(DatasetInfo)
269-
def _hash_dataset_info(hasher, value):
270-
return hasher.hash_bytes(json.dumps(asdict(value), sort_keys=True).encode("utf-8"))
271-
272-
273243
#################
274244
# Fingerprinting
275245
#################

0 commit comments

Comments
 (0)