11import inspect
2- import json
32import os
43import random
54import shutil
109from typing import TYPE_CHECKING , Any , Callable , Dict , List , Optional , Tuple , Union
1110
1211import numpy as np
13- import pyarrow as pa
1412import xxhash
1513
16- from .info import DatasetInfo
1714from .naming import INVALID_WINDOWS_CHARACTERS_IN_PATH
18- from .table import ConcatenationTable , InMemoryTable , MemoryMappedTable , Table
1915from .utils ._dill import dumps
20-
21- # from .utils.py_utils import dumps
2216from .utils .deprecation_utils import deprecated
2317from .utils .logging import get_logger
24- from .utils .py_utils import asdict
2518
2619
2720if TYPE_CHECKING :
@@ -202,6 +195,7 @@ def cleanup_func():
202195#################
203196
204197
198+ @deprecated ("Use `copyreg.pickle` to register a custom reducer." )
205199def hashregister (* types ):
206200 def proxy (func ):
207201 for t in types :
@@ -228,15 +222,13 @@ def hash_bytes(cls, value: Union[bytes, List[bytes]]) -> str:
228222 return m .hexdigest ()
229223
230224 @classmethod
225+ @deprecated ("Use `Hasher.hash` instead." )
231226 def hash_default (cls , value : Any ) -> str :
232- return cls .hash_bytes ( dumps ( value ) )
227+ return cls .hash ( value )
233228
234229 @classmethod
235230 def hash (cls , value : Any ) -> str :
236- if type (value ) in cls .dispatch :
237- return cls .dispatch [type (value )](cls , value )
238- else :
239- return cls .hash_default (value )
231+ return cls .hash_bytes (dumps (value ))
240232
241233 def update (self , value : Any ) -> None :
242234 header_for_update = f"=={ type (value )} =="
@@ -248,28 +240,6 @@ def hexdigest(self) -> str:
248240 return self .m .hexdigest ()
249241
250242
251- # Register a new hasher can be useful for two possible reasons:
252- # 1 - optimize the hashing of large amount of data (e.g. pa.Table)
253- # 2 - take advantage of a custom serialization method (e.g. DatasetInfo)
254-
255-
256- @hashregister (pa .Table , Table , InMemoryTable , MemoryMappedTable , ConcatenationTable )
257- def _hash_pa_table (hasher , value ):
258- def _hash_pa_array (value ):
259- if isinstance (value , pa .ChunkedArray ):
260- return hasher .hash_bytes (c .to_string ().encode ("utf-8" ) for c in value .chunks )
261- else :
262- return hasher .hash_bytes (value .to_string ().encode ("utf-8" ))
263-
264- value = "-" .join (col + "-" + _hash_pa_array (value [col ]) for col in sorted (value .column_names ))
265- return hasher .hash_bytes (value .encode ("utf-8" ))
266-
267-
268- @hashregister (DatasetInfo )
269- def _hash_dataset_info (hasher , value ):
270- return hasher .hash_bytes (json .dumps (asdict (value ), sort_keys = True ).encode ("utf-8" ))
271-
272-
273243#################
274244# Fingerprinting
275245#################
0 commit comments