Resolved comments

Pringled · Pringled · commit b40ef688ce89 · 2025-08-13T11:06:18.000+02:00
diff --git a/semhash/datamodels.py b/semhash/datamodels.py
@@ -1,11 +1,14 @@
 import warnings
 from collections import defaultdict
 from dataclasses import dataclass, field
-from typing import Any, Generic, Hashable, Sequence, TypeVar
+from typing import Any, Generic, Hashable, Sequence, TypeAlias, TypeVar
+
+from frozendict import frozendict
 
 from semhash.utils import to_frozendict
 
 Record = TypeVar("Record", str, dict[str, Any])
+DuplicateList: TypeAlias = list[tuple[Record, float]]
 
 
 @dataclass
@@ -23,13 +26,29 @@ class DuplicateRecord(Generic[Record]):
 
     record: Record
     exact: bool
-    duplicates: list[tuple[Record, float]] = field(default_factory=list)
+    duplicates: DuplicateList = field(default_factory=list)
 
     def _rethreshold(self, threshold: float) -> None:
         """Rethreshold the duplicates."""
         self.duplicates = [(d, score) for d, score in self.duplicates if score >= threshold]
 
 
+@dataclass
+class SelectedWithDuplicates(Generic[Record]):
+    """
+    A record that has been selected along with its duplicates.
+
+    Attributes
+    ----------
+        record: The original record being selected.
+        duplicates: List of tuples consisting of duplicate records and their associated scores.
+
+    """
+
+    record: Record
+    duplicates: DuplicateList = field(default_factory=list)
+
+
 @dataclass
 class DeduplicationResult(Generic[Record]):
     """
@@ -49,7 +68,7 @@ class DeduplicationResult(Generic[Record]):
     selected: list[Record] = field(default_factory=list)
     filtered: list[DuplicateRecord] = field(default_factory=list)
     threshold: float = field(default=0.9)
-    columns: Sequence[str] = field(default_factory=list)
+    columns: Sequence[str] | None = field(default=None)
     deduplicated: list[Record] = field(default_factory=list)  # Deprecated
     duplicates: list[DuplicateRecord] = field(default_factory=list)  # Deprecated
 
@@ -108,33 +127,34 @@ def rethreshold(self, threshold: float) -> None:
         self.threshold = threshold
 
     @property
-    def selected_with_duplicates(self) -> list[tuple[Record, list[tuple[Record, float]]]]:
+    def selected_with_duplicates(self) -> list[SelectedWithDuplicates[Record]]:
         """
         For every kept record, return the duplicates that were removed along with their similarity scores.
 
         :return: A list of tuples where each tuple contains a kept record
                 and a list of its duplicates with their similarity scores.
         """
 
-        def _to_hashable(record: Record) -> Hashable:
-            if isinstance(record, dict):
+        def _to_hashable(record: Record) -> frozendict[str, str] | str:
+            """Convert a record to a hashable representation."""
+            if isinstance(record, dict) and self.columns is not None:
                 # Convert dict to frozendict for immutability and hashability
                 return to_frozendict(record, set(self.columns))
-            return record
+            return str(record)
 
         # Build a mapping from original-record  to  [(duplicate, score), …]
-        buckets: defaultdict[Hashable, list[tuple[Record, float]]] = defaultdict(list)
+        buckets: defaultdict[Hashable, DuplicateList] = defaultdict(list)
         for duplicate_record in self.filtered:
             for original_record, score in duplicate_record.duplicates:
                 buckets[_to_hashable(original_record)].append((duplicate_record.record, float(score)))
 
-        result: list[tuple[Record, list[tuple[Record, float]]]] = []
+        result: list[SelectedWithDuplicates[Record]] = []
         for selected in self.selected:
             # Get the list of duplicates for the selected record
             raw_list = buckets.get(_to_hashable(selected), [])
             # Ensure we don't have duplicates in the list
             deduped = {_to_hashable(rec): (rec, score) for rec, score in raw_list}
-            result.append((selected, list(deduped.values())))
+            result.append(SelectedWithDuplicates(record=selected, duplicates=list(deduped.values())))
 
         return result
 
diff --git a/tests/test_datamodels.py b/tests/test_datamodels.py
@@ -2,7 +2,7 @@
 
 import semhash
 import semhash.version
-from semhash.datamodels import DeduplicationResult, DuplicateRecord
+from semhash.datamodels import DeduplicationResult, DuplicateRecord, SelectedWithDuplicates
 
 
 def test_deduplication_scoring() -> None:
@@ -11,7 +11,6 @@ def test_deduplication_scoring() -> None:
         ["a", "b", "c"],
         [DuplicateRecord("a", False, [("b", 0.9)]), DuplicateRecord("b", False, [("c", 0.8)])],
         0.8,
-        columns=["text"],
     )
     assert d.duplicate_ratio == 0.4
 
@@ -22,7 +21,6 @@ def test_deduplication_scoring_exact() -> None:
         ["a", "b", "c"],
         [DuplicateRecord("a", True, [("b", 0.9)]), DuplicateRecord("b", False, [("c", 0.8)])],
         0.8,
-        columns=["text"],
     )
     assert d.exact_duplicate_ratio == 0.2
 
@@ -59,7 +57,6 @@ def test_get_least_similar_from_duplicates() -> None:
         ["a", "b", "c"],
         [DuplicateRecord("a", False, [("b", 0.9), ("c", 0.7)]), DuplicateRecord("b", False, [("c", 0.8)])],
         0.8,
-        columns=["text"],
     )
     result = d.get_least_similar_from_duplicates(1)
     assert result == [("a", "c", 0.7)]
@@ -80,7 +77,6 @@ def test_rethreshold_deduplication_result() -> None:
             DuplicateRecord("e", False, [("z", 0.8)]),
         ],
         0.8,
-        columns=["text"],
     )
     d.rethreshold(0.85)
     assert d.filtered == [DuplicateRecord("d", False, [("x", 0.9)])]
@@ -96,7 +92,6 @@ def test_rethreshold_exception() -> None:
             DuplicateRecord("e", False, [("z", 0.8)]),
         ],
         0.7,
-        columns=["text"],
     )
     with pytest.raises(ValueError):
         d.rethreshold(0.6)
@@ -113,7 +108,6 @@ def test_deprecation_deduplicated_duplicates() -> None:
                     DuplicateRecord("e", False, [("z", 0.8)]),
                 ],
                 threshold=0.8,
-                columns=["text"],
             )
     else:
         raise ValueError("deprecate `deduplicated` and `duplicates` fields in `DeduplicationResult`")
@@ -133,10 +127,14 @@ def test_selected_with_duplicates_strings() -> None:
             DuplicateRecord("duplicate_2", False, [("original", 0.8)]),
         ],
         threshold=0.8,
-        columns=["text"],
     )
 
-    expected = [("original", [("duplicate_1", 0.9), ("duplicate_2", 0.8)])]
+    expected = [
+        SelectedWithDuplicates(
+            record="original",
+            duplicates=[("duplicate_1", 0.9), ("duplicate_2", 0.8)],
+        )
+    ]
     assert d.selected_with_duplicates == expected
 
 
@@ -153,9 +151,10 @@ def test_selected_with_duplicates_dicts() -> None:
         columns=["text"],
     )
 
-    pairs = d.selected_with_duplicates
-    assert len(pairs) == 1
-    kept, dups = pairs[0]
+    items = d.selected_with_duplicates
+    assert len(items) == 1
+    kept = items[0].record
+    dups = items[0].duplicates
     assert kept == selected
     assert {r["id"] for r, _ in dups} == {1, 2}
 
@@ -173,16 +172,16 @@ def test_selected_with_duplicates_multi_column() -> None:
         columns=["text", "text2"],
     )
 
-    pairs = d.selected_with_duplicates
-    assert len(pairs) == 1
-    kept, _ = pairs[0]
+    items = d.selected_with_duplicates
+    assert len(items) == 1
+    kept = items[0].record
     assert kept == selected
 
 
 def test_selected_with_duplicates_unhashable_values() -> None:
     """Test selected_with_duplicates with unhashable values in records."""
-    selected = {"a": [1, 2, 3]}  # list -> unhashable value
-    filtered = {"a": [1, 2, 3], "flag": True}
+    selected = {"text": "hello", "a": [1, 2, 3]}  # list -> unhashable value
+    filtered = {"text": "hello", "a": [1, 2, 3], "flag": True}
 
     d = DeduplicationResult(
         selected=[selected],
@@ -191,8 +190,8 @@ def test_selected_with_duplicates_unhashable_values() -> None:
         columns=["text"],
     )
 
-    pairs = d.selected_with_duplicates
-    assert pairs == [(selected, [(filtered, 1.0)])]
+    items = d.selected_with_duplicates
+    assert items == [SelectedWithDuplicates(record=selected, duplicates=[(filtered, 1.0)])]
 
 
 def test_selected_with_duplicates_removes_internal_duplicates() -> None:
@@ -210,11 +209,11 @@ def test_selected_with_duplicates_removes_internal_duplicates() -> None:
         columns=["text"],
     )
 
-    selected_with_duplicates = d.selected_with_duplicates
-
-    assert len(selected_with_duplicates) == 1
+    items = d.selected_with_duplicates
+    assert len(items) == 1
 
-    selected_record, duplicate_list = selected_with_duplicates[0]
+    selected_record = items[0].record
+    duplicate_list = items[0].duplicates
     # Should keep the kept record unchanged
     assert selected_record == selected
     # The duplicate row must appear only once