22
33import semhash
44import semhash .version
5- from semhash .datamodels import DeduplicationResult , DuplicateRecord
5+ from semhash .datamodels import DeduplicationResult , DuplicateRecord , SelectedWithDuplicates
66
77
88def test_deduplication_scoring () -> None :
@@ -11,7 +11,6 @@ def test_deduplication_scoring() -> None:
1111 ["a" , "b" , "c" ],
1212 [DuplicateRecord ("a" , False , [("b" , 0.9 )]), DuplicateRecord ("b" , False , [("c" , 0.8 )])],
1313 0.8 ,
14- columns = ["text" ],
1514 )
1615 assert d .duplicate_ratio == 0.4
1716
@@ -22,7 +21,6 @@ def test_deduplication_scoring_exact() -> None:
2221 ["a" , "b" , "c" ],
2322 [DuplicateRecord ("a" , True , [("b" , 0.9 )]), DuplicateRecord ("b" , False , [("c" , 0.8 )])],
2423 0.8 ,
25- columns = ["text" ],
2624 )
2725 assert d .exact_duplicate_ratio == 0.2
2826
@@ -59,7 +57,6 @@ def test_get_least_similar_from_duplicates() -> None:
5957 ["a" , "b" , "c" ],
6058 [DuplicateRecord ("a" , False , [("b" , 0.9 ), ("c" , 0.7 )]), DuplicateRecord ("b" , False , [("c" , 0.8 )])],
6159 0.8 ,
62- columns = ["text" ],
6360 )
6461 result = d .get_least_similar_from_duplicates (1 )
6562 assert result == [("a" , "c" , 0.7 )]
@@ -80,7 +77,6 @@ def test_rethreshold_deduplication_result() -> None:
8077 DuplicateRecord ("e" , False , [("z" , 0.8 )]),
8178 ],
8279 0.8 ,
83- columns = ["text" ],
8480 )
8581 d .rethreshold (0.85 )
8682 assert d .filtered == [DuplicateRecord ("d" , False , [("x" , 0.9 )])]
@@ -96,7 +92,6 @@ def test_rethreshold_exception() -> None:
9692 DuplicateRecord ("e" , False , [("z" , 0.8 )]),
9793 ],
9894 0.7 ,
99- columns = ["text" ],
10095 )
10196 with pytest .raises (ValueError ):
10297 d .rethreshold (0.6 )
@@ -113,7 +108,6 @@ def test_deprecation_deduplicated_duplicates() -> None:
113108 DuplicateRecord ("e" , False , [("z" , 0.8 )]),
114109 ],
115110 threshold = 0.8 ,
116- columns = ["text" ],
117111 )
118112 else :
119113 raise ValueError ("deprecate `deduplicated` and `duplicates` fields in `DeduplicationResult`" )
@@ -133,10 +127,14 @@ def test_selected_with_duplicates_strings() -> None:
133127 DuplicateRecord ("duplicate_2" , False , [("original" , 0.8 )]),
134128 ],
135129 threshold = 0.8 ,
136- columns = ["text" ],
137130 )
138131
139- expected = [("original" , [("duplicate_1" , 0.9 ), ("duplicate_2" , 0.8 )])]
132+ expected = [
133+ SelectedWithDuplicates (
134+ record = "original" ,
135+ duplicates = [("duplicate_1" , 0.9 ), ("duplicate_2" , 0.8 )],
136+ )
137+ ]
140138 assert d .selected_with_duplicates == expected
141139
142140
@@ -153,9 +151,10 @@ def test_selected_with_duplicates_dicts() -> None:
153151 columns = ["text" ],
154152 )
155153
156- pairs = d .selected_with_duplicates
157- assert len (pairs ) == 1
158- kept , dups = pairs [0 ]
154+ items = d .selected_with_duplicates
155+ assert len (items ) == 1
156+ kept = items [0 ].record
157+ dups = items [0 ].duplicates
159158 assert kept == selected
160159 assert {r ["id" ] for r , _ in dups } == {1 , 2 }
161160
@@ -173,16 +172,16 @@ def test_selected_with_duplicates_multi_column() -> None:
173172 columns = ["text" , "text2" ],
174173 )
175174
176- pairs = d .selected_with_duplicates
177- assert len (pairs ) == 1
178- kept , _ = pairs [0 ]
175+ items = d .selected_with_duplicates
176+ assert len (items ) == 1
177+ kept = items [0 ]. record
179178 assert kept == selected
180179
181180
182181def test_selected_with_duplicates_unhashable_values () -> None :
183182 """Test selected_with_duplicates with unhashable values in records."""
184- selected = {"a" : [1 , 2 , 3 ]} # list -> unhashable value
185- filtered = {"a" : [1 , 2 , 3 ], "flag" : True }
183+ selected = {"text" : "hello" , " a" : [1 , 2 , 3 ]} # list -> unhashable value
184+ filtered = {"text" : "hello" , " a" : [1 , 2 , 3 ], "flag" : True }
186185
187186 d = DeduplicationResult (
188187 selected = [selected ],
@@ -191,8 +190,8 @@ def test_selected_with_duplicates_unhashable_values() -> None:
191190 columns = ["text" ],
192191 )
193192
194- pairs = d .selected_with_duplicates
195- assert pairs == [( selected , [(filtered , 1.0 )])]
193+ items = d .selected_with_duplicates
194+ assert items == [SelectedWithDuplicates ( record = selected , duplicates = [(filtered , 1.0 )])]
196195
197196
198197def test_selected_with_duplicates_removes_internal_duplicates () -> None :
@@ -210,11 +209,11 @@ def test_selected_with_duplicates_removes_internal_duplicates() -> None:
210209 columns = ["text" ],
211210 )
212211
213- selected_with_duplicates = d .selected_with_duplicates
214-
215- assert len (selected_with_duplicates ) == 1
212+ items = d .selected_with_duplicates
213+ assert len (items ) == 1
216214
217- selected_record , duplicate_list = selected_with_duplicates [0 ]
215+ selected_record = items [0 ].record
216+ duplicate_list = items [0 ].duplicates
218217 # Should keep the kept record unchanged
219218 assert selected_record == selected
220219 # The duplicate row must appear only once
0 commit comments