Skip to content

Commit 8910eda

Browse files
Make DuplicateKeysError more user friendly [For Issue #2556] (#4545)
* Update DuplicateKeysError more user friendly DuplicateKeysError error does not provide any information regarding the examples which have the same the key. This information is very helpful for debugging the dataset generator script. * Removing unused hkey_index variable * Update Dataset Path in DuplicatedKeysError The DuplicatedKeysError class was updated to always except duplicate_key_indices and old Error message is removed. Path to Dataset in Error updated using dataset name to make it more user friendly. * Update index calculation with _num_examples Current Index calculation does not calculate the absolute indices. Error message used str.replace , changed it to simple str append. * Fix Lint Error * Apply suggestions from code review Co-authored-by: Quentin Lhoest <[email protected]>
1 parent b8363e0 commit 8910eda

File tree

3 files changed

+25
-6
lines changed

3 files changed

+25
-6
lines changed

src/datasets/arrow_writer.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -456,7 +456,13 @@ def check_duplicate_keys(self):
456456
tmp_record = set()
457457
for hash, key in self.hkey_record:
458458
if hash in tmp_record:
459-
raise DuplicatedKeysError(key)
459+
duplicate_key_indices = [
460+
str(self._num_examples + index)
461+
for index, (duplicate_hash, _) in enumerate(self.hkey_record)
462+
if duplicate_hash == hash
463+
]
464+
465+
raise DuplicatedKeysError(key, duplicate_key_indices)
460466
else:
461467
tmp_record.add(hash)
462468

src/datasets/builder.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
from .fingerprint import Hasher
5050
from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
5151
from .iterable_dataset import ExamplesIterable, IterableDataset, _generate_examples_from_tables_wrapper
52+
from .keyhash import DuplicatedKeysError
5253
from .naming import camelcase_to_snakecase
5354
from .splits import Split, SplitDict, SplitGenerator
5455
from .streaming import extend_dataset_builder_for_streaming
@@ -798,7 +799,13 @@ def _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs
798799
+ "\nOriginal error:\n"
799800
+ str(e)
800801
) from None
801-
802+
# If check_duplicates is set to True , then except DuplicatedKeysError
803+
except DuplicatedKeysError as e:
804+
raise DuplicatedKeysError(
805+
e.key,
806+
e.duplicate_key_indices,
807+
fix_msg=f"To avoid duplicate keys, please fix the dataset script {self.name}.py",
808+
) from None
802809
dl_manager.manage_extracted_files()
803810

804811
if verify_infos:

src/datasets/keyhash.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,16 @@ def __init__(self, hash_data):
7070
class DuplicatedKeysError(Exception):
7171
"""Raise an error when duplicate key found."""
7272

73-
def __init__(self, key):
74-
self.prefix = "FAILURE TO GENERATE DATASET !"
75-
self.err_msg = f"\nFound duplicate Key: {key}"
76-
self.suffix = "\nKeys should be unique and deterministic in nature"
73+
def __init__(self, key, duplicate_key_indices, fix_msg=""):
74+
self.key = key
75+
self.duplicate_key_indices = duplicate_key_indices
76+
self.fix_msg = fix_msg
77+
self.prefix = "Found multiple examples generated with the same key"
78+
if len(duplicate_key_indices) <= 20:
79+
self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices)} have the key {key}"
80+
else:
81+
self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices[:20])}... ({len(duplicate_key_indices) - 20} more) have the key {key}"
82+
self.suffix = "\n" + fix_msg if fix_msg else ""
7783
super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")
7884

7985

0 commit comments

Comments
 (0)