diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 51788a1514d..4f616110ac2 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -456,7 +456,13 @@ def check_duplicate_keys(self): tmp_record = set() for hash, key in self.hkey_record: if hash in tmp_record: - raise DuplicatedKeysError(key) + duplicate_key_indices = [ + str(self._num_examples + index) + for index, (duplicate_hash, _) in enumerate(self.hkey_record) + if duplicate_hash == hash + ] + + raise DuplicatedKeysError(key, duplicate_key_indices) else: tmp_record.add(hash) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index 999c15953f2..901bab73ab6 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -49,6 +49,7 @@ from .fingerprint import Hasher from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo from .iterable_dataset import ExamplesIterable, IterableDataset, _generate_examples_from_tables_wrapper +from .keyhash import DuplicatedKeysError from .naming import camelcase_to_snakecase from .splits import Split, SplitDict, SplitGenerator from .streaming import extend_dataset_builder_for_streaming @@ -798,7 +799,13 @@ def _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs + "\nOriginal error:\n" + str(e) ) from None - + # If check_duplicates is set to True , then except DuplicatedKeysError + except DuplicatedKeysError as e: + raise DuplicatedKeysError( + e.key, + e.duplicate_key_indices, + fix_msg=f"To avoid duplicate keys, please fix the dataset script {self.name}.py", + ) from None dl_manager.manage_extracted_files() if verify_infos: diff --git a/src/datasets/keyhash.py b/src/datasets/keyhash.py index a6692c1f96c..5b3babf677f 100644 --- a/src/datasets/keyhash.py +++ b/src/datasets/keyhash.py @@ -70,10 +70,16 @@ def __init__(self, hash_data): class DuplicatedKeysError(Exception): """Raise an error when duplicate key found.""" - def __init__(self, key): - self.prefix = "FAILURE TO GENERATE DATASET !" - self.err_msg = f"\nFound duplicate Key: {key}" - self.suffix = "\nKeys should be unique and deterministic in nature" + def __init__(self, key, duplicate_key_indices, fix_msg=""): + self.key = key + self.duplicate_key_indices = duplicate_key_indices + self.fix_msg = fix_msg + self.prefix = "Found multiple examples generated with the same key" + if len(duplicate_key_indices) <= 20: + self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices)} have the key {key}" + else: + self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices[:20])}... ({len(duplicate_key_indices) - 20} more) have the key {key}" + self.suffix = "\n" + fix_msg if fix_msg else "" super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")