Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/datasets/arrow_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,13 @@ def check_duplicate_keys(self):
tmp_record = set()
for hash, key in self.hkey_record:
if hash in tmp_record:
raise DuplicatedKeysError(key)
duplicate_key_indices = [
str(self._num_examples + index)
for index, (duplicate_hash, _) in enumerate(self.hkey_record)
if duplicate_hash == hash
]

raise DuplicatedKeysError(key, duplicate_key_indices)
else:
tmp_record.add(hash)

Expand Down
9 changes: 8 additions & 1 deletion src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
from .fingerprint import Hasher
from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
from .iterable_dataset import ExamplesIterable, IterableDataset, _generate_examples_from_tables_wrapper
from .keyhash import DuplicatedKeysError
from .naming import camelcase_to_snakecase
from .splits import Split, SplitDict, SplitGenerator
from .streaming import extend_dataset_builder_for_streaming
Expand Down Expand Up @@ -798,7 +799,13 @@ def _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs
+ "\nOriginal error:\n"
+ str(e)
) from None

# If check_duplicates is set to True , then except DuplicatedKeysError
except DuplicatedKeysError as e:
raise DuplicatedKeysError(
e.key,
e.duplicate_key_indices,
fix_msg=f"To avoid duplicate keys, please fix the dataset script {self.name}.py",
) from None
dl_manager.manage_extracted_files()

if verify_infos:
Expand Down
14 changes: 10 additions & 4 deletions src/datasets/keyhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,16 @@ def __init__(self, hash_data):
class DuplicatedKeysError(Exception):
"""Raise an error when duplicate key found."""

def __init__(self, key):
self.prefix = "FAILURE TO GENERATE DATASET !"
self.err_msg = f"\nFound duplicate Key: {key}"
self.suffix = "\nKeys should be unique and deterministic in nature"
def __init__(self, key, duplicate_key_indices, fix_msg=""):
self.key = key
self.duplicate_key_indices = duplicate_key_indices
self.fix_msg = fix_msg
self.prefix = "Found multiple examples generated with the same key"
if len(duplicate_key_indices) <= 20:
self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices)} have the key {key}"
else:
self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices[:20])}... ({len(duplicate_key_indices) - 20} more) have the key {key}"
self.suffix = "\n" + fix_msg if fix_msg else ""
super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")


Expand Down