Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion src/datasets/arrow_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,13 @@ def check_duplicate_keys(self):
tmp_record = set()
for hash, key in self.hkey_record:
if hash in tmp_record:
raise DuplicatedKeysError(key)
duplicate_key_indices = [
str(self._num_examples + index)
for index, (duplicate_hash, _) in enumerate(self.hkey_record)
if duplicate_hash == hash
]

raise DuplicatedKeysError(key, duplicate_key_indices)
else:
tmp_record.add(hash)

Expand Down
6 changes: 5 additions & 1 deletion src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
from .fingerprint import Hasher
from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
from .iterable_dataset import ExamplesIterable, IterableDataset, _generate_examples_from_tables_wrapper
from .keyhash import DuplicatedKeysError
from .naming import camelcase_to_snakecase
from .splits import Split, SplitDict, SplitGenerator
from .streaming import extend_dataset_builder_for_streaming
Expand Down Expand Up @@ -798,7 +799,10 @@ def _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs
+ "\nOriginal error:\n"
+ str(e)
) from None

# If check_duplicates is set to True , then except DuplicatedKeysError
except DuplicatedKeysError as e:
e.args = (e.args[0] + f"datasets/{self.name}/{self.name}.py",)
raise (e) from None
dl_manager.manage_extracted_files()

if verify_infos:
Expand Down
8 changes: 4 additions & 4 deletions src/datasets/keyhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,10 @@ def __init__(self, hash_data):
class DuplicatedKeysError(Exception):
"""Raise an error when duplicate key found."""

def __init__(self, key):
self.prefix = "FAILURE TO GENERATE DATASET !"
self.err_msg = f"\nFound duplicate Key: {key}"
self.suffix = "\nKeys should be unique and deterministic in nature"
def __init__(self, key, duplicate_key_indices):
self.prefix = "Found multiple examples generated with the same key"
self.err_msg = f"\nThe following examples {', '.join(duplicate_key_indices)} have the key {key}"
self.suffix = "\nTo avoid duplicate keys, please fix the dataset script at "
super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")


Expand Down