Skip to content

Commit ccba4f5

Browse files
committed
Update Dataset Path in DuplicatedKeysError
The DuplicatedKeysError class was updated to always except duplicate_key_indices and old Error message is removed. Path to Dataset in Error updated using dataset name to make it more user friendly.
1 parent 49d4561 commit ccba4f5

File tree

2 files changed

+10
-12
lines changed

2 files changed

+10
-12
lines changed

src/datasets/builder.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
from .fingerprint import Hasher
5050
from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
5151
from .iterable_dataset import ExamplesIterable, IterableDataset, _generate_examples_from_tables_wrapper
52+
from .keyhash import DuplicatedKeysError
5253
from .naming import camelcase_to_snakecase
5354
from .splits import Split, SplitDict, SplitGenerator
5455
from .streaming import extend_dataset_builder_for_streaming
@@ -798,7 +799,10 @@ def _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs
798799
+ "\nOriginal error:\n"
799800
+ str(e)
800801
) from None
801-
802+
# If check_duplicates is set to True , then except DuplicatedKeysError
803+
except DuplicatedKeysError as e:
804+
e.args = (e.args[0].replace("<Path to Dataset>", f"datasets/{self.name}/{self.name}.py"),)
805+
raise (e) from None
802806
dl_manager.manage_extracted_files()
803807

804808
if verify_infos:

src/datasets/keyhash.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -70,17 +70,11 @@ def __init__(self, hash_data):
7070
class DuplicatedKeysError(Exception):
7171
"""Raise an error when duplicate key found."""
7272

73-
def __init__(self, key, duplicate_key_indices=None):
74-
if duplicate_key_indices:
75-
self.prefix = f"Found multiple examples with duplicate key: {key}"
76-
self.err_msg = f"\nThe following examples {' ,'.join(duplicate_key_indices)} have the same key {key} "
77-
self.suffix = "\nPlease fix the dataset script at <Path to Dataset>"
78-
super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")
79-
else:
80-
self.prefix = "FAILURE TO GENERATE DATASET !"
81-
self.err_msg = f"\nFound duplicate Key: {key}"
82-
self.suffix = "\nKeys should be unique and deterministic in nature"
83-
super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")
73+
def __init__(self, key, duplicate_key_indices):
74+
self.prefix = f"Found multiple examples generated with the same key"
75+
self.err_msg = f"\nThe following examples {', '.join(duplicate_key_indices)} have the key {key}"
76+
self.suffix = "\nPlease fix the dataset script at <Path to Dataset> to avoid duplicate keys"
77+
super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")
8478

8579

8680
class KeyHasher:

0 commit comments

Comments
 (0)