Make DuplicateKeysError more user friendly [For Issue #2556] (#4545)

VijayKalmath · lhoestq · web-flow · commit 8910eda53561 · 2022-06-28T11:26:03.000+02:00
* Update DuplicateKeysError more user friendly

DuplicateKeysError error does not provide any information regarding
the examples which have the same the key. This information is very
helpful for debugging the dataset generator script.

* Removing unused hkey_index variable

* Update Dataset Path in DuplicatedKeysError

The DuplicatedKeysError class was updated to always except
duplicate_key_indices and old Error message is removed.

Path to Dataset in Error updated using dataset name to make it more
user friendly.

* Update index calculation with _num_examples

Current Index calculation does not calculate the absolute indices.
Error message used str.replace , changed it to simple str append.

* Fix Lint Error

* Apply suggestions from code review

Co-authored-by: Quentin Lhoest &lt;42851186+lhoestq@users.noreply.github.com&gt;
diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
@@ -456,7 +456,13 @@ def check_duplicate_keys(self):
         tmp_record = set()
         for hash, key in self.hkey_record:
             if hash in tmp_record:
-                raise DuplicatedKeysError(key)
+                duplicate_key_indices = [
+                    str(self._num_examples + index)
+                    for index, (duplicate_hash, _) in enumerate(self.hkey_record)
+                    if duplicate_hash == hash
+                ]
+
+                raise DuplicatedKeysError(key, duplicate_key_indices)
             else:
                 tmp_record.add(hash)
 
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -49,6 +49,7 @@
 from .fingerprint import Hasher
 from .info import DatasetInfo, DatasetInfosDict, PostProcessedInfo
 from .iterable_dataset import ExamplesIterable, IterableDataset, _generate_examples_from_tables_wrapper
+from .keyhash import DuplicatedKeysError
 from .naming import camelcase_to_snakecase
 from .splits import Split, SplitDict, SplitGenerator
 from .streaming import extend_dataset_builder_for_streaming
@@ -798,7 +799,13 @@ def _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs
                     + "\nOriginal error:\n"
                     + str(e)
                 ) from None
-
+            # If check_duplicates is set to True , then except DuplicatedKeysError
+            except DuplicatedKeysError as e:
+                raise DuplicatedKeysError(
+                    e.key,
+                    e.duplicate_key_indices,
+                    fix_msg=f"To avoid duplicate keys, please fix the dataset script {self.name}.py",
+                ) from None
             dl_manager.manage_extracted_files()
 
         if verify_infos:
diff --git a/src/datasets/keyhash.py b/src/datasets/keyhash.py
@@ -70,10 +70,16 @@ def __init__(self, hash_data):
 class DuplicatedKeysError(Exception):
     """Raise an error when duplicate key found."""
 
-    def __init__(self, key):
-        self.prefix = "FAILURE TO GENERATE DATASET !"
-        self.err_msg = f"\nFound duplicate Key: {key}"
-        self.suffix = "\nKeys should be unique and deterministic in nature"
+    def __init__(self, key, duplicate_key_indices, fix_msg=""):
+        self.key = key
+        self.duplicate_key_indices = duplicate_key_indices
+        self.fix_msg = fix_msg
+        self.prefix = "Found multiple examples generated with the same key"
+        if len(duplicate_key_indices) <= 20:
+            self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices)} have the key {key}"
+        else:
+            self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices[:20])}... ({len(duplicate_key_indices) - 20} more) have the key {key}"
+        self.suffix = "\n" + fix_msg if fix_msg else ""
         super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}")