huggingface · lhoestq · Jun 17, 2021 · Jun 8, 2021 · Jun 10, 2021 · Jun 10, 2021
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -3097,6 +3097,21 @@ def add_item(self, item: dict, new_fingerprint: str):
             fingerprint=new_fingerprint,
         )
 
+    def align_labels_with_mapping(self, label2id: Dict, label_column: str = "labels") -> "Dataset":
+        features = self.features.copy()
+        int2str_function = features[label_column].int2str
+        # Some label mappings use uppercase label names so we lowercase them
+        label2id = {k.lower(): v for k, v in label2id.items()}
+        label_names = list(label2id.keys())
+        features[label_column] = ClassLabel(num_classes=len(label_names), names=label_names)
+
+        def process_label_ids(batch):
+            dset_label_names = [int2str_function(label_id).lower() for label_id in batch[label_column]]
+            batch[label_column] = [label2id[label_name] for label_name in dset_label_names]
+            return batch
+
+        return self.map(process_label_ids, features=features, batched=True)
+
 
 def concatenate_datasets(
     dsets: List[Dataset],

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -2202,6 +2202,21 @@ def test_task_templates_empty_after_preparation(self, in_memory):
                 with dset.prepare_for_task(task="text-classification") as dset:
                     self.assertIsNone(dset.info.task_templates)
 
+    def test_align_labels_with_mapping(self, in_memory):
+        features = Features(
+            {
+                "input_text": Value("string"),
+                "input_labels": ClassLabel(num_classes=3, names=["entailment", "neutral", "contradiction"]),
+            }
+        )
+        data = {"input_text": ["a", "a", "b", "b", "c", "c"], "input_labels": [0, 0, 1, 1, 2, 2]}
+        label2id = {"CONTRADICTION": 0, "ENTAILMENT": 2, "NEUTRAL": 1}
+        expected_labels = [2, 2, 1, 1, 0, 0]
+        with tempfile.TemporaryDirectory() as tmp_dir, Dataset.from_dict(data, features=features) as dset:
+            with self._to(in_memory, tmp_dir, dset) as dset:
+                with dset.align_labels_with_mapping(label2id, "input_labels") as dset:
+                    self.assertListEqual(expected_labels, dset["input_labels"])
+
 
 class MiscellaneousDatasetTest(TestCase):
     def test_from_pandas(self):