Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3097,6 +3097,21 @@ def add_item(self, item: dict, new_fingerprint: str):
fingerprint=new_fingerprint,
)

def align_labels_with_mapping(self, label2id: Dict, label_column: str = "labels") -> "Dataset":
features = self.features.copy()
int2str_function = features[label_column].int2str
# Some label mappings use uppercase label names so we lowercase them
label2id = {k.lower(): v for k, v in label2id.items()}
label_names = list(label2id.keys())
features[label_column] = ClassLabel(num_classes=len(label_names), names=label_names)

def process_label_ids(batch):
dset_label_names = [int2str_function(label_id).lower() for label_id in batch[label_column]]
batch[label_column] = [label2id[label_name] for label_name in dset_label_names]
return batch

return self.map(process_label_ids, features=features, batched=True)


def concatenate_datasets(
dsets: List[Dataset],
Expand Down
15 changes: 15 additions & 0 deletions tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2202,6 +2202,21 @@ def test_task_templates_empty_after_preparation(self, in_memory):
with dset.prepare_for_task(task="text-classification") as dset:
self.assertIsNone(dset.info.task_templates)

def test_align_labels_with_mapping(self, in_memory):
features = Features(
{
"input_text": Value("string"),
"input_labels": ClassLabel(num_classes=3, names=["entailment", "neutral", "contradiction"]),
}
)
data = {"input_text": ["a", "a", "b", "b", "c", "c"], "input_labels": [0, 0, 1, 1, 2, 2]}
label2id = {"CONTRADICTION": 0, "ENTAILMENT": 2, "NEUTRAL": 1}
expected_labels = [2, 2, 1, 1, 0, 0]
with tempfile.TemporaryDirectory() as tmp_dir, Dataset.from_dict(data, features=features) as dset:
with self._to(in_memory, tmp_dir, dset) as dset:
with dset.align_labels_with_mapping(label2id, "input_labels") as dset:
self.assertListEqual(expected_labels, dset["input_labels"])


class MiscellaneousDatasetTest(TestCase):
def test_from_pandas(self):
Expand Down