diff --git a/metrics/accuracy/accuracy.py b/metrics/accuracy/accuracy.py index c050338e331..1aae6f5e88d 100644 --- a/metrics/accuracy/accuracy.py +++ b/metrics/accuracy/accuracy.py @@ -83,5 +83,7 @@ def _info(self): def _compute(self, predictions, references, normalize=True, sample_weight=None): return { - "accuracy": accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight), + "accuracy": accuracy_score( + references, predictions, normalize=normalize, sample_weight=sample_weight + ).tolist(), } diff --git a/metrics/f1/f1.py b/metrics/f1/f1.py index d95f8c6553d..4abad5a9cda 100644 --- a/metrics/f1/f1.py +++ b/metrics/f1/f1.py @@ -106,5 +106,5 @@ def _compute(self, predictions, references, labels=None, pos_label=1, average="b pos_label=pos_label, average=average, sample_weight=sample_weight, - ), + ).tolist(), } diff --git a/metrics/glue/glue.py b/metrics/glue/glue.py index 4a3d9eac84f..4de1e3a158d 100644 --- a/metrics/glue/glue.py +++ b/metrics/glue/glue.py @@ -81,12 +81,12 @@ def simple_accuracy(preds, labels): - return (preds == labels).mean() + return (preds == labels).mean().tolist() def acc_and_f1(preds, labels): acc = simple_accuracy(preds, labels) - f1 = f1_score(y_true=labels, y_pred=preds) + f1 = f1_score(y_true=labels, y_pred=preds).tolist() return { "accuracy": acc, "f1": f1, @@ -94,8 +94,8 @@ def acc_and_f1(preds, labels): def pearson_and_spearman(preds, labels): - pearson_corr = pearsonr(preds, labels)[0] - spearman_corr = spearmanr(preds, labels)[0] + pearson_corr = pearsonr(preds, labels)[0].tolist() + spearman_corr = spearmanr(preds, labels)[0].tolist() return { "pearson": pearson_corr, "spearmanr": spearman_corr, diff --git a/metrics/indic_glue/indic_glue.py b/metrics/indic_glue/indic_glue.py index 51309effbe1..989a521ca7a 100644 --- a/metrics/indic_glue/indic_glue.py +++ b/metrics/indic_glue/indic_glue.py @@ -74,12 +74,12 @@ def simple_accuracy(preds, labels): - return (preds == labels).mean() + return (preds == labels).mean().tolist() def acc_and_f1(preds, labels): acc = simple_accuracy(preds, labels) - f1 = f1_score(y_true=labels, y_pred=preds) + f1 = f1_score(y_true=labels, y_pred=preds).tolist() return { "accuracy": acc, "f1": f1, @@ -99,7 +99,7 @@ def precision_at_10(en_sentvecs, in_sentvecs): actual = np.array(range(n)) preds = sim.argsort(axis=1)[:, :10] matches = np.any(preds == actual[:, None], axis=1) - return matches.mean() + return matches.mean().tolist() @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) diff --git a/metrics/matthews_correlation/matthews_correlation.py b/metrics/matthews_correlation/matthews_correlation.py index 212d395c99d..ae6250a0fbf 100644 --- a/metrics/matthews_correlation/matthews_correlation.py +++ b/metrics/matthews_correlation/matthews_correlation.py @@ -82,5 +82,5 @@ def _info(self): def _compute(self, predictions, references, sample_weight=None): return { - "matthews_correlation": matthews_corrcoef(references, predictions, sample_weight=sample_weight), + "matthews_correlation": matthews_corrcoef(references, predictions, sample_weight=sample_weight).tolist(), } diff --git a/metrics/precision/precision.py b/metrics/precision/precision.py index a13ead33ce3..fca5b0b21ce 100644 --- a/metrics/precision/precision.py +++ b/metrics/precision/precision.py @@ -108,5 +108,5 @@ def _compute(self, predictions, references, labels=None, pos_label=1, average="b pos_label=pos_label, average=average, sample_weight=sample_weight, - ), + ).tolist(), } diff --git a/metrics/recall/recall.py b/metrics/recall/recall.py index 30c77a95464..8cc84fdea10 100644 --- a/metrics/recall/recall.py +++ b/metrics/recall/recall.py @@ -108,5 +108,5 @@ def _compute(self, predictions, references, labels=None, pos_label=1, average="b pos_label=pos_label, average=average, sample_weight=sample_weight, - ), + ).tolist(), } diff --git a/metrics/super_glue/super_glue.py b/metrics/super_glue/super_glue.py index c80135d2bb5..7aff18a527c 100644 --- a/metrics/super_glue/super_glue.py +++ b/metrics/super_glue/super_glue.py @@ -107,12 +107,12 @@ def simple_accuracy(preds, labels): - return (preds == labels).mean() + return (preds == labels).mean().tolist() def acc_and_f1(preds, labels, f1_avg="binary"): acc = simple_accuracy(preds, labels) - f1 = f1_score(y_true=labels, y_pred=preds, average=f1_avg) + f1 = f1_score(y_true=labels, y_pred=preds, average=f1_avg).tolist() return { "accuracy": acc, "f1": f1, @@ -138,9 +138,9 @@ def evaluate_multirc(ids_preds, labels): f1s.append(f1) em = int(sum([p == l for p, l in preds_labels]) == len(preds_labels)) ems.append(em) - f1_m = sum(f1s) / len(f1s) + f1_m = (sum(f1s) / len(f1s)).tolist() em = sum(ems) / len(ems) - f1_a = f1_score(y_true=labels, y_pred=[id_pred["prediction"] for id_pred in ids_preds]) + f1_a = f1_score(y_true=labels, y_pred=[id_pred["prediction"] for id_pred in ids_preds]).tolist() return {"exact_match": em, "f1_m": f1_m, "f1_a": f1_a}