Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

## [v0.8.5](https://github.com/allenai/OLMo-in-loop-evals/releases/tag/v0.8.4) - 2025-06-05

- Remove `sklearn` and `numpy` as depedencies. Manual implementation of F1 score.

## [v0.8.4](https://github.com/allenai/OLMo-in-loop-evals/releases/tag/v0.8.4) - 2025-06-05

- Add BOS token, when the BOS token exists in the tokenizer
Expand Down
2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,10 @@ authors = [
requires-python = ">=3.9"
license = { file = "LICENSE" }
dependencies = [
"numpy<2.0",
"torch",
"torchmetrics",
"datasets",
"tokenizers",
"scikit-learn",
"cached-path",
"requests",
"packaging",
Expand Down
25 changes: 22 additions & 3 deletions src/olmo_eval/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score
from torchmetrics import Metric

from .util import all_gather_object
Expand Down Expand Up @@ -395,8 +394,10 @@ def compute(self) -> Dict[str, torch.Tensor]:
assert preds is not None
assert labels is not None
# for NLI tasks, continuations are yes, no, neither, so idx=0 assigned to pos label
score = f1_score(labels, preds, pos_label=0)
score_no_leading_space = f1_score(labels, preds_no_leading_space, pos_label=0)
score = self.custom_f1_score(labels, preds, pos_label=0)
score_no_leading_space = self.custom_f1_score(
labels, preds_no_leading_space, pos_label=0
)
return {
"f1_v1": torch.tensor(score),
"f1_v2": torch.tensor(score_no_leading_space),
Expand Down Expand Up @@ -432,3 +433,21 @@ def compute(self) -> Dict[str, torch.Tensor]:
),
"soft_log_v2": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
}

def custom_f1_score(self, y_true, y_pred, pos_label=1):
y_true = list(y_true)
y_pred = list(y_pred)
tp = sum((yt == pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
fp = sum((yt != pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
fn = sum((yt == pos_label) and (yp != pos_label) for yt, yp in zip(y_true, y_pred))

if tp + fp == 0 or tp + fn == 0:
return 0.0

precision = tp / (tp + fp)
recall = tp / (tp + fn)

if precision + recall == 0:
return 0.0

return 2 * precision * recall / (precision + recall)
2 changes: 1 addition & 1 deletion src/olmo_eval/version.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
_MAJOR = "0"
_MINOR = "8"
_PATCH = "4"
_PATCH = "5"
_SUFFIX = ""

VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
Expand Down