allenai · davidheineman · Jul 19, 2025 · Jul 19, 2025 · Jul 19, 2025 · Jul 19, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+## [v0.8.5](https://github.com/allenai/OLMo-in-loop-evals/releases/tag/v0.8.4) - 2025-06-05
+
+- Remove `sklearn` and `numpy` as depedencies. Manual implementation of F1 score.
+
 ## [v0.8.4](https://github.com/allenai/OLMo-in-loop-evals/releases/tag/v0.8.4) - 2025-06-05
 
 - Add BOS token, when the BOS token exists in the tokenizer

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,12 +13,10 @@ authors = [
 requires-python = ">=3.9"
 license = { file = "LICENSE" }
 dependencies = [
-    "numpy<2.0",
     "torch",
     "torchmetrics",
     "datasets",
     "tokenizers",
-    "scikit-learn",
     "cached-path",
     "requests",
     "packaging",

diff --git a/src/olmo_eval/metrics.py b/src/olmo_eval/metrics.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch.nn.functional as F
-from sklearn.metrics import f1_score
 from torchmetrics import Metric
 
 from .util import all_gather_object
@@ -395,8 +394,10 @@ def compute(self) -> Dict[str, torch.Tensor]:
             assert preds is not None
             assert labels is not None
             # for NLI tasks, continuations are yes, no, neither, so idx=0 assigned to pos label
-            score = f1_score(labels, preds, pos_label=0)
-            score_no_leading_space = f1_score(labels, preds_no_leading_space, pos_label=0)
+            score = self.custom_f1_score(labels, preds, pos_label=0)
+            score_no_leading_space = self.custom_f1_score(
+                labels, preds_no_leading_space, pos_label=0
+            )
             return {
                 "f1_v1": torch.tensor(score),
                 "f1_v2": torch.tensor(score_no_leading_space),
@@ -432,3 +433,21 @@ def compute(self) -> Dict[str, torch.Tensor]:
                 ),
                 "soft_log_v2": torch.tensor(sum(soft_log_score) / len(soft_log_score)),
             }
+
+    def custom_f1_score(self, y_true, y_pred, pos_label=1):
+        y_true = list(y_true)
+        y_pred = list(y_pred)
+        tp = sum((yt == pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
+        fp = sum((yt != pos_label) and (yp == pos_label) for yt, yp in zip(y_true, y_pred))
+        fn = sum((yt == pos_label) and (yp != pos_label) for yt, yp in zip(y_true, y_pred))
+
+        if tp + fp == 0 or tp + fn == 0:
+            return 0.0
+
+        precision = tp / (tp + fp)
+        recall = tp / (tp + fn)
+
+        if precision + recall == 0:
+            return 0.0
+
+        return 2 * precision * recall / (precision + recall)
diff --git a/src/olmo_eval/version.py b/src/olmo_eval/version.py
@@ -1,6 +1,6 @@
 _MAJOR = "0"
 _MINOR = "8"
-_PATCH = "4"
+_PATCH = "5"
 _SUFFIX = ""
 
 VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)