allenai · epwalsh · May 19, 2025 · May 17, 2025 · May 18, 2025 · May 17, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - The `BeakerCallback` will save the config and Python requirements to the results dataset.
 - Added `from_file` method to `Config` class.
 - Added in-loop evals for OLMES basic skills eval
+- Added in-loop fast MCQA for in-loop evals and translated MBPP tasks
 
 ### Changed
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "omegaconf",
     "safetensors",
     "importlib_resources",
-    "ai2-olmo-eval==0.7.2",
+    "ai2-olmo-eval==0.8.1",
 ]
 
 [project.urls]

diff --git a/src/olmo_core/train/config.py b/src/olmo_core/train/config.py
@@ -112,8 +112,12 @@ def with_recommended_evals(
             "minerva_math_precalculus_gold_bpb_0shot",
             "codex_humaneval_gold_bpb_0shot",
             "codex_mbpp_gold_bpb_0shot",
+            # MT MBPP tasks
+            "mt_mbpp_rust_gold_bpb_3shot",
+            "mt_mbpp_java_gold_bpb_3shot",
+            "mt_mbpp_cpp_gold_bpb_3shot",
             # Sanity check for MCQA ability
-            "copycolors_10way",
+            "copycolors_10way_fast",
             # Basic Skills
             "basic_skills_arithmetic_rc_5shot",
             "basic_skills_coding_rc_5shot",
@@ -126,25 +130,25 @@ def with_recommended_evals(
         # For training runs where we expect the model to acquire MC
         tasks_large_compute = [
             # OLMES Core 9(-ish) MC
-            "arc_challenge_test_mc_5shot",
-            "arc_easy_test_mc_5shot",
+            "arc_challenge_test_mc_5shot_fast",
+            "arc_easy_test_mc_5shot_fast",
             "hellaswag_rc_5shot",  # 1K subset of HellaSwag
-            "csqa_val_mc_5shot",
-            "piqa_val_mc_5shot",
-            "socialiqa_val_mc_5shot",
+            "csqa_val_mc_5shot_fast",
+            "piqa_val_mc_5shot_fast",
+            "socialiqa_val_mc_5shot_fast",
             "winogrande_val_rc_5shot",
             # Too noisy to be worth tracking
-            # "boolq_val_mc_5shot",
-            # "openbookqa_test_mc_5shot",
+            # "boolq_val_mc_5shot_fast",
+            # "openbookqa_test_mc_5shot_fast",
             # MMLU MC BPB
-            "mmlu_stem_val_mc_5shot",
-            "mmlu_humanities_val_mc_5shot",
-            "mmlu_social_sciences_val_mc_5shot",
-            "mmlu_other_val_mc_5shot",
-            "mmlu_stem_test_mc_5shot",
-            "mmlu_humanities_test_mc_5shot",
-            "mmlu_social_sciences_test_mc_5shot",
-            "mmlu_other_test_mc_5shot",
+            "mmlu_stem_val_mc_5shot_fast",
+            "mmlu_humanities_val_mc_5shot_fast",
+            "mmlu_social_sciences_val_mc_5shot_fast",
+            "mmlu_other_val_mc_5shot_fast",
+            "mmlu_stem_test_mc_5shot_fast",
+            "mmlu_humanities_test_mc_5shot_fast",
+            "mmlu_social_sciences_test_mc_5shot_fast",
+            "mmlu_other_test_mc_5shot_fast",
             # Gen tasks BPB
             "gsm8k_gold_bpb_5shot",
             "minerva_math_algebra_gold_bpb_0shot",
@@ -156,8 +160,12 @@ def with_recommended_evals(
             "minerva_math_precalculus_gold_bpb_0shot",
             "codex_humaneval_gold_bpb_0shot",
             "codex_mbpp_gold_bpb_0shot",
+            # MT MBPP tasks
+            "mt_mbpp_rust_gold_bpb_3shot",
+            "mt_mbpp_java_gold_bpb_3shot",
+            "mt_mbpp_cpp_gold_bpb_3shot",
             # Sanity check for MCQA ability
-            "copycolors_10way",
+            "copycolors_10way_fast",
             # Basic Skills
             "basic_skills_arithmetic_rc_5shot",
             "basic_skills_coding_rc_5shot",