diff --git a/CHANGELOG.md b/CHANGELOG.md index 3ee568291..94d9d46ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - The `BeakerCallback` will save the config and Python requirements to the results dataset. - Added `from_file` method to `Config` class. - Added in-loop evals for OLMES basic skills eval +- Added in-loop fast MCQA for in-loop evals and translated MBPP tasks ### Changed diff --git a/pyproject.toml b/pyproject.toml index e940cbf08..5a50700cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "omegaconf", "safetensors", "importlib_resources", - "ai2-olmo-eval==0.7.2", + "ai2-olmo-eval==0.8.1", ] [project.urls] diff --git a/src/olmo_core/train/config.py b/src/olmo_core/train/config.py index 84e10a4cb..b52f2ad2c 100644 --- a/src/olmo_core/train/config.py +++ b/src/olmo_core/train/config.py @@ -112,8 +112,12 @@ def with_recommended_evals( "minerva_math_precalculus_gold_bpb_0shot", "codex_humaneval_gold_bpb_0shot", "codex_mbpp_gold_bpb_0shot", + # MT MBPP tasks + "mt_mbpp_rust_gold_bpb_3shot", + "mt_mbpp_java_gold_bpb_3shot", + "mt_mbpp_cpp_gold_bpb_3shot", # Sanity check for MCQA ability - "copycolors_10way", + "copycolors_10way_fast", # Basic Skills "basic_skills_arithmetic_rc_5shot", "basic_skills_coding_rc_5shot", @@ -126,25 +130,25 @@ def with_recommended_evals( # For training runs where we expect the model to acquire MC tasks_large_compute = [ # OLMES Core 9(-ish) MC - "arc_challenge_test_mc_5shot", - "arc_easy_test_mc_5shot", + "arc_challenge_test_mc_5shot_fast", + "arc_easy_test_mc_5shot_fast", "hellaswag_rc_5shot", # 1K subset of HellaSwag - "csqa_val_mc_5shot", - "piqa_val_mc_5shot", - "socialiqa_val_mc_5shot", + "csqa_val_mc_5shot_fast", + "piqa_val_mc_5shot_fast", + "socialiqa_val_mc_5shot_fast", "winogrande_val_rc_5shot", # Too noisy to be worth tracking - # "boolq_val_mc_5shot", - # "openbookqa_test_mc_5shot", + # "boolq_val_mc_5shot_fast", + # "openbookqa_test_mc_5shot_fast", # MMLU MC BPB - "mmlu_stem_val_mc_5shot", - "mmlu_humanities_val_mc_5shot", - "mmlu_social_sciences_val_mc_5shot", - "mmlu_other_val_mc_5shot", - "mmlu_stem_test_mc_5shot", - "mmlu_humanities_test_mc_5shot", - "mmlu_social_sciences_test_mc_5shot", - "mmlu_other_test_mc_5shot", + "mmlu_stem_val_mc_5shot_fast", + "mmlu_humanities_val_mc_5shot_fast", + "mmlu_social_sciences_val_mc_5shot_fast", + "mmlu_other_val_mc_5shot_fast", + "mmlu_stem_test_mc_5shot_fast", + "mmlu_humanities_test_mc_5shot_fast", + "mmlu_social_sciences_test_mc_5shot_fast", + "mmlu_other_test_mc_5shot_fast", # Gen tasks BPB "gsm8k_gold_bpb_5shot", "minerva_math_algebra_gold_bpb_0shot", @@ -156,8 +160,12 @@ def with_recommended_evals( "minerva_math_precalculus_gold_bpb_0shot", "codex_humaneval_gold_bpb_0shot", "codex_mbpp_gold_bpb_0shot", + # MT MBPP tasks + "mt_mbpp_rust_gold_bpb_3shot", + "mt_mbpp_java_gold_bpb_3shot", + "mt_mbpp_cpp_gold_bpb_3shot", # Sanity check for MCQA ability - "copycolors_10way", + "copycolors_10way_fast", # Basic Skills "basic_skills_arithmetic_rc_5shot", "basic_skills_coding_rc_5shot",