add few show he

davidheineman · davidheineman · commit d3d244862b3a · 2025-05-19T13:51:28.000-07:00
diff --git a/src/olmo_eval/oe_eval_tasks/codex_humaneval/gold_bpb_3shot/config.json b/src/olmo_eval/oe_eval_tasks/codex_humaneval/gold_bpb_3shot/config.json
@@ -0,0 +1 @@
+{"task_name": "codex_humaneval", "task_hash": "b271b0f127ae71cf79a80d6463f0c877", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "codex_humaneval", "task_core": "codex_humaneval", "limit": null, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {"answer_prefix": ""}, "generation_kwargs": {"max_gen_toks": 512, "do_sample": false, "temperature": 0.0, "stop_sequences": [], "repeats": 1}, "metric_kwargs": {"pass_at_ks": [1]}, "native_id_field": "task_id", "fewshot_source": null, "dataset_path": "openai_humaneval", "dataset_name": null, "use_chat_format": false, "version": 0.1, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "codex_humaneval:3shot:bpb::none"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.107417345046997, "current_date": "2025-05-19 20:42:07 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}
diff --git a/src/olmo_eval/oe_eval_tasks/codex_humaneval/gold_bpb_3shot/requests.jsonl.gz b/src/olmo_eval/oe_eval_tasks/codex_humaneval/gold_bpb_3shot/requests.jsonl.gz
diff --git a/src/olmo_eval/tasks.py b/src/olmo_eval/tasks.py
@@ -2620,6 +2620,10 @@ def doc_to_label(self, doc) -> int:
         OEEvalTask,
         {"dataset_path": "codex_humaneval", "dataset_name": "gold_bpb_0shot", "metric_type": "bpb"},
     ),
+    "codex_humaneval_gold_bpb_3shot": (
+        OEEvalTask,
+        {"dataset_path": "codex_humaneval", "dataset_name": "gold_bpb_3shot", "metric_type": "bpb"},
+    ),
     "codex_mbpp_gold_bpb_0shot": (
         OEEvalTask,
         {"dataset_path": "codex_mbpp", "dataset_name": "gold_bpb_0shot", "metric_type": "bpb"},

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"task_name": "codex_humaneval", "task_hash": "b271b0f127ae71cf79a80d6463f0c877", "model_hash": "99914b932bd37a50b983c5e7c90ae93b", "model_config": {"model": null, "revision": null, "trust_remote_code": null, "max_length": 2048, "model_path": null, "model_type": "hf"}, "task_config": {"task_name": "codex_humaneval", "task_core": "codex_humaneval", "limit": null, "split": "test", "num_shots": 3, "fewshot_seed": 1234, "primary_metric": "bits_per_byte_corr", "random_subsample_seed": 1234, "context_kwargs": {"answer_prefix": ""}, "generation_kwargs": {"max_gen_toks": 512, "do_sample": false, "temperature": 0.0, "stop_sequences": [], "repeats": 1}, "metric_kwargs": {"pass_at_ks": [1]}, "native_id_field": "task_id", "fewshot_source": null, "dataset_path": "openai_humaneval", "dataset_name": null, "use_chat_format": false, "version": 0.1, "revision": null, "compute_gold_bpb": true, "external_eval": null, "custom_kwargs": null, "skip_model_judges": null, "model_max_length": null, "metadata": {"alias": "codex_humaneval:3shot:bpb::none"}}, "compute_config": {"batch_size": "4", "max_batch_size": 32, "output_dir": "workspace", "num_recorded_inputs": 3, "save_raw_requests": true, "recompute_metrics": false, "check_datalake": false, "autofetch_datalake": false, "push_datalake": false, "no_datalake": false}, "processing_time": 2.107417345046997, "current_date": "2025-05-19 20:42:07 UTC", "num_instances": 0, "beaker_info": {"BEAKER_NODE_ID": "01JR1D5PVN5HA1E2SX4FNFAZN4", "BEAKER_JOB_KIND": "session", "BEAKER_JOB_ID": "01JVGSX2MHV1Z5AY6TBD7F6HAS", "BEAKER_WORKLOAD_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ENVIRONMENT_ID": "01JVGSX2MHNQK7G4K6C93E8WZR", "BEAKER_ASSIGNED_CPU_COUNT": "127.5", "BEAKER_ASSIGNED_GPU_COUNT": "4", "BEAKER_NODE_HOSTNAME": "neptune-cs-aus-264.reviz.ai2.in"}}