|
3 | 3 | LM eval harness on model to compare vs HF baseline computed offline. |
4 | 4 | Configs are found in configs/$MODEL.yaml |
5 | 5 |
|
6 | | -* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml |
7 | | -* export LM_EVAL_TP_SIZE=4 |
8 | | -* pytest -s test_lm_eval_correctness.py |
| 6 | +pytest -s -v test_lm_eval_correctness.py \ |
| 7 | + --config-list-file=configs/models-small.txt \ |
| 8 | + --tp-size=1 |
9 | 9 | """ |
10 | 10 |
|
11 | | -import os |
12 | | -from pathlib import Path |
13 | | - |
14 | 11 | import lm_eval |
15 | | -import numpy |
16 | | -import pytest |
| 12 | +import numpy as np |
17 | 13 | import yaml |
18 | 14 |
|
19 | 15 | RTOL = 0.08 |
20 | | -TEST_DATA_FILE = os.environ.get( |
21 | | - "LM_EVAL_TEST_DATA_FILE", |
22 | | - ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml") |
23 | | - |
24 | | -TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1) |
25 | 16 |
|
26 | 17 |
|
27 | | -def launch_lm_eval(eval_config): |
| 18 | +def launch_lm_eval(eval_config, tp_size): |
28 | 19 | trust_remote_code = eval_config.get('trust_remote_code', False) |
29 | | - |
30 | 20 | model_args = f"pretrained={eval_config['model_name']}," \ |
31 | | - f"tensor_parallel_size={TP_SIZE}," \ |
| 21 | + f"tensor_parallel_size={tp_size}," \ |
| 22 | + f"enforce_eager=true," \ |
32 | 23 | f"add_bos_token=true," \ |
33 | 24 | f"trust_remote_code={trust_remote_code}" |
34 | | - |
35 | 25 | results = lm_eval.simple_evaluate( |
36 | 26 | model="vllm", |
37 | 27 | model_args=model_args, |
38 | 28 | tasks=[task["name"] for task in eval_config["tasks"]], |
39 | 29 | num_fewshot=eval_config["num_fewshot"], |
40 | 30 | limit=eval_config["limit"], |
41 | 31 | batch_size="auto") |
42 | | - |
43 | 32 | return results |
44 | 33 |
|
45 | 34 |
|
46 | | -def test_lm_eval_correctness(): |
47 | | - eval_config = yaml.safe_load( |
48 | | - Path(TEST_DATA_FILE).read_text(encoding="utf-8")) |
49 | | - |
50 | | - if eval_config[ |
51 | | - "model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501 |
52 | | - pytest.skip("FBGEMM is currently failing on main.") |
| 35 | +def test_lm_eval_correctness_param(config_filename, tp_size): |
| 36 | + eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8")) |
53 | 37 |
|
54 | | - # Launch eval requests. |
55 | | - results = launch_lm_eval(eval_config) |
| 38 | + results = launch_lm_eval(eval_config, tp_size) |
56 | 39 |
|
57 | | - # Confirm scores match ground truth. |
58 | 40 | success = True |
59 | 41 | for task in eval_config["tasks"]: |
60 | 42 | for metric in task["metrics"]: |
61 | 43 | ground_truth = metric["value"] |
62 | 44 | measured_value = results["results"][task["name"]][metric["name"]] |
63 | 45 | print(f'{task["name"]} | {metric["name"]}: ' |
64 | 46 | f'ground_truth={ground_truth} | measured={measured_value}') |
65 | | - success = success and numpy.isclose( |
| 47 | + success = success and np.isclose( |
66 | 48 | ground_truth, measured_value, rtol=RTOL) |
67 | 49 |
|
68 | | - # Assert at the end, print all scores even on failure for debugging. |
69 | 50 | assert success |
0 commit comments