Skip to content

Commit 153c77b

Browse files
author
andy-neuma
committed
add missing lm-eval file
1 parent 4e91c9d commit 153c77b

File tree

1 file changed

+150
-0
lines changed

1 file changed

+150
-0
lines changed
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
import itertools
2+
import os
3+
from pathlib import Path
4+
from typing import TYPE_CHECKING
5+
6+
import nltk
7+
import numpy
8+
import pandas as pd
9+
import pytest
10+
import yaml
11+
12+
if TYPE_CHECKING:
13+
import lm_eval as lm_eval_t
14+
15+
# requires a particular lm-evaluation-harness
16+
# pip install lm_eval==0.4.3
17+
lm_eval: "lm_eval_t" = pytest.importorskip("lm_eval",
18+
reason="lm_eval required")
19+
20+
MAX_MODEL_LEN = 4096
21+
RTOL = 0.040
22+
TEST_DATA_PATH = os.environ.get(
23+
"LM_EVAL_TEST_DATA_FILE",
24+
"../neuralmagic/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml")
25+
# just show the test data file from the `neuralmagic/lm-eval-configs/models`
26+
# directory. this could be a `model.yaml`, or a `leaderboard/model.yaml`
27+
TEST_DATA_FILE = str(Path(TEST_DATA_PATH)).replace(
28+
str(Path.cwd() / "../neuralmagic/lm-eval-configs/models"), "")
29+
30+
31+
def launch_lm_eval(eval_config, tp_size):
32+
model_args = {
33+
"pretrained": eval_config['model_name'],
34+
}
35+
eval_config_model_args = eval_config.get('model_args')
36+
if eval_config_model_args:
37+
model_args.update(eval_config_model_args)
38+
39+
model_backend = eval_config.get("backend", "vllm")
40+
41+
if model_backend == "vllm":
42+
model_args.update({
43+
"tensor_parallel_size": tp_size,
44+
"distributed_executor_backend": "ray",
45+
"max_model_len": MAX_MODEL_LEN
46+
})
47+
48+
evaluate_args = {
49+
"model": model_backend,
50+
"model_args": ",".join([f"{k}={v}" for k, v in model_args.items()]),
51+
"tasks": [task["name"] for task in eval_config["tasks"]],
52+
"num_fewshot": eval_config["num_fewshot"],
53+
"batch_size": "auto"
54+
}
55+
if "limit" in eval_config:
56+
evaluate_args["limit"] = eval_config["limit"]
57+
if "fewshot_as_multiturn" in eval_config:
58+
evaluate_args["fewshot_as_multiturn"] = eval_config[
59+
"fewshot_as_multiturn"]
60+
if "apply_chat_template" in eval_config:
61+
evaluate_args["apply_chat_template"] = eval_config[
62+
"apply_chat_template"]
63+
64+
simple_eval_args = ['{}={}'.format(k, v) for k, v in evaluate_args.items()]
65+
print(f"lm_eval.simple_evaluate({', '.join(simple_eval_args)}")
66+
results = lm_eval.simple_evaluate(**evaluate_args)
67+
68+
return results
69+
70+
71+
# pass the TEST_DATA_FILE in as a parameter so that the results
72+
# are uniquely reported to TestMo
73+
@pytest.mark.parametrize("test_data_file", [TEST_DATA_FILE])
74+
def test_lm_eval_correctness(num_gpus_available, test_data_file):
75+
eval_config = yaml.safe_load(
76+
Path(TEST_DATA_PATH).read_text(encoding="utf-8"))
77+
eval_config_tasks = {
78+
t['name']: {m['name']: m['value']
79+
for m in t['metrics']}
80+
for t in eval_config["tasks"]
81+
}
82+
# identify unique metrics we wish to report on.
83+
eval_config_metrics = set(
84+
itertools.chain.from_iterable([
85+
metric.keys() for metric in
86+
[eval_config_tasks[task] for task in eval_config_tasks]
87+
]))
88+
89+
# retrieve the ground truth values from the evaluation config
90+
# we transpose the info into a set of records indexed by
91+
# a "task" and "metric". The `dropna()` is necessary to remove extra
92+
# rows where there is no ground truth value for the "task" and "metric"
93+
ground_truth_df = pd.DataFrame.from_records(
94+
eval_config_tasks, index=eval_config_metrics).transpose()
95+
gt_listing_df = ground_truth_df.reset_index(names="task").melt(
96+
id_vars="task", var_name="metric",
97+
value_name="ground_truth").dropna().set_index(["task", "metric"])
98+
99+
# the ifeval task requires an additional set of data
100+
if "leaderboard_ifeval" in [task["name"] for task in eval_config["tasks"]]:
101+
nltk.download('punkt_tab')
102+
103+
# Launch eval requests.
104+
results = launch_lm_eval(eval_config, tp_size=num_gpus_available)
105+
106+
# process the results into a dataframe that looks like the ground truth
107+
# with records indexed by "task" and "metric", but with the measured value
108+
# for each index.
109+
results_df = pd.DataFrame.from_records(
110+
results["results"], index=eval_config_metrics).transpose()
111+
r_listing_df = (results_df.reset_index(names="task").melt(
112+
id_vars="task", var_name="metric",
113+
value_name="measured").dropna().set_index(["task", "metric"]))
114+
115+
# present the results
116+
# combine the ground truth and results into a single dataframe
117+
# but eliminate any rows that do not have both values
118+
# (This could happen if the eval_config includes a measure that's not
119+
# generated, or if the LM Evaluation harness generates a measure that
120+
# was not requested by the eval_config.)
121+
comparing_metrics_df = pd.concat(
122+
[gt_listing_df, r_listing_df],
123+
axis="columns").reset_index(names=["task", "metric"]).dropna()
124+
125+
# Add a column with the relative tolerance level for the task
126+
task_rtol_map = {
127+
t["name"]: t.get("rtol", RTOL)
128+
for t in eval_config["tasks"]
129+
}
130+
comparing_metrics_df.loc[:, "rtol"] = comparing_metrics_df.apply(
131+
lambda metric: task_rtol_map[metric.task], axis=1)
132+
133+
# and determine if measured is close to ground truth
134+
comparing_metrics_df.loc[:, "isclose"] = comparing_metrics_df.apply(
135+
lambda metric: numpy.isclose(
136+
metric.ground_truth, metric.measured, rtol=metric.rtol),
137+
axis=1)
138+
print("==== LM EVAL RESULT ====\n")
139+
comparing_metrics_df.sort_values(by=["task", "metric"], inplace=True)
140+
print(comparing_metrics_df.to_markdown(index=False))
141+
142+
# save the results for later summary
143+
llm_results_md = Path("llmeval_results-" +
144+
TEST_DATA_FILE.replace("/", "-")).with_suffix(".md")
145+
llm_results_md.write_text(
146+
f"## {eval_config['model_name']}\n"
147+
f"{comparing_metrics_df.to_markdown(index=False)}\n")
148+
149+
# fail if any scores fail to match ground truth.
150+
assert comparing_metrics_df.loc[:, "isclose"].all()

0 commit comments

Comments
 (0)