Skip to content

Commit b1d2f7d

Browse files
vidyasivLiangyx2
authored andcommitted
Tests for text gen output text (huggingface#1411)
1 parent 497c9fa commit b1d2f7d

4 files changed

Lines changed: 172 additions & 51 deletions

File tree

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ style: clean
3535
fast_tests:
3636
python -m pip install .[tests]
3737
python -m pytest tests/test_gaudi_configuration.py tests/test_trainer_distributed.py tests/test_trainer.py tests/test_trainer_seq2seq.py
38+
# TODO enable when CI has more servers
39+
# python -m pytest test_functional_text_generation_example.py
3840

3941
# Run unit and integration tests related to Diffusers
4042
fast_tests_diffusers:

examples/text-generation/run_generation.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,12 +561,16 @@ def rounder(x):
561561

562562
print()
563563
print("Input/outputs:")
564+
all_inputs = []
565+
all_outputs = []
564566
for i, input_sentence in enumerate(zip(input_sentences)):
565567
print(f"input {i+1}: {input_sentence}")
568+
all_inputs.append(input_sentence)
566569
for j, output in enumerate(
567570
zip(generated[args.num_return_sequences * i : args.num_return_sequences * (i + 1)])
568571
):
569572
print(f"output {j+1}: {output}")
573+
all_outputs.append(output)
570574
print()
571575

572576
# Store results if necessary
@@ -576,7 +580,8 @@ def rounder(x):
576580

577581
results = {
578582
"throughput": throughput,
579-
"output": output,
583+
"input": all_inputs,
584+
"output": all_outputs,
580585
}
581586
with (output_dir / "results.json").open("w", encoding="utf-8") as f:
582587
json.dump(results, f, ensure_ascii=False, indent=4)
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import json
2+
import os
3+
import re
4+
import subprocess
5+
from pathlib import Path
6+
from tempfile import TemporaryDirectory
7+
8+
import pytest
9+
10+
from optimum.habana.utils import set_seed
11+
12+
13+
if os.environ.get("GAUDI2_CI", "0") == "1":
14+
MODEL_OUTPUTS = {
15+
"bigcode/starcoder": 'def print_hello_world():\n print("Hello World")\n\ndef print_hello_world_twice():\n print_hello_world()\n print_hello_world()\n\ndef print_hello_world_thrice():\n print_hello_world()\n print_hello_world()\n print_hello_world()\n\ndef print_hello_world_four_times():\n print_hello_world()\n print_hello_world()\n print_hello_world()\n ',
16+
"bigcode/starcoder2-3b": 'def print_hello_world():\n print("Hello World")\n\ndef print_hello_world_with_name(name):\n print("Hello World, " + name)\n\ndef print_hello_world_with_name_and_age(name, age):\n print("Hello World, " + name + ", " + str(age))\n\ndef print_hello_world_with_name_and_age_and_gender(name, age, gender):\n print("Hello',
17+
"google/gemma-7b": "DeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and it is compatible with the existing PyTorch ecosystem. DeepSpeed is designed to be easy to use, and it provides a number of features that make it easy to train large-scale models.\n\nDeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and",
18+
"meta-llama/Llama-2-7b-hf": "DeepSpeed is a machine learning framework for deep learning. It is designed to be fast and efficient, while also being easy to use. DeepSpeed is based on the TensorFlow framework, and it uses the TensorFlow library to perform computations.\nDeepSpeed is a deep learning framework that is designed to be fast and efficient. It is based on the TensorFlow library and uses the TensorFlow library to perform computations. DeepSpeed is designed to be easy to use and to provide a high level of flex",
19+
"mistralai/Mistral-7B-v0.1": "DeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system.\n\nDeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system",
20+
"mistralai/Mixtral-8x7B-v0.1": "DeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## Introduction\n\nDeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## What is DeepSpeed",
21+
"Qwen/Qwen2-7B": "DeepSpeed is a machine learning framework that provides a unified interface for training deep learning models. It is designed to be easy to use and to provide high performance on a variety of hardware platforms. DeepSpeed is built on top of PyTorch and TensorFlow, and it supports a wide range of models architectures, including transformer models, convolutional neural networks, and recurrent neural networks.\nDeepSpeed is designed to be easy to use, and it provides a unified interface for training deep learning models. It supports a wide range of model architectures, including",
22+
}
23+
else:
24+
# Functional testing only on G2 onwards
25+
MODEL_OUTPUTS = []
26+
27+
28+
def _test_text_generation(
29+
model_name: str,
30+
token: str,
31+
):
32+
set_seed(42)
33+
command = ["python3"]
34+
path_to_example_dir = Path(__file__).resolve().parent.parent / "examples"
35+
env_variables = os.environ.copy()
36+
37+
command += [
38+
f"{path_to_example_dir}/text-generation/run_generation.py ",
39+
f"--model_name_or_path {model_name}",
40+
"--use_kv_cache",
41+
"--use_hpu_graphs",
42+
"--bf16",
43+
]
44+
45+
with TemporaryDirectory() as tmp_dir:
46+
command.append(f"--output_dir {tmp_dir}")
47+
command.append(f"--token {token.value}")
48+
49+
pattern = re.compile(r"([\"\"].+?[\"\"])|\s")
50+
51+
command = [x for y in command for x in re.split(pattern, y) if x]
52+
if "starcoder" in model_name:
53+
command.append("--prompt")
54+
command.append("def print_hello_world():")
55+
print(f"\n\nCommand to test: {' '.join(command)}\n")
56+
proc = subprocess.run(command, env=env_variables)
57+
58+
# Ensure the run finished without any issue
59+
# Use try-except to avoid logging the token if used
60+
try:
61+
assert proc.returncode == 0
62+
except AssertionError as e:
63+
if "'--token', 'hf_" in e.args[0]:
64+
e.args = (f"The following command failed:\n{' '.join(command[:-2])}",)
65+
raise
66+
67+
with open(Path(tmp_dir) / "results.json") as fp:
68+
results = json.load(fp)
69+
70+
assert results["output"][0][0] == MODEL_OUTPUTS[model_name]
71+
72+
73+
@pytest.mark.parametrize("model_name", MODEL_OUTPUTS.keys())
74+
def test_text_generation_bf16_1x(model_name: str, token: str):
75+
_test_text_generation(model_name, token)

tests/test_text_generation_example.py

Lines changed: 89 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
import pytest
1111

12+
from optimum.habana.utils import set_seed
13+
1214
from .test_examples import TIME_PERF_FACTOR
1315

1416

@@ -19,34 +21,40 @@
1921
# Gaudi2 CI baselines
2022
MODELS_TO_TEST = {
2123
"bf16_1x": [
22-
("bigscience/bloomz-7b1", 1, False, 130.0472971205316),
23-
("gpt2-xl", 1, False, 281.8734689674413),
24-
("EleutherAI/gpt-j-6b", 1, False, 160.5823842101192),
25-
("EleutherAI/gpt-neox-20b", 1, False, 50.67672679310354),
26-
("meta-llama/Llama-2-7b-hf", 1, True, 141.25776956002076),
27-
("tiiuae/falcon-40b", 1, True, 25.202450111088346),
28-
("bigcode/starcoder", 256, True, 6846.575763562658),
29-
("Salesforce/codegen2-1B", 1, False, 446.4029486883532),
30-
("mosaicml/mpt-30b", 1, False, 36.06464336116623),
31-
("mistralai/Mistral-7B-v0.1", 1, True, 130.2172236767782),
32-
("mistralai/Mixtral-8x7B-v0.1", 1, False, 23.7931001677926),
33-
("microsoft/phi-2", 1, False, 224.72307766211117),
34-
("meta-llama/Meta-Llama-3-8B", 1, True, 129),
35-
("meta-llama/Llama-2-7b-hf", 512, True, 12808),
36-
("meta-llama/Llama-2-7b-hf", 512, False, 8711), # in some cases like TGI, reuse_cache isnt used
37-
("stabilityai/stablelm-2-12b", 1, False, 74.8904496532218),
38-
("codellama/CodeLlama-34b-hf", 1, True, 32.644),
39-
("bigcode/starcoder2-3b", 1, False, 261.07213776344133),
40-
("adept/persimmon-8b-base", 4, False, 366.73968820698406),
41-
("Qwen/Qwen1.5-7B", 4, False, 490.8621617893209),
42-
("google/gemma-7b", 1, False, 109.70751574382221),
43-
("state-spaces/mamba-130m-hf", 1536, False, 5385.511100161605),
44-
("Deci/DeciLM-7B", 1, False, 120),
45-
("Qwen/Qwen2-7B", 512, False, 9669.45787),
46-
("Qwen/Qwen1.5-MoE-A2.7B", 1, True, 44.25834541569395),
47-
("EleutherAI/gpt-neo-2.7B", 1, False, 257.2476416844122),
48-
("facebook/xglm-1.7B", 1, False, 357.46365062825083),
49-
("CohereForAI/c4ai-command-r-v01", 1, False, 29.50315234651154),
24+
("bigscience/bloomz-7b1", 1, False, 130.0472971205316, False),
25+
("gpt2-xl", 1, False, 281.8734689674413, False),
26+
("EleutherAI/gpt-j-6b", 1, False, 160.5823842101192, False),
27+
("EleutherAI/gpt-neox-20b", 1, False, 50.67672679310354, False),
28+
("meta-llama/Llama-2-7b-hf", 1, True, 141.25776956002076, True),
29+
("tiiuae/falcon-40b", 1, True, 25.202450111088346, False),
30+
(
31+
"bigcode/starcoder",
32+
256,
33+
True,
34+
6846.575763562658,
35+
False,
36+
), # TODO: Enable check_output after model bigcode/starcoder is fixed
37+
("Salesforce/codegen2-1B", 1, False, 446.4029486883532, False),
38+
("mosaicml/mpt-30b", 1, False, 36.06464336116623, False),
39+
("mistralai/Mistral-7B-v0.1", 1, True, 130.2172236767782, True),
40+
("mistralai/Mixtral-8x7B-v0.1", 1, False, 23.7931001677926, True),
41+
("microsoft/phi-2", 1, False, 224.72307766211117, False),
42+
("meta-llama/Meta-Llama-3-8B", 1, True, 129, False),
43+
("meta-llama/Llama-2-7b-hf", 512, True, 12808, False),
44+
("meta-llama/Llama-2-7b-hf", 512, False, 8711, False), # in some cases like TGI, reuse_cache isnt used
45+
("stabilityai/stablelm-2-12b", 1, False, 74.8904496532218, False),
46+
("codellama/CodeLlama-34b-hf", 1, True, 32.644, False),
47+
("bigcode/starcoder2-3b", 1, False, 261.07213776344133, True),
48+
("adept/persimmon-8b-base", 4, False, 366.73968820698406, False),
49+
("Qwen/Qwen1.5-7B", 4, False, 490.8621617893209, False),
50+
("google/gemma-7b", 1, False, 109.70751574382221, True),
51+
("state-spaces/mamba-130m-hf", 1536, False, 5385.511100161605, False),
52+
("Deci/DeciLM-7B", 1, False, 120, False),
53+
("Qwen/Qwen2-7B", 512, False, 9669.45787, True),
54+
("Qwen/Qwen1.5-MoE-A2.7B", 1, True, 44.25834541569395, False),
55+
("EleutherAI/gpt-neo-2.7B", 1, False, 257.2476416844122, False),
56+
("facebook/xglm-1.7B", 1, False, 357.46365062825083, False),
57+
("CohereForAI/c4ai-command-r-v01", 1, False, 29.50315234651154, False),
5058
],
5159
"fp8": [
5260
("tiiuae/falcon-180B", 4, 950, True, 128, 128, 2506.68),
@@ -91,41 +99,51 @@
9199
("gpt2-xl", 1, False, 51.61471298016438),
92100
],
93101
}
102+
MODEL_OUTPUTS = {
103+
"bigcode/starcoder": 'def print_hello_world():\n print("Hello World")\n\ndef print_hello_world_twice():\n print_hello_world()\n print_hello_world()\n\ndef print_hello_world_thrice():\n print_hello_world()\n print_hello_world()\n print_hello_world()\n\ndef print_hello_world_four_times():\n print_hello_world()\n print_hello_world()\n print_hello_world()\n ',
104+
"bigcode/starcoder2-3b": 'def print_hello_world():\n print("Hello World")\n\ndef print_hello_world_with_name(name):\n print("Hello World, " + name)\n\ndef print_hello_world_with_name_and_age(name, age):\n print("Hello World, " + name + ", " + str(age))\n\ndef print_hello_world_with_name_and_age_and_gender(name, age, gender):\n print("Hello',
105+
"google/gemma-7b": "DeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and it is compatible with the existing PyTorch ecosystem. DeepSpeed is designed to be easy to use, and it provides a number of features that make it easy to train large-scale models.\n\nDeepSpeed is a machine learning framework that enables training of large-scale models on commodity hardware. It is designed to be a drop-in replacement for PyTorch, and",
106+
"meta-llama/Llama-2-7b-hf": "DeepSpeed is a machine learning framework for deep learning. It is designed to be fast and efficient, while also being easy to use. DeepSpeed is based on the TensorFlow framework, and it uses the TensorFlow library to perform computations.\nDeepSpeed is a deep learning framework that is designed to be fast and efficient. It is based on the TensorFlow library and uses the TensorFlow library to perform computations. DeepSpeed is designed to be easy to use and to provide a high level of flex",
107+
"mistralai/Mistral-7B-v0.1": "DeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system.\n\nDeepSpeed is a machine learning framework that accelerates training of large models on a single machine or distributed systems. It is designed to be compatible with PyTorch and TensorFlow, and can be used to train models on a single machine or on a distributed system",
108+
"mistralai/Mixtral-8x7B-v0.1": "DeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## Introduction\n\nDeepSpeed is a machine learning framework that enables training of large models on a single machine with a single GPU. It is designed to be easy to use and efficient, and it can be used to train models on a variety of tasks.\n\n## What is DeepSpeed",
109+
"Qwen/Qwen2-7B": "DeepSpeed is a machine learning framework that provides a suite of toolskits for building and training deep learning models. It is designed to be highly scalable and efficient, and it supports a wide range of deep learning frameworks, including PyTorch, TensorFlow, and MXNet. DeepSpeed is particularly well-suited for training large-scale models on distributed systems, and it provides a number of features that make it easy to use and configure. Some of the key features of DeepSpeed include:\n\n- Distributed training: DeepSpeed supports distributed training on multiple",
110+
}
94111
else:
95112
# Gaudi1 CI baselines
96113
MODELS_TO_TEST = {
97114
"bf16_1x": [
98-
("bigscience/bloomz-7b1", 1, False, 41.7555095197846),
99-
("gpt2-xl", 1, False, 142.11481820425706),
115+
("bigscience/bloomz-7b1", 1, False, 41.7555095197846, False),
116+
("gpt2-xl", 1, False, 142.11481820425706, False),
100117
# TODO: fix OPT 6.7B
101118
# ("facebook/opt-6.7b", 0.0),
102-
("EleutherAI/gpt-j-6b", 1, True, 156.2893125740893),
103-
("meta-llama/Llama-2-7b-hf", 1, True, 44.39616259946937),
104-
("tiiuae/falcon-7b", 1, True, 44.82870145718665),
105-
("bigcode/starcoder", 1, False, 15.945023767901013),
106-
("Salesforce/codegen2-1B", 1, False, 155.32071248826423),
107-
("mosaicml/mpt-7b", 1, False, 45.45168927038262),
108-
("mistralai/Mistral-7B-v0.1", 1, True, 41.21906841459711),
109-
("microsoft/phi-2", 1, False, 92.53083167241344),
110-
("google/gemma-7b", 1, False, 28.84284625836978),
111-
("stabilityai/stablelm-2-12b", 1, False, 26.80858949645992),
112-
("Qwen/Qwen1.5-7B", 1, False, 39.29068423087616),
113-
("adept/persimmon-8b-base", 1, False, 34.53559807384106),
114-
("bigcode/starcoder2-3b", 1, False, 82.09655684566117),
115-
("state-spaces/mamba-130m-hf", 224, False, 794.542),
119+
("EleutherAI/gpt-j-6b", 1, True, 156.2893125740893, False),
120+
("meta-llama/Llama-2-7b-hf", 1, True, 44.39616259946937, False),
121+
("tiiuae/falcon-7b", 1, True, 44.82870145718665, False),
122+
("bigcode/starcoder", 1, False, 15.945023767901013, False),
123+
("Salesforce/codegen2-1B", 1, False, 155.32071248826423, False),
124+
("mosaicml/mpt-7b", 1, False, 45.45168927038262, False),
125+
("mistralai/Mistral-7B-v0.1", 1, True, 41.21906841459711, False),
126+
("microsoft/phi-2", 1, False, 92.53083167241344, False),
127+
("google/gemma-7b", 1, False, 28.84284625836978, False),
128+
("stabilityai/stablelm-2-12b", 1, False, 26.80858949645992, False),
129+
("Qwen/Qwen1.5-7B", 1, False, 39.29068423087616, False),
130+
("adept/persimmon-8b-base", 1, False, 34.53559807384106, False),
131+
("bigcode/starcoder2-3b", 1, False, 82.09655684566117, False),
132+
("state-spaces/mamba-130m-hf", 224, False, 794.542, False),
116133
],
117134
"fp8": [],
118135
"load_quantized_model_with_autogptq": [],
119136
"deepspeed": [
120-
("bigscience/bloomz-7b1", 8, 1, 31.994268212011505),
137+
("bigscience/bloomz-7b1", 8, 1, 31.994268212011505, False),
121138
],
122139
"torch_compile": [],
123140
"torch_compile_distributed": [],
124141
"distributed_tp": [],
125142
"contrastive_search": [
126-
("gpt2-xl", 1, False, 34.48141280163397),
143+
("gpt2-xl", 1, False, 34.48141280163397, False),
127144
],
128145
}
146+
MODEL_OUTPUTS = {}
129147

130148

131149
def _test_text_generation(
@@ -143,6 +161,7 @@ def _test_text_generation(
143161
max_output_tokens: int = 100,
144162
parallel_strategy: str = None,
145163
contrastive_search: bool = False,
164+
check_output: bool = False,
146165
):
147166
command = ["python3"]
148167
path_to_example_dir = Path(__file__).resolve().parent.parent / "examples"
@@ -293,7 +312,13 @@ def _test_text_generation(
293312
)
294313

295314
command = [x for y in command for x in re.split(pattern, y) if x]
296-
print(f"\n\nCommand to test: {' '.join(command[:-2])}\n")
315+
if "starcoder" in model_name and check_output:
316+
command.append("--prompt")
317+
command.append("def print_hello_world():")
318+
319+
set_seed(42)
320+
321+
print(f"\n\nCommand to test: {' '.join(command)}\n")
297322
proc = subprocess.run(command, env=env_variables)
298323

299324
# Ensure the run finished without any issue
@@ -311,10 +336,24 @@ def _test_text_generation(
311336
# Ensure performance requirements (throughput) are met
312337
assert results["throughput"] >= (2 - TIME_PERF_FACTOR) * baseline
313338

339+
# Verify output for 1 HPU, BF16
340+
if check_output and model_name in MODEL_OUTPUTS:
341+
expected_output = MODEL_OUTPUTS[model_name]
342+
assert results["output"][0][0] == expected_output
343+
314344

315-
@pytest.mark.parametrize("model_name, batch_size, reuse_cache, baseline", MODELS_TO_TEST["bf16_1x"])
316-
def test_text_generation_bf16_1x(model_name: str, baseline: float, batch_size: int, reuse_cache: bool, token: str):
317-
_test_text_generation(model_name, baseline, token, batch_size, reuse_cache)
345+
@pytest.mark.parametrize("model_name, batch_size, reuse_cache, baseline, check_output", MODELS_TO_TEST["bf16_1x"])
346+
def test_text_generation_bf16_1x(
347+
model_name: str, baseline: float, batch_size: int, reuse_cache: bool, token: str, check_output: bool
348+
):
349+
_test_text_generation(
350+
model_name=model_name,
351+
baseline=baseline,
352+
token=token,
353+
batch_size=batch_size,
354+
reuse_cache=reuse_cache,
355+
check_output=check_output,
356+
)
318357

319358

320359
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)