1+ import dataclasses
12from typing import Dict , List , Optional
23
34import pytest
89from ..utils import compare_all_settings
910
1011
12+ @dataclasses .dataclass
13+ class TestSetting :
14+ model : str
15+ model_args : List [str ]
16+ pp_size : int
17+ tp_size : int
18+ attn_backend : str
19+ method : str
20+ fullgraph : bool
21+
22+
23+ # representative settings for testing
24+ test_settings = [
25+ # basic llama model
26+ TestSetting (
27+ model = "meta-llama/Llama-3.2-1B" ,
28+ model_args = [],
29+ pp_size = 2 ,
30+ tp_size = 2 ,
31+ attn_backend = "FLASHINFER" ,
32+ method = "generate" ,
33+ fullgraph = True ,
34+ ),
35+ # llama model with quantization
36+ TestSetting (
37+ model = "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ" ,
38+ model_args = ["--quantization" , "gptq" ],
39+ pp_size = 1 ,
40+ tp_size = 1 ,
41+ attn_backend = "FLASH_ATTN" ,
42+ method = "generate" ,
43+ fullgraph = True ,
44+ ),
45+ # MoE model
46+ TestSetting (
47+ model = "ibm/PowerMoE-3b" ,
48+ model_args = [],
49+ pp_size = 1 ,
50+ tp_size = 2 ,
51+ attn_backend = "FLASH_ATTN" ,
52+ method = "generate" ,
53+ fullgraph = True ,
54+ ),
55+ # embedding model
56+ TestSetting (
57+ model = "BAAI/bge-multilingual-gemma2" ,
58+ model_args = ["--task" , "embedding" ],
59+ pp_size = 1 ,
60+ tp_size = 1 ,
61+ attn_backend = "FLASHINFER" ,
62+ method = "encode" ,
63+ fullgraph = True ,
64+ ),
65+ # vision language model
66+ TestSetting (
67+ model = "microsoft/Phi-3.5-vision-instruct" ,
68+ model_args = ["--trust-remote-code" , "--max-model-len" , "2048" ],
69+ pp_size = 2 ,
70+ tp_size = 1 ,
71+ attn_backend = "FLASH_ATTN" ,
72+ method = "generate_with_image" ,
73+ fullgraph = False ,
74+ ),
75+ ]
76+
77+
1178# we cannot afford testing the full Catesian product
1279# of all models and all levels
13- @pytest .mark .parametrize (
14- "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph" ,
15- [
16- ("meta-llama/Llama-3.2-1B" , [], 2 , 2 , "FLASHINFER" , "generate" , True ),
17- ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples" ,
18- ["--quantization" , "compressed-tensors"
19- ], 1 , 1 , "FLASH_ATTN" , "generate" , True ),
20- ("ibm/PowerMoE-3b" , [], 1 , 2 , "FLASH_ATTN" , "generate" , True ),
21- # TODO: add multi-modality test for llava
22- ("llava-hf/llava-1.5-7b-hf" , [], 2 , 1 , "FLASHINFER" , "generate" , False )
23- ])
24- def test_compile_correctness (model , model_args , pp_size , tp_size , attn_backend ,
25- method , fullgraph ):
80+ @pytest .mark .parametrize ("test_setting" , test_settings )
81+ def test_compile_correctness (test_setting : TestSetting ):
2682 # this test is run under multiple suits, with different GPUs.
2783 # make sure we only run the test with correct CUDA devices.
2884 # don't use "<", as it will duplicate the tests.
85+ model = test_setting .model
86+ model_args = test_setting .model_args
87+ pp_size = test_setting .pp_size
88+ tp_size = test_setting .tp_size
89+ attn_backend = test_setting .attn_backend
90+ method = test_setting .method
91+ fullgraph = test_setting .fullgraph
2992 if cuda_device_count_stateless () != pp_size * tp_size :
3093 pytest .skip ("Not correct CUDA devices for the test." )
3194 import os
3295 os .environ ["VLLM_ATTENTION_BACKEND" ] = attn_backend
33- all_args = [["--enforce-eager" ] + model_args + ["-pp" , str (pp_size )] +
34- ["-tp" , str (tp_size )]] * 3
35- # don't test VLLM_TORCH_COMPILE_LEVEL == 3 case
36- # inductor will change the output, so we cannot compare them.
96+ final_args = ["--enforce-eager" ] + model_args + ["-pp" , str (pp_size )] + \
97+ ["-tp" , str (tp_size )]
98+
3799 all_envs : List [Optional [Dict [str , str ]]] = []
100+
101+ for level in [
102+ CompilationLevel .NO_COMPILATION ,
103+ CompilationLevel .PIECEWISE ,
104+ ]:
105+ all_envs .append ({"VLLM_TORCH_COMPILE_LEVEL" : str (level )})
106+
107+ # inductor will change the output, so we only compare if the output
108+ # is close, not exactly the same.
109+ compare_all_settings (
110+ model , [final_args ] * 2 ,
111+ all_envs ,
112+ method = method if method != "generate" else "generate_close" )
113+ all_envs .clear ()
114+
38115 for level in [
39116 CompilationLevel .NO_COMPILATION ,
40117 CompilationLevel .DYNAMO_AS_IS ,
@@ -46,4 +123,4 @@ def test_compile_correctness(model, model_args, pp_size, tp_size, attn_backend,
46123 all_envs [- 1 ][
47124 "VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE" ] = "0" # type: ignore
48125
49- compare_all_settings (model , all_args , all_envs , method = method )
126+ compare_all_settings (model , [ final_args ] * 3 , all_envs , method = method )
0 commit comments