File tree Expand file tree Collapse file tree 3 files changed +12
-6
lines changed
Expand file tree Collapse file tree 3 files changed +12
-6
lines changed Original file line number Diff line number Diff line change @@ -308,8 +308,8 @@ def llama_2_7b_model_extra_embeddings(llama_2_7b_engine_extra_embeddings):
308308 model_runner .model )
309309
310310
311- @pytest .fixture (params = [False , True ])
312- def run_with_both_engines_lora (request ):
311+ @pytest .fixture (params = [True , False ])
312+ def run_with_both_engines_lora (request , monkeypatch ):
313313 # Automatically runs tests twice, once with V1 and once without
314314 use_v1 = request .param
315315 # Tests decorated with `@skip_v1` are only run without v1
@@ -318,8 +318,8 @@ def run_with_both_engines_lora(request):
318318 if use_v1 :
319319 if skip_v1 :
320320 pytest .skip ("Skipping test on vllm V1" )
321- with patch ('vllm.envs.VLLM_USE_V1' , True ):
322- yield
321+ monkeypatch .setenv ('VLLM_USE_V1' , '1' )
323322 else :
324- with patch ('vllm.envs.VLLM_USE_V1' , False ):
325- yield
323+ monkeypatch .setenv ('VLLM_USE_V1' , '0' )
324+
325+ yield
Original file line number Diff line number Diff line change @@ -57,6 +57,7 @@ def v1(run_with_both_engines_lora):
5757 pass
5858
5959
60+ @pytest .mark .skip_v1
6061@fork_new_process_for_each_test
6162def test_chatglm3_lora (chatglm3_lora_files ):
6263 llm = vllm .LLM (MODEL_PATH ,
@@ -76,6 +77,7 @@ def test_chatglm3_lora(chatglm3_lora_files):
7677 assert output2 [i ] == EXPECTED_LORA_OUTPUT [i ]
7778
7879
80+ @pytest .mark .skip_v1
7981@multi_gpu_test (num_gpus = 4 )
8082@fork_new_process_for_each_test
8183def test_chatglm3_lora_tp4 (chatglm3_lora_files ):
@@ -97,6 +99,7 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
9799 assert output2 [i ] == EXPECTED_LORA_OUTPUT [i ]
98100
99101
102+ @pytest .mark .skip_v1
100103@multi_gpu_test (num_gpus = 4 )
101104@fork_new_process_for_each_test
102105def test_chatglm3_lora_tp4_fully_sharded_loras (chatglm3_lora_files ):
Original file line number Diff line number Diff line change @@ -38,6 +38,9 @@ def v1(run_with_both_engines_lora):
3838 pass
3939
4040
41+ # Skipping for V1 for now as we are hitting,
42+ # "Head size 80 is not supported by FlashAttention." error.
43+ @pytest .mark .skip_v1
4144@pytest .mark .parametrize ("lora_bias" , [True ])
4245@pytest .mark .parametrize ("fully_sharded" , [True , False ])
4346def test_lora_bias (lora_bias_files : str , lora_bias : bool , fully_sharded : bool ):
You can’t perform that action at this time.
0 commit comments