Skip to content

Commit 2d40adb

Browse files
authored
[https://nvbugs/5437405][fix] cherry-pick PR 7000 (qwen3 235b eagle3 ci) (#7702)
Signed-off-by: bhsueh <[email protected]>
1 parent 9d719dd commit 2d40adb

File tree

5 files changed

+53
-8
lines changed

5 files changed

+53
-8
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 48 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2295,11 +2295,12 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
22952295
[
22962296
(8, 1, 8, True, True, True, "CUTLASS", False),
22972297
(8, 1, 8, True, True, True, "TRTLLM", False),
2298-
(8, 1, 8, False, False, False, "TRTLLM", True),
2298+
(8, 1, 8, True, True, True, "TRTLLM", True),
22992299
],
23002300
ids=[
2301-
"latency_moe_cutlass", "latency_moe_trtllm",
2302-
"latency_moe_trtllm_eagle3"
2301+
"latency_moe_cutlass",
2302+
"latency_moe_trtllm",
2303+
"latency_moe_trtllm_eagle3",
23032304
],
23042305
)
23052306
def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
@@ -2339,6 +2340,50 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
23392340
task = GSM8K(self.MODEL_NAME)
23402341
task.evaluate(llm)
23412342

2343+
@skip_pre_blackwell
2344+
@pytest.mark.skip_less_mpi_world_size(4)
2345+
@pytest.mark.parametrize(
2346+
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3",
2347+
[
2348+
(4, 1, 4, False, False, False, "TRTLLM",
2349+
True), # TP8 has bug when we use TRTLLM moe backend and eagle3
2350+
],
2351+
ids=[
2352+
"latency_moe_trtllm_eagle3",
2353+
],
2354+
)
2355+
def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
2356+
cuda_graph, overlap_scheduler, moe_backend, eagle3):
2357+
2358+
pytorch_config = dict(
2359+
disable_overlap_scheduler=not overlap_scheduler,
2360+
cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
2361+
moe_config=MoeConfig(backend=moe_backend))
2362+
2363+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4,
2364+
enable_block_reuse=not eagle3)
2365+
spec_config = None
2366+
if eagle3:
2367+
spec_config = EagleDecodingConfig(
2368+
max_draft_len=2,
2369+
speculative_model_dir=
2370+
f"{llm_models_root()}/Qwen3/qwen3-235B-eagle3/",
2371+
eagle3_one_model=True)
2372+
with LLM(
2373+
f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
2374+
tensor_parallel_size=tp_size,
2375+
pipeline_parallel_size=pp_size,
2376+
moe_expert_parallel_size=ep_size,
2377+
**pytorch_config,
2378+
enable_attention_dp=attention_dp,
2379+
kv_cache_config=kv_cache_config,
2380+
speculative_config=spec_config) as llm:
2381+
2382+
task = MMLU(self.MODEL_NAME)
2383+
task.evaluate(llm)
2384+
task = GSM8K(self.MODEL_NAME)
2385+
task.evaluate(llm)
2386+
23422387

23432388
class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
23442389
MODEL_NAME = "microsoft/Phi-4-mini-instruct"

tests/integration/test_lists/qa/llm_function_full.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -537,7 +537,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
537537
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
538538
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
539539
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
540-
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
540+
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
541541
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
542542
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
543543
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8

tests/integration/test_lists/qa/llm_function_sanity.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
100100
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True]
101101
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False]
102102
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True]
103-
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
103+
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3]
104104
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
105105
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
106106
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend

tests/integration/test_lists/test-db/l0_gb200.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,4 @@ l0_gb200:
6666
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
6767
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
6868
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
69+
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90)

tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ l0_gb200_multi_nodes:
1616
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
1717
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
1818
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
19-
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180)
20-
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180)
21-
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] TIMEOUT (180)
19+
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
20+
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
2221
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] TIMEOUT (180)

0 commit comments

Comments
 (0)