Skip to content

Commit 6ba336c

Browse files
committed
add trtllm to global sf path
Signed-off-by: jiahanc <[email protected]>
1 parent d1d0bc5 commit 6ba336c

File tree

5 files changed

+21
-42
lines changed

5 files changed

+21
-42
lines changed

vllm/benchmarks/serve.py

Lines changed: 1 addition & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,6 @@ class BenchmarkMetrics:
9494
# Max output tokens per second and concurrent requests at that peak
9595
max_output_tokens_per_s: float
9696
max_concurrent_requests: int
97-
# TPS per user - average tokens per second per request
98-
tps_per_user: float
99-
# TPS per GPU - output throughput divided by number of GPUs
100-
tps_per_gpu: float
10197

10298

10399
@dataclass
@@ -292,7 +288,6 @@ def calculate_metrics(
292288
tokenizer: PreTrainedTokenizerBase,
293289
selected_percentiles: list[float],
294290
goodput_config_dict: dict[str, float],
295-
num_gpu: int = 1,
296291
) -> tuple[BenchmarkMetrics, list[int]]:
297292
"""Calculate the metrics for the benchmark.
298293
@@ -316,7 +311,6 @@ def calculate_metrics(
316311
all_tpots: list[float] = []
317312
ttfts: list[float] = []
318313
e2els: list[float] = []
319-
tps_per_user: list[float] = []
320314
for i in range(len(outputs)):
321315
if outputs[i].success:
322316
output_len = outputs[i].output_tokens
@@ -344,10 +338,6 @@ def calculate_metrics(
344338
itls += outputs[i].itl
345339
ttfts.append(outputs[i].ttft)
346340
e2els.append(outputs[i].latency)
347-
348-
# Calculate TPS per request: (input_tokens + output_tokens) / latency
349-
tps_per_user.append(output_len / (outputs[i].latency))
350-
351341
completed += 1
352342
else:
353343
actual_output_lens.append(0)
@@ -486,8 +476,6 @@ def calculate_metrics(
486476
],
487477
max_output_tokens_per_s=max_output_tokens_per_s,
488478
max_concurrent_requests=max_concurrent_requests,
489-
tps_per_user=np.mean(tps_per_user) if tps_per_user else 0.0,
490-
tps_per_gpu=sum(actual_output_lens) / dur_s / num_gpu if num_gpu > 0 else 0.0,
491479
)
492480

493481
return metrics, actual_output_lens
@@ -520,7 +508,6 @@ async def benchmark(
520508
ramp_up_start_rps: int | None = None,
521509
ramp_up_end_rps: int | None = None,
522510
ready_check_timeout_sec: int = 600,
523-
num_gpu: int = 1,
524511
):
525512
try:
526513
request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
@@ -752,7 +739,6 @@ async def limited_request_func(request_func_input, session, pbar):
752739
tokenizer=tokenizer,
753740
selected_percentiles=selected_percentiles,
754741
goodput_config_dict=goodput_config_dict,
755-
num_gpu=num_gpu,
756742
)
757743
else:
758744
metrics = calculate_metrics_for_embeddings(
@@ -805,11 +791,6 @@ async def limited_request_func(request_func_input, session, pbar):
805791
"Total Token throughput (tok/s):", metrics.total_token_throughput
806792
)
807793
)
808-
if isinstance(metrics, BenchmarkMetrics):
809-
print("{:<40} {:<10.2f}".format("TPS per user (tok/s):",
810-
metrics.tps_per_user))
811-
print("{:<40} {:<10.2f}".format("TPS per GPU (tok/s):",
812-
metrics.tps_per_gpu))
813794

814795
if isinstance(metrics, BenchmarkMetrics):
815796
result = {
@@ -830,8 +811,6 @@ async def limited_request_func(request_func_input, session, pbar):
830811
"errors": [output.error for output in outputs],
831812
"max_output_tokens_per_s": metrics.max_output_tokens_per_s,
832813
"max_concurrent_requests": metrics.max_concurrent_requests,
833-
"tps_per_user": metrics.tps_per_user,
834-
"tps_per_gpu": metrics.tps_per_gpu,
835814
}
836815
else:
837816
result = {
@@ -1157,7 +1136,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
11571136
parser.add_argument(
11581137
"--percentile-metrics",
11591138
type=str,
1160-
default="ttft,tpot,itl,e2el",
1139+
default=None,
11611140
help="Comma-separated list of selected metrics to report percentils. "
11621141
"This argument specifies the metrics to report percentiles. "
11631142
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
@@ -1304,12 +1283,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
13041283
"in seconds (default: 600 seconds / 10 minutes). If set to 0, "
13051284
"the ready check will be skipped.",
13061285
)
1307-
parser.add_argument(
1308-
"--num-gpu",
1309-
type=int,
1310-
default=1,
1311-
help="Number of GPUs used for serving. Used to calculate TPS per GPU metric."
1312-
)
13131286

13141287
parser.add_argument(
13151288
"--extra-body",
@@ -1470,7 +1443,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
14701443
ramp_up_start_rps=args.ramp_up_start_rps,
14711444
ramp_up_end_rps=args.ramp_up_end_rps,
14721445
ready_check_timeout_sec=args.ready_check_timeout_sec,
1473-
num_gpu=args.num_gpu,
14741446
)
14751447

14761448
# Save config and results to json

vllm/entrypoints/openai/serving_chat.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -295,10 +295,10 @@ async def create_chat_completion(
295295
self.model_config.logits_processor_pattern,
296296
self.default_sampling_params,
297297
)
298-
# validate_logits_processors_parameters(
299-
# self.logits_processors,
300-
# sampling_params,
301-
# )
298+
validate_logits_processors_parameters(
299+
self.logits_processors,
300+
sampling_params,
301+
)
302302

303303
self._log_inputs(
304304
request_id,

vllm/entrypoints/openai/serving_completion.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,10 +186,10 @@ async def create_completion(
186186
self.model_config.logits_processor_pattern,
187187
self.default_sampling_params,
188188
)
189-
# validate_logits_processors_parameters(
190-
# self.logits_processors,
191-
# sampling_params,
192-
# )
189+
validate_logits_processors_parameters(
190+
self.logits_processors,
191+
sampling_params,
192+
)
193193

194194
request_id_item = f"{request_id}-{i}"
195195

vllm/model_executor/layers/quantization/modelopt.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -939,9 +939,9 @@ def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
939939

940940
self.backend = "none"
941941
if envs.VLLM_NVFP4_GEMM_BACKEND is None:
942-
# if has_flashinfer():
943-
# self.backend = "flashinfer-cutlass"
944-
if cutlass_fp4_supported():
942+
if has_flashinfer():
943+
self.backend = "flashinfer-cutlass"
944+
elif cutlass_fp4_supported():
945945
self.backend = "cutlass"
946946
elif is_fp4_marlin_supported():
947947
self.backend = "marlin"
@@ -1654,7 +1654,11 @@ def apply(
16541654
routing_method_type = layer.routing_method_type
16551655
if use_llama4_routing:
16561656
routing_method_type = RoutingMethodType.Llama4
1657-
router_logits = router_logits.to(torch.float32) if routing_method_type == RoutingMethodType.DeepSeekV3 else router_logits
1657+
router_logits = (
1658+
router_logits.to(torch.float32)
1659+
if routing_method_type == RoutingMethodType.DeepSeekV3
1660+
else router_logits
1661+
)
16581662
routing_bias = e_score_correction_bias
16591663
if routing_bias is not None:
16601664
routing_bias = routing_bias.to(torch.bfloat16)

vllm/model_executor/layers/quantization/utils/flashinfer_utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,5 +291,8 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
291291

292292
def is_flashinfer_supporting_global_sf(backend: FlashinferMoeBackend | None) -> bool:
293293
# TODO(shuw@nvidia): Update when new backends are added.
294-
backends_supporting_global_sf = (FlashinferMoeBackend.CUTLASS,)
294+
backends_supporting_global_sf = (
295+
FlashinferMoeBackend.CUTLASS,
296+
FlashinferMoeBackend.TENSORRT_LLM,
297+
)
295298
return backend in backends_supporting_global_sf

0 commit comments

Comments
 (0)