add trtllm to global sf path

jiahanc · jiahanc · commit 6ba336cd1491 · 2025-11-14T09:23:17.000-08:00
Signed-off-by: jiahanc &lt;173873397+jiahanc@users.noreply.github.com&gt;
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
@@ -94,10 +94,6 @@ class BenchmarkMetrics:
     # Max output tokens per second and concurrent requests at that peak
     max_output_tokens_per_s: float
     max_concurrent_requests: int
-    # TPS per user - average tokens per second per request
-    tps_per_user: float
-    # TPS per GPU - output throughput divided by number of GPUs
-    tps_per_gpu: float
 
 
 @dataclass
@@ -292,7 +288,6 @@ def calculate_metrics(
     tokenizer: PreTrainedTokenizerBase,
     selected_percentiles: list[float],
     goodput_config_dict: dict[str, float],
-    num_gpu: int = 1,
 ) -> tuple[BenchmarkMetrics, list[int]]:
     """Calculate the metrics for the benchmark.
 
@@ -316,7 +311,6 @@ def calculate_metrics(
     all_tpots: list[float] = []
     ttfts: list[float] = []
     e2els: list[float] = []
-    tps_per_user: list[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
             output_len = outputs[i].output_tokens
@@ -344,10 +338,6 @@ def calculate_metrics(
             itls += outputs[i].itl
             ttfts.append(outputs[i].ttft)
             e2els.append(outputs[i].latency)
-            
-            # Calculate TPS per request: (input_tokens + output_tokens) / latency
-            tps_per_user.append(output_len / (outputs[i].latency))
-            
             completed += 1
         else:
             actual_output_lens.append(0)
@@ -486,8 +476,6 @@ def calculate_metrics(
         ],
         max_output_tokens_per_s=max_output_tokens_per_s,
         max_concurrent_requests=max_concurrent_requests,
-        tps_per_user=np.mean(tps_per_user) if tps_per_user else 0.0,
-        tps_per_gpu=sum(actual_output_lens) / dur_s / num_gpu if num_gpu > 0 else 0.0,
     )
 
     return metrics, actual_output_lens
@@ -520,7 +508,6 @@ async def benchmark(
     ramp_up_start_rps: int | None = None,
     ramp_up_end_rps: int | None = None,
     ready_check_timeout_sec: int = 600,
-    num_gpu: int = 1,
 ):
     try:
         request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
@@ -752,7 +739,6 @@ async def limited_request_func(request_func_input, session, pbar):
             tokenizer=tokenizer,
             selected_percentiles=selected_percentiles,
             goodput_config_dict=goodput_config_dict,
-            num_gpu=num_gpu,
         )
     else:
         metrics = calculate_metrics_for_embeddings(
@@ -805,11 +791,6 @@ async def limited_request_func(request_func_input, session, pbar):
             "Total Token throughput (tok/s):", metrics.total_token_throughput
         )
     )
-    if isinstance(metrics, BenchmarkMetrics):
-        print("{:<40} {:<10.2f}".format("TPS per user (tok/s):",
-                                        metrics.tps_per_user))
-        print("{:<40} {:<10.2f}".format("TPS per GPU (tok/s):",
-                                        metrics.tps_per_gpu))
 
     if isinstance(metrics, BenchmarkMetrics):
         result = {
@@ -830,8 +811,6 @@ async def limited_request_func(request_func_input, session, pbar):
             "errors": [output.error for output in outputs],
             "max_output_tokens_per_s": metrics.max_output_tokens_per_s,
             "max_concurrent_requests": metrics.max_concurrent_requests,
-            "tps_per_user": metrics.tps_per_user,
-            "tps_per_gpu": metrics.tps_per_gpu,
         }
     else:
         result = {
@@ -1157,7 +1136,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--percentile-metrics",
         type=str,
-        default="ttft,tpot,itl,e2el",
+        default=None,
         help="Comma-separated list of selected metrics to report percentils. "
         "This argument specifies the metrics to report percentiles. "
         'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
@@ -1304,12 +1283,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "in seconds (default: 600 seconds / 10 minutes). If set to 0, "
         "the ready check will be skipped.",
     )
-    parser.add_argument(
-        "--num-gpu",
-        type=int,
-        default=1,
-        help="Number of GPUs used for serving. Used to calculate TPS per GPU metric."
-    )
 
     parser.add_argument(
         "--extra-body",
@@ -1470,7 +1443,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         ramp_up_start_rps=args.ramp_up_start_rps,
         ramp_up_end_rps=args.ramp_up_end_rps,
         ready_check_timeout_sec=args.ready_check_timeout_sec,
-        num_gpu=args.num_gpu,
     )
 
     # Save config and results to json
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -295,10 +295,10 @@ async def create_chat_completion(
                         self.model_config.logits_processor_pattern,
                         self.default_sampling_params,
                     )
-                    # validate_logits_processors_parameters(
-                    #     self.logits_processors,
-                    #     sampling_params,
-                    # )
+                    validate_logits_processors_parameters(
+                        self.logits_processors,
+                        sampling_params,
+                    )
 
                 self._log_inputs(
                     request_id,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -186,10 +186,10 @@ async def create_completion(
                         self.model_config.logits_processor_pattern,
                         self.default_sampling_params,
                     )
-                    # validate_logits_processors_parameters(
-                    #     self.logits_processors,
-                    #     sampling_params,
-                    # )
+                    validate_logits_processors_parameters(
+                        self.logits_processors,
+                        sampling_params,
+                    )
 
                 request_id_item = f"{request_id}-{i}"
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
@@ -939,9 +939,9 @@ def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
 
         self.backend = "none"
         if envs.VLLM_NVFP4_GEMM_BACKEND is None:
-            # if has_flashinfer():
-            #     self.backend = "flashinfer-cutlass"
-            if cutlass_fp4_supported():
+            if has_flashinfer():
+                self.backend = "flashinfer-cutlass"
+            elif cutlass_fp4_supported():
                 self.backend = "cutlass"
             elif is_fp4_marlin_supported():
                 self.backend = "marlin"
@@ -1654,7 +1654,11 @@ def apply(
             routing_method_type = layer.routing_method_type
             if use_llama4_routing:
                 routing_method_type = RoutingMethodType.Llama4
-            router_logits = router_logits.to(torch.float32) if routing_method_type == RoutingMethodType.DeepSeekV3 else router_logits
+            router_logits = (
+                router_logits.to(torch.float32)
+                if routing_method_type == RoutingMethodType.DeepSeekV3
+                else router_logits
+            )
             routing_bias = e_score_correction_bias
             if routing_bias is not None:
                 routing_bias = routing_bias.to(torch.bfloat16)
diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
@@ -291,5 +291,8 @@ def get_flashinfer_moe_backend() -> FlashinferMoeBackend:
 
 def is_flashinfer_supporting_global_sf(backend: FlashinferMoeBackend | None) -> bool:
     # TODO(shuw@nvidia): Update when new backends are added.
-    backends_supporting_global_sf = (FlashinferMoeBackend.CUTLASS,)
+    backends_supporting_global_sf = (
+        FlashinferMoeBackend.CUTLASS,
+        FlashinferMoeBackend.TENSORRT_LLM,
+    )
     return backend in backends_supporting_global_sf