revert unrelated work

jiahanc · jiahanc · commit 462c8c2a3345 · 2025-11-12T09:21:53.000-08:00
Signed-off-by: jiahanc &lt;173873397+jiahanc@users.noreply.github.com&gt;
diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py
@@ -19,7 +19,6 @@
 import argparse
 import asyncio
 import contextlib
-import gc
 import importlib.util
 import json
 import os
@@ -49,6 +48,7 @@
 from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
 from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils.gc_utils import freeze_gc_heap
 
 MILLISECONDS_TO_SECONDS_CONVERSION = 1000
 
@@ -94,10 +94,6 @@ class BenchmarkMetrics:
     # Max output tokens per second and concurrent requests at that peak
     max_output_tokens_per_s: float
     max_concurrent_requests: int
-    # TPS per user - average tokens per second per request
-    tps_per_user: float
-    # TPS per GPU - output throughput divided by number of GPUs
-    tps_per_gpu: float
 
 
 @dataclass
@@ -292,7 +288,6 @@ def calculate_metrics(
     tokenizer: PreTrainedTokenizerBase,
     selected_percentiles: list[float],
     goodput_config_dict: dict[str, float],
-    num_gpu: int = 1,
 ) -> tuple[BenchmarkMetrics, list[int]]:
     """Calculate the metrics for the benchmark.
 
@@ -316,7 +311,6 @@ def calculate_metrics(
     all_tpots: list[float] = []
     ttfts: list[float] = []
     e2els: list[float] = []
-    tps_per_user: list[float] = []
     for i in range(len(outputs)):
         if outputs[i].success:
             output_len = outputs[i].output_tokens
@@ -344,10 +338,6 @@ def calculate_metrics(
             itls += outputs[i].itl
             ttfts.append(outputs[i].ttft)
             e2els.append(outputs[i].latency)
-            
-            # Calculate TPS per request: (input_tokens + output_tokens) / latency
-            tps_per_user.append(output_len / (outputs[i].latency))
-            
             completed += 1
         else:
             actual_output_lens.append(0)
@@ -486,8 +476,6 @@ def calculate_metrics(
         ],
         max_output_tokens_per_s=max_output_tokens_per_s,
         max_concurrent_requests=max_concurrent_requests,
-        tps_per_user=np.mean(tps_per_user) if tps_per_user else 0.0,
-        tps_per_gpu=sum(actual_output_lens) / dur_s / num_gpu if num_gpu > 0 else 0.0,
     )
 
     return metrics, actual_output_lens
@@ -520,7 +508,6 @@ async def benchmark(
     ramp_up_start_rps: int | None = None,
     ramp_up_end_rps: int | None = None,
     ready_check_timeout_sec: int = 600,
-    num_gpu: int = 1,
 ):
     try:
         request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
@@ -752,7 +739,6 @@ async def limited_request_func(request_func_input, session, pbar):
             tokenizer=tokenizer,
             selected_percentiles=selected_percentiles,
             goodput_config_dict=goodput_config_dict,
-            num_gpu=num_gpu,
         )
     else:
         metrics = calculate_metrics_for_embeddings(
@@ -805,11 +791,6 @@ async def limited_request_func(request_func_input, session, pbar):
             "Total Token throughput (tok/s):", metrics.total_token_throughput
         )
     )
-    if isinstance(metrics, BenchmarkMetrics):
-        print("{:<40} {:<10.2f}".format("TPS per user (tok/s):",
-                                        metrics.tps_per_user))
-        print("{:<40} {:<10.2f}".format("TPS per GPU (tok/s):",
-                                        metrics.tps_per_gpu))
 
     if isinstance(metrics, BenchmarkMetrics):
         result = {
@@ -830,8 +811,6 @@ async def limited_request_func(request_func_input, session, pbar):
             "errors": [output.error for output in outputs],
             "max_output_tokens_per_s": metrics.max_output_tokens_per_s,
             "max_concurrent_requests": metrics.max_concurrent_requests,
-            "tps_per_user": metrics.tps_per_user,
-            "tps_per_gpu": metrics.tps_per_gpu,
         }
     else:
         result = {
@@ -1157,7 +1136,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
     parser.add_argument(
         "--percentile-metrics",
         type=str,
-        default="ttft,tpot,itl,e2el",
+        default=None,
         help="Comma-separated list of selected metrics to report percentils. "
         "This argument specifies the metrics to report percentiles. "
         'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
@@ -1304,12 +1283,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
         "in seconds (default: 600 seconds / 10 minutes). If set to 0, "
         "the ready check will be skipped.",
     )
-    parser.add_argument(
-        "--num-gpu",
-        type=int,
-        default=1,
-        help="Number of GPUs used for serving. Used to calculate TPS per GPU metric."
-    )
 
     parser.add_argument(
         "--extra-body",
@@ -1441,8 +1414,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
     percentile_metrics: str = args.percentile_metrics or default_percentile_metrics
 
     # Avoid GC processing "static" data - reduce pause times.
-    gc.collect()
-    gc.freeze()
+    freeze_gc_heap()
 
     benchmark_result = await benchmark(
         task_type=task_type,
@@ -1471,7 +1443,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
         ramp_up_start_rps=args.ramp_up_start_rps,
         ramp_up_end_rps=args.ramp_up_end_rps,
         ready_check_timeout_sec=args.ready_check_timeout_sec,
-        num_gpu=args.num_gpu,
     )
 
     # Save config and results to json
@@ -1555,4 +1526,4 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
             json.dump(result_json, outfile)
         save_to_pytorch_benchmark_format(args, result_json, file_name)
 
-    return result_json
+    return result_json
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
@@ -295,10 +295,10 @@ async def create_chat_completion(
                         self.model_config.logits_processor_pattern,
                         self.default_sampling_params,
                     )
-                    # validate_logits_processors_parameters(
-                    #     self.logits_processors,
-                    #     sampling_params,
-                    # )
+                    validate_logits_processors_parameters(
+                        self.logits_processors,
+                        sampling_params,
+                    )
 
                 self._log_inputs(
                     request_id,
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -186,10 +186,10 @@ async def create_completion(
                         self.model_config.logits_processor_pattern,
                         self.default_sampling_params,
                     )
-                    # validate_logits_processors_parameters(
-                    #     self.logits_processors,
-                    #     sampling_params,
-                    # )
+                    validate_logits_processors_parameters(
+                        self.logits_processors,
+                        sampling_params,
+                    )
 
                 request_id_item = f"{request_id}-{i}"