Skip to content

Commit 462c8c2

Browse files
committed
revert unrelated work
Signed-off-by: jiahanc <[email protected]>
1 parent 72fc354 commit 462c8c2

File tree

3 files changed

+12
-41
lines changed

3 files changed

+12
-41
lines changed

vllm/benchmarks/serve.py

Lines changed: 4 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
import argparse
2020
import asyncio
2121
import contextlib
22-
import gc
2322
import importlib.util
2423
import json
2524
import os
@@ -49,6 +48,7 @@
4948
from vllm.benchmarks.lib.ready_checker import wait_for_endpoint
5049
from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json
5150
from vllm.transformers_utils.tokenizer import get_tokenizer
51+
from vllm.utils.gc_utils import freeze_gc_heap
5252

5353
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
5454

@@ -94,10 +94,6 @@ class BenchmarkMetrics:
9494
# Max output tokens per second and concurrent requests at that peak
9595
max_output_tokens_per_s: float
9696
max_concurrent_requests: int
97-
# TPS per user - average tokens per second per request
98-
tps_per_user: float
99-
# TPS per GPU - output throughput divided by number of GPUs
100-
tps_per_gpu: float
10197

10298

10399
@dataclass
@@ -292,7 +288,6 @@ def calculate_metrics(
292288
tokenizer: PreTrainedTokenizerBase,
293289
selected_percentiles: list[float],
294290
goodput_config_dict: dict[str, float],
295-
num_gpu: int = 1,
296291
) -> tuple[BenchmarkMetrics, list[int]]:
297292
"""Calculate the metrics for the benchmark.
298293
@@ -316,7 +311,6 @@ def calculate_metrics(
316311
all_tpots: list[float] = []
317312
ttfts: list[float] = []
318313
e2els: list[float] = []
319-
tps_per_user: list[float] = []
320314
for i in range(len(outputs)):
321315
if outputs[i].success:
322316
output_len = outputs[i].output_tokens
@@ -344,10 +338,6 @@ def calculate_metrics(
344338
itls += outputs[i].itl
345339
ttfts.append(outputs[i].ttft)
346340
e2els.append(outputs[i].latency)
347-
348-
# Calculate TPS per request: (input_tokens + output_tokens) / latency
349-
tps_per_user.append(output_len / (outputs[i].latency))
350-
351341
completed += 1
352342
else:
353343
actual_output_lens.append(0)
@@ -486,8 +476,6 @@ def calculate_metrics(
486476
],
487477
max_output_tokens_per_s=max_output_tokens_per_s,
488478
max_concurrent_requests=max_concurrent_requests,
489-
tps_per_user=np.mean(tps_per_user) if tps_per_user else 0.0,
490-
tps_per_gpu=sum(actual_output_lens) / dur_s / num_gpu if num_gpu > 0 else 0.0,
491479
)
492480

493481
return metrics, actual_output_lens
@@ -520,7 +508,6 @@ async def benchmark(
520508
ramp_up_start_rps: int | None = None,
521509
ramp_up_end_rps: int | None = None,
522510
ready_check_timeout_sec: int = 600,
523-
num_gpu: int = 1,
524511
):
525512
try:
526513
request_func = ASYNC_REQUEST_FUNCS[endpoint_type]
@@ -752,7 +739,6 @@ async def limited_request_func(request_func_input, session, pbar):
752739
tokenizer=tokenizer,
753740
selected_percentiles=selected_percentiles,
754741
goodput_config_dict=goodput_config_dict,
755-
num_gpu=num_gpu,
756742
)
757743
else:
758744
metrics = calculate_metrics_for_embeddings(
@@ -805,11 +791,6 @@ async def limited_request_func(request_func_input, session, pbar):
805791
"Total Token throughput (tok/s):", metrics.total_token_throughput
806792
)
807793
)
808-
if isinstance(metrics, BenchmarkMetrics):
809-
print("{:<40} {:<10.2f}".format("TPS per user (tok/s):",
810-
metrics.tps_per_user))
811-
print("{:<40} {:<10.2f}".format("TPS per GPU (tok/s):",
812-
metrics.tps_per_gpu))
813794

814795
if isinstance(metrics, BenchmarkMetrics):
815796
result = {
@@ -830,8 +811,6 @@ async def limited_request_func(request_func_input, session, pbar):
830811
"errors": [output.error for output in outputs],
831812
"max_output_tokens_per_s": metrics.max_output_tokens_per_s,
832813
"max_concurrent_requests": metrics.max_concurrent_requests,
833-
"tps_per_user": metrics.tps_per_user,
834-
"tps_per_gpu": metrics.tps_per_gpu,
835814
}
836815
else:
837816
result = {
@@ -1157,7 +1136,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
11571136
parser.add_argument(
11581137
"--percentile-metrics",
11591138
type=str,
1160-
default="ttft,tpot,itl,e2el",
1139+
default=None,
11611140
help="Comma-separated list of selected metrics to report percentils. "
11621141
"This argument specifies the metrics to report percentiles. "
11631142
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
@@ -1304,12 +1283,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
13041283
"in seconds (default: 600 seconds / 10 minutes). If set to 0, "
13051284
"the ready check will be skipped.",
13061285
)
1307-
parser.add_argument(
1308-
"--num-gpu",
1309-
type=int,
1310-
default=1,
1311-
help="Number of GPUs used for serving. Used to calculate TPS per GPU metric."
1312-
)
13131286

13141287
parser.add_argument(
13151288
"--extra-body",
@@ -1441,8 +1414,7 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
14411414
percentile_metrics: str = args.percentile_metrics or default_percentile_metrics
14421415

14431416
# Avoid GC processing "static" data - reduce pause times.
1444-
gc.collect()
1445-
gc.freeze()
1417+
freeze_gc_heap()
14461418

14471419
benchmark_result = await benchmark(
14481420
task_type=task_type,
@@ -1471,7 +1443,6 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
14711443
ramp_up_start_rps=args.ramp_up_start_rps,
14721444
ramp_up_end_rps=args.ramp_up_end_rps,
14731445
ready_check_timeout_sec=args.ready_check_timeout_sec,
1474-
num_gpu=args.num_gpu,
14751446
)
14761447

14771448
# Save config and results to json
@@ -1555,4 +1526,4 @@ async def main_async(args: argparse.Namespace) -> dict[str, Any]:
15551526
json.dump(result_json, outfile)
15561527
save_to_pytorch_benchmark_format(args, result_json, file_name)
15571528

1558-
return result_json
1529+
return result_json

vllm/entrypoints/openai/serving_chat.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -295,10 +295,10 @@ async def create_chat_completion(
295295
self.model_config.logits_processor_pattern,
296296
self.default_sampling_params,
297297
)
298-
# validate_logits_processors_parameters(
299-
# self.logits_processors,
300-
# sampling_params,
301-
# )
298+
validate_logits_processors_parameters(
299+
self.logits_processors,
300+
sampling_params,
301+
)
302302

303303
self._log_inputs(
304304
request_id,

vllm/entrypoints/openai/serving_completion.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,10 +186,10 @@ async def create_completion(
186186
self.model_config.logits_processor_pattern,
187187
self.default_sampling_params,
188188
)
189-
# validate_logits_processors_parameters(
190-
# self.logits_processors,
191-
# sampling_params,
192-
# )
189+
validate_logits_processors_parameters(
190+
self.logits_processors,
191+
sampling_params,
192+
)
193193

194194
request_id_item = f"{request_id}-{i}"
195195

0 commit comments

Comments
 (0)