diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index ec9b0fd6e969..88cabe9e54fe 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -20,15 +20,17 @@ import logging import math import random +import re from abc import ABC, abstractmethod from collections.abc import Callable, Iterator, Mapping from contextlib import suppress from copy import deepcopy from dataclasses import dataclass +from datasets import load_dataset from functools import cache from io import BytesIO from tempfile import NamedTemporaryFile -from typing import Any, cast +from typing import Any, Callable, List, Optional, Union, cast import numpy as np from PIL import Image @@ -63,6 +65,8 @@ except ImportError: from argparse import ArgumentParser as FlexibleArgumentParser +from rich.progress import Progress + logger = logging.getLogger(__name__) # ----------------------------------------------------------------------------- @@ -1288,6 +1292,119 @@ def sample( ) return samples +# ----------------------------------------------------------------------------- +# Project Gutenberg Dataset Implementation +# ----------------------------------------------------------------------------- + + +class GutenbergDataset(BenchmarkDataset): + """ + Implements the Gutenberg dataset. Loads data from a text file and generates + sample requests. + """ + + def __init__(self, **kwargs) -> None: + super().__init__(**kwargs) + self.load_data() + + def load_data(self) -> None: + """Load data from HuggingFace datasets.""" + self.data = load_dataset(self.dataset_path, split="en", streaming=True) + self.data = self.data.shuffle(seed=self.random_seed) + + def clean_gutenberg_text(self, text: str) -> str: + """ + Basic cleaning for Project Gutenberg text: + - Extract content between "*** START OF ..." and "*** END OF ..." markers (if available) + - Normalize whitespace and multiple newlines + - Trim leading/trailing spaces + """ + # Extract content inside START/END markers + start_match = re.search(r"\*\*\* START OF.*?\*\*\*", text, re.IGNORECASE | re.DOTALL) + end_match = re.search(r"\*\*\* END OF.*?\*\*\*", text, re.IGNORECASE | re.DOTALL) + + if start_match and end_match: + content = text[start_match.end(): end_match.start()] + else: + # Use entire raw text if markers are missing + content = text + + # Normalize newlines + content = content.replace("\r\n", "\n").replace("\r", "\n") + # Reduce consecutive blank lines and spaces + content = re.sub(r"\n{2,}", "\n\n", content) + content = re.sub(r"[ \t]{2,}", " ", content) + # Strip spaces + content = content.strip() + return content + + def chunk_tokens(self, input_ids: List[int], input_len: int = 4000) -> List[List[int]]: + """ + Split a list of token IDs into chunks of size `chunk_size`. + """ + return [ + input_ids[i : i + input_len] + for i in range(0, len(input_ids), input_len) + ] + def sample( + self, + tokenizer: PreTrainedTokenizerBase, + num_requests: int, + input_len: int, + output_len: int, + request_id_prefix: str = "", + no_oversample: bool = False, + **kwargs, + ) -> list: + samples = [] + ind = 0 + + pbar = Progress() + task_id = pbar.add_task( + description="Preparing input prompts...", + total=num_requests, + ) + pbar.start() + for book_idx, book in enumerate(self.data): + if len(samples) >= num_requests: + break + text = book["text"] + text = self.clean_gutenberg_text(text) + input_ids = tokenizer(text).input_ids + chunks = self.chunk_tokens(input_ids, input_len) + for i in range(0, len(chunks)-1): + if len(samples) >= num_requests: + break + prompt = tokenizer.decode(chunks[i]) + prompt_len = len(chunks[i]) + # FOR DEBUG + #print(f"{prompt_len=}") + #print(f"{prompt=}\n") + samples.append( + SampleRequest( + prompt=prompt, + prompt_len=prompt_len, + expected_output_len=output_len, + multi_modal_data=None, + request_id=request_id_prefix + str(ind), + )) + pbar.update(task_id=task_id, advance=1) + ind += 1 + + completed = pbar.tasks[task_id].completed + if completed < num_requests: + print( + f"Not enough compatible requests ({completed}/{num_requests}). " + f"Start oversampling..." + ) + pbar.update(task_id=task_id, advance=(num_requests-completed)) + pbar.stop() + self.maybe_oversample_requests(samples, + num_requests, + request_id_prefix, + no_oversample) + return samples + class _ValidateDatasetArgs(argparse.Action): """Argparse action to validate dataset name and path compatibility.""" @@ -1333,6 +1450,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "custom", "prefix_repetition", "spec_bench", + "gutenberg" ], help="Name of the dataset to benchmark on.", ) @@ -1346,7 +1464,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): type=str, default=None, action=_ValidateDatasetArgs, - help="Path to the sharegpt/sonnet dataset. " + help="Path to the sharegpt/sonnet/gutenberg dataset. " "Or the huggingface dataset ID if using HF dataset.", ) parser.add_argument( @@ -1367,6 +1485,13 @@ def add_dataset_parser(parser: FlexibleArgumentParser): # group for dataset specific arguments custom_group = parser.add_argument_group("custom dataset options") + custom_group.add_argument( + "--custom-input-len", + type=int, + default=None, + help= + "Number of input tokens per request, used only for custom dataset.", + ) custom_group.add_argument( "--custom-output-len", type=int, @@ -1417,6 +1542,22 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "from the ShareGPT dataset.", ) + gutenberg_group = parser.add_argument_group("gutenberg dataset options") + gutenberg_group.add_argument( + "--gutenberg-input-len", + type=int, + default=None, + help="Input length for each request. Overrides the input length " + "from the Gutenberg dataset.", + ) + gutenberg_group.add_argument( + "--gutenberg-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output length " + "from the Gutenberg dataset.", + ) + blazedit_group = parser.add_argument_group("blazedit dataset options") blazedit_group.add_argument( "--blazedit-min-distance", @@ -1644,6 +1785,7 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: input_requests = dataset.sample( num_requests=args.num_prompts, tokenizer=tokenizer, + input_len=args.custom_input_len, output_len=args.custom_output_len, skip_chat_template=args.skip_chat_template, request_id_prefix=args.request_id_prefix, @@ -1828,6 +1970,16 @@ def get_samples(args, tokenizer) -> list[SampleRequest]: request_id_prefix=args.request_id_prefix, no_oversample=args.no_oversample, ), + "gutenberg": lambda: GutenbergDataset( + random_seed=args.seed, dataset_path=args.dataset_path + ).sample( + tokenizer=tokenizer, + num_requests=args.num_prompts, + input_len=args.gutenberg_input_len, + output_len=args.gutenberg_output_len, + request_id_prefix=args.request_id_prefix, + no_oversample=args.no_oversample, + ), "burstgpt": lambda: BurstGPTDataset( random_seed=args.seed, dataset_path=args.dataset_path, @@ -1974,6 +2126,7 @@ def sample( num_requests: int, lora_path: str | None = None, max_loras: int | None = None, + input_len: int | None = None, output_len: int | None = None, enable_multimodal_chat: bool = False, skip_chat_template: bool = False, @@ -1991,11 +2144,27 @@ def sample( num_requests, ) + pbar = Progress() + task_id = pbar.add_task(description="Preparing input prompts...", total=num_requests) + pbar.start() sampled_requests = [] for i, item in enumerate(self.data): if len(sampled_requests) >= num_requests: break + prompt = item["prompt"] + if prompt is None: + continue + + prompt_ids = tokenizer(prompt).input_ids + prompt_len = len(prompt_ids) + if prompt_len >= input_len: + if (prompt_len < input_len): + continue + prompt = tokenizer.decode(prompt_ids[:input_len]) + prompt_len = input_len + else: + continue # apply template if not skip_chat_template: @@ -2004,8 +2173,8 @@ def sample( add_generation_prompt=True, tokenize=False, ) + prompt_len = len(tokenizer(prompt).input_ids) - prompt_len = len(tokenizer(prompt).input_ids) sampled_requests.append( SampleRequest( prompt=prompt, @@ -2014,6 +2183,12 @@ def sample( request_id=request_id_prefix + str(i), ) ) + pbar.update(task_id=task_id, advance=1) + completed = pbar.tasks[task_id].completed + if completed < num_requests: + print(f"Not enough compatible requests ({completed}/{num_requests}). Start oversampling...") + pbar.update(task_id=task_id, advance=(num_requests-completed)) + pbar.stop() self.maybe_oversample_requests( sampled_requests, num_requests, request_id_prefix, no_oversample ) diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index f54ddee52fb6..b9dc47168618 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -813,7 +813,10 @@ async def warmup_limited_request_func(): print(f"num_workers: {num_workers}") pbar_q = Queue() + start_time = time.time() benchmark_start_time = time.perf_counter() + ts = datetime.fromtimestamp(start_time).strftime('%Y-%m-%d %H:%M:%S') + print("Started at ", ts) if num_workers > 0: output_q = Queue() @@ -878,145 +881,149 @@ async def warmup_limited_request_func(): benchmark_end_time = time.perf_counter() benchmark_duration = benchmark_end_time - benchmark_start_time + end_time = time.time() + ts = datetime.fromtimestamp(end_time).strftime('%Y-%m-%d %H:%M:%S') + print("Finished at ", ts) - if task_type == TaskType.GENERATION: - metrics, actual_output_lens = calculate_metrics( - input_requests=input_requests, - outputs=outputs, - dur_s=benchmark_duration, - tokenizer=tokenizer, - selected_percentiles=selected_percentiles, - goodput_config_dict=goodput_config_dict, - ) - else: - metrics = calculate_metrics_for_embeddings( - outputs=outputs, - dur_s=benchmark_duration, - selected_percentiles=selected_percentiles, - ) - actual_output_lens = 0 - - print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) - print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - print("{:<40} {:<10}".format("Failed requests:", metrics.failed)) - if max_concurrency is not None: - print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency)) - if request_rate != float("inf"): - print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) - print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - if isinstance(metrics, BenchmarkMetrics): - print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) - print( - "{:<40} {:<10.2f}".format( - "Request throughput (req/s):", metrics.request_throughput - ) - ) - if goodput_config_dict: - print( - "{:<40} {:<10.2f}".format( - "Request goodput (req/s):", metrics.request_goodput + if warmup_time == 0 and cooldown_time == 0: + if task_type == TaskType.GENERATION: + metrics, actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=outputs, + dur_s=benchmark_duration, + tokenizer=tokenizer, + selected_percentiles=selected_percentiles, + goodput_config_dict=goodput_config_dict, ) - ) - if isinstance(metrics, BenchmarkMetrics): - print( - "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", metrics.output_throughput + else: + metrics = calculate_metrics_for_embeddings( + outputs=outputs, + dur_s=benchmark_duration, + selected_percentiles=selected_percentiles, ) - ) + actual_output_lens = 0 + + print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) + print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) + print("{:<40} {:<10}".format("Failed requests:", metrics.failed)) + if max_concurrency is not None: + print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency)) + if request_rate != float("inf"): + print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) + print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) + if isinstance(metrics, BenchmarkMetrics): + print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) print( "{:<40} {:<10.2f}".format( - "Peak output token throughput (tok/s):", metrics.max_output_tokens_per_s + "Request throughput (req/s):", metrics.request_throughput ) ) + if goodput_config_dict: + print( + "{:<40} {:<10.2f}".format( + "Request goodput (req/s):", metrics.request_goodput + ) + ) + if isinstance(metrics, BenchmarkMetrics): + print( + "{:<40} {:<10.2f}".format( + "Output token throughput (tok/s):", metrics.output_throughput + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Peak output token throughput (tok/s):", metrics.max_output_tokens_per_s + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Peak concurrent requests:", metrics.max_concurrent_requests + ) + ) print( "{:<40} {:<10.2f}".format( - "Peak concurrent requests:", metrics.max_concurrent_requests + "Total Token throughput (tok/s):", metrics.total_token_throughput ) ) - print( - "{:<40} {:<10.2f}".format( - "Total Token throughput (tok/s):", metrics.total_token_throughput - ) - ) - - if isinstance(metrics, BenchmarkMetrics): - result = { - "duration": benchmark_duration, - "completed": metrics.completed, - "failed": metrics.failed, - "total_input_tokens": metrics.total_input, - "total_output_tokens": metrics.total_output, - "request_throughput": metrics.request_throughput, - "request_goodput": metrics.request_goodput if goodput_config_dict else None, - "output_throughput": metrics.output_throughput, - "total_token_throughput": metrics.total_token_throughput, - "input_lens": [output.prompt_len for output in outputs], - "output_lens": actual_output_lens, - "ttfts": [output.ttft for output in outputs], - "itls": [output.itl for output in outputs], - "generated_texts": [output.generated_text for output in outputs], - "errors": [output.error for output in outputs], - "max_output_tokens_per_s": metrics.max_output_tokens_per_s, - "max_concurrent_requests": metrics.max_concurrent_requests, - } - else: - result = { - "duration": benchmark_duration, - "completed": metrics.completed, - "total_input_tokens": metrics.total_input, - "request_throughput": metrics.request_throughput, - "total_token_throughput": metrics.total_token_throughput, - "input_lens": [output.prompt_len for output in outputs], - "errors": [output.error for output in outputs], - } - def process_one_metric( - # E.g., "ttft" - metric_attribute_name: str, - # E.g., "TTFT" - metric_name: str, - # E.g., "Time to First Token" - metric_header: str, - ): - # This function prints and adds statistics of the specified - # metric. - if metric_attribute_name not in selected_percentile_metrics: - return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) - print( - "{:<40} {:<10.2f}".format( - f"Mean {metric_name} (ms):", - getattr(metrics, f"mean_{metric_attribute_name}_ms"), + if isinstance(metrics, BenchmarkMetrics): + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "failed": metrics.failed, + "total_input_tokens": metrics.total_input, + "total_output_tokens": metrics.total_output, + "request_throughput": metrics.request_throughput, + "request_goodput": metrics.request_goodput if goodput_config_dict else None, + "output_throughput": metrics.output_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "output_lens": actual_output_lens, + "ttfts": [output.ttft for output in outputs], + "itls": [output.itl for output in outputs], + "generated_texts": [output.generated_text for output in outputs], + "errors": [output.error for output in outputs], + "max_output_tokens_per_s": metrics.max_output_tokens_per_s, + "max_concurrent_requests": metrics.max_concurrent_requests, + } + else: + result = { + "duration": benchmark_duration, + "completed": metrics.completed, + "total_input_tokens": metrics.total_input, + "request_throughput": metrics.request_throughput, + "total_token_throughput": metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "errors": [output.error for output in outputs], + } + + def process_one_metric( + # E.g., "ttft" + metric_attribute_name: str, + # E.g., "TTFT" + metric_name: str, + # E.g., "Time to First Token" + metric_header: str, + ): + # This function prints and adds statistics of the specified + # metric. + if metric_attribute_name not in selected_percentile_metrics: + return + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"), + ) ) - ) - print( - "{:<40} {:<10.2f}".format( - f"Median {metric_name} (ms):", - getattr(metrics, f"median_{metric_attribute_name}_ms"), + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"), + ) ) - ) - result[f"mean_{metric_attribute_name}_ms"] = getattr( - metrics, f"mean_{metric_attribute_name}_ms" - ) - result[f"median_{metric_attribute_name}_ms"] = getattr( - metrics, f"median_{metric_attribute_name}_ms" - ) - result[f"std_{metric_attribute_name}_ms"] = getattr( - metrics, f"std_{metric_attribute_name}_ms" - ) - for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): - p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) - result[f"p{p_word}_{metric_attribute_name}_ms"] = value + result[f"mean_{metric_attribute_name}_ms"] = getattr( + metrics, f"mean_{metric_attribute_name}_ms" + ) + result[f"median_{metric_attribute_name}_ms"] = getattr( + metrics, f"median_{metric_attribute_name}_ms" + ) + result[f"std_{metric_attribute_name}_ms"] = getattr( + metrics, f"std_{metric_attribute_name}_ms" + ) + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): + p_word = str(int(p)) if int(p) == p else str(p) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) + result[f"p{p_word}_{metric_attribute_name}_ms"] = value - if task_type == TaskType.GENERATION: - process_one_metric("ttft", "TTFT", "Time to First Token") - process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") - process_one_metric("itl", "ITL", "Inter-token Latency") - process_one_metric("e2el", "E2EL", "End-to-end Latency") + if task_type == TaskType.GENERATION: + process_one_metric("ttft", "TTFT", "Time to First Token") + process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") + process_one_metric("itl", "ITL", "Inter-token Latency") + process_one_metric("e2el", "E2EL", "End-to-end Latency") - print("=" * 50) + print("=" * 50) if warmup_time > 0.0 or cooldown_time > 0.0: """ @@ -1033,10 +1040,13 @@ def process_one_metric( * error -- Copy at first * start_time -- Copy at first """ - min_start = min(e.start_time for e in outputs) - max_end = max(e.start_time + e.latency for e in outputs) + # min_start = min(e.start_time for e in outputs) + # max_end = max(e.start_time + e.latency for e in outputs) + min_start = benchmark_start_time + max_end = benchmark_end_time num_counted_tokens = 0 effective_outputs: list[RequestFuncOutput] = [] + full_effective_outputs: list[RequestFuncOutput] = [] warmup_sentinel = min_start + warmup_time cooldown_sentinel = max_end - cooldown_time @@ -1088,6 +1098,10 @@ def process_one_metric( effective_outputs.append(new_output) num_counted_tokens += new_output.output_tokens + # 5. Collect the full_effective_outputs (To get precise e2el and ttft) + if (output.start_time >= warmup_sentinel) and (output.start_time+output.latency <= cooldown_sentinel): + full_effective_outputs.append(output) + # Get effective duration (t_duration) t_duration = cooldown_sentinel - warmup_sentinel @@ -1100,21 +1114,56 @@ def process_one_metric( goodput_config_dict=goodput_config_dict, is_trim=True, ) - print("{s:{c}^{n}}".format(s="Serving Benchmark Result after warmup before cooldown", n=50, c="=")) + + ft_metrics, ft_actual_output_lens = calculate_metrics( + input_requests=input_requests, + outputs=full_effective_outputs, + dur_s=t_duration, + tokenizer=tokenizer, + selected_percentiles=selected_percentiles, + goodput_config_dict=goodput_config_dict, + is_trim=False, + ) + + print("{s:{c}^{n}}".format(s="Serving Benchmark Result", n=50, c="=")) + print("{:<40} {:<10}".format("Number of worker processes:", num_workers)) + print("{:<40} {:<10}".format("Successful requests:", ft_metrics.completed)) + if max_concurrency is not None: + print("{:<40} {:<10}".format("Maximum request concurrency:", + max_concurrency)) + if request_rate != float('inf'): + print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", + request_rate)) + # if request_rate_per_time > 0.0: + # print("{:<40} {:<10.2f}".format("Uniform request time (s):", + # request_rate_per_time)) print("{:<40} {:<10}".format("Warm-up Time:", warmup_time)) print("{:<40} {:<10}".format("Cool-down Time:", cooldown_time)) - print("{:<40} {:<10}".format("Total counted tokens at filtering:", num_counted_tokens)) print("{:<40} {:<10.2f}".format("Benchmark duration (s):", t_duration)) - if isinstance(metrics, BenchmarkMetrics): - print("{:<40} {:<10}".format("Total generated tokens:", t_metrics.total_output)) - if isinstance(metrics, BenchmarkMetrics): - print( - "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", num_counted_tokens / t_duration - ) - ) - - result_t = { + print("{:<40} {:<10}".format("Total input tokens:", ft_metrics.total_input)) + print("{:<40} {:<10}".format("Total generated tokens:", t_metrics.total_output)) + # if isinstance(ft_metrics, BenchmarkMetrics): + # print("{:<40} {:<10}".format("Total generated tokens:", + # metrics.total_output)) + # print("{:<40} {:<10.2f}".format("Request throughput (req/s):", + # metrics.request_throughput)) + # if goodput_config_dict: + # print("{:<40} {:<10.2f}".format("Request goodput (req/s):", + # ft_metrics.request_goodput)) + # if isinstance(ft_metrics, BenchmarkMetrics): + # print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", + # ft_metrics.output_throughput)) + # print("{:<40} {:<10.2f}".format( + # "Peak output token throughput (tok/s):", + # ft_metrics.max_output_tokens_per_s)) + # print("{:<40} {:<10.2f}".format("Peak concurrent requests:", + # ft_metrics.max_concurrent_requests)) + print("{:<40} {:<10.2f}".format( + "Output token throughput (tok/s):", num_counted_tokens / t_duration)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", + (ft_metrics.total_input+num_counted_tokens) / t_duration)) + + t_result = { "duration": t_duration, "completed": t_metrics.completed, "total_input_tokens": t_metrics.total_input, @@ -1133,7 +1182,29 @@ def process_one_metric( "max_concurrent_requests": t_metrics.max_concurrent_requests, } + ft_result = { + "duration": t_duration, + "completed": ft_metrics.completed, + "total_input_tokens": ft_metrics.total_input, + "total_output_tokens": ft_metrics.total_output, + "request_throughput": ft_metrics.request_throughput, + "request_goodput": ft_metrics.request_goodput if goodput_config_dict else None, + "output_throughput": ft_metrics.output_throughput, + "total_token_throughput": ft_metrics.total_token_throughput, + "input_lens": [output.prompt_len for output in outputs], + "output_lens": ft_actual_output_lens, + "ttfts": [output.ttft for output in effective_outputs], + "itls": [output.itl for output in effective_outputs], + "generated_texts": [output.generated_text for output in effective_outputs], + "errors": [output.error for output in outputs], + "max_output_tokens_per_s": ft_metrics.max_output_tokens_per_s, + "max_concurrent_requests": ft_metrics.max_concurrent_requests, + } + result = ft_result + def process_one_metric_trim( + o_metrics, + o_result, # E.g., "ttft" metric_attribute_name: str, # E.g., "TTFT" @@ -1149,31 +1220,33 @@ def process_one_metric_trim( print( "{:<40} {:<10.2f}".format( f"Mean {metric_name} (ms):", - getattr(t_metrics, f"mean_{metric_attribute_name}_ms"), + getattr(o_metrics, f"mean_{metric_attribute_name}_ms"), ) ) print( "{:<40} {:<10.2f}".format( f"Median {metric_name} (ms):", - getattr(t_metrics, f"median_{metric_attribute_name}_ms"), + getattr(o_metrics, f"median_{metric_attribute_name}_ms"), ) ) - result_t[f"mean_{metric_attribute_name}_ms"] = getattr( - t_metrics, f"mean_{metric_attribute_name}_ms" + o_result[f"mean_{metric_attribute_name}_ms"] = getattr( + o_metrics, f"mean_{metric_attribute_name}_ms" ) - result_t[f"median_{metric_attribute_name}_ms"] = getattr( - t_metrics, f"median_{metric_attribute_name}_ms" + o_result[f"median_{metric_attribute_name}_ms"] = getattr( + o_metrics, f"median_{metric_attribute_name}_ms" ) - result_t[f"std_{metric_attribute_name}_ms"] = getattr( - t_metrics, f"std_{metric_attribute_name}_ms" + o_result[f"std_{metric_attribute_name}_ms"] = getattr( + o_metrics, f"std_{metric_attribute_name}_ms" ) - for p, value in getattr(t_metrics, f"percentiles_{metric_attribute_name}_ms"): + for p, value in getattr(o_metrics, f"percentiles_{metric_attribute_name}_ms"): p_word = str(int(p)) if int(p) == p else str(p) print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) - result_t[f"p{p_word}_{metric_attribute_name}_ms"] = value + o_result[f"p{p_word}_{metric_attribute_name}_ms"] = value if task_type == TaskType.GENERATION: - process_one_metric_trim("itl", "ITL", "Inter-token Latency") + process_one_metric_trim(ft_metrics, ft_result,"ttft", "TTFT", "Time to First Token") + process_one_metric_trim(t_metrics, t_result,"itl", "ITL", "Inter-token Latency") + process_one_metric_trim(ft_metrics, ft_result, "e2el", "E2EL", "End-to-end Latency") print("=" * 50)