Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1055,3 +1055,10 @@
- "Enable VLLM_USE_FLASHINFER_MOE_INT4=1 for Kimi K2.5 INT4 B200 benchmark"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/935

- config-keys:
- gptoss-fp4-b200-vllm
description:
- "test"
pr-link: test


8 changes: 4 additions & 4 deletions utils/bench_serving/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -930,18 +930,18 @@ def main(args: argparse.Namespace):
parser.add_argument(
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
default="ttft,tpot,itl,e2el",
help="Comma-seperated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
"Default value is \"ttft,tpot,itl\".")
"Default value is \"ttft,tpot,itl,e2el\".")
parser.add_argument(
"--metric-percentiles",
type=str,
default="99",
default="50,90,99,99.9",
help="Comma-seperated list of percentiles for selected metrics. "
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
"Default value is \"99\". "
"Default value is \"50,90,99,99.9\". "
"Use \"--percentile-metrics\" to select metrics.",
)
parser.add_argument(
Expand Down
55 changes: 46 additions & 9 deletions utils/summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,19 @@
EP = "EP"
DP_ATTENTION = "DP Attention"
CONC = "Conc"
TTFT = "TTFT (ms)"
TTFT_MEDIAN = "TTFT Median (ms)"
TTFT_P90 = "TTFT P90 (ms)"
TTFT_P99 = "TTFT P99 (ms)"
TTFT_P999 = "TTFT P99.9 (ms)"
TPOT = "TPOT (ms)"
INTERACTIVITY = "Interactivity (tok/s/user)"
E2EL = "E2EL (s)"
INTVTY_MEDIAN = "Intvty Median (tok/s/user)"
INTVTY_P90 = "Intvty P90 (tok/s/user)"
INTVTY_P99 = "Intvty P99 (tok/s/user)"
INTVTY_P999 = "Intvty P99.9 (tok/s/user)"
E2EL_MEDIAN = "E2EL Median (s)"
E2EL_P90 = "E2EL P90 (s)"
E2EL_P99 = "E2EL P99 (s)"
E2EL_P999 = "E2EL P99.9 (s)"
TPUT_PER_GPU = "TPUT per GPU"
OUTPUT_TPUT_PER_GPU = "Output TPUT per GPU"
INPUT_TPUT_PER_GPU = "Input TPUT per GPU"
Expand Down Expand Up @@ -74,7 +83,12 @@ def main():

single_node_headers = [
MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, DP_ATTENTION,
CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
CONC,
TTFT_MEDIAN, TTFT_P90, TTFT_P99, TTFT_P999,
TPOT,
INTVTY_MEDIAN, INTVTY_P90, INTVTY_P99, INTVTY_P999,
E2EL_MEDIAN, E2EL_P90, E2EL_P99, E2EL_P999,
TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
]

single_node_rows = [
Expand All @@ -91,9 +105,18 @@ def main():
r['dp_attention'],
r['conc'],
f"{r['median_ttft'] * 1000:.4f}",
f"{r.get('p90_ttft', 0) * 1000:.4f}",
f"{r.get('p99_ttft', 0) * 1000:.4f}",
f"{r.get('p99.9_ttft', 0) * 1000:.4f}",
f"{r['median_tpot'] * 1000:.4f}",
f"{r['median_intvty']:.4f}",
f"{r['median_e2el']:.4f}",
f"{r.get('median_intvty', 0):.4f}",
f"{r.get('p90_intvty', 0):.4f}",
f"{r.get('p99_intvty', 0):.4f}",
f"{r.get('p99.9_intvty', 0):.4f}",
f"{r.get('median_e2el', 0):.4f}",
f"{r.get('p90_e2el', 0):.4f}",
f"{r.get('p99_e2el', 0):.4f}",
f"{r.get('p99.9_e2el', 0):.4f}",
f"{r['tput_per_gpu']:.4f}",
f"{r['output_tput_per_gpu']:.4f}",
f"{r['input_tput_per_gpu']:.4f}",
Expand All @@ -114,7 +137,12 @@ def main():
MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL,
PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, PREFILL_GPUS,
DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, DECODE_GPUS,
CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
CONC,
TTFT_MEDIAN, TTFT_P90, TTFT_P99, TTFT_P999,
TPOT,
INTVTY_MEDIAN, INTVTY_P90, INTVTY_P99, INTVTY_P999,
E2EL_MEDIAN, E2EL_P90, E2EL_P99, E2EL_P999,
TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU
]

multinode_rows = [
Expand All @@ -138,9 +166,18 @@ def main():
r['num_decode_gpu'],
r['conc'],
f"{r['median_ttft'] * 1000:.4f}",
f"{r.get('p90_ttft', 0) * 1000:.4f}",
f"{r.get('p99_ttft', 0) * 1000:.4f}",
f"{r.get('p99.9_ttft', 0) * 1000:.4f}",
f"{r['median_tpot'] * 1000:.4f}",
f"{r['median_intvty']:.4f}",
f"{r['median_e2el']:.4f}",
f"{r.get('median_intvty', 0):.4f}",
f"{r.get('p90_intvty', 0):.4f}",
f"{r.get('p99_intvty', 0):.4f}",
f"{r.get('p99.9_intvty', 0):.4f}",
f"{r.get('median_e2el', 0):.4f}",
f"{r.get('p90_e2el', 0):.4f}",
f"{r.get('p99_e2el', 0):.4f}",
f"{r.get('p99.9_e2el', 0):.4f}",
f"{r['tput_per_gpu']:.4f}",
f"{r['output_tput_per_gpu']:.4f}",
f"{r['input_tput_per_gpu']:.4f}",
Expand Down
Loading