From 80a6fed4ecac3fe914157003b9038a40d43dd4cc Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 10 Sep 2025 00:16:19 -0700 Subject: [PATCH 01/16] quick fix Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .../scripts/convert-results-json-to-markdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index a655a650cb32..6e3dd8d70e5e 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -368,7 +368,7 @@ def parse_client_command(cmd: str) -> dict[str, Any]: # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", # we want to turn it into "8xGPUTYPE" df["GPU"] = df["GPU"].apply( - lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}" + lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0]) ) # get markdown tables From 89d54aa54a185fb95543eb4f3aaafb0e7ce902b3 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 17 Sep 2025 12:01:16 -0700 Subject: [PATCH 02/16] seperate model into different files Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 54 ++++++++++--------- .../convert-results-json-to-markdown.py | 2 + 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 5ea5a50a258a..bd0c102685cf 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -278,30 +278,36 @@ def split_json_by_tp_pp( output_df_sorted = output_df.sort_values(by=existing_group_cols) output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) for name, group in output_groups: + group_name = ",".join(map(str, name)).replace(",", "_").replace("/","-") + group_html_name = "perf_comparison_" + group_name + ".html" html = group.to_html() text_file.write(html_msgs_for_data_cols[i]) text_file.write(html) - - if plot and plotly_found: - import plotly.express as px - - df = group[raw_data_cols] - df_sorted = df.sort_values(by=info_cols[y_axis_index]) - # Melt DataFrame for plotting - df_melted = df_sorted.melt( - id_vars=info_cols[y_axis_index], - var_name="Configuration", - value_name=data_cols_to_compare[i], - ) - title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] - # Create Plotly line chart - fig = px.line( - df_melted, - x=info_cols[y_axis_index], - y=data_cols_to_compare[i], - color="Configuration", - title=title, - markers=True, - ) - # Export to HTML - text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) + with open(group_html_name, "a") as sub_text_file: + sub_text_file.write(html_msgs_for_data_cols[i]) + sub_text_file.write(html) + + if plot and plotly_found: + import plotly.express as px + + df = group[raw_data_cols] + df_sorted = df.sort_values(by=info_cols[y_axis_index]) + # Melt DataFrame for plotting + df_melted = df_sorted.melt( + id_vars=info_cols[y_axis_index], + var_name="Configuration", + value_name=data_cols_to_compare[i], + ) + title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] + # Create Plotly line chart + fig = px.line( + df_melted, + x=info_cols[y_axis_index], + y=data_cols_to_compare[i], + color="Configuration", + title=title, + markers=True, + ) + # Export to HTML + text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) + sub_text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 6e3dd8d70e5e..a7544aeef4c7 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -63,9 +63,11 @@ "mean_ttft_ms": "Mean TTFT (ms)", "median_ttft_ms": "Median TTFT (ms)", "p99_ttft_ms": "P99 TTFT (ms)", + "std_ttft_ms": "STD TTFT (ms)", "mean_tpot_ms": "Mean TPOT (ms)", "median_tpot_ms": "Median", "p99_tpot_ms": "P99", + "std_tpot_ms": "STD TPOT (ms)", "mean_itl_ms": "Mean ITL (ms)", "median_itl_ms": "Median ITL (ms)", "p99_itl_ms": "P99 ITL (ms)", From 1cd2a6fd5cb3fcd26ae5b8614f6d7658e2aab23d Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 17 Sep 2025 12:28:38 -0700 Subject: [PATCH 03/16] add SLA lines Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index bd0c102685cf..a0ba289deb40 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -198,6 +198,25 @@ def split_json_by_tp_pp( return saved_paths +def _add_limit_line(fig, y_value, label): + # Visible dashed line + annotation + fig.add_hline( + y=y_value, + line_dash="dash", + line_color="red" if "ttft" in label.lower() else "blue", + annotation_text=f"{label}: {y_value} ms", + annotation_position="top left", + ) + # Optional: add a legend item (as a transparent helper trace) + if plot and plotly_found: + import plotly.graph_objects as go + fig.add_trace(go.Scatter( + x=[None], y=[None], + mode="lines", + line=dict(dash="dash", + color="red" if "ttft" in label.lower() else "blue"), + name=f"{label}" + )) if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -220,6 +239,12 @@ def split_json_by_tp_pp( default="# of max concurrency.", help="column name to use as X Axis in comparison graph", ) + parser.add_argument("--ttft-max-ms", type=float, default=5000.0, + help="Reference limit for TTFT plots (ms)") + parser.add_argument("--tpot-max-ms", type=float, default=150.0, + help="Reference limit for TPOT plots (ms)") + + args = parser.parse_args() drop_column = "P99" @@ -308,6 +333,13 @@ def split_json_by_tp_pp( title=title, markers=True, ) + + # ---- Add threshold lines based on metric name ---- + if i == 1: + _add_limit_line(fig, args.ttft_max_ms, "TTFT limit") + if i == 2: + _add_limit_line(fig, args.tpot_max_ms, "TPOT limit") + # Export to HTML text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) sub_text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) From e1bd425d93aecffa38002c705de342bde6f2bb53 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 17 Sep 2025 14:15:51 -0700 Subject: [PATCH 04/16] highlight cell within SLA Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 41 +++++++++++++++++-- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index a0ba289deb40..aafdbd054a5e 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -218,6 +218,30 @@ def _add_limit_line(fig, y_value, label): name=f"{label}" )) + +def _find_concurrency_col(df: pd.DataFrame) -> str: + for c in ["# of max concurrency.", "# of max concurrency", "Max Concurrency", + "max_concurrency", "Concurrency"]: + if c in df.columns: + return c + # Fallback: guess an integer-like column (harmless if unused) + for c in df.columns: + if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1: + return c + return "# of max concurrency." + +def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.style.Styler": + """Highlight numeric per-configuration columns with value <= threshold.""" + conc_col = _find_concurrency_col(df) + key_cols = [c for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] if c in df.columns] + conf_cols = [c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")] + conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])] + return df.style.map( + lambda v: "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) and v <= threshold else "", + subset=conf_cols + ) + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -305,7 +329,18 @@ def _add_limit_line(fig, y_value, label): for name, group in output_groups: group_name = ",".join(map(str, name)).replace(",", "_").replace("/","-") group_html_name = "perf_comparison_" + group_name + ".html" - html = group.to_html() + + + metric_name = str(data_cols_to_compare[i]).lower() + if "tok/s" in metric_name: + html = group.to_html() + elif "ttft" in metric_name: + styler = _highlight_threshold(group, args.ttft_max_ms) + html = styler.to_html(table_attributes='border="1" class="dataframe"') + elif "tpot" in metric_name or "median" in metric_name: + styler = _highlight_threshold(group, args.tpot_max_ms) + html = styler.to_html(table_attributes='border="1" class="dataframe"') + text_file.write(html_msgs_for_data_cols[i]) text_file.write(html) with open(group_html_name, "a") as sub_text_file: @@ -335,9 +370,9 @@ def _add_limit_line(fig, y_value, label): ) # ---- Add threshold lines based on metric name ---- - if i == 1: + if "ttft" in metric_name: _add_limit_line(fig, args.ttft_max_ms, "TTFT limit") - if i == 2: + elif "tpot" in metric_name or "median" in metric_name: _add_limit_line(fig, args.tpot_max_ms, "TPOT limit") # Export to HTML From 2500b6ed7f3a99d468b69e4776e0fe9edc4256ed Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 17 Sep 2025 14:21:21 -0700 Subject: [PATCH 05/16] sorted by x axis value Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .buildkite/nightly-benchmarks/scripts/compare-json-results.py | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index aafdbd054a5e..1cd7433e21f1 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -325,6 +325,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s f"but DataFrame has: {list(output_df.columns)}" ) output_df_sorted = output_df.sort_values(by=existing_group_cols) + output_df_sorted = output_df.sort_values(by=args.xaxis) output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) for name, group in output_groups: group_name = ",".join(map(str, name)).replace(",", "_").replace("/","-") From 28a696dc6fa4260111c332a21e9d542399a76b0e Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 17 Sep 2025 16:26:32 -0700 Subject: [PATCH 06/16] reduce TTFT SLA Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .buildkite/nightly-benchmarks/scripts/compare-json-results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 1cd7433e21f1..a6f897b9bdab 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -263,7 +263,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s default="# of max concurrency.", help="column name to use as X Axis in comparison graph", ) - parser.add_argument("--ttft-max-ms", type=float, default=5000.0, + parser.add_argument("--ttft-max-ms", type=float, default=3000.0, help="Reference limit for TTFT plots (ms)") parser.add_argument("--tpot-max-ms", type=float, default=150.0, help="Reference limit for TPOT plots (ms)") From b06aab2fc0f1b4b7756c18b9ad0aa451b01c7d68 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 17 Sep 2025 17:05:21 -0700 Subject: [PATCH 07/16] change to compare p99 Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index a6f897b9bdab..841bab98e301 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -283,11 +283,17 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s "# of max concurrency.", "qps", ] - data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] + #data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] + #html_msgs_for_data_cols = [ + # "Compare Output Tokens /n", + # "Median TTFT /n", + # "Median TPOT /n", + #] + data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"] html_msgs_for_data_cols = [ "Compare Output Tokens /n", - "Median TTFT /n", - "Median TPOT /n", + "P99 TTFT /n", + "P99 TPOT /n", ] if len(args.file) == 1: @@ -300,7 +306,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s plot = args.plot # For Plot feature, assign y axis from one of info_cols y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6 - with open("perf_comparison.html", "w") as text_file: + with open("perf_comparison.html", "a") as text_file: for i in range(len(data_cols_to_compare)): output_df, raw_data_cols = compare_data_columns( files, @@ -338,7 +344,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s elif "ttft" in metric_name: styler = _highlight_threshold(group, args.ttft_max_ms) html = styler.to_html(table_attributes='border="1" class="dataframe"') - elif "tpot" in metric_name or "median" in metric_name: + elif "tpot" in metric_name or "median" in metric_name or "p99" in metric_name: styler = _highlight_threshold(group, args.tpot_max_ms) html = styler.to_html(table_attributes='border="1" class="dataframe"') @@ -373,7 +379,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s # ---- Add threshold lines based on metric name ---- if "ttft" in metric_name: _add_limit_line(fig, args.ttft_max_ms, "TTFT limit") - elif "tpot" in metric_name or "median" in metric_name: + elif "tpot" in metric_name or "median" in metric_name or "p99" in metric_name : _add_limit_line(fig, args.tpot_max_ms, "TPOT limit") # Export to HTML From f80f0ded1fcc83d601a15238d91ead6732691cb3 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Thu, 18 Sep 2025 13:30:53 -0700 Subject: [PATCH 08/16] make p99/median both available for latency Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 35 ++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 841bab98e301..bcfe3d1a32dd 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -263,6 +263,13 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s default="# of max concurrency.", help="column name to use as X Axis in comparison graph", ) + parser.add_argument( + "-l", + "--latency", + type=str, + default="p99", + help="take median|p99 for latency like TTFT/TPOT", + ) parser.add_argument("--ttft-max-ms", type=float, default=3000.0, help="Reference limit for TTFT plots (ms)") parser.add_argument("--tpot-max-ms", type=float, default=150.0, @@ -283,18 +290,22 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s "# of max concurrency.", "qps", ] - #data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] - #html_msgs_for_data_cols = [ - # "Compare Output Tokens /n", - # "Median TTFT /n", - # "Median TPOT /n", - #] - data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"] - html_msgs_for_data_cols = [ - "Compare Output Tokens /n", - "P99 TTFT /n", - "P99 TPOT /n", - ] + + if "median" in args.latency: + data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"] + html_msgs_for_data_cols = [ + "Compare Output Tokens /n", + "Median TTFT /n", + "Median TPOT /n", + ] + drop_column = "P99" + elif "p99" in args.latency: + data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"] + html_msgs_for_data_cols = [ + "Compare Output Tokens /n", + "P99 TTFT /n", + "P99 TPOT /n", + ] if len(args.file) == 1: files = split_json_by_tp_pp(args.file[0], output_root="splits") From 0555169c0bc52b313d09fd63f66100ca01881b8c Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Wed, 1 Oct 2025 18:22:57 -0700 Subject: [PATCH 09/16] Update .buildkite/nightly-benchmarks/scripts/compare-json-results.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Louie Tsai Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .buildkite/nightly-benchmarks/scripts/compare-json-results.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index bcfe3d1a32dd..17d44890b84d 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -317,7 +317,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s plot = args.plot # For Plot feature, assign y axis from one of info_cols y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6 - with open("perf_comparison.html", "a") as text_file: + with open("perf_comparison.html", "w") as text_file: for i in range(len(data_cols_to_compare)): output_df, raw_data_cols = compare_data_columns( files, @@ -341,7 +341,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s f"Expected subset: {filtered_info_cols}, " f"but DataFrame has: {list(output_df.columns)}" ) - output_df_sorted = output_df.sort_values(by=existing_group_cols) + #output_df_sorted = output_df.sort_values(by=existing_group_cols) output_df_sorted = output_df.sort_values(by=args.xaxis) output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) for name, group in output_groups: From 598edcba90c9542ea823b6e4a2eee3abfa5c0b5e Mon Sep 17 00:00:00 2001 From: louie-tsai Date: Wed, 1 Oct 2025 22:35:50 -0700 Subject: [PATCH 10/16] keep only 2 decimial Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 17d44890b84d..13c04a912934 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -7,6 +7,7 @@ import pandas as pd +pd.options.display.float_format = "{:.2f}".format plotly_found = util.find_spec("plotly.express") is not None @@ -272,7 +273,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s ) parser.add_argument("--ttft-max-ms", type=float, default=3000.0, help="Reference limit for TTFT plots (ms)") - parser.add_argument("--tpot-max-ms", type=float, default=150.0, + parser.add_argument("--tpot-max-ms", type=float, default=100.0, help="Reference limit for TPOT plots (ms)") @@ -348,15 +349,20 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s group_name = ",".join(map(str, name)).replace(",", "_").replace("/","-") group_html_name = "perf_comparison_" + group_name + ".html" - metric_name = str(data_cols_to_compare[i]).lower() if "tok/s" in metric_name: html = group.to_html() elif "ttft" in metric_name: - styler = _highlight_threshold(group, args.ttft_max_ms) + styler = ( + _highlight_threshold(group, args.ttft_max_ms) + .format({c: "{:.2f}" for c in group.select_dtypes("number").columns}, na_rep="—") + ) html = styler.to_html(table_attributes='border="1" class="dataframe"') elif "tpot" in metric_name or "median" in metric_name or "p99" in metric_name: - styler = _highlight_threshold(group, args.tpot_max_ms) + styler = ( + _highlight_threshold(group, args.tpot_max_ms) + .format({c: "{:.2f}" for c in group.select_dtypes("number").columns}, na_rep="—") + ) html = styler.to_html(table_attributes='border="1" class="dataframe"') text_file.write(html_msgs_for_data_cols[i]) From 5c4b6579635ec94e29f7f47640e3c6943ce96f14 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 1 Oct 2025 23:01:05 -0700 Subject: [PATCH 11/16] fix for pre-commit Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .../scripts/compare-json-results.py | 119 ++++++++++++------ 1 file changed, 84 insertions(+), 35 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 13c04a912934..0014140474ee 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -199,6 +199,7 @@ def split_json_by_tp_pp( return saved_paths + def _add_limit_line(fig, y_value, label): # Visible dashed line + annotation fig.add_hline( @@ -211,18 +212,28 @@ def _add_limit_line(fig, y_value, label): # Optional: add a legend item (as a transparent helper trace) if plot and plotly_found: import plotly.graph_objects as go - fig.add_trace(go.Scatter( - x=[None], y=[None], - mode="lines", - line=dict(dash="dash", - color="red" if "ttft" in label.lower() else "blue"), - name=f"{label}" - )) + + fig.add_trace( + go.Scatter( + x=[None], + y=[None], + mode="lines", + line=dict( + dash="dash", color="red" if "ttft" in label.lower() else "blue" + ), + name=f"{label}", + ) + ) def _find_concurrency_col(df: pd.DataFrame) -> str: - for c in ["# of max concurrency.", "# of max concurrency", "Max Concurrency", - "max_concurrency", "Concurrency"]: + for c in [ + "# of max concurrency.", + "# of max concurrency", + "Max Concurrency", + "max_concurrency", + "Concurrency", + ]: if c in df.columns: return c # Fallback: guess an integer-like column (harmless if unused) @@ -231,15 +242,26 @@ def _find_concurrency_col(df: pd.DataFrame) -> str: return c return "# of max concurrency." -def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.style.Styler": + +def _highlight_threshold( + df: pd.DataFrame, threshold: float +) -> "pd.io.formats.style.Styler": """Highlight numeric per-configuration columns with value <= threshold.""" conc_col = _find_concurrency_col(df) - key_cols = [c for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] if c in df.columns] - conf_cols = [c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")] + key_cols = [ + c + for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] + if c in df.columns + ] + conf_cols = [ + c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio") + ] conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])] return df.style.map( - lambda v: "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) and v <= threshold else "", - subset=conf_cols + lambda v: "background-color:#e6ffe6;font-weight:bold;" + if pd.notna(v) and v <= threshold + else "", + subset=conf_cols, ) @@ -271,11 +293,18 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s default="p99", help="take median|p99 for latency like TTFT/TPOT", ) - parser.add_argument("--ttft-max-ms", type=float, default=3000.0, - help="Reference limit for TTFT plots (ms)") - parser.add_argument("--tpot-max-ms", type=float, default=100.0, - help="Reference limit for TPOT plots (ms)") - + parser.add_argument( + "--ttft-max-ms", + type=float, + default=3000.0, + help="Reference limit for TTFT plots (ms)", + ) + parser.add_argument( + "--tpot-max-ms", + type=float, + default=100.0, + help="Reference limit for TPOT plots (ms)", + ) args = parser.parse_args() @@ -342,29 +371,39 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s f"Expected subset: {filtered_info_cols}, " f"but DataFrame has: {list(output_df.columns)}" ) - #output_df_sorted = output_df.sort_values(by=existing_group_cols) + # output_df_sorted = output_df.sort_values(by=existing_group_cols) output_df_sorted = output_df.sort_values(by=args.xaxis) output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False) for name, group in output_groups: - group_name = ",".join(map(str, name)).replace(",", "_").replace("/","-") + group_name = ( + ",".join(map(str, name)).replace(",", "_").replace("/", "-") + ) group_html_name = "perf_comparison_" + group_name + ".html" metric_name = str(data_cols_to_compare[i]).lower() if "tok/s" in metric_name: html = group.to_html() elif "ttft" in metric_name: - styler = ( - _highlight_threshold(group, args.ttft_max_ms) - .format({c: "{:.2f}" for c in group.select_dtypes("number").columns}, na_rep="—") + styler = _highlight_threshold(group, args.ttft_max_ms).format( + {c: "{:.2f}" for c in group.select_dtypes("number").columns}, + na_rep="—", + ) + html = styler.to_html( + table_attributes='border="1" class="dataframe"' ) - html = styler.to_html(table_attributes='border="1" class="dataframe"') - elif "tpot" in metric_name or "median" in metric_name or "p99" in metric_name: - styler = ( - _highlight_threshold(group, args.tpot_max_ms) - .format({c: "{:.2f}" for c in group.select_dtypes("number").columns}, na_rep="—") + elif ( + "tpot" in metric_name + or "median" in metric_name + or "p99" in metric_name + ): + styler = _highlight_threshold(group, args.tpot_max_ms).format( + {c: "{:.2f}" for c in group.select_dtypes("number").columns}, + na_rep="—", ) - html = styler.to_html(table_attributes='border="1" class="dataframe"') - + html = styler.to_html( + table_attributes='border="1" class="dataframe"' + ) + text_file.write(html_msgs_for_data_cols[i]) text_file.write(html) with open(group_html_name, "a") as sub_text_file: @@ -382,7 +421,9 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s var_name="Configuration", value_name=data_cols_to_compare[i], ) - title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] + title = ( + data_cols_to_compare[i] + " vs " + info_cols[y_axis_index] + ) # Create Plotly line chart fig = px.line( df_melted, @@ -396,9 +437,17 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s # ---- Add threshold lines based on metric name ---- if "ttft" in metric_name: _add_limit_line(fig, args.ttft_max_ms, "TTFT limit") - elif "tpot" in metric_name or "median" in metric_name or "p99" in metric_name : + elif ( + "tpot" in metric_name + or "median" in metric_name + or "p99" in metric_name + ): _add_limit_line(fig, args.tpot_max_ms, "TPOT limit") # Export to HTML - text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) - sub_text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn")) + text_file.write( + fig.to_html(full_html=True, include_plotlyjs="cdn") + ) + sub_text_file.write( + fig.to_html(full_html=True, include_plotlyjs="cdn") + ) From 1bd8e3c06f0aee25a5b2620b36a9e9dc7e2fa0de Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Tue, 7 Oct 2025 13:19:24 -0700 Subject: [PATCH 12/16] make the latency ratio >1 Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .../nightly-benchmarks/scripts/compare-json-results.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py index 0014140474ee..c8bf7b045366 100644 --- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py +++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py @@ -110,7 +110,10 @@ def compare_data_columns( if len(compare_frames) >= 2: base = compare_frames[0] current = compare_frames[-1] - ratio = current / base + if "P99" in data_column or "Median" in data_column: + ratio = base / current # for latency + else: + ratio = current / base ratio = ratio.mask(base == 0) # avoid inf when baseline is 0 ratio.name = f"Ratio 1 vs {len(compare_frames)}" frames.append(ratio) @@ -406,7 +409,7 @@ def _highlight_threshold( text_file.write(html_msgs_for_data_cols[i]) text_file.write(html) - with open(group_html_name, "a") as sub_text_file: + with open(group_html_name, "a+") as sub_text_file: sub_text_file.write(html_msgs_for_data_cols[i]) sub_text_file.write(html) From ed7d3b219d4c52017daca9b111fde71c4e25cca6 Mon Sep 17 00:00:00 2001 From: louie-tsai Date: Thu, 9 Oct 2025 10:26:35 -0700 Subject: [PATCH 13/16] Add vllm collect-env Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .../nightly-benchmarks/scripts/run-performance-benchmarks.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh index c64e5638029e..5a47576483bb 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh @@ -471,6 +471,11 @@ main() { mkdir -p $RESULTS_FOLDER QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + # dump vllm info via vllm collect-env + env_output=$(vllm collect-env) + + echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt" + # benchmarking run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}" run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}" From e2aaa8d27e474a3107b061d744cc2acd1350ac21 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Thu, 9 Oct 2025 15:24:59 -0700 Subject: [PATCH 14/16] add TP4 test cases according to findings from AWS benchmarking Signed-off-by: Tsai, Louie Signed-off-by: louie-tsai Signed-off-by: Tsai, Louie --- .../tests/serving-tests-cpu-snc3.json | 203 ++++++++++++++++++ .../tests/serving-tests-cpu.json | 51 ++++- 2 files changed, 246 insertions(+), 8 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json index ce396d6e54f2..0b1a42e79025 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json @@ -95,6 +95,38 @@ "num_prompts": 200 } }, + { + "test_name": "serving_llama8B_bf16_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, { "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt", "qps_list": ["inf"], @@ -233,6 +265,41 @@ "num_prompts": 1000 } }, + { + "test_name": "serving_llama8B_bf16_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, { "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128", "qps_list": ["inf"], @@ -365,6 +432,38 @@ "num_prompts": 200 } }, + { + "test_name": "serving_llama8B_int8_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, { "test_name": "serving_llama8B_int8_tp2pp3_sharegpt", "qps_list": ["inf"], @@ -503,6 +602,41 @@ "num_prompts": 1000 } }, + { + "test_name": "serving_llama8B_int8_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, { "test_name": "serving_llama8B_int8_tp2pp3_random_128_128", "qps_list": ["inf"], @@ -638,6 +772,39 @@ "num_prompts": 200 } }, + { + "test_name": "serving_llama8B_int4_tp4_sharegpt", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, { "test_name": "serving_llama8B_int4_tp2pp3_sharegpt", "qps_list": ["inf"], @@ -780,6 +947,42 @@ "num_prompts": 1000 } }, + { + "test_name": "serving_llama8B_int4_tp4_random_128_128", + "qps_list": ["inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "quantization": "awq", + "tensor_parallel_size": 4, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 1000 + } + }, { "test_name": "serving_llama8B_int4_tp2pp3_random_128_128", "qps_list": ["inf"], diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json index e21c8df0a9fe..5066f0d51895 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json @@ -96,7 +96,7 @@ } }, { - "test_name": "serving_llama8B_tp4_random_1024_128", + "test_name": "serving_llama8B_tp1_random_128_128", "qps_list": [1, 4, 16, "inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -108,7 +108,42 @@ }, "server_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_128", + "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, @@ -124,14 +159,14 @@ "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", - "random-input-len": 1024, + "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "num_prompts": 100 + "num_prompts": 200 } }, { - "test_name": "serving_llama8B_pp6_random_1024_128", + "test_name": "serving_llama8B_tp4_random_128_128", "qps_list": [1, 4, 16, "inf"], "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], "server_environment_variables": { @@ -143,7 +178,7 @@ }, "server_parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", - "pipeline_parallel_size": 6, + "tensor_parallel_size": 4, "dtype": "bfloat16", "distributed_executor_backend": "mp", "block_size": 128, @@ -159,10 +194,10 @@ "model": "meta-llama/Llama-3.1-8B-Instruct", "backend": "vllm", "dataset_name": "random", - "random-input-len": 1024, + "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "num_prompts": 100 + "num_prompts": 200 } } ] From 92d187b998eac725a9a984cc8d060c64e225b39d Mon Sep 17 00:00:00 2001 From: louie-tsai Date: Mon, 13 Oct 2025 16:27:43 -0700 Subject: [PATCH 15/16] change serving-test-cpu.json for R8i.24xlarge Signed-off-by: Tsai, Louie --- .../tests/serving-tests-cpu.json | 83 ++----------------- 1 file changed, 8 insertions(+), 75 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json index 5066f0d51895..844726635a7e 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json @@ -2,7 +2,7 @@ { "test_name": "serving_llama8B_tp1_sharegpt", "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "max_concurrency_list": [32], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -28,13 +28,13 @@ "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 + "num_prompts": 32 } }, { "test_name": "serving_llama8B_tp2_sharegpt", "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "max_concurrency_list": [32], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -60,45 +60,13 @@ "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_tp4_sharegpt", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 + "num_prompts": 32 } }, { "test_name": "serving_llama8B_tp1_random_128_128", "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "max_concurrency_list": [32], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -127,13 +95,13 @@ "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "num_prompts": 200 + "num_prompts": 32 } }, { "test_name": "serving_llama8B_tp2_random_128_128", "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], + "max_concurrency_list": [32], "server_environment_variables": { "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, @@ -162,42 +130,7 @@ "random-input-len": 128, "random-output-len": 128, "ignore-eos": "", - "num_prompts": 200 - } - }, - { - "test_name": "serving_llama8B_tp4_random_128_128", - "qps_list": [1, 4, 16, "inf"], - "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], - "server_environment_variables": { - "VLLM_RPC_TIMEOUT": 100000, - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, - "VLLM_CPU_SGL_KERNEL": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "server_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "dtype": "bfloat16", - "distributed_executor_backend": "mp", - "block_size": 128, - "trust_remote_code": "", - "enable_chunked_prefill": "", - "disable_log_stats": "", - "enforce_eager": "", - "max_num_batched_tokens": 2048, - "max_num_seqs": 256, - "load_format": "dummy" - }, - "client_parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "backend": "vllm", - "dataset_name": "random", - "random-input-len": 128, - "random-output-len": 128, - "ignore-eos": "", - "num_prompts": 200 + "num_prompts": 32 } } ] From d8314706ea485ce82d14f04d1b05bffc1117e40b Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 15 Oct 2025 08:58:54 -0700 Subject: [PATCH 16/16] add two more use cases according to discussions Signed-off-by: Tsai, Louie --- .../tests/latency-tests-cpu.json | 30 ++-- .../tests/serving-tests-cpu.json | 140 ++++++++++++++++++ .../tests/throughput-tests-cpu.json | 31 ++-- 3 files changed, 166 insertions(+), 35 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json index 569117aae852..77d1694ec864 100644 --- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json @@ -1,28 +1,24 @@ [ { - "test_name": "latency_llama8B_tp1", + "test_name": "latency_llama8B_tp2", "environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "load_format": "dummy", - "num_iters_warmup": 5, - "num_iters": 15 - } - }, - { - "test_name": "latency_llama8B_tp4", - "environment_variables": { - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "load_format": "dummy", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "num_iters_warmup": 5, "num_iters": 15 } diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json index 844726635a7e..f792956f3947 100644 --- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json @@ -132,5 +132,145 @@ "ignore-eos": "", "num_prompts": 32 } + }, + { + "test_name": "serving_llama8B_tp1_random_128_2048", + "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [32], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048, + "ignore-eos": "", + "num_prompts": 32 + } + }, + { + "test_name": "serving_llama8B_tp2_random_128_2048", + "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [32], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 128, + "random-output-len": 2048, + "ignore-eos": "", + "num_prompts": 32 + } + }, + { + "test_name": "serving_llama8B_tp1_random_2048_128", + "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [32], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 32 + } + }, + { + "test_name": "serving_llama8B_tp2_random_2048_128", + "qps_list": [1, 4, 16, "inf"], + "max_concurrency_list": [32], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 2048, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 32 + } } ] diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json index 48c015aa8403..dc214ddfb27e 100644 --- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json @@ -1,29 +1,24 @@ [ { - "test_name": "throughput_llama8B_tp1", + "test_name": "throughput_llama8B_tp2", "environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_SGL_KERNEL": 1, "VLLM_CPU_KVCACHE_SPACE": 40 }, "parameters": { "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 1, - "load_format": "dummy", - "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "backend": "vllm" - } - }, - { - "test_name": "throughput_llama8B_tp4", - "environment_variables": { - "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, - "VLLM_CPU_KVCACHE_SPACE": 40 - }, - "parameters": { - "model": "meta-llama/Llama-3.1-8B-Instruct", - "tensor_parallel_size": 4, - "load_format": "dummy", + "tensor_parallel_size": 2, + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "enforce_eager": "", + "max_num_batched_tokens": 2048, + "max_num_seqs": 256, "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "backend": "vllm"