From 80a6fed4ecac3fe914157003b9038a40d43dd4cc Mon Sep 17 00:00:00 2001
From: "Tsai, Louie" <louie.tsai@intel.com>
Date: Wed, 10 Sep 2025 00:16:19 -0700
Subject: [PATCH 01/16] quick fix

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../scripts/convert-results-json-to-markdown.py                 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index a655a650cb32..6e3dd8d70e5e 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -368,7 +368,7 @@ def parse_client_command(cmd: str) -> dict[str, Any]:
         # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
         # we want to turn it into "8xGPUTYPE"
         df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
+            lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
         )
 
     # get markdown tables

From 89d54aa54a185fb95543eb4f3aaafb0e7ce902b3 Mon Sep 17 00:00:00 2001
From: "Tsai, Louie" <louie.tsai@intel.com>
Date: Wed, 17 Sep 2025 12:01:16 -0700
Subject: [PATCH 02/16] seperate model into different files

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../scripts/compare-json-results.py           | 54 ++++++++++---------
 .../convert-results-json-to-markdown.py       |  2 +
 2 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 5ea5a50a258a..bd0c102685cf 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -278,30 +278,36 @@ def split_json_by_tp_pp(
             output_df_sorted = output_df.sort_values(by=existing_group_cols)
             output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
             for name, group in output_groups:
+                group_name = ",".join(map(str, name)).replace(",", "_").replace("/","-")
+                group_html_name = "perf_comparison_" + group_name + ".html"
                 html = group.to_html()
                 text_file.write(html_msgs_for_data_cols[i])
                 text_file.write(html)
-
-                if plot and plotly_found:
-                    import plotly.express as px
-
-                    df = group[raw_data_cols]
-                    df_sorted = df.sort_values(by=info_cols[y_axis_index])
-                    # Melt DataFrame for plotting
-                    df_melted = df_sorted.melt(
-                        id_vars=info_cols[y_axis_index],
-                        var_name="Configuration",
-                        value_name=data_cols_to_compare[i],
-                    )
-                    title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
-                    # Create Plotly line chart
-                    fig = px.line(
-                        df_melted,
-                        x=info_cols[y_axis_index],
-                        y=data_cols_to_compare[i],
-                        color="Configuration",
-                        title=title,
-                        markers=True,
-                    )
-                    # Export to HTML
-                    text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
+                with open(group_html_name, "a") as sub_text_file:
+                    sub_text_file.write(html_msgs_for_data_cols[i])
+                    sub_text_file.write(html)
+
+                    if plot and plotly_found:
+                        import plotly.express as px
+
+                        df = group[raw_data_cols]
+                        df_sorted = df.sort_values(by=info_cols[y_axis_index])
+                        # Melt DataFrame for plotting
+                        df_melted = df_sorted.melt(
+                            id_vars=info_cols[y_axis_index],
+                            var_name="Configuration",
+                            value_name=data_cols_to_compare[i],
+                        )
+                        title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+                        # Create Plotly line chart
+                        fig = px.line(
+                            df_melted,
+                            x=info_cols[y_axis_index],
+                            y=data_cols_to_compare[i],
+                            color="Configuration",
+                            title=title,
+                            markers=True,
+                        )
+                        # Export to HTML
+                        text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
+                        sub_text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 6e3dd8d70e5e..a7544aeef4c7 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -63,9 +63,11 @@
     "mean_ttft_ms": "Mean TTFT (ms)",
     "median_ttft_ms": "Median TTFT (ms)",
     "p99_ttft_ms": "P99 TTFT (ms)",
+    "std_ttft_ms": "STD TTFT (ms)",
     "mean_tpot_ms": "Mean TPOT (ms)",
     "median_tpot_ms": "Median",
     "p99_tpot_ms": "P99",
+    "std_tpot_ms": "STD TPOT (ms)",
     "mean_itl_ms": "Mean ITL (ms)",
     "median_itl_ms": "Median ITL (ms)",
     "p99_itl_ms": "P99 ITL (ms)",

From 1cd2a6fd5cb3fcd26ae5b8614f6d7658e2aab23d Mon Sep 17 00:00:00 2001
From: "Tsai, Louie" <louie.tsai@intel.com>
Date: Wed, 17 Sep 2025 12:28:38 -0700
Subject: [PATCH 03/16] add SLA lines

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../scripts/compare-json-results.py           | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index bd0c102685cf..a0ba289deb40 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -198,6 +198,25 @@ def split_json_by_tp_pp(
 
     return saved_paths
 
+def _add_limit_line(fig, y_value, label):
+    # Visible dashed line + annotation
+    fig.add_hline(
+        y=y_value,
+        line_dash="dash",
+        line_color="red" if "ttft" in label.lower() else "blue",
+        annotation_text=f"{label}: {y_value} ms",
+        annotation_position="top left",
+    )
+    # Optional: add a legend item (as a transparent helper trace)
+    if plot and plotly_found:
+        import plotly.graph_objects as go
+        fig.add_trace(go.Scatter(
+            x=[None], y=[None],
+            mode="lines",
+            line=dict(dash="dash",
+                  color="red" if "ttft" in label.lower() else "blue"),
+            name=f"{label}"
+        ))
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -220,6 +239,12 @@ def split_json_by_tp_pp(
         default="# of max concurrency.",
         help="column name to use as X Axis in comparison graph",
     )
+    parser.add_argument("--ttft-max-ms", type=float, default=5000.0,
+                    help="Reference limit for TTFT plots (ms)")
+    parser.add_argument("--tpot-max-ms", type=float, default=150.0,
+                    help="Reference limit for TPOT plots (ms)")
+
+
     args = parser.parse_args()
 
     drop_column = "P99"
@@ -308,6 +333,13 @@ def split_json_by_tp_pp(
                             title=title,
                             markers=True,
                         )
+
+                        # ---- Add threshold lines based on metric name ----
+                        if i == 1:
+                            _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
+                        if i == 2:
+                            _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
+
                         # Export to HTML
                         text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
                         sub_text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))

From e1bd425d93aecffa38002c705de342bde6f2bb53 Mon Sep 17 00:00:00 2001
From: "Tsai, Louie" <louie.tsai@intel.com>
Date: Wed, 17 Sep 2025 14:15:51 -0700
Subject: [PATCH 04/16] highlight cell within SLA

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../scripts/compare-json-results.py           | 41 +++++++++++++++++--
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index a0ba289deb40..aafdbd054a5e 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -218,6 +218,30 @@ def _add_limit_line(fig, y_value, label):
             name=f"{label}"
         ))
 
+
+def _find_concurrency_col(df: pd.DataFrame) -> str:
+    for c in ["# of max concurrency.", "# of max concurrency", "Max Concurrency",
+              "max_concurrency", "Concurrency"]:
+        if c in df.columns:
+            return c
+    # Fallback: guess an integer-like column (harmless if unused)
+    for c in df.columns:
+        if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
+            return c
+    return "# of max concurrency."
+
+def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.style.Styler":
+    """Highlight numeric per-configuration columns with value <= threshold."""
+    conc_col = _find_concurrency_col(df)
+    key_cols = [c for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] if c in df.columns]
+    conf_cols = [c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")]
+    conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
+    return df.style.map(
+        lambda v: "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) and v <= threshold else "",
+        subset=conf_cols
+    )
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -305,7 +329,18 @@ def _add_limit_line(fig, y_value, label):
             for name, group in output_groups:
                 group_name = ",".join(map(str, name)).replace(",", "_").replace("/","-")
                 group_html_name = "perf_comparison_" + group_name + ".html"
-                html = group.to_html()
+
+
+                metric_name = str(data_cols_to_compare[i]).lower()
+                if "tok/s" in metric_name:
+                    html = group.to_html()
+                elif "ttft" in metric_name:
+                    styler = _highlight_threshold(group, args.ttft_max_ms)
+                    html = styler.to_html(table_attributes='border="1" class="dataframe"')
+                elif "tpot" in metric_name or "median" in metric_name:
+                    styler = _highlight_threshold(group, args.tpot_max_ms)
+                    html = styler.to_html(table_attributes='border="1" class="dataframe"')
+                
                 text_file.write(html_msgs_for_data_cols[i])
                 text_file.write(html)
                 with open(group_html_name, "a") as sub_text_file:
@@ -335,9 +370,9 @@ def _add_limit_line(fig, y_value, label):
                         )
 
                         # ---- Add threshold lines based on metric name ----
-                        if i == 1:
+                        if "ttft" in metric_name:
                             _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
-                        if i == 2:
+                        elif "tpot" in metric_name or "median" in metric_name:
                             _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
 
                         # Export to HTML

From 2500b6ed7f3a99d468b69e4776e0fe9edc4256ed Mon Sep 17 00:00:00 2001
From: "Tsai, Louie" <louie.tsai@intel.com>
Date: Wed, 17 Sep 2025 14:21:21 -0700
Subject: [PATCH 05/16] sorted by x axis value

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .buildkite/nightly-benchmarks/scripts/compare-json-results.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index aafdbd054a5e..1cd7433e21f1 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -325,6 +325,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
                     f"but DataFrame has: {list(output_df.columns)}"
                 )
             output_df_sorted = output_df.sort_values(by=existing_group_cols)
+            output_df_sorted = output_df.sort_values(by=args.xaxis)
             output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
             for name, group in output_groups:
                 group_name = ",".join(map(str, name)).replace(",", "_").replace("/","-")

From 28a696dc6fa4260111c332a21e9d542399a76b0e Mon Sep 17 00:00:00 2001
From: "Tsai, Louie" <louie.tsai@intel.com>
Date: Wed, 17 Sep 2025 16:26:32 -0700
Subject: [PATCH 06/16] reduce TTFT SLA

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .buildkite/nightly-benchmarks/scripts/compare-json-results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 1cd7433e21f1..a6f897b9bdab 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -263,7 +263,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
         default="# of max concurrency.",
         help="column name to use as X Axis in comparison graph",
     )
-    parser.add_argument("--ttft-max-ms", type=float, default=5000.0,
+    parser.add_argument("--ttft-max-ms", type=float, default=3000.0,
                     help="Reference limit for TTFT plots (ms)")
     parser.add_argument("--tpot-max-ms", type=float, default=150.0,
                     help="Reference limit for TPOT plots (ms)")

From b06aab2fc0f1b4b7756c18b9ad0aa451b01c7d68 Mon Sep 17 00:00:00 2001
From: "Tsai, Louie" <louie.tsai@intel.com>
Date: Wed, 17 Sep 2025 17:05:21 -0700
Subject: [PATCH 07/16] change to compare p99

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../scripts/compare-json-results.py            | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index a6f897b9bdab..841bab98e301 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -283,11 +283,17 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
         "# of max concurrency.",
         "qps",
     ]
-    data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
+    #data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
+    #html_msgs_for_data_cols = [
+    #    "Compare Output Tokens /n",
+    #    "Median TTFT /n",
+    #    "Median TPOT /n",
+    #]
+    data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
     html_msgs_for_data_cols = [
         "Compare Output Tokens /n",
-        "Median TTFT /n",
-        "Median TPOT /n",
+        "P99 TTFT /n",
+        "P99 TPOT /n",
     ]
 
     if len(args.file) == 1:
@@ -300,7 +306,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
     plot = args.plot
     # For Plot feature, assign y axis from one of info_cols
     y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
-    with open("perf_comparison.html", "w") as text_file:
+    with open("perf_comparison.html", "a") as text_file:
         for i in range(len(data_cols_to_compare)):
             output_df, raw_data_cols = compare_data_columns(
                 files,
@@ -338,7 +344,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
                 elif "ttft" in metric_name:
                     styler = _highlight_threshold(group, args.ttft_max_ms)
                     html = styler.to_html(table_attributes='border="1" class="dataframe"')
-                elif "tpot" in metric_name or "median" in metric_name:
+                elif "tpot" in metric_name or "median" in metric_name or "p99" in metric_name:
                     styler = _highlight_threshold(group, args.tpot_max_ms)
                     html = styler.to_html(table_attributes='border="1" class="dataframe"')
                 
@@ -373,7 +379,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
                         # ---- Add threshold lines based on metric name ----
                         if "ttft" in metric_name:
                             _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
-                        elif "tpot" in metric_name or "median" in metric_name:
+                        elif "tpot" in metric_name or "median" in metric_name or "p99" in metric_name :
                             _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
 
                         # Export to HTML

From f80f0ded1fcc83d601a15238d91ead6732691cb3 Mon Sep 17 00:00:00 2001
From: "Tsai, Louie" <louie.tsai@intel.com>
Date: Thu, 18 Sep 2025 13:30:53 -0700
Subject: [PATCH 08/16] make p99/median both available for latency

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../scripts/compare-json-results.py           | 35 ++++++++++++-------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 841bab98e301..bcfe3d1a32dd 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -263,6 +263,13 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
         default="# of max concurrency.",
         help="column name to use as X Axis in comparison graph",
     )
+    parser.add_argument(
+        "-l",
+        "--latency",
+        type=str,
+        default="p99",
+        help="take median|p99 for latency like TTFT/TPOT",
+    )
     parser.add_argument("--ttft-max-ms", type=float, default=3000.0,
                     help="Reference limit for TTFT plots (ms)")
     parser.add_argument("--tpot-max-ms", type=float, default=150.0,
@@ -283,18 +290,22 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
         "# of max concurrency.",
         "qps",
     ]
-    #data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
-    #html_msgs_for_data_cols = [
-    #    "Compare Output Tokens /n",
-    #    "Median TTFT /n",
-    #    "Median TPOT /n",
-    #]
-    data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
-    html_msgs_for_data_cols = [
-        "Compare Output Tokens /n",
-        "P99 TTFT /n",
-        "P99 TPOT /n",
-    ]
+
+    if "median" in args.latency:
+        data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
+        html_msgs_for_data_cols = [
+            "Compare Output Tokens /n",
+            "Median TTFT /n",
+            "Median TPOT /n",
+        ]
+        drop_column = "P99"
+    elif "p99" in args.latency:
+        data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
+        html_msgs_for_data_cols = [
+            "Compare Output Tokens /n",
+            "P99 TTFT /n",
+            "P99 TPOT /n",
+        ]
 
     if len(args.file) == 1:
         files = split_json_by_tp_pp(args.file[0], output_root="splits")

From 0555169c0bc52b313d09fd63f66100ca01881b8c Mon Sep 17 00:00:00 2001
From: Louie Tsai <louie.tsai@intel.com>
Date: Wed, 1 Oct 2025 18:22:57 -0700
Subject: [PATCH 09/16] Update
 .buildkite/nightly-benchmarks/scripts/compare-json-results.py

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Louie Tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .buildkite/nightly-benchmarks/scripts/compare-json-results.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index bcfe3d1a32dd..17d44890b84d 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -317,7 +317,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
     plot = args.plot
     # For Plot feature, assign y axis from one of info_cols
     y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
-    with open("perf_comparison.html", "a") as text_file:
+    with open("perf_comparison.html", "w") as text_file:
         for i in range(len(data_cols_to_compare)):
             output_df, raw_data_cols = compare_data_columns(
                 files,
@@ -341,7 +341,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
                     f"Expected subset: {filtered_info_cols}, "
                     f"but DataFrame has: {list(output_df.columns)}"
                 )
-            output_df_sorted = output_df.sort_values(by=existing_group_cols)
+            #output_df_sorted = output_df.sort_values(by=existing_group_cols)
             output_df_sorted = output_df.sort_values(by=args.xaxis)
             output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
             for name, group in output_groups:

From 598edcba90c9542ea823b6e4a2eee3abfa5c0b5e Mon Sep 17 00:00:00 2001
From: louie-tsai <louie.tsai@intel.com>
Date: Wed, 1 Oct 2025 22:35:50 -0700
Subject: [PATCH 10/16] keep only 2 decimial

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../scripts/compare-json-results.py                | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 17d44890b84d..13c04a912934 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -7,6 +7,7 @@
 
 import pandas as pd
 
+pd.options.display.float_format = "{:.2f}".format
 plotly_found = util.find_spec("plotly.express") is not None
 
 
@@ -272,7 +273,7 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
     )
     parser.add_argument("--ttft-max-ms", type=float, default=3000.0,
                     help="Reference limit for TTFT plots (ms)")
-    parser.add_argument("--tpot-max-ms", type=float, default=150.0,
+    parser.add_argument("--tpot-max-ms", type=float, default=100.0,
                     help="Reference limit for TPOT plots (ms)")
 
 
@@ -348,15 +349,20 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
                 group_name = ",".join(map(str, name)).replace(",", "_").replace("/","-")
                 group_html_name = "perf_comparison_" + group_name + ".html"
 
-
                 metric_name = str(data_cols_to_compare[i]).lower()
                 if "tok/s" in metric_name:
                     html = group.to_html()
                 elif "ttft" in metric_name:
-                    styler = _highlight_threshold(group, args.ttft_max_ms)
+                    styler = (
+                      _highlight_threshold(group, args.ttft_max_ms)
+                      .format({c: "{:.2f}" for c in group.select_dtypes("number").columns}, na_rep="—")
+                    )
                     html = styler.to_html(table_attributes='border="1" class="dataframe"')
                 elif "tpot" in metric_name or "median" in metric_name or "p99" in metric_name:
-                    styler = _highlight_threshold(group, args.tpot_max_ms)
+                    styler = (
+                      _highlight_threshold(group, args.tpot_max_ms)
+                      .format({c: "{:.2f}" for c in group.select_dtypes("number").columns}, na_rep="—")
+                    )
                     html = styler.to_html(table_attributes='border="1" class="dataframe"')
                 
                 text_file.write(html_msgs_for_data_cols[i])

From 5c4b6579635ec94e29f7f47640e3c6943ce96f14 Mon Sep 17 00:00:00 2001
From: "Tsai, Louie" <louie.tsai@intel.com>
Date: Wed, 1 Oct 2025 23:01:05 -0700
Subject: [PATCH 11/16] fix for pre-commit

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../scripts/compare-json-results.py           | 119 ++++++++++++------
 1 file changed, 84 insertions(+), 35 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 13c04a912934..0014140474ee 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -199,6 +199,7 @@ def split_json_by_tp_pp(
 
     return saved_paths
 
+
 def _add_limit_line(fig, y_value, label):
     # Visible dashed line + annotation
     fig.add_hline(
@@ -211,18 +212,28 @@ def _add_limit_line(fig, y_value, label):
     # Optional: add a legend item (as a transparent helper trace)
     if plot and plotly_found:
         import plotly.graph_objects as go
-        fig.add_trace(go.Scatter(
-            x=[None], y=[None],
-            mode="lines",
-            line=dict(dash="dash",
-                  color="red" if "ttft" in label.lower() else "blue"),
-            name=f"{label}"
-        ))
+
+        fig.add_trace(
+            go.Scatter(
+                x=[None],
+                y=[None],
+                mode="lines",
+                line=dict(
+                    dash="dash", color="red" if "ttft" in label.lower() else "blue"
+                ),
+                name=f"{label}",
+            )
+        )
 
 
 def _find_concurrency_col(df: pd.DataFrame) -> str:
-    for c in ["# of max concurrency.", "# of max concurrency", "Max Concurrency",
-              "max_concurrency", "Concurrency"]:
+    for c in [
+        "# of max concurrency.",
+        "# of max concurrency",
+        "Max Concurrency",
+        "max_concurrency",
+        "Concurrency",
+    ]:
         if c in df.columns:
             return c
     # Fallback: guess an integer-like column (harmless if unused)
@@ -231,15 +242,26 @@ def _find_concurrency_col(df: pd.DataFrame) -> str:
             return c
     return "# of max concurrency."
 
-def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.style.Styler":
+
+def _highlight_threshold(
+    df: pd.DataFrame, threshold: float
+) -> "pd.io.formats.style.Styler":
     """Highlight numeric per-configuration columns with value <= threshold."""
     conc_col = _find_concurrency_col(df)
-    key_cols = [c for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col] if c in df.columns]
-    conf_cols = [c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")]
+    key_cols = [
+        c
+        for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
+        if c in df.columns
+    ]
+    conf_cols = [
+        c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
+    ]
     conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
     return df.style.map(
-        lambda v: "background-color:#e6ffe6;font-weight:bold;" if pd.notna(v) and v <= threshold else "",
-        subset=conf_cols
+        lambda v: "background-color:#e6ffe6;font-weight:bold;"
+        if pd.notna(v) and v <= threshold
+        else "",
+        subset=conf_cols,
     )
 
 
@@ -271,11 +293,18 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
         default="p99",
         help="take median|p99 for latency like TTFT/TPOT",
     )
-    parser.add_argument("--ttft-max-ms", type=float, default=3000.0,
-                    help="Reference limit for TTFT plots (ms)")
-    parser.add_argument("--tpot-max-ms", type=float, default=100.0,
-                    help="Reference limit for TPOT plots (ms)")
-
+    parser.add_argument(
+        "--ttft-max-ms",
+        type=float,
+        default=3000.0,
+        help="Reference limit for TTFT plots (ms)",
+    )
+    parser.add_argument(
+        "--tpot-max-ms",
+        type=float,
+        default=100.0,
+        help="Reference limit for TPOT plots (ms)",
+    )
 
     args = parser.parse_args()
 
@@ -342,29 +371,39 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
                     f"Expected subset: {filtered_info_cols}, "
                     f"but DataFrame has: {list(output_df.columns)}"
                 )
-            #output_df_sorted = output_df.sort_values(by=existing_group_cols)
+            # output_df_sorted = output_df.sort_values(by=existing_group_cols)
             output_df_sorted = output_df.sort_values(by=args.xaxis)
             output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
             for name, group in output_groups:
-                group_name = ",".join(map(str, name)).replace(",", "_").replace("/","-")
+                group_name = (
+                    ",".join(map(str, name)).replace(",", "_").replace("/", "-")
+                )
                 group_html_name = "perf_comparison_" + group_name + ".html"
 
                 metric_name = str(data_cols_to_compare[i]).lower()
                 if "tok/s" in metric_name:
                     html = group.to_html()
                 elif "ttft" in metric_name:
-                    styler = (
-                      _highlight_threshold(group, args.ttft_max_ms)
-                      .format({c: "{:.2f}" for c in group.select_dtypes("number").columns}, na_rep="—")
+                    styler = _highlight_threshold(group, args.ttft_max_ms).format(
+                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
+                        na_rep="—",
+                    )
+                    html = styler.to_html(
+                        table_attributes='border="1" class="dataframe"'
                     )
-                    html = styler.to_html(table_attributes='border="1" class="dataframe"')
-                elif "tpot" in metric_name or "median" in metric_name or "p99" in metric_name:
-                    styler = (
-                      _highlight_threshold(group, args.tpot_max_ms)
-                      .format({c: "{:.2f}" for c in group.select_dtypes("number").columns}, na_rep="—")
+                elif (
+                    "tpot" in metric_name
+                    or "median" in metric_name
+                    or "p99" in metric_name
+                ):
+                    styler = _highlight_threshold(group, args.tpot_max_ms).format(
+                        {c: "{:.2f}" for c in group.select_dtypes("number").columns},
+                        na_rep="—",
                     )
-                    html = styler.to_html(table_attributes='border="1" class="dataframe"')
-                
+                    html = styler.to_html(
+                        table_attributes='border="1" class="dataframe"'
+                    )
+
                 text_file.write(html_msgs_for_data_cols[i])
                 text_file.write(html)
                 with open(group_html_name, "a") as sub_text_file:
@@ -382,7 +421,9 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
                             var_name="Configuration",
                             value_name=data_cols_to_compare[i],
                         )
-                        title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+                        title = (
+                            data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
+                        )
                         # Create Plotly line chart
                         fig = px.line(
                             df_melted,
@@ -396,9 +437,17 @@ def _highlight_threshold(df: pd.DataFrame, threshold: float) -> "pd.io.formats.s
                         # ---- Add threshold lines based on metric name ----
                         if "ttft" in metric_name:
                             _add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
-                        elif "tpot" in metric_name or "median" in metric_name or "p99" in metric_name :
+                        elif (
+                            "tpot" in metric_name
+                            or "median" in metric_name
+                            or "p99" in metric_name
+                        ):
                             _add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
 
                         # Export to HTML
-                        text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
-                        sub_text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
+                        text_file.write(
+                            fig.to_html(full_html=True, include_plotlyjs="cdn")
+                        )
+                        sub_text_file.write(
+                            fig.to_html(full_html=True, include_plotlyjs="cdn")
+                        )

From 1bd8e3c06f0aee25a5b2620b36a9e9dc7e2fa0de Mon Sep 17 00:00:00 2001
From: "Tsai, Louie" <louie.tsai@intel.com>
Date: Tue, 7 Oct 2025 13:19:24 -0700
Subject: [PATCH 12/16] make the latency ratio >1

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../nightly-benchmarks/scripts/compare-json-results.py     | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
index 0014140474ee..c8bf7b045366 100644
--- a/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/compare-json-results.py
@@ -110,7 +110,10 @@ def compare_data_columns(
         if len(compare_frames) >= 2:
             base = compare_frames[0]
             current = compare_frames[-1]
-            ratio = current / base
+            if "P99" in data_column or "Median" in data_column:
+                ratio = base / current  # for latency
+            else:
+                ratio = current / base
             ratio = ratio.mask(base == 0)  # avoid inf when baseline is 0
             ratio.name = f"Ratio 1 vs {len(compare_frames)}"
             frames.append(ratio)
@@ -406,7 +409,7 @@ def _highlight_threshold(
 
                 text_file.write(html_msgs_for_data_cols[i])
                 text_file.write(html)
-                with open(group_html_name, "a") as sub_text_file:
+                with open(group_html_name, "a+") as sub_text_file:
                     sub_text_file.write(html_msgs_for_data_cols[i])
                     sub_text_file.write(html)
 

From ed7d3b219d4c52017daca9b111fde71c4e25cca6 Mon Sep 17 00:00:00 2001
From: louie-tsai <louie.tsai@intel.com>
Date: Thu, 9 Oct 2025 10:26:35 -0700
Subject: [PATCH 13/16] Add vllm collect-env Signed-off-by: Tsai, Louie
 <louie.tsai@intel.com>

Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../nightly-benchmarks/scripts/run-performance-benchmarks.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index c64e5638029e..5a47576483bb 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -471,6 +471,11 @@ main() {
   mkdir -p $RESULTS_FOLDER
   QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 
+  # dump vllm info via vllm collect-env
+  env_output=$(vllm collect-env)
+
+  echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
+
   # benchmarking
   run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
   run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"

From e2aaa8d27e474a3107b061d744cc2acd1350ac21 Mon Sep 17 00:00:00 2001
From: "Tsai, Louie" <louie.tsai@intel.com>
Date: Thu, 9 Oct 2025 15:24:59 -0700
Subject: [PATCH 14/16] add TP4 test cases according to findings from AWS
 benchmarking

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: louie-tsai <louie.tsai@intel.com>
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../tests/serving-tests-cpu-snc3.json         | 203 ++++++++++++++++++
 .../tests/serving-tests-cpu.json              |  51 ++++-
 2 files changed, 246 insertions(+), 8 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
index ce396d6e54f2..0b1a42e79025 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
@@ -95,6 +95,38 @@
             "num_prompts": 200
         }
     },
+    {
+        "test_name": "serving_llama8B_bf16_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
     {
         "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
         "qps_list": ["inf"],
@@ -233,6 +265,41 @@
             "num_prompts": 1000
         }
     },
+    {
+        "test_name": "serving_llama8B_bf16_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
     {
         "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
         "qps_list": ["inf"],
@@ -365,6 +432,38 @@
             "num_prompts": 200
         }
     },
+    {
+        "test_name": "serving_llama8B_int8_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
     {
         "test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
         "qps_list": ["inf"],
@@ -503,6 +602,41 @@
             "num_prompts": 1000
         }
     },
+    {
+        "test_name": "serving_llama8B_int8_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
     {
         "test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
         "qps_list": ["inf"],
@@ -638,6 +772,39 @@
             "num_prompts": 200
         }
     },
+    {
+        "test_name": "serving_llama8B_int4_tp4_sharegpt",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
     {
         "test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
         "qps_list": ["inf"],
@@ -780,6 +947,42 @@
             "num_prompts": 1000
         }
     },
+    {
+        "test_name": "serving_llama8B_int4_tp4_random_128_128",
+        "qps_list": ["inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+	    "quantization": "awq",
+            "tensor_parallel_size": 4,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 1000
+        }
+    },
     {
         "test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
         "qps_list": ["inf"],
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
index e21c8df0a9fe..5066f0d51895 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -96,7 +96,7 @@
         }
     },
     {
-        "test_name": "serving_llama8B_tp4_random_1024_128",
+        "test_name": "serving_llama8B_tp1_random_128_128",
         "qps_list": [1, 4, 16, "inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -108,7 +108,42 @@
         },
         "server_parameters": {
             "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_random_128_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
@@ -124,14 +159,14 @@
             "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
-	    "random-input-len": 1024,
+	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 100
+            "num_prompts": 200
         }
     },
     {
-        "test_name": "serving_llama8B_pp6_random_1024_128",
+        "test_name": "serving_llama8B_tp4_random_128_128",
         "qps_list": [1, 4, 16, "inf"],
         "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
         "server_environment_variables": {
@@ -143,7 +178,7 @@
         },
         "server_parameters": {
             "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "pipeline_parallel_size": 6,
+            "tensor_parallel_size": 4,
 	    "dtype": "bfloat16",
 	    "distributed_executor_backend": "mp",
 	    "block_size": 128,
@@ -159,10 +194,10 @@
             "model": "meta-llama/Llama-3.1-8B-Instruct",
             "backend": "vllm",
             "dataset_name": "random",
-	    "random-input-len": 1024,
+	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 100
+            "num_prompts": 200
         }
     }
 ]

From 92d187b998eac725a9a984cc8d060c64e225b39d Mon Sep 17 00:00:00 2001
From: louie-tsai <louie.tsai@intel.com>
Date: Mon, 13 Oct 2025 16:27:43 -0700
Subject: [PATCH 15/16] change serving-test-cpu.json for R8i.24xlarge

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../tests/serving-tests-cpu.json              | 83 ++-----------------
 1 file changed, 8 insertions(+), 75 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
index 5066f0d51895..844726635a7e 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -2,7 +2,7 @@
     {
         "test_name": "serving_llama8B_tp1_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -28,13 +28,13 @@
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
+            "num_prompts": 32
         }
     },
     {
         "test_name": "serving_llama8B_tp2_sharegpt",
         "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -60,45 +60,13 @@
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp4_sharegpt",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
+            "num_prompts": 32
         }
     },
     {
         "test_name": "serving_llama8B_tp1_random_128_128",
         "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -127,13 +95,13 @@
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 200
+            "num_prompts": 32
         }
     },
     {
         "test_name": "serving_llama8B_tp2_random_128_128",
         "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
+        "max_concurrency_list": [32],
         "server_environment_variables": {
             "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
@@ -162,42 +130,7 @@
 	    "random-input-len": 128,
 	    "random-output-len": 128,
 	    "ignore-eos": "",
-            "num_prompts": 200
-        }
-    },
-    {
-        "test_name": "serving_llama8B_tp4_random_128_128",
-        "qps_list": [1, 4, 16, "inf"],
-        "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
-        "server_environment_variables": {
-            "VLLM_RPC_TIMEOUT": 100000,
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
-	    "VLLM_CPU_SGL_KERNEL": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "server_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-	    "dtype": "bfloat16",
-	    "distributed_executor_backend": "mp",
-	    "block_size": 128,
-	    "trust_remote_code": "",
-	    "enable_chunked_prefill": "",
-            "disable_log_stats": "",
-	    "enforce_eager": "",
-	    "max_num_batched_tokens": 2048,
-	    "max_num_seqs": 256,
-            "load_format": "dummy"
-        },
-        "client_parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "random",
-	    "random-input-len": 128,
-	    "random-output-len": 128,
-	    "ignore-eos": "",
-            "num_prompts": 200
+            "num_prompts": 32
         }
     }
 ]

From d8314706ea485ce82d14f04d1b05bffc1117e40b Mon Sep 17 00:00:00 2001
From: "Tsai, Louie" <louie.tsai@intel.com>
Date: Wed, 15 Oct 2025 08:58:54 -0700
Subject: [PATCH 16/16] add two more use cases according to discussions

Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
---
 .../tests/latency-tests-cpu.json              |  30 ++--
 .../tests/serving-tests-cpu.json              | 140 ++++++++++++++++++
 .../tests/throughput-tests-cpu.json           |  31 ++--
 3 files changed, 166 insertions(+), 35 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
index 569117aae852..77d1694ec864 100644
--- a/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
@@ -1,28 +1,24 @@
 [
     {
-        "test_name": "latency_llama8B_tp1",
+        "test_name": "latency_llama8B_tp2",
         "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
             "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "num_iters_warmup": 5,
-            "num_iters": 15
-        }
-    },
-    {
-        "test_name": "latency_llama8B_tp4",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
             "num_iters_warmup": 5,
             "num_iters": 15
         }
diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
index 844726635a7e..f792956f3947 100644
--- a/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
@@ -132,5 +132,145 @@
 	    "ignore-eos": "",
             "num_prompts": 32
         }
+    },
+    {
+        "test_name": "serving_llama8B_tp1_random_128_2048",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 2048,
+	    "ignore-eos": "",
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_random_128_2048",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 128,
+	    "random-output-len": 2048,
+	    "ignore-eos": "",
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp1_random_2048_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 2048,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 32
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_random_2048_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "max_concurrency_list": [32],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 2048,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 32
+        }
     }
 ]
diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
index 48c015aa8403..dc214ddfb27e 100644
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests-cpu.json
@@ -1,29 +1,24 @@
 [
     {
-        "test_name": "throughput_llama8B_tp1",
+        "test_name": "throughput_llama8B_tp2",
         "environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
 	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_SGL_KERNEL": 1,
 	    "VLLM_CPU_KVCACHE_SPACE": 40
         },
         "parameters": {
             "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 1,
-            "load_format": "dummy",
-            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "backend": "vllm"
-        }
-    },
-    {
-        "test_name": "throughput_llama8B_tp4",
-        "environment_variables": {
-	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
-	    "VLLM_CPU_KVCACHE_SPACE": 40
-        },
-        "parameters": {
-            "model": "meta-llama/Llama-3.1-8B-Instruct",
-            "tensor_parallel_size": 4,
-            "load_format": "dummy",
+            "tensor_parallel_size": 2,
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+	    "enforce_eager": "",
+	    "max_num_batched_tokens": 2048,
+	    "max_num_seqs": 256,
             "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
             "backend": "vllm"