Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions python/sglang/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging
import os
import random
import re
import subprocess
import threading
import time
Expand Down Expand Up @@ -840,12 +841,23 @@ def run_bench_one_batch(model, other_args):
print(f"Output: {output}", flush=True)
print(f"Error: {error}", flush=True)

lastline = output.split("\n")[-3]
output_throughput = float(lastline.split(" ")[-2])
# Return prefill_latency, decode_throughput, decode_latency
prefill_line = output.split("\n")[-9]
decode_line = output.split("\n")[-3]
pattern = (
r"latency: (?P<latency>\d+\.\d+).*?throughput:\s*(?P<throughput>\d+\.\d+)"
)
match = re.search(pattern, prefill_line)
if match:
prefill_latency = float(match.group("latency"))
match = re.search(pattern, decode_line)
if match:
decode_latency = float(match.group("latency"))
decode_throughput = float(match.group("throughput"))
finally:
kill_process_tree(process.pid)

return output_throughput
return prefill_latency, decode_throughput, decode_latency
Copy link
Copy Markdown
Collaborator

@Alcanderian Alcanderian Jul 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yanbing-j is this fixed now?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose this will be fixed by b180dfa.



def run_bench_offline_throughput(model, other_args):
Expand Down
2 changes: 1 addition & 1 deletion test/srt/models/test_dummy_grok_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
class TestDummyGrok1(CustomTestCase):

def test_dummy_grok_1(self):
output_throughput = run_bench_one_batch(
_, output_throughput, _ = run_bench_one_batch(
None,
[
"--model",
Expand Down
1 change: 1 addition & 0 deletions test/srt/run_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ class TestFile:
TestFile("cpu/test_rope.py"),
TestFile("cpu/test_shared_expert.py"),
TestFile("cpu/test_topk.py"),
TestFile("test_intel_amx_attention_backend.py"),
],
"nightly": [
TestFile("test_nightly_gsm8k_eval.py"),
Expand Down
2 changes: 1 addition & 1 deletion test/srt/test_bench_one_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
class TestBenchOneBatch(CustomTestCase):

def test_bs1_small(self):
output_throughput = run_bench_one_batch(
_, output_throughput, _ = run_bench_one_batch(
DEFAULT_SMALL_MODEL_NAME_FOR_TEST, ["--cuda-graph-max-bs", "2"]
)
self.assertGreater(output_throughput, 50)
Expand Down
2 changes: 1 addition & 1 deletion test/srt/test_flashmla.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_gsm8k(self):

class TestFlashMLAAttnLatency(unittest.TestCase):
def test_latency(self):
output_throughput = run_bench_one_batch(
_, output_throughput, _ = run_bench_one_batch(
DEFAULT_MODEL_NAME_FOR_TEST_MLA,
[
"--attention-backend",
Expand Down
79 changes: 79 additions & 0 deletions test/srt/test_intel_amx_attention_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""
Usage:
python3 -m unittest test_intel_amx_attention_backend.TestIntelAMXAttnBackend.test_mmlu
"""

import unittest
from types import SimpleNamespace

from sglang.srt.utils import kill_process_tree
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MLA_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
CustomTestCase,
is_in_ci,
popen_launch_server,
run_bench_one_batch,
)


class TestIntelAMXAttnBackend(CustomTestCase):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The test class TestIntelAMXAttnBackend should have a docstring explaining its purpose. This improves code readability and helps others understand the scope of this test suite.1

class TestIntelAMXAttnBackend(CustomTestCase):
    """Tests for the Intel AMX attention backend, covering latency and MMLU accuracy."""

Style Guide References

Footnotes

  1. PEP 257: "Write docstrings for all public modules, functions, classes, and methods."

def test_latency(self):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The test method test_latency is missing a docstring. Adding a docstring to explain what this test specifically verifies (e.g., conditions, expected outcomes) would improve maintainability.1

    def test_latency(self):
        """Measures and verifies the latency/throughput of the Intel AMX attention backend."""

Style Guide References

Footnotes

  1. PEP 257: "Write docstrings for all public modules, functions, classes, and methods."

prefill_latency, decode_throughput, decode_latency = run_bench_one_batch(
DEFAULT_MLA_MODEL_NAME_FOR_TEST,
[
"--attention-backend",
"intel_amx",
"--mem-fraction-static",
"0.05",
"--disable-radix",
"--trust-remote-code",
"--batch-size",
"4",
],
)

print(f"{prefill_latency=}")
print(f"{decode_throughput=}")
print(f"{decode_latency=}")

if is_in_ci():
self.assertGreater(decode_throughput, 10)

def test_mmlu(self):
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The test method test_mmlu lacks a docstring. A docstring explaining the test's objective, such as verifying MMLU evaluation accuracy with specific configurations, would be beneficial for clarity.1

    def test_mmlu(self):
        """Tests MMLU evaluation accuracy using the Intel AMX attention backend."""

Style Guide References

Footnotes

  1. PEP 257: "Write docstrings for all public modules, functions, classes, and methods."

model = DEFAULT_MLA_MODEL_NAME_FOR_TEST
base_url = DEFAULT_URL_FOR_TEST
process = popen_launch_server(
model,
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--attention-backend",
"intel_amx",
"--mem-fraction-static",
"0.05",
"--disable-radix",
"--trust-remote-code",
"--disable-overlap-schedule",
],
)

try:
args = SimpleNamespace(
base_url=base_url,
model=model,
eval_name="mmlu",
num_examples=64,
num_threads=32,
Comment on lines +68 to +69
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The values 64 for num_examples and 32 for num_threads are magic numbers. Consider defining them as named constants at the module or class level (e.g., MMLU_TEST_NUM_EXAMPLES = 64, MMLU_TEST_NUM_THREADS = 32). This improves readability by giving context to these numbers and makes them easier to modify if needed.1

                num_examples=MMLU_TEST_NUM_EXAMPLES,  # Define MMLU_TEST_NUM_EXAMPLES = 64
                num_threads=MMLU_TEST_NUM_THREADS,    # Define MMLU_TEST_NUM_THREADS = 32

Style Guide References

Footnotes

  1. PEP 8 recommends using uppercase with underscores for constants. Using named constants instead of magic numbers improves code readability and maintainability.

)

metrics = run_eval(args)
self.assertGreater(metrics["score"], 0.5)
finally:
kill_process_tree(process.pid)


if __name__ == "__main__":
unittest.main()
Loading