From dd5f880b7b918b9bea9331d7dd6a8916f30aa50a Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 15:48:15 -0700 Subject: [PATCH 01/30] Add deepep tests to CI --- .github/workflows/pr-test.yml | 41 ++++++++++++++++++- scripts/ci_install_deepep.sh | 75 +++++++++++++++++++++++++++++++++++ test/srt/run_suite.py | 11 +++-- 3 files changed, 121 insertions(+), 6 deletions(-) create mode 100644 scripts/ci_install_deepep.sh diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index ac93dc183706..8cd117aa071e 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -306,12 +306,49 @@ jobs: cd test/srt python3 test_moe_eval_accuracy_large.py + unittest-test-deepep-4-gpu: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: 4-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_deepep.sh + + - name: Run test + timeout-minutes: 30 + run: | + cd test/srt + python3 run_suite.py --suite per-commit-4-deepep + + unittest-test-deepep-8-gpu: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + needs: [unittest-test-deepep-4-gpu] + runs-on: 8-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_deepep.sh + + - name: Run test + timeout-minutes: 20 + run: | + cd test/srt + python3 run_suite.py --suite per-commit-8-deepep + finish: if: always() needs: [ unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unittest-test-backend-8-gpu, - performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, - accuracy-test-1-gpu, accuracy-test-2-gpu, + unittest-test-deepep-4-gpu, unittest-test-deepep-8-gpu, performance-test-1-gpu-part-1, + performance-test-1-gpu-part-2, performance-test-2-gpu, accuracy-test-1-gpu, accuracy-test-2-gpu, ] runs-on: ubuntu-latest steps: diff --git a/scripts/ci_install_deepep.sh b/scripts/ci_install_deepep.sh new file mode 100644 index 000000000000..84b4be3f69d3 --- /dev/null +++ b/scripts/ci_install_deepep.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# Install the dependency in CI. +set -euxo pipefail + +bash scripts/ci_install_dependency.sh + +if python -c "import deep_ep" >/dev/null 2>&1; then + echo "deep_ep is already installed or importable. Skipping installation." + exit 0 +fi + +export GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ +export NVSHMEM_DIR=/opt/nvshmem/install +export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH" +export PATH="${NVSHMEM_DIR}/bin:$PATH" +export CUDA_HOME=/usr/local/cuda + +# Install system dependencies +apt install -y curl wget git sudo libibverbs-dev rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 build-essential cmake + +# Install GDRCopy +mkdir -p /opt/gdrcopy +mkdir -p /opt/nvshmem +cd /opt/gdrcopy +git clone https://github.com/NVIDIA/gdrcopy.git . +git checkout v2.4.4 +apt update +apt install -y nvidia-dkms-535 +apt install -y build-essential devscripts debhelper fakeroot pkg-config dkms +apt install -y check libsubunit0 libsubunit-dev +cd packages +CUDA=/usr/local/cuda ./build-deb-packages.sh +dpkg -i gdrdrv-dkms_*.deb +dpkg -i libgdrapi_*.deb +dpkg -i gdrcopy-tests_*.deb +dpkg -i gdrcopy_*.deb + +if [ ! -e "/usr/lib/x86_64-linux-gnu/libmlx5.so" ]; then + ln -s /usr/lib/x86_64-linux-gnu/libmlx5.so.1 /usr/lib/x86_64-linux-gnu/libmlx5.so +fi +apt-get update && apt-get install -y libfabric-dev + +# Clone DeepEP +git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep + +# Install NVSHMEM +cd /opt/nvshmem +wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz +tar -xf nvshmem_src_3.2.5-1.txz +mv nvshmem_src nvshmem +cd nvshmem +git apply /root/.cache/deepep/third-party/nvshmem.patch +NVSHMEM_SHMEM_SUPPORT=0 \ +NVSHMEM_UCX_SUPPORT=0 \ +NVSHMEM_USE_NCCL=0 \ +NVSHMEM_MPI_SUPPORT=0 \ +NVSHMEM_IBGDA_SUPPORT=1 \ +NVSHMEM_PMIX_SUPPORT=0 \ +NVSHMEM_TIMEOUT_DEVICE_POLLING=0 \ +NVSHMEM_USE_GDRCOPY=1 \ +cmake -S . -B build/ -DCMAKE_INSTALL_PREFIX=/opt/nvshmem/install -DCMAKE_CUDA_ARCHITECTURES=90 +cd build +make -j$(nproc) install + +# Install DeepEP +cd /root/.cache/deepep && python3 setup.py install + +# Verify configuration +echo "=== NCCL Configuration ===" +nvidia-smi topo -m +nvidia-smi nvlink -s +echo "=== Verify GDRCOPY ===" +gdrcopy_copybw +echo "=== Verify NVSHMEM ===" +nvshmem-info -a diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index b2c8c9252f02..cc29d80dfcef 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -169,21 +169,24 @@ class TestFile: TestFile("test_pp_single_node.py", 150), TestFile("test_multi_instance_release_memory_occupation.py", 64), ], + "per-commit-4-deepep": [ + TestFile("test_deepep_intranode.py", 50), + TestFile("test_deepep_low_latency.py", 50), + ], "per-commit-4-gpu-amd": [ TestFile("test_pp_single_node.py", 150), ], "per-commit-8-gpu": [ # Disabled deepep tests temporarily because it takes too much time. - # TODO: re-enable them after reducing the test time with compilation cache and smaller models. - # TestFile("test_deepep_intranode.py", 50), - # TestFile("test_deepep_low_latency.py", 50), - # TestFile("test_moe_deepep_eval_accuracy_large.py", 250), # Disabled because it hangs on the CI. # TestFile("test_moe_ep.py", 181), TestFile("test_disaggregation.py", 270), TestFile("test_disaggregation_different_tp.py", 155), TestFile("test_full_deepseek_v3.py", 463), ], + "per-commit-8-deepep": [ + TestFile("test_moe_deepep_eval_accuracy_large.py", 250), + ], "per-commit-8-gpu-amd": [ TestFile("test_full_deepseek_v3.py", 250), ], From 6ee70f71b9be772d6b8a324a07fcf35b96f12db9 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 16:38:52 -0700 Subject: [PATCH 02/30] update --- .github/workflows/pr-test.yml | 278 +--------------------------- test/srt/run_suite.py | 3 +- test/srt/test_dp_attention.py | 81 --------- test/srt/test_hybrid_dp_ep.py | 328 ++++++++++++++++++++++++++++++++++ 4 files changed, 330 insertions(+), 360 deletions(-) create mode 100644 test/srt/test_hybrid_dp_ep.py diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 8cd117aa071e..f920e4606dae 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -31,280 +31,6 @@ concurrency: cancel-in-progress: true jobs: - unit-test-frontend: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false - runs-on: 1-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 10 - run: | - cd test/lang - python3 run_suite.py --suite per-commit - - unit-test-backend-1-gpu: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false - runs-on: 1-gpu-runner - strategy: - fail-fast: false - matrix: - part: [0, 1, 2, 3, 4, 5, 6, 7, 8] - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 30 - run: | - cd test/srt - python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 9 - - unit-test-backend-2-gpu: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false - runs-on: 2-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 30 - run: | - cd test/srt - python3 run_suite.py --suite per-commit-2-gpu - - unittest-test-backend-4-gpu: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false - needs: [unit-test-frontend, unit-test-backend-2-gpu] - runs-on: 4-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 30 - run: | - cd test/srt - python3 run_suite.py --suite per-commit-4-gpu - - unittest-test-backend-8-gpu: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false - needs: [unit-test-frontend, unit-test-backend-2-gpu] - runs-on: 8-gpu-runner - strategy: - fail-fast: false - matrix: - part: [0, 1] - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci_install_dependency.sh - - - name: Run test - timeout-minutes: 20 - run: | - cd test/srt - python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 - - performance-test-1-gpu-part-1: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false - runs-on: 1-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci_install_dependency.sh - - - name: Benchmark single latency - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small - python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default - - - name: Benchmark online latency - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default - - - name: Benchmark offline throughput - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default - - - name: Benchmark offline throughput (Non-streaming, small batch size) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size - - - name: Benchmark online latency (EAGLE) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle - - performance-test-1-gpu-part-2: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false - runs-on: 1-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci_install_dependency.sh - - - name: Benchmark offline throughput (w/o RadixAttention) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache - - - name: Benchmark offline throughput (w/ Triton) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend - - - name: Benchmark offline throughput (w/ FP8) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8 - - - name: Benchmark VLM offline throughput - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput - - - name: Benchmark VLM online latency - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency - - performance-test-2-gpu: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false - runs-on: 2-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci_install_dependency.sh - - - name: Benchmark single latency (TP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 - - - name: Benchmark single latency + torch.compile (TP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1 - - - name: Benchmark offline throughput (TP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default - - - name: Benchmark offline throughput (w/o RadixAttention) (TP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache - - - name: Benchmark offline decode throughput (PP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode - - - name: Benchmark offline prefill throughput (PP=2) - timeout-minutes: 10 - run: | - cd test/srt - python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill - - accuracy-test-1-gpu: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false - runs-on: 1-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci_install_dependency.sh - git clone https://github.com/merrymercy/human-eval.git - cd human-eval - pip install -e . - - - name: Evaluate accuracy - timeout-minutes: 20 - run: | - cd test/srt - python3 test_eval_accuracy_large.py - - accuracy-test-2-gpu: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false - runs-on: 2-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci_install_dependency.sh - git clone https://github.com/merrymercy/human-eval.git - cd human-eval - pip install -e . - - - name: Evaluate accuracy (TP=2) - timeout-minutes: 20 - run: | - cd test/srt - python3 test_moe_eval_accuracy_large.py unittest-test-deepep-4-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -346,9 +72,7 @@ jobs: finish: if: always() needs: [ - unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unittest-test-backend-8-gpu, - unittest-test-deepep-4-gpu, unittest-test-deepep-8-gpu, performance-test-1-gpu-part-1, - performance-test-1-gpu-part-2, performance-test-2-gpu, accuracy-test-1-gpu, accuracy-test-2-gpu, + unittest-test-deepep-4-gpu, unittest-test-deepep-8-gpu, ] runs-on: ubuntu-latest steps: diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index cc29d80dfcef..d7cfc0d42ea3 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -170,8 +170,7 @@ class TestFile: TestFile("test_multi_instance_release_memory_occupation.py", 64), ], "per-commit-4-deepep": [ - TestFile("test_deepep_intranode.py", 50), - TestFile("test_deepep_low_latency.py", 50), + TestFile("test_hybrid_dp_ep.py", 50), ], "per-commit-4-gpu-amd": [ TestFile("test_pp_single_node.py", 150), diff --git a/test/srt/test_dp_attention.py b/test/srt/test_dp_attention.py index 085dc206bb58..af50dc7803c1 100644 --- a/test/srt/test_dp_attention.py +++ b/test/srt/test_dp_attention.py @@ -137,86 +137,5 @@ def test_gsm8k(self): self.assertGreater(avg_spec_accept_length, 2.5) -# TODO: enable this test later -# class TestDPAttentionDP2TP2DeepseekV3MTPTBO(CustomTestCase): -# @classmethod -# def setUpClass(cls): -# import os - -# # print debug log for tbo -# os.environ["SGLANG_TBO_DEBUG"] = "1" -# cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA -# cls.base_url = DEFAULT_URL_FOR_TEST -# other_args = [ -# "--trust-remote-code", -# "--disable-radix", -# "--speculative-algorithm", -# "EAGLE", -# "--speculative-num-steps", -# "2", -# "--speculative-eagle-topk", -# "4", -# "--speculative-num-draft-tokens", -# "4", -# "--speculative-draft", -# DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, -# "--tp-size", -# "2", -# "--enable-dp-attention", -# "--dp-size", -# "2", -# "--enable-two-batch-overlap", -# "--enable-deepep-moe", -# "--deepep-mode", -# "low_latency", -# "--chunked-prefill-size", -# "256", -# "--cuda-graph-max-bs", -# "32", -# "--max-running-requests", -# "32", -# ] -# if not is_in_amd_ci(): -# other_args += ["--mem-frac", "0.7"] -# cls.process = popen_launch_server( -# cls.model, -# cls.base_url, -# timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, -# other_args=other_args, -# ) - -# @classmethod -# def tearDownClass(cls): -# kill_process_tree(cls.process.pid) - -# def test_gsm8k(self): -# requests.get(self.base_url + "/flush_cache") - -# args = SimpleNamespace( -# num_shots=5, -# data_path=None, -# num_questions=200, -# max_new_tokens=512, -# parallel=128, -# host="http://127.0.0.1", -# port=int(self.base_url.split(":")[-1]), -# ) -# metrics = run_eval_few_shot_gsm8k(args) -# print(metrics) - -# self.assertGreater(metrics["accuracy"], 0.60) - -# server_info = requests.get(self.base_url + "/get_server_info") -# avg_spec_accept_length = server_info.json()["internal_states"][0][ -# "avg_spec_accept_length" -# ] -# print( -# f"###test_gsm8k (deepseek-v3 mtp + dp + tbo):\n" -# f"accuracy={metrics['accuracy']=:.3f}\n" -# f"{avg_spec_accept_length=:.3f}\n" -# ) -# self.assertGreater(avg_spec_accept_length, 2.3) - - if __name__ == "__main__": unittest.main() diff --git a/test/srt/test_hybrid_dp_ep.py b/test/srt/test_hybrid_dp_ep.py new file mode 100644 index 000000000000..c3b4e40cca3a --- /dev/null +++ b/test/srt/test_hybrid_dp_ep.py @@ -0,0 +1,328 @@ +# Comprehensive test for hybrid parallelism (DP/TP attention, DP/TP Dense FFN, TP/EP Sparse FFN, DP/VP LM head) plus speculative decoding. +# These tests are not run by default but can be launched on demand. + +import unittest +from types import SimpleNamespace + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.run_eval import run_eval +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.test_utils import ( + DEFAULT_MODEL_NAME_FOR_TEST_MLA, + DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + + +class TestPureDP(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--tp", + "4", + "--enable-dp-attention", + "--dp", + "4", + "--enable-deepep-moe", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.62) + +class TestHybridDPTP(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--tp", + "4", + "--enable-dp-attention", + "--dp", + "2", + "--enable-deepep-moe", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.62) + + +class TestNoGatherdBuffer(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--tp", + "4", + "--enable-dp-attention", + "--dp", + "4", + "--moe-dense-tp-size", + "1", + "--enable-dp-lm-head", + "--enable-deepep-moe", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.62) + + +class TestTBO(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--tp", + "4", + "--enable-dp-attention", + "--dp", + "4", + "--moe-dense-tp-size", + "1", + "--enable-deepep-moe", + "--enable-two-batch-overlap", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.62) + + +class TestMTP(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--tp", + "4", + "--enable-dp-attention", + "--dp", + "2", + "--enable-dp-lm-head", + "--enable-deepep-moe", + "--speculative-algo", + "NEXTN", + "--speculative-draft", + DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, + "--speculative-num-steps", + "2", + "--speculative-eagle-topk", + "4", + "--speculative-num-draft-tokens", + "4", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + requests.get(self.base_url + "/flush_cache") + + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.60) + + server_info = requests.get(self.base_url + "/get_server_info") + avg_spec_accept_length = server_info.json()["internal_states"][0][ + "avg_spec_accept_length" + ] + print( + f"###test_gsm8k (deepseek-v3 mtp + dp + tbo):\n" + f"accuracy={metrics['accuracy']=:.3f}\n" + f"{avg_spec_accept_length=:.3f}\n" + ) + self.assertGreater(avg_spec_accept_length, 2.3) + + + +class TestMTPWithTBO(CustomTestCase): + @classmethod + def setUpClass(cls): + import os + + cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA + cls.base_url = DEFAULT_URL_FOR_TEST + other_args = [ + "--trust-remote-code", + "--speculative-algorithm", + "EAGLE", + "--speculative-num-steps", + "2", + "--speculative-eagle-topk", + "4", + "--speculative-num-draft-tokens", + "4", + "--speculative-draft", + DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, + "--tp-size", + "4", + "--enable-dp-attention", + "--dp-size", + "4", + "--enable-two-batch-overlap", + "--enable-deepep-moe", + "--deepep-mode", + "low_latency", + "--chunked-prefill-size", + "256", + "--cuda-graph-max-bs", + "32", + "--max-running-requests", + "32", + ] + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + requests.get(self.base_url + "/flush_cache") + + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.60) + + server_info = requests.get(self.base_url + "/get_server_info") + avg_spec_accept_length = server_info.json()["internal_states"][0][ + "avg_spec_accept_length" + ] + print( + f"###test_gsm8k (deepseek-v3 mtp + dp + tbo):\n" + f"accuracy={metrics['accuracy']=:.3f}\n" + f"{avg_spec_accept_length=:.3f}\n" + ) + self.assertGreater(avg_spec_accept_length, 2.3) + + +if __name__ == "__main__": + unittest.main() From 2a2020bd44e00b0bdbf0e5d767c550b4064af897 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 16:42:49 -0700 Subject: [PATCH 03/30] fix --- scripts/ci_install_deepep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci_install_deepep.sh b/scripts/ci_install_deepep.sh index 84b4be3f69d3..b52374a43d43 100644 --- a/scripts/ci_install_deepep.sh +++ b/scripts/ci_install_deepep.sh @@ -4,7 +4,7 @@ set -euxo pipefail bash scripts/ci_install_dependency.sh -if python -c "import deep_ep" >/dev/null 2>&1; then +if python3 -c "import deep_ep" >/dev/null 2>&1; then echo "deep_ep is already installed or importable. Skipping installation." exit 0 fi From 574b4cfb3a5b92bc0b9479b43e4d8f58f6a6fa63 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 17:17:24 -0700 Subject: [PATCH 04/30] update --- scripts/ci_install_deepep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci_install_deepep.sh b/scripts/ci_install_deepep.sh index b52374a43d43..c15b514fdba5 100644 --- a/scripts/ci_install_deepep.sh +++ b/scripts/ci_install_deepep.sh @@ -41,7 +41,7 @@ fi apt-get update && apt-get install -y libfabric-dev # Clone DeepEP -git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep +git clone --branch eef7ab50fa5cf0ab1dd3fce4c6493c90bdf290ac --depth 1 https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep # Install NVSHMEM cd /opt/nvshmem From c616f387159b3ffcb9c38b45c31e366cf03209d1 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 17:39:41 -0700 Subject: [PATCH 05/30] move all tests to 8-gpu-runner --- .github/workflows/pr-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index f920e4606dae..9c5d2d792fa0 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -35,7 +35,7 @@ jobs: unittest-test-deepep-4-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false - runs-on: 4-gpu-runner + runs-on: 8-gpu-runner steps: - name: Checkout code uses: actions/checkout@v4 From 0e18a101a026e9dd750641f94e0d7234f511ff62 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 17:53:02 -0700 Subject: [PATCH 06/30] update install script --- scripts/ci_install_deepep.sh | 6 ++---- test/srt/test_hybrid_dp_ep.py | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/scripts/ci_install_deepep.sh b/scripts/ci_install_deepep.sh index c15b514fdba5..fb610776d154 100644 --- a/scripts/ci_install_deepep.sh +++ b/scripts/ci_install_deepep.sh @@ -40,9 +40,6 @@ if [ ! -e "/usr/lib/x86_64-linux-gnu/libmlx5.so" ]; then fi apt-get update && apt-get install -y libfabric-dev -# Clone DeepEP -git clone --branch eef7ab50fa5cf0ab1dd3fce4c6493c90bdf290ac --depth 1 https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep - # Install NVSHMEM cd /opt/nvshmem wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz @@ -63,7 +60,8 @@ cd build make -j$(nproc) install # Install DeepEP -cd /root/.cache/deepep && python3 setup.py install +git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep +cd /root/.cache/deepep && git checkout eef7ab50fa5cf0ab1dd3fce4c6493c90bdf290ac && python3 setup.py install # Verify configuration echo "=== NCCL Configuration ===" diff --git a/test/srt/test_hybrid_dp_ep.py b/test/srt/test_hybrid_dp_ep.py index c3b4e40cca3a..50b97a01186a 100644 --- a/test/srt/test_hybrid_dp_ep.py +++ b/test/srt/test_hybrid_dp_ep.py @@ -7,8 +7,8 @@ import requests from sglang.srt.utils import kill_process_tree -from sglang.test.run_eval import run_eval from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST_MLA, DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, @@ -58,6 +58,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.62) + class TestHybridDPTP(CustomTestCase): @classmethod def setUpClass(cls): @@ -248,7 +249,6 @@ def test_gsm8k(self): self.assertGreater(avg_spec_accept_length, 2.3) - class TestMTPWithTBO(CustomTestCase): @classmethod def setUpClass(cls): From dbe5e171e5034a24551315c83abfd5ef7f5e2705 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 17:58:57 -0700 Subject: [PATCH 07/30] update install script --- scripts/ci_install_deepep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci_install_deepep.sh b/scripts/ci_install_deepep.sh index fb610776d154..a71c0c0dbf8b 100644 --- a/scripts/ci_install_deepep.sh +++ b/scripts/ci_install_deepep.sh @@ -60,7 +60,7 @@ cd build make -j$(nproc) install # Install DeepEP -git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep +rm -rf /root/.cache/deepep && git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep cd /root/.cache/deepep && git checkout eef7ab50fa5cf0ab1dd3fce4c6493c90bdf290ac && python3 setup.py install # Verify configuration From 5ce2c065bbc80ee2125ab5f4c57ddacd691577e4 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 18:04:04 -0700 Subject: [PATCH 08/30] update install script --- scripts/ci_install_deepep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci_install_deepep.sh b/scripts/ci_install_deepep.sh index a71c0c0dbf8b..f9dea53fab74 100644 --- a/scripts/ci_install_deepep.sh +++ b/scripts/ci_install_deepep.sh @@ -22,7 +22,7 @@ apt install -y curl wget git sudo libibverbs-dev rdma-core infiniband-diags open mkdir -p /opt/gdrcopy mkdir -p /opt/nvshmem cd /opt/gdrcopy -git clone https://github.com/NVIDIA/gdrcopy.git . +rm -rf gdrcopy && git clone https://github.com/NVIDIA/gdrcopy.git . git checkout v2.4.4 apt update apt install -y nvidia-dkms-535 From 4395325688515c0447922f6db04c8b7d10ca80a1 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 18:04:39 -0700 Subject: [PATCH 09/30] fix --- scripts/ci_install_deepep.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ci_install_deepep.sh b/scripts/ci_install_deepep.sh index f9dea53fab74..d1f4c94525b1 100644 --- a/scripts/ci_install_deepep.sh +++ b/scripts/ci_install_deepep.sh @@ -19,10 +19,10 @@ export CUDA_HOME=/usr/local/cuda apt install -y curl wget git sudo libibverbs-dev rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 build-essential cmake # Install GDRCopy -mkdir -p /opt/gdrcopy +rm -rf gdrcopy && mkdir -p /opt/gdrcopy mkdir -p /opt/nvshmem cd /opt/gdrcopy -rm -rf gdrcopy && git clone https://github.com/NVIDIA/gdrcopy.git . +git clone https://github.com/NVIDIA/gdrcopy.git . git checkout v2.4.4 apt update apt install -y nvidia-dkms-535 From b3b89fed0aac0778bdb6eadd68efa3c01930f5c4 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 18:19:58 -0700 Subject: [PATCH 10/30] update --- .github/workflows/pr-test.yml | 21 +------- test/srt/run_suite.py | 6 +-- test/srt/test_hybrid_dp_ep.py | 96 ++++++++++++++++++++++++----------- 3 files changed, 68 insertions(+), 55 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 9c5d2d792fa0..257d75329de2 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -32,28 +32,9 @@ concurrency: jobs: - unittest-test-deepep-4-gpu: - if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && - github.event.pull_request.draft == false - runs-on: 8-gpu-runner - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Install dependencies - run: | - bash scripts/ci_install_deepep.sh - - - name: Run test - timeout-minutes: 30 - run: | - cd test/srt - python3 run_suite.py --suite per-commit-4-deepep - unittest-test-deepep-8-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false - needs: [unittest-test-deepep-4-gpu] runs-on: 8-gpu-runner steps: - name: Checkout code @@ -72,7 +53,7 @@ jobs: finish: if: always() needs: [ - unittest-test-deepep-4-gpu, unittest-test-deepep-8-gpu, + unittest-test-deepep-8-gpu, ] runs-on: ubuntu-latest steps: diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index d7cfc0d42ea3..4fd5fa91e26f 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -169,9 +169,6 @@ class TestFile: TestFile("test_pp_single_node.py", 150), TestFile("test_multi_instance_release_memory_occupation.py", 64), ], - "per-commit-4-deepep": [ - TestFile("test_hybrid_dp_ep.py", 50), - ], "per-commit-4-gpu-amd": [ TestFile("test_pp_single_node.py", 150), ], @@ -184,7 +181,8 @@ class TestFile: TestFile("test_full_deepseek_v3.py", 463), ], "per-commit-8-deepep": [ - TestFile("test_moe_deepep_eval_accuracy_large.py", 250), + TestFile("test_hybrid_dp_ep.py", 50), + # TestFile("test_moe_deepep_eval_accuracy_large.py", 250), ], "per-commit-8-gpu-amd": [ TestFile("test_full_deepseek_v3.py", 250), diff --git a/test/srt/test_hybrid_dp_ep.py b/test/srt/test_hybrid_dp_ep.py index 50b97a01186a..5e4271f0a608 100644 --- a/test/srt/test_hybrid_dp_ep.py +++ b/test/srt/test_hybrid_dp_ep.py @@ -99,6 +99,43 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.62) +class TestTP(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--tp", + "4", + "--enable-deepep-moe", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(metrics) + + self.assertGreater(metrics["accuracy"], 0.62) + + class TestNoGatherdBuffer(CustomTestCase): @classmethod def setUpClass(cls): @@ -119,6 +156,10 @@ def setUpClass(cls): "1", "--enable-dp-lm-head", "--enable-deepep-moe", + "--cuda-graph-max-bs", + "32", + "--max-running-requests", + "128", ], ) @@ -210,9 +251,9 @@ def setUpClass(cls): "--speculative-num-steps", "2", "--speculative-eagle-topk", - "4", + "3", "--speculative-num-draft-tokens", - "4", + "5", ], ) @@ -256,39 +297,32 @@ def setUpClass(cls): cls.model = DEFAULT_MODEL_NAME_FOR_TEST_MLA cls.base_url = DEFAULT_URL_FOR_TEST - other_args = [ - "--trust-remote-code", - "--speculative-algorithm", - "EAGLE", - "--speculative-num-steps", - "2", - "--speculative-eagle-topk", - "4", - "--speculative-num-draft-tokens", - "4", - "--speculative-draft", - DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, - "--tp-size", - "4", - "--enable-dp-attention", - "--dp-size", - "4", - "--enable-two-batch-overlap", - "--enable-deepep-moe", - "--deepep-mode", - "low_latency", - "--chunked-prefill-size", - "256", - "--cuda-graph-max-bs", - "32", - "--max-running-requests", - "32", - ] cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, + other_args=[ + "--tp-size", + "4", + "--enable-dp-attention", + "--dp-size", + "4", + "--enable-two-batch-overlap", + "--enable-deepep-moe", + "--trust-remote-code", + "--speculative-algorithm", + "EAGLE", + "--speculative-num-steps", + "2", + "--speculative-eagle-topk", + "3", + "--speculative-num-draft-tokens", + "5", + "--speculative-draft", + DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, + "--chunked-prefill-size", + "256", + ], ) @classmethod From cfff6b14cb20104140ba3f63384300b2270c016d Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 18:29:00 -0700 Subject: [PATCH 11/30] fix --- scripts/ci_install_deepep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci_install_deepep.sh b/scripts/ci_install_deepep.sh index d1f4c94525b1..0b12bbbcca40 100644 --- a/scripts/ci_install_deepep.sh +++ b/scripts/ci_install_deepep.sh @@ -19,7 +19,7 @@ export CUDA_HOME=/usr/local/cuda apt install -y curl wget git sudo libibverbs-dev rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 build-essential cmake # Install GDRCopy -rm -rf gdrcopy && mkdir -p /opt/gdrcopy +rm -rf /opt/gdrcopy && mkdir -p /opt/gdrcopy mkdir -p /opt/nvshmem cd /opt/gdrcopy git clone https://github.com/NVIDIA/gdrcopy.git . From aca4af9bb310c75b1bc9891520531712806c370a Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 18:41:04 -0700 Subject: [PATCH 12/30] fix --- scripts/ci_install_deepep.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/ci_install_deepep.sh b/scripts/ci_install_deepep.sh index 0b12bbbcca40..3b48fab7cc50 100644 --- a/scripts/ci_install_deepep.sh +++ b/scripts/ci_install_deepep.sh @@ -40,6 +40,9 @@ if [ ! -e "/usr/lib/x86_64-linux-gnu/libmlx5.so" ]; then fi apt-get update && apt-get install -y libfabric-dev +# Clone DeepEP +rm -rf /root/.cache/deepep && git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep && cd /root/.cache/deepep && git checkout eef7ab50fa5cf0ab1dd3fce4c6493c90bdf290ac + # Install NVSHMEM cd /opt/nvshmem wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz @@ -60,7 +63,6 @@ cd build make -j$(nproc) install # Install DeepEP -rm -rf /root/.cache/deepep && git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep cd /root/.cache/deepep && git checkout eef7ab50fa5cf0ab1dd3fce4c6493c90bdf290ac && python3 setup.py install # Verify configuration From 5cbcad4aa2fe547772fc28bedd332edb30b3770b Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 18:47:04 -0700 Subject: [PATCH 13/30] fix --- scripts/ci_install_deepep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 scripts/ci_install_deepep.sh diff --git a/scripts/ci_install_deepep.sh b/scripts/ci_install_deepep.sh old mode 100644 new mode 100755 index 3b48fab7cc50..e41dde6da6b1 --- a/scripts/ci_install_deepep.sh +++ b/scripts/ci_install_deepep.sh @@ -47,7 +47,7 @@ rm -rf /root/.cache/deepep && git clone https://github.com/deepseek-ai/DeepEP.gi cd /opt/nvshmem wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz tar -xf nvshmem_src_3.2.5-1.txz -mv nvshmem_src nvshmem +rm nvshmem && mv nvshmem_src nvshmem cd nvshmem git apply /root/.cache/deepep/third-party/nvshmem.patch NVSHMEM_SHMEM_SUPPORT=0 \ From 45dd430814c8f11a6bbde9da470a5d6fc28e903a Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 18:56:26 -0700 Subject: [PATCH 14/30] update BLOCK_D for ci --- .../sglang/srt/layers/moe/ep_moe/kernels.py | 3 ++- test/srt/test_hybrid_dp_ep.py | 22 +++++++++---------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/python/sglang/srt/layers/moe/ep_moe/kernels.py b/python/sglang/srt/layers/moe/ep_moe/kernels.py index 1d661931cf75..7f9bdc7486a6 100644 --- a/python/sglang/srt/layers/moe/ep_moe/kernels.py +++ b/python/sglang/srt/layers/moe/ep_moe/kernels.py @@ -6,6 +6,7 @@ from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8 from sglang.srt.utils import ceil_div, dispose_tensor, is_cuda +from sglang.utils import is_in_ci logger = logging.getLogger(__name__) @@ -1058,7 +1059,7 @@ def ep_gather( input_index: torch.Tensor, output_tensor: torch.Tensor, ): - BLOCK_D = 1024 # block size of quantization + BLOCK_D = 1024 if not is_in_ci() else 128 # block size of quantization num_warps = 2 num_tokens = output_tensor.shape[0] hidden_size = input_tensor.shape[1] diff --git a/test/srt/test_hybrid_dp_ep.py b/test/srt/test_hybrid_dp_ep.py index 5e4271f0a608..8f6fd7a4280e 100644 --- a/test/srt/test_hybrid_dp_ep.py +++ b/test/srt/test_hybrid_dp_ep.py @@ -31,10 +31,10 @@ def setUpClass(cls): other_args=[ "--trust-remote-code", "--tp", - "4", + "8", "--enable-dp-attention", "--dp", - "4", + "8", "--enable-deepep-moe", ], ) @@ -71,7 +71,7 @@ def setUpClass(cls): other_args=[ "--trust-remote-code", "--tp", - "4", + "8", "--enable-dp-attention", "--dp", "2", @@ -111,7 +111,7 @@ def setUpClass(cls): other_args=[ "--trust-remote-code", "--tp", - "4", + "8", "--enable-deepep-moe", ], ) @@ -148,10 +148,10 @@ def setUpClass(cls): other_args=[ "--trust-remote-code", "--tp", - "4", + "8", "--enable-dp-attention", "--dp", - "4", + "8", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", @@ -195,10 +195,10 @@ def setUpClass(cls): other_args=[ "--trust-remote-code", "--tp", - "4", + "8", "--enable-dp-attention", "--dp", - "4", + "8", "--moe-dense-tp-size", "1", "--enable-deepep-moe", @@ -238,7 +238,7 @@ def setUpClass(cls): other_args=[ "--trust-remote-code", "--tp", - "4", + "8", "--enable-dp-attention", "--dp", "2", @@ -303,10 +303,10 @@ def setUpClass(cls): timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--tp-size", - "4", + "8", "--enable-dp-attention", "--dp-size", - "4", + "8", "--enable-two-batch-overlap", "--enable-deepep-moe", "--trust-remote-code", From 1ccdddc2153b445ba619316aa8950ab0ae2a66dc Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 20:12:41 -0700 Subject: [PATCH 15/30] fix cuda graph max bs --- test/srt/test_hybrid_dp_ep.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/test/srt/test_hybrid_dp_ep.py b/test/srt/test_hybrid_dp_ep.py index 8f6fd7a4280e..7149e9214d28 100644 --- a/test/srt/test_hybrid_dp_ep.py +++ b/test/srt/test_hybrid_dp_ep.py @@ -19,6 +19,9 @@ ) +DEFAULT_MODEL_NAME_FOR_TEST_MLA = "/dev/shm/DeepSeek-V3-0324" + + class TestPureDP(CustomTestCase): @classmethod def setUpClass(cls): @@ -36,6 +39,10 @@ def setUpClass(cls): "--dp", "8", "--enable-deepep-moe", + "--cuda-graph-max-bs", + "128", + "--max-running-requests", + "128", ], ) @@ -76,6 +83,11 @@ def setUpClass(cls): "--dp", "2", "--enable-deepep-moe", + "--cuda-graph-max-bs", + "128", + "--max-running-requests", + "128", + ], ) @@ -113,6 +125,10 @@ def setUpClass(cls): "--tp", "8", "--enable-deepep-moe", + "--cuda-graph-max-bs", + "128", + "--max-running-requests", + "128", ], ) @@ -203,6 +219,10 @@ def setUpClass(cls): "1", "--enable-deepep-moe", "--enable-two-batch-overlap", + "--cuda-graph-max-bs", + "128", + "--max-running-requests", + "128", ], ) @@ -254,6 +274,10 @@ def setUpClass(cls): "3", "--speculative-num-draft-tokens", "5", + "--cuda-graph-max-bs", + "128", + "--max-running-requests", + "128", ], ) @@ -322,6 +346,10 @@ def setUpClass(cls): DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, "--chunked-prefill-size", "256", + "--cuda-graph-max-bs", + "128", + "--max-running-requests", + "128", ], ) From 9112f1fbc5ab11f39d8b3419bfd56c7f73ef4886 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 22:16:30 -0700 Subject: [PATCH 16/30] fix --- test/srt/test_hybrid_dp_ep.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/srt/test_hybrid_dp_ep.py b/test/srt/test_hybrid_dp_ep.py index 7149e9214d28..f9390e2b8aee 100644 --- a/test/srt/test_hybrid_dp_ep.py +++ b/test/srt/test_hybrid_dp_ep.py @@ -19,9 +19,6 @@ ) -DEFAULT_MODEL_NAME_FOR_TEST_MLA = "/dev/shm/DeepSeek-V3-0324" - - class TestPureDP(CustomTestCase): @classmethod def setUpClass(cls): @@ -87,7 +84,6 @@ def setUpClass(cls): "128", "--max-running-requests", "128", - ], ) From f3ab5953ef0c4f162c92d5d2d602c9f212a1e33f Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 22:26:49 -0700 Subject: [PATCH 17/30] try 4 gpu and fix eagle --- .github/workflows/pr-test.yml | 20 +++++++++++++++++- test/srt/run_suite.py | 5 ++++- test/srt/test_hybrid_dp_ep.py | 39 +++++++++++++++++++---------------- 3 files changed, 44 insertions(+), 20 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 257d75329de2..837114dae657 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -32,6 +32,24 @@ concurrency: jobs: + unittest-test-deepep-8-gpu: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: 4-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_deepep.sh + + - name: Run test + timeout-minutes: 20 + run: | + cd test/srt + python3 run_suite.py --suite per-commit-4-gpu-deepep + unittest-test-deepep-8-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false @@ -48,7 +66,7 @@ jobs: timeout-minutes: 20 run: | cd test/srt - python3 run_suite.py --suite per-commit-8-deepep + python3 run_suite.py --suite per-commit-8-gpu-deepep finish: if: always() diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 0280e1eb40c5..33893314ff27 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -170,6 +170,9 @@ class TestFile: TestFile("test_pp_single_node.py", 150), TestFile("test_multi_instance_release_memory_occupation.py", 64), ], + "per-commit-4-gpu-deepep": [ + TestFile("test_hybrid_dp_ep.py", 50), + ], "per-commit-4-gpu-amd": [ TestFile("test_pp_single_node.py", 150), ], @@ -181,7 +184,7 @@ class TestFile: TestFile("test_disaggregation_different_tp.py", 155), TestFile("test_full_deepseek_v3.py", 463), ], - "per-commit-8-deepep": [ + "per-commit-8-gpu-deepep": [ TestFile("test_hybrid_dp_ep.py", 50), # TestFile("test_moe_deepep_eval_accuracy_large.py", 250), ], diff --git a/test/srt/test_hybrid_dp_ep.py b/test/srt/test_hybrid_dp_ep.py index f9390e2b8aee..1a4ddd79f6f2 100644 --- a/test/srt/test_hybrid_dp_ep.py +++ b/test/srt/test_hybrid_dp_ep.py @@ -19,6 +19,9 @@ ) +# DEFAULT_MODEL_NAME_FOR_TEST_MLA = "/dev/shm/DeepSeek-V3-0324" + + class TestPureDP(CustomTestCase): @classmethod def setUpClass(cls): @@ -31,10 +34,10 @@ def setUpClass(cls): other_args=[ "--trust-remote-code", "--tp", - "8", + "4", "--enable-dp-attention", "--dp", - "8", + "4", "--enable-deepep-moe", "--cuda-graph-max-bs", "128", @@ -75,7 +78,7 @@ def setUpClass(cls): other_args=[ "--trust-remote-code", "--tp", - "8", + "4", "--enable-dp-attention", "--dp", "2", @@ -119,7 +122,7 @@ def setUpClass(cls): other_args=[ "--trust-remote-code", "--tp", - "8", + "4", "--enable-deepep-moe", "--cuda-graph-max-bs", "128", @@ -160,10 +163,10 @@ def setUpClass(cls): other_args=[ "--trust-remote-code", "--tp", - "8", + "4", "--enable-dp-attention", "--dp", - "8", + "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", @@ -207,10 +210,10 @@ def setUpClass(cls): other_args=[ "--trust-remote-code", "--tp", - "8", + "4", "--enable-dp-attention", "--dp", - "8", + "4", "--moe-dense-tp-size", "1", "--enable-deepep-moe", @@ -254,7 +257,7 @@ def setUpClass(cls): other_args=[ "--trust-remote-code", "--tp", - "8", + "4", "--enable-dp-attention", "--dp", "2", @@ -269,11 +272,11 @@ def setUpClass(cls): "--speculative-eagle-topk", "3", "--speculative-num-draft-tokens", - "5", + "3", "--cuda-graph-max-bs", - "128", + "32", "--max-running-requests", - "128", + "32", ], ) @@ -323,29 +326,29 @@ def setUpClass(cls): timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--tp-size", - "8", + "4", "--enable-dp-attention", "--dp-size", - "8", + "4", "--enable-two-batch-overlap", "--enable-deepep-moe", "--trust-remote-code", "--speculative-algorithm", - "EAGLE", + "NEXTN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "3", "--speculative-num-draft-tokens", - "5", + "3", "--speculative-draft", DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, "--chunked-prefill-size", "256", "--cuda-graph-max-bs", - "128", + "32", "--max-running-requests", - "128", + "32", ], ) From af4b52e255c2e87711c7dccec6e59a6fbb5324ee Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 22:29:54 -0700 Subject: [PATCH 18/30] fix --- .github/workflows/pr-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 837114dae657..0d131c304635 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -32,7 +32,7 @@ concurrency: jobs: - unittest-test-deepep-8-gpu: + unittest-test-deepep-4-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false runs-on: 4-gpu-runner From 1a09e0ca6708ee0da3a5c2f16e0d5b19a831636d Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Tue, 8 Jul 2025 23:46:43 -0700 Subject: [PATCH 19/30] add large test --- .github/workflows/pr-test.yml | 5 +- test/srt/run_suite.py | 4 +- test/srt/test_deepep_large.py | 146 ++++++++++++++++++ ...t_hybrid_dp_ep.py => test_deepep_small.py} | 13 +- 4 files changed, 156 insertions(+), 12 deletions(-) create mode 100644 test/srt/test_deepep_large.py rename test/srt/{test_hybrid_dp_ep.py => test_deepep_small.py} (96%) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 0d131c304635..8b234ff3bbe9 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -54,6 +54,9 @@ jobs: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false runs-on: 8-gpu-runner + # needs: [ + # unittest-test-deepep-4-gpu, + # ] steps: - name: Checkout code uses: actions/checkout@v4 @@ -71,7 +74,7 @@ jobs: finish: if: always() needs: [ - unittest-test-deepep-8-gpu, + unittest-test-deepep-4-gpu, unittest-test-deepep-8-gpu, ] runs-on: ubuntu-latest steps: diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 33893314ff27..158e86eddca2 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -171,7 +171,7 @@ class TestFile: TestFile("test_multi_instance_release_memory_occupation.py", 64), ], "per-commit-4-gpu-deepep": [ - TestFile("test_hybrid_dp_ep.py", 50), + TestFile("test_deepep_small.py", 666), ], "per-commit-4-gpu-amd": [ TestFile("test_pp_single_node.py", 150), @@ -185,7 +185,7 @@ class TestFile: TestFile("test_full_deepseek_v3.py", 463), ], "per-commit-8-gpu-deepep": [ - TestFile("test_hybrid_dp_ep.py", 50), + TestFile("test_deepep_large.py", 666), # TestFile("test_moe_deepep_eval_accuracy_large.py", 250), ], "per-commit-8-gpu-amd": [ diff --git a/test/srt/test_deepep_large.py b/test/srt/test_deepep_large.py new file mode 100644 index 000000000000..a15c29eaaf03 --- /dev/null +++ b/test/srt/test_deepep_large.py @@ -0,0 +1,146 @@ +import unittest +from types import SimpleNamespace + +import requests + +from sglang.srt.utils import kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k +from sglang.test.test_utils import ( + DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST, + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + + +class TestDeepseek(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--tp", + "8", + "--enable-dp-attention", + "--dp", + "8", + "--moe-dense-tp-size", + "1", + "--enable-dp-lm-head", + "--enable-deepep-moe", + "--enable-two-batch-overlap", + "--ep-num-redundant-experts", + "32", + "--ep-dispatch-algorithm", + "dynamic", + "--eplb-algorithm", + "deepseek", + "--cuda-graph-bs", + "256", + "--max-running-requests", + "2048", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=8, + data_path=None, + num_questions=1250, + parallel=1250, + max_new_tokens=512, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(f"Eval accuracy of GSM8K: {metrics=}") + + self.assertGreater(metrics["accuracy"], 0.93) + self.assertGreater(metrics["output_throughput"], 4000) + + +class TestDeepseekMTP(CustomTestCase): + @classmethod + def setUpClass(cls): + cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST + cls.base_url = DEFAULT_URL_FOR_TEST + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=[ + "--trust-remote-code", + "--tp", + "8", + "--enable-dp-attention", + "--dp", + "8", + "--moe-dense-tp-size", + "1", + "--enable-dp-lm-head", + "--enable-deepep-moe", + "--enable-two-batch-overlap", + "--ep-num-redundant-experts", + "32", + "--ep-dispatch-algorithm", + "dynamic", + "--eplb-algorithm", + "deepseek", + "--cuda-graph-bs", + "64", # TODO: increase it to 128 when TBO is supported in draft_extend + "--max-running-requests", + "512", + "--speculative-algorithm", + "NEXTN", + "--speculative-num-steps", + "1", + "--speculative-eagle-topk", + "1", + "--speculative-num-draft-tokens", + "2", + ], + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=8, + data_path=None, + num_questions=1250, + parallel=1250, + max_new_tokens=512, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval_few_shot_gsm8k(args) + print(f"Eval accuracy of GSM8K: {metrics=}") + + self.assertGreater(metrics["accuracy"], 0.93) + + server_info = requests.get(self.base_url + "/get_server_info") + avg_spec_accept_length = server_info.json()["internal_states"][0][ + "avg_spec_accept_length" + ] + print( + f"###test_gsm8k:\n" + f"accuracy={metrics['accuracy']=:.3f}\n" + f"{avg_spec_accept_length=:.3f}\n" + ) + self.assertGreater(avg_spec_accept_length, 1.9) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/srt/test_hybrid_dp_ep.py b/test/srt/test_deepep_small.py similarity index 96% rename from test/srt/test_hybrid_dp_ep.py rename to test/srt/test_deepep_small.py index 1a4ddd79f6f2..a60f8296c67b 100644 --- a/test/srt/test_hybrid_dp_ep.py +++ b/test/srt/test_deepep_small.py @@ -1,6 +1,3 @@ -# Comprehensive test for hybrid parallelism (DP/TP attention, DP/TP Dense FFN, TP/EP Sparse FFN, DP/VP LM head) plus speculative decoding. -# These tests are not run by default but can be launched on demand. - import unittest from types import SimpleNamespace @@ -8,7 +5,6 @@ from sglang.srt.utils import kill_process_tree from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k -from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST_MLA, DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN, @@ -19,9 +15,6 @@ ) -# DEFAULT_MODEL_NAME_FOR_TEST_MLA = "/dev/shm/DeepSeek-V3-0324" - - class TestPureDP(CustomTestCase): @classmethod def setUpClass(cls): @@ -151,6 +144,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.62) +@unittest.skip("covered in test_deepep_large.py") class TestNoGatherdBuffer(CustomTestCase): @classmethod def setUpClass(cls): @@ -245,6 +239,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.62) +@unittest.skip("covered in TestMTPWithTBO") class TestMTP(CustomTestCase): @classmethod def setUpClass(cls): @@ -310,7 +305,7 @@ def test_gsm8k(self): f"accuracy={metrics['accuracy']=:.3f}\n" f"{avg_spec_accept_length=:.3f}\n" ) - self.assertGreater(avg_spec_accept_length, 2.3) + self.assertGreater(avg_spec_accept_length, 2.1) class TestMTPWithTBO(CustomTestCase): @@ -382,7 +377,7 @@ def test_gsm8k(self): f"accuracy={metrics['accuracy']=:.3f}\n" f"{avg_spec_accept_length=:.3f}\n" ) - self.assertGreater(avg_spec_accept_length, 2.3) + self.assertGreater(avg_spec_accept_length, 2.1) if __name__ == "__main__": From e0103ae83beda5acf5cf7e6a7ea78e21a60f469f Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 9 Jul 2025 00:03:00 -0700 Subject: [PATCH 20/30] fix --- test/srt/test_deepep_large.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/srt/test_deepep_large.py b/test/srt/test_deepep_large.py index a15c29eaaf03..083b14034b16 100644 --- a/test/srt/test_deepep_large.py +++ b/test/srt/test_deepep_large.py @@ -66,7 +66,7 @@ def test_gsm8k(self): print(f"Eval accuracy of GSM8K: {metrics=}") self.assertGreater(metrics["accuracy"], 0.93) - self.assertGreater(metrics["output_throughput"], 4000) + self.assertGreater(metrics["output_throughput"], 3800) class TestDeepseekMTP(CustomTestCase): @@ -129,7 +129,7 @@ def test_gsm8k(self): print(f"Eval accuracy of GSM8K: {metrics=}") self.assertGreater(metrics["accuracy"], 0.93) - + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" From f3c2b84e00ad229b8a4dbf5020b78721e79f33c6 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 9 Jul 2025 00:22:46 -0700 Subject: [PATCH 21/30] recover original tests --- .github/workflows/pr-test.yml | 253 +++++++++++++++++++++++++++++++++- test/srt/run_suite.py | 5 +- 2 files changed, 251 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 8b234ff3bbe9..abe1a7abddd3 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -31,6 +31,249 @@ concurrency: cancel-in-progress: true jobs: + unit-test-frontend: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: 1-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_dependency.sh + - name: Run test + timeout-minutes: 10 + run: | + cd test/lang + python3 run_suite.py --suite per-commit + unit-test-backend-1-gpu: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: 1-gpu-runner + strategy: + fail-fast: false + matrix: + part: [0, 1, 2, 3, 4, 5, 6, 7, 8] + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_dependency.sh + - name: Run test + timeout-minutes: 30 + run: | + cd test/srt + python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 9 + unit-test-backend-2-gpu: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: 2-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_dependency.sh + - name: Run test + timeout-minutes: 30 + run: | + cd test/srt + python3 run_suite.py --suite per-commit-2-gpu + + unittest-test-backend-4-gpu: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + needs: [unit-test-frontend, unit-test-backend-2-gpu] + runs-on: 4-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_dependency.sh + - name: Run test + timeout-minutes: 30 + run: | + cd test/srt + python3 run_suite.py --suite per-commit-4-gpu + unittest-test-backend-8-gpu: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + needs: [unit-test-frontend, unit-test-backend-2-gpu] + runs-on: 8-gpu-runner + strategy: + fail-fast: false + matrix: + part: [0, 1] + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_dependency.sh + - name: Run test + timeout-minutes: 20 + run: | + cd test/srt + python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 + performance-test-1-gpu-part-1: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: 1-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_dependency.sh + - name: Benchmark single latency + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small + python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default + - name: Benchmark online latency + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default + - name: Benchmark offline throughput + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default + - name: Benchmark offline throughput (Non-streaming, small batch size) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size + - name: Benchmark online latency (EAGLE) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle + performance-test-1-gpu-part-2: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: 1-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_dependency.sh + - name: Benchmark offline throughput (w/o RadixAttention) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache + - name: Benchmark offline throughput (w/ Triton) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend + - name: Benchmark offline throughput (w/ FP8) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8 + - name: Benchmark VLM offline throughput + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput + - name: Benchmark VLM online latency + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency + performance-test-2-gpu: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: 2-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_dependency.sh + - name: Benchmark single latency (TP=2) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 + - name: Benchmark single latency + torch.compile (TP=2) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1 + - name: Benchmark offline throughput (TP=2) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default + - name: Benchmark offline throughput (w/o RadixAttention) (TP=2) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache + - name: Benchmark offline decode throughput (PP=2) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode + - name: Benchmark offline prefill throughput (PP=2) + timeout-minutes: 10 + run: | + cd test/srt + python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill + accuracy-test-1-gpu: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: 1-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_dependency.sh + git clone https://github.com/merrymercy/human-eval.git + cd human-eval + pip install -e . + - name: Evaluate accuracy + timeout-minutes: 20 + run: | + cd test/srt + python3 test_eval_accuracy_large.py + accuracy-test-2-gpu: + if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && + github.event.pull_request.draft == false + runs-on: 2-gpu-runner + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + bash scripts/ci_install_dependency.sh + git clone https://github.com/merrymercy/human-eval.git + cd human-eval + pip install -e . + - name: Evaluate accuracy (TP=2) + timeout-minutes: 20 + run: | + cd test/srt + python3 test_moe_eval_accuracy_large.py unittest-test-deepep-4-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && @@ -54,9 +297,9 @@ jobs: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false runs-on: 8-gpu-runner - # needs: [ - # unittest-test-deepep-4-gpu, - # ] + needs: [ + unittest-test-deepep-4-gpu, + ] steps: - name: Checkout code uses: actions/checkout@v4 @@ -74,7 +317,9 @@ jobs: finish: if: always() needs: [ - unittest-test-deepep-4-gpu, unittest-test-deepep-8-gpu, + unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unittest-test-backend-8-gpu, + performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, + accuracy-test-1-gpu, accuracy-test-2-gpu, unittest-test-deepep-4-gpu, unittest-test-deepep-8-gpu, ] runs-on: ubuntu-latest steps: diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 158e86eddca2..9513b03e36f8 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -171,7 +171,7 @@ class TestFile: TestFile("test_multi_instance_release_memory_occupation.py", 64), ], "per-commit-4-gpu-deepep": [ - TestFile("test_deepep_small.py", 666), + TestFile("test_deepep_small.py", 531), ], "per-commit-4-gpu-amd": [ TestFile("test_pp_single_node.py", 150), @@ -185,8 +185,7 @@ class TestFile: TestFile("test_full_deepseek_v3.py", 463), ], "per-commit-8-gpu-deepep": [ - TestFile("test_deepep_large.py", 666), - # TestFile("test_moe_deepep_eval_accuracy_large.py", 250), + TestFile("test_deepep_large.py", 485), ], "per-commit-8-gpu-amd": [ TestFile("test_full_deepseek_v3.py", 250), From ace18b3b9f5c7b47634f0a42c472deb76dc585da Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 9 Jul 2025 00:24:35 -0700 Subject: [PATCH 22/30] fix name --- .github/workflows/pr-test.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index abe1a7abddd3..860b8b08e5e9 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -84,7 +84,7 @@ jobs: cd test/srt python3 run_suite.py --suite per-commit-2-gpu - unittest-test-backend-4-gpu: + unit-test-backend-4-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false needs: [unit-test-frontend, unit-test-backend-2-gpu] @@ -101,7 +101,7 @@ jobs: run: | cd test/srt python3 run_suite.py --suite per-commit-4-gpu - unittest-test-backend-8-gpu: + unit-test-backend-8-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false needs: [unit-test-frontend, unit-test-backend-2-gpu] @@ -275,7 +275,7 @@ jobs: cd test/srt python3 test_moe_eval_accuracy_large.py - unittest-test-deepep-4-gpu: + unit-test-deepep-4-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false runs-on: 4-gpu-runner @@ -293,12 +293,12 @@ jobs: cd test/srt python3 run_suite.py --suite per-commit-4-gpu-deepep - unittest-test-deepep-8-gpu: + unit-test-deepep-8-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false runs-on: 8-gpu-runner needs: [ - unittest-test-deepep-4-gpu, + unit-test-deepep-4-gpu, ] steps: - name: Checkout code @@ -317,9 +317,9 @@ jobs: finish: if: always() needs: [ - unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unittest-test-backend-8-gpu, + unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unit-test-backend-8-gpu, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, - accuracy-test-1-gpu, accuracy-test-2-gpu, unittest-test-deepep-4-gpu, unittest-test-deepep-8-gpu, + accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-4-gpu, unit-test-deepep-8-gpu, ] runs-on: ubuntu-latest steps: From 4e92d4d9015b7f2a3efa81964f15285aaaf87f69 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 9 Jul 2025 00:26:13 -0700 Subject: [PATCH 23/30] update dependency --- .github/workflows/pr-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 860b8b08e5e9..6cf94ce29944 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -319,7 +319,7 @@ jobs: needs: [ unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unit-test-backend-8-gpu, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, - accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-4-gpu, unit-test-deepep-8-gpu, + accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-8-gpu, ] runs-on: ubuntu-latest steps: From 6b5442b819bd578efe32855fb6d19c8f2f9b6db8 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 9 Jul 2025 00:27:44 -0700 Subject: [PATCH 24/30] update dependency --- .github/workflows/pr-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 6cf94ce29944..860b8b08e5e9 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -319,7 +319,7 @@ jobs: needs: [ unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unit-test-backend-8-gpu, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, - accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-8-gpu, + accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-4-gpu, unit-test-deepep-8-gpu, ] runs-on: ubuntu-latest steps: From 311fc269bffc58af7f61c4a23734af27516960ef Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 9 Jul 2025 00:29:14 -0700 Subject: [PATCH 25/30] format --- .github/workflows/pr-test.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 860b8b08e5e9..b9a1ad98b30d 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -47,6 +47,7 @@ jobs: run: | cd test/lang python3 run_suite.py --suite per-commit + unit-test-backend-1-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false @@ -67,6 +68,7 @@ jobs: run: | cd test/srt python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 9 + unit-test-backend-2-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false @@ -101,6 +103,7 @@ jobs: run: | cd test/srt python3 run_suite.py --suite per-commit-4-gpu + unit-test-backend-8-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false @@ -122,6 +125,7 @@ jobs: run: | cd test/srt python3 run_suite.py --suite per-commit-8-gpu --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 + performance-test-1-gpu-part-1: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false @@ -159,6 +163,7 @@ jobs: run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle + performance-test-1-gpu-part-2: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false @@ -195,6 +200,7 @@ jobs: run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency + performance-test-2-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false @@ -236,6 +242,7 @@ jobs: run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill + accuracy-test-1-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false @@ -255,6 +262,7 @@ jobs: run: | cd test/srt python3 test_eval_accuracy_large.py + accuracy-test-2-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false From f85c7020de02c1eae544bfeb30df964b500f4599 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 9 Jul 2025 00:32:26 -0700 Subject: [PATCH 26/30] recover format --- .github/workflows/pr-test.yml | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index b9a1ad98b30d..73cf2e2a9368 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -42,6 +42,7 @@ jobs: - name: Install dependencies run: | bash scripts/ci_install_dependency.sh + - name: Run test timeout-minutes: 10 run: | @@ -63,6 +64,7 @@ jobs: - name: Install dependencies run: | bash scripts/ci_install_dependency.sh + - name: Run test timeout-minutes: 30 run: | @@ -80,6 +82,7 @@ jobs: - name: Install dependencies run: | bash scripts/ci_install_dependency.sh + - name: Run test timeout-minutes: 30 run: | @@ -98,6 +101,7 @@ jobs: - name: Install dependencies run: | bash scripts/ci_install_dependency.sh + - name: Run test timeout-minutes: 30 run: | @@ -120,6 +124,7 @@ jobs: - name: Install dependencies run: | bash scripts/ci_install_dependency.sh + - name: Run test timeout-minutes: 20 run: | @@ -137,27 +142,32 @@ jobs: - name: Install dependencies run: | bash scripts/ci_install_dependency.sh + - name: Benchmark single latency timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default + - name: Benchmark online latency timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default + - name: Benchmark offline throughput timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default + - name: Benchmark offline throughput (Non-streaming, small batch size) timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size + - name: Benchmark online latency (EAGLE) timeout-minutes: 10 run: | @@ -175,26 +185,31 @@ jobs: - name: Install dependencies run: | bash scripts/ci_install_dependency.sh + - name: Benchmark offline throughput (w/o RadixAttention) timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache + - name: Benchmark offline throughput (w/ Triton) timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend + - name: Benchmark offline throughput (w/ FP8) timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8 + - name: Benchmark VLM offline throughput timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput + - name: Benchmark VLM online latency timeout-minutes: 10 run: | @@ -212,37 +227,43 @@ jobs: - name: Install dependencies run: | bash scripts/ci_install_dependency.sh + - name: Benchmark single latency (TP=2) timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 + - name: Benchmark single latency + torch.compile (TP=2) timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1 + - name: Benchmark offline throughput (TP=2) timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default + - name: Benchmark offline throughput (w/o RadixAttention) (TP=2) timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache + - name: Benchmark offline decode throughput (PP=2) timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_pp_offline_throughput_default_decode + - name: Benchmark offline prefill throughput (PP=2) timeout-minutes: 10 run: | cd test/srt python3 -m unittest test_bench_serving.TestBenchServing.test_pp_long_context_prefill - + accuracy-test-1-gpu: if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && github.event.pull_request.draft == false @@ -257,6 +278,7 @@ jobs: git clone https://github.com/merrymercy/human-eval.git cd human-eval pip install -e . + - name: Evaluate accuracy timeout-minutes: 20 run: | @@ -277,6 +299,7 @@ jobs: git clone https://github.com/merrymercy/human-eval.git cd human-eval pip install -e . + - name: Evaluate accuracy (TP=2) timeout-minutes: 20 run: | From f76e0d18b64c7e00bdee0940d693d24d01dd00de Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 9 Jul 2025 00:35:10 -0700 Subject: [PATCH 27/30] fix dependency --- .github/workflows/pr-test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 73cf2e2a9368..2378695e21ee 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -348,8 +348,8 @@ jobs: finish: if: always() needs: [ - unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unit-test-backend-8-gpu, - performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, + unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unit-test-backend-4-gpu, + unit-test-backend-8-gpu, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-4-gpu, unit-test-deepep-8-gpu, ] runs-on: ubuntu-latest From ebd4aba317673ba7253c19bc0d0aeea54ae26bc0 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 9 Jul 2025 00:37:51 -0700 Subject: [PATCH 28/30] fix --- test/srt/run_suite.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 9513b03e36f8..90e4f0094047 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -177,7 +177,6 @@ class TestFile: TestFile("test_pp_single_node.py", 150), ], "per-commit-8-gpu": [ - # Disabled deepep tests temporarily because it takes too much time. # Disabled because it hangs on the CI. # TestFile("test_moe_ep.py", 181), TestFile("test_disaggregation.py", 270), From 0598f368a64c2424f58ea4a9ab5831574d9dd238 Mon Sep 17 00:00:00 2001 From: Cheng Wan Date: Wed, 9 Jul 2025 01:17:18 -0700 Subject: [PATCH 29/30] remove throughput check --- test/srt/test_deepep_large.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/srt/test_deepep_large.py b/test/srt/test_deepep_large.py index 083b14034b16..8afb2896f8f8 100644 --- a/test/srt/test_deepep_large.py +++ b/test/srt/test_deepep_large.py @@ -66,7 +66,6 @@ def test_gsm8k(self): print(f"Eval accuracy of GSM8K: {metrics=}") self.assertGreater(metrics["accuracy"], 0.93) - self.assertGreater(metrics["output_throughput"], 3800) class TestDeepseekMTP(CustomTestCase): From 755611eedb123f234fe1a36709b57ce6801e50e1 Mon Sep 17 00:00:00 2001 From: Cheng Wan <54331508+ch-wan@users.noreply.github.com> Date: Wed, 9 Jul 2025 01:20:08 -0700 Subject: [PATCH 30/30] Apply suggestions from code review minor Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- scripts/ci_install_deepep.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ci_install_deepep.sh b/scripts/ci_install_deepep.sh index e41dde6da6b1..aa4dab097bb6 100755 --- a/scripts/ci_install_deepep.sh +++ b/scripts/ci_install_deepep.sh @@ -47,7 +47,7 @@ rm -rf /root/.cache/deepep && git clone https://github.com/deepseek-ai/DeepEP.gi cd /opt/nvshmem wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz tar -xf nvshmem_src_3.2.5-1.txz -rm nvshmem && mv nvshmem_src nvshmem +rm -rf nvshmem && mv nvshmem_src nvshmem cd nvshmem git apply /root/.cache/deepep/third-party/nvshmem.patch NVSHMEM_SHMEM_SUPPORT=0 \ @@ -63,7 +63,7 @@ cd build make -j$(nproc) install # Install DeepEP -cd /root/.cache/deepep && git checkout eef7ab50fa5cf0ab1dd3fce4c6493c90bdf290ac && python3 setup.py install +cd /root/.cache/deepep && python3 setup.py install # Verify configuration echo "=== NCCL Configuration ==="