sgl-project · ayrnb · Jul 23, 2025 · Jul 24, 2025 · Jul 24, 2025 · Jul 24, 2025
@@ -6,19 +6,19 @@
 /python/sglang/srt/constrained @hnyls2002
 /python/sglang/srt/disaggregation @ByronHsu @hnyls2002
 /python/sglang/srt/distributed @yizhang2077
-/python/sglang/srt/entrypoints @zhaochenyang20 @CatherineSue
+/python/sglang/srt/entrypoints @ispobock @CatherineSue @slin1237
 /python/sglang/srt/eplb @fzyzcjy
 /python/sglang/srt/function_call @CatherineSue
 /python/sglang/srt/layers @merrymercy @Ying1123 @zhyncs @ispobock @HaiShaw @ch-wan @BBuf
 /python/sglang/srt/lora @Ying1123 @Fridge003
 /python/sglang/srt/managers @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
 /python/sglang/srt/mem_cache @merrymercy @Ying1123 @hnyls2002 @xiezhq-hermann
 /python/sglang/srt/model_executor @merrymercy @Ying1123 @hnyls2002 @zhyncs @ispobock
-/python/sglang/srt/models @zhyncs @ispobock @ByronHsu @zhaochenyang20
+/python/sglang/srt/models @zhyncs @ispobock @ByronHsu @JustinTong0323
 /python/sglang/srt/multimodal @mickqian @JustinTong0323
 /python/sglang/srt/sampling @hnyls2002
 /python/sglang/srt/speculative @Ying1123 @merrymercy @rkooo567 @kssteven418
 /test/lang @merrymercy @Ying1123
 /test/srt @merrymercy @Ying1123 @zhyncs
 /sgl-router @ByronHsu @slin1237
-/sgl-kernel @zhyncs @ispobock @HandH1998 @BBuf @yizhang2077 @merrymercy @yinfan98 @HaiShaw
+/sgl-kernel @zhyncs @ispobock @HandH1998 @BBuf @yizhang2077 @merrymercy @FlamingoPg @HaiShaw
@@ -114,7 +114,7 @@ jobs:
       run: |
         echo "Installing SGLang with all extras..."
         python3 -m pip --no-cache-dir install -e "python[all]" --break-system-packages
-        python3 -m pip --no-cache-dir install mooncake-transfer-engine==0.3.4.post2
+        python3 -m pip --no-cache-dir install mooncake-transfer-engine==0.3.5
 
     - name: Build and install sgl-router
       run: |

@@ -174,6 +174,13 @@ jobs:
           cd test/srt
           python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
 
+      - name: Benchmark online latency (LoRA)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency
+          python3 -m unittest test_bench_serving.TestBenchServing.test_lora_online_latency_with_concurrent_adapter_updates
+
   performance-test-1-gpu-part-2:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
         github.event.pull_request.draft == false

@@ -0,0 +1,36 @@
+name: Release Docker Images (GB200)
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-22.04-arm
+    environment: 'prod'
+    steps:
+      - name: Delete huge unnecessary tools folder
+        run: rm -rf /opt/hostedtoolcache
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          tag=v${version}-cu128-gb200
+
+          docker buildx build --platform linux/arm64 --push --output type=image -t lmsysorg/sglang:${tag} -f docker/Dockerfile.gb200 --build-arg CUDA_VERSION=12.8.1 --build-arg BUILD_TYPE=blackwell --no-cache .
@@ -30,7 +30,7 @@ jobs:
       - name: Install dependencies
         run: |
           bash scripts/ci_install_dependency.sh
-          pip install "vllm==0.9.0.1"
+          pip install "vllm==0.10.0"
           pip install "bitsandbytes>=0.44.0"
 
       - name: Run VLLM dependency tests

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -39,7 +39,11 @@ repos:
       - id: codespell
         additional_dependencies: ['tomli']
         args: ['--toml', 'python/pyproject.toml', '-L', 'cann']
-        exclude: test/srt/test_reasoning_parser.py # Exclude the test file that is expected to fail
+        exclude: |
+          (?x)^(
+            test/srt/test_reasoning_parser\.py|
+            docs/backend/vlm_query\.ipynb
+          )$
   - repo: https://github.com/pre-commit/mirrors-clang-format
     rev: v18.1.8
     hooks:

diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md
@@ -33,7 +33,7 @@ Add [performance optimization options](#performance-optimization-options) as nee
 
 ```bash
 # Installation
-pip install "sglang[all]>=0.4.9.post3"
+pip install "sglang[all]>=0.4.9.post5"
 
 # Launch
 python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code

diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py
@@ -10,6 +10,7 @@
 from sglang.api import set_default_backend
 from sglang.test.test_utils import (
     add_common_sglang_args_and_parse,
+    dump_bench_raw_result,
     select_sglang_backend,
 )
 from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl
@@ -115,6 +116,12 @@ def few_shot_gsm8k(s, question):
 
     # Dump results
     dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    dump_bench_raw_result(
+        path=args.raw_result_file,
+        states=states,
+        preds=preds,
+        labels=labels,
+    )
 
     with open(args.result_file, "a") as fout:
         value = {

diff --git a/benchmark/kernels/fused_moe_triton/benchmark_sglang_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/benchmark_sglang_fused_moe_triton.py
@@ -33,7 +33,11 @@ def get_model_config(model_name: str, tp_size: int):
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // tp_size
-    elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
+    elif config.architectures[0] in [
+        "DeepseekV2ForCausalLM",
+        "DeepseekV3ForCausalLM",
+        "Glm4MoeForCausalLM",
+    ]:
         E = (
             config.n_routed_experts + 1
             if config.architectures[0] in ["DeepseekV3ForCausalLM"]

diff --git a/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/benchmark_vllm_vs_sglang_fused_moe_triton.py
@@ -42,7 +42,11 @@ def get_model_config(model_name: str, tp_size: int):
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // tp_size
-    elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
+    elif config.architectures[0] in [
+        "DeepseekV2ForCausalLM",
+        "DeepseekV3ForCausalLM",
+        "Glm4MoeForCausalLM",
+    ]:
         E = (
             config.n_routed_experts + 1
             if config.architectures[0] in ["DeepseekV3ForCausalLM"]

diff --git a/benchmark/mmlu/bench_sglang.py b/benchmark/mmlu/bench_sglang.py
@@ -9,6 +9,7 @@
 
 from sglang.test.test_utils import (
     add_common_sglang_args_and_parse,
+    dump_bench_raw_result,
     select_sglang_backend,
 )
 
@@ -142,6 +143,13 @@ def few_shot_mmlu(s, examples, question):
     assert pt == len(cors)
     weighted_acc = np.mean(cors)
 
+    dump_bench_raw_result(
+        path=args.raw_result_file,
+        states=states,
+        preds=preds,
+        labels=labels,
+    )
+
     # Print results
     print("Total latency: {:.3f}".format(latency))
     print("Average accuracy: {:.3f}".format(weighted_acc))

diff --git a/benchmark/mmmu/bench_sglang.py b/benchmark/mmmu/bench_sglang.py
@@ -125,7 +125,6 @@ async def eval_mmmu(args) -> None:
     client = openai.AsyncOpenAI(
         api_key="sk", base_url=f"http://127.0.0.1:{args.port}/v1"
     )
-    semaphore = asyncio.Semaphore(args.concurrency)
     start = time.perf_counter()
     base_url = f"http://127.0.0.1:{args.port}"
 
@@ -139,16 +138,26 @@ async def eval_mmmu(args) -> None:
 
         samples = samples[: args.profile_number]
 
-    tasks = [
-        process_sample_with_semaphore(
-            semaphore, client, sample, sampling_params, lora_path
-        )
-        for sample in samples
-    ]
-
-    for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
-        sample, response = await coro
-        process_result(response, sample, answer_dict, out_samples)
+    if args.concurrency == 1:
+        # For concurrency == 1, run in sequential mode to ensure consistent order
+        # this is mainly for profiling
+        for sample in tqdm(samples):
+            _, response = await process_sample(
+                client, sample, sampling_params, lora_path
+            )
+            process_result(response, sample, answer_dict, out_samples)
+    else:
+        semaphore = asyncio.Semaphore(args.concurrency)
+        tasks = [
+            process_sample_with_semaphore(
+                semaphore, client, sample, sampling_params, lora_path
+            )
+            for sample in samples
+        ]
+
+        for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
+            sample, response = await coro
+            process_result(response, sample, answer_dict, out_samples)
 
     if args.profile:
         print("Stopping profiler...")

diff --git a/benchmark/mmmu/eval_utils.py b/benchmark/mmmu/eval_utils.py
@@ -27,8 +27,7 @@
 class EvalArgs:
     seed: int = 42
     split: str = "validation"
-    # Default setting to make the benchmark available on A100 for most 7B models
-    image_pixels_limit: int = 4300000
+    image_pixels_limit: int = -1
     result_filename: str = ""
     prompt_format_file: str = "prompt_format.yaml"
     dataset_path: str = "MMMU/MMMU"
@@ -190,7 +189,7 @@ def process_sample(i, sample):
         sample = construct_prompt(sample, eval_args.config)
         image = sample["image"]
         width, height = image.size
-        if width * height >= eval_args.image_pixels_limit:
+        if 0 < eval_args.image_pixels_limit <= width * height:
             return None, True
         # Use a unique identifier for the image path to avoid potential collisions if indices reset
         image_path = f"{images_path}/image_{sample['id']}.png"
@@ -217,6 +216,8 @@ def process_sample(i, sample):
             elif sample:
                 samples.append(sample)
 
+    samples.sort(key=lambda x: x["final_input_prompt"])
+
     print(
         f"Skipping {skip_count} samples with large images, {round((float(skip_count) / len(dataset)) * 100, 2)}% of dataset"
     )

@@ -58,8 +58,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
       *) echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1 ;; \
     esac \
  && python3 -m pip install --no-cache-dir -e "python[${BUILD_TYPE}]" --extra-index-url https://download.pytorch.org/whl/cu${CUINDEX} \
+ && python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps \
  && if [ "$CUDA_VERSION" = "12.8.1" ]; then \
-      python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps ; \
       python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.2.7/sgl_kernel-0.2.7+cu128-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
     fi
 
@@ -86,7 +86,7 @@ RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/sour
 # Python tools
 RUN python3 -m pip install --no-cache-dir \
     datamodel_code_generator \
-    mooncake_transfer_engine==0.3.4.post2 \
+    mooncake-transfer-engine==0.3.5 \
     pre-commit \
     pytest \
     black \