cnwenf
diff --git a/‎.github/workflows/pr-test-amd.yml‎
Lines changed: 44 additions & 3 deletions b/‎.github/workflows/pr-test-amd.yml‎
Lines changed: 44 additions & 3 deletions
diff --git a/‎docs/backend/native_api.ipynb‎
Lines changed: 10 additions & 4 deletions b/‎docs/backend/native_api.ipynb‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎docs/references/deploy_on_k8s.md‎
Lines changed: 1 addition & 9 deletions b/‎docs/references/deploy_on_k8s.md‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎python/sglang/bench_one_batch.py‎
Lines changed: 20 additions & 0 deletions b/‎python/sglang/bench_one_batch.py‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎python/sglang/srt/configs/model_config.py‎
Lines changed: 2 additions & 1 deletion b/‎python/sglang/srt/configs/model_config.py‎
Lines changed: 2 additions & 1 deletion
@@ -56,16 +56,15 @@ jobs:
           docker exec -w /human-eval ci_sglang pip install -e .
 
           docker exec -w / ci_sglang mkdir -p /dummy-grok
-          mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -P dummy-grok
-          docker cp ./dummy-grok ci_sglang:/dummy-grok/
+          mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
+          docker cp ./dummy-grok ci_sglang:/
 
       - name: Evaluate Accuracy
         timeout-minutes: 20
         run: |
           docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_accuracy_large.py
           docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_fp8_accuracy.py
           docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_qwen_models.py
-          docker exec -w /sglang-checkout -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 8 --model /dummy-grok --tokenizer-path Xenova/grok-1-tokenizer --load-format dummy --tp 8 --quantization fp8
 
   mla-test-1-gpu-amd:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -105,6 +104,48 @@ jobs:
         run: |
           docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_mla.py
 
+  bench-test-2-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    runs-on: linux-mi300-gpu-2
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup docker
+        run: |
+          # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG.
+          if [ -f "/etc/podinfo/gha-render-devices" ]; then
+            DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
+          else
+            DEVICE_FLAG="--device /dev/dri"
+          fi
+          docker pull lmsysorg/sglang:v0.4.5-rocm630
+          docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
+            -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
+            --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
+            -w /sglang-checkout --name ci_sglang \
+            lmsysorg/sglang:v0.4.5-rocm630
+
+      - name: Install dependencies
+        run: |
+          docker exec ci_sglang pip install --upgrade pip
+          docker exec ci_sglang pip uninstall sgl-kernel -y || true
+          docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
+          docker exec ci_sglang pip install -e "python[dev_hip]"
+
+          docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
+          docker exec -w /human-eval ci_sglang pip install -e .
+
+          docker exec -w / ci_sglang mkdir -p /dummy-grok
+          mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
+          docker cp ./dummy-grok ci_sglang:/
+
+      - name: Evaluate Benchmark
+        timeout-minutes: 20
+        run: |
+          docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_dummy_grok_models.py
+
   finish:
     if: always()
     needs: [
 
@@ -371,7 +371,9 @@
    "source": [
     "## Capture expert selection distribution in MoE models\n",
     "\n",
-    "SGLang Runtime supports recording the number of times an expert is selected in a MoE model run for each expert in the model. This is useful when analyzing the throughput of the model and plan for optimization."
+    "SGLang Runtime supports recording the number of times an expert is selected in a MoE model run for each expert in the model. This is useful when analyzing the throughput of the model and plan for optimization.\n",
+    "\n",
+    "*Note: We only print out the first 10 lines of the csv below for better readability. Please adjust accordingly if you want to analyze the results more deeply.*"
    ]
   },
   {
@@ -412,9 +414,13 @@
     "\n",
     "output_file = glob.glob(\"expert_distribution_*.csv\")[0]\n",
     "with open(output_file, \"r\") as f:\n",
-    "    print_highlight(\"Content of dumped record:\")\n",
-    "    for line in f:\n",
-    "        print_highlight(line.strip())"
+    "    print_highlight(\"\\n| Layer ID | Expert ID | Count |\")\n",
+    "    print_highlight(\"|----------|-----------|--------|\")\n",
+    "    next(f)\n",
+    "    for i, line in enumerate(f):\n",
+    "        if i < 9:\n",
+    "            layer_id, expert_id, count = line.strip().split(\",\")\n",
+    "            print_highlight(f\"| {layer_id:8} | {expert_id:9} | {count:6} |\")"
    ]
   },
   {
 
@@ -14,7 +14,7 @@ Here we take the deployment of DeepSeek-R1 as an example.
 
 1. At least two Kubernetes nodes, each with two H20 systems and eight GPUs, are required.
 
-2. Make sure your K8S cluster has LWS correctly installed. If it hasn't been set up yet, please follow the [installation instructions](https://github.com/kubernetes-sigs/lws/blob/main/site/content/en/docs/installation/_index.md).
+2. Make sure your K8S cluster has LWS correctly installed. If it hasn't been set up yet, please follow the [installation instructions](https://github.com/kubernetes-sigs/lws/blob/main/site/content/en/docs/installation/_index.md). **Note:** For LWS versions ≤0.5.x, you must use the Downward API to obtain `LWS_WORKER_INDEX`, as native support for this feature was introduced in v0.6.0.
 
 ## Basic example
 
@@ -95,10 +95,6 @@ spec:
             env:
               - name: NCCL_IB_GID_INDEX
                 value: "3"
-              - name: LWS_WORKER_INDEX
-                valueFrom:
-                  fieldRef:
-                    fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
             command:
               - python3
               - -m
@@ -164,10 +160,6 @@ spec:
             env:
             - name: NCCL_IB_GID_INDEX
               value: "3"
-            - name: LWS_WORKER_INDEX
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
             command:
               - python3
               - -m
 
@@ -60,6 +60,7 @@
 from sglang.srt.entrypoints.engine import _set_envs_and_config
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.managers.scheduler import Scheduler
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.sampling.sampling_params import SamplingParams
@@ -184,6 +185,7 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
         req.prefix_indices = []
         req.fill_ids = req.origin_input_ids
         req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+        req.logprob_start_len = len(req.origin_input_ids) - 1
         reqs.append(req)
 
     return input_ids, reqs
@@ -199,6 +201,7 @@ def prepare_extend_inputs_for_correctness_test(
             i, : bench_args.cut_len
         ]
         req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+        req.logprob_start_len = len(req.origin_input_ids) - 1
     return reqs
 
 
@@ -220,6 +223,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
         req.prefix_indices = []
         req.fill_ids = req.origin_input_ids
         req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+        req.logprob_start_len = len(req.origin_input_ids) - 1
         reqs.append(req)
 
     return reqs
@@ -238,6 +242,7 @@ def extend(reqs, model_runner):
         enable_custom_logit_processor=False,
     )
     batch.prepare_for_extend()
+    _maybe_prepare_dp_attn_batch(batch, model_runner)
     model_worker_batch = batch.get_model_worker_batch()
     forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
     logits_output = model_runner.forward(forward_batch)
@@ -249,13 +254,28 @@ def extend(reqs, model_runner):
 def decode(input_token_ids, batch, model_runner):
     batch.output_ids = input_token_ids
     batch.prepare_for_decode()
+    _maybe_prepare_dp_attn_batch(batch, model_runner)
     model_worker_batch = batch.get_model_worker_batch()
     forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
     logits_output = model_runner.forward(forward_batch)
     next_token_ids = model_runner.sample(logits_output, forward_batch)
     return next_token_ids, logits_output.next_token_logits
 
 
+def _maybe_prepare_dp_attn_batch(batch: ScheduleBatch, model_runner):
+    if model_runner.server_args.enable_dp_attention:
+        Scheduler.prepare_dp_attn_batch_raw(
+            batch,
+            dp_size=model_runner.server_args.dp_size,
+            attn_tp_size=1,
+            tp_cpu_group=model_runner.tp_group.cpu_group,
+            get_idle_batch=None,
+            disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
+            spec_algorithm=SpeculativeAlgorithm.NONE,
+            speculative_num_draft_tokens=None,
+        )
+
+
 def correctness_test(
     server_args,
     port_args,
 
@@ -279,6 +279,7 @@ def _verify_quantization(self) -> None:
             "moe_wna16",
         ]
         compatible_quantization_methods = {
+            "modelopt_fp4": ["modelopt"],
             "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
             "w8a8_fp8": ["compressed-tensors", "compressed_tensors"],
         }
@@ -485,8 +486,8 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
     "Gemma3ForConditionalGeneration",
     "Grok1VForCausalLM",
     "Grok1AForCausalLM",
-    # TODO: add multimodal support for "Llama4ForConditionalGeneration",
     "LlavaLlamaForCausalLM",
+    "Llama4ForConditionalGeneration",
     "LlavaMistralForCausalLM",
     "LlavaQwenForCausalLM",
     "LlavaVidForCausalLM",