Skip to content

Commit 13c355c

Browse files
author
wenfeng.wf
committed
Merge branch 'main' into nixl
* main: (29 commits) reduce moe_align_block_size_kernel small batch mode overhead (sgl-project#5086) Fix DeepSeek error when using DeepEP mode (sgl-project#5190) [metrics] Add in queue metrics (sgl-project#4444) fix: log warning when disable cuda graph (sgl-project#5209) Add H20 dtype fp8_w8a8 fused MoE kernel tuning configs for DeepSeek V3/R1 (sgl-project#5196) sgl-kernel use cutlass latest version for fp8 blockwise gemm (sgl-project#5207) update grok test (sgl-project#5171) model: support mllama4 (sgl-project#5144) [ci] fix ci test fused_moe op (sgl-project#5102) Support Llama4 fp8 inference (sgl-project#5194) Optimize topk operation in llama4 (sgl-project#5128) Fix ci test "test_eval_fp8_accuracy" failed (sgl-project#5185) [Misc] clean up vllm in sgl-kernel test (sgl-project#5189) Let `bench_one_batch` support `enable_dp_attention` (sgl-project#4058) [DeepEP] fix: import buffer error (sgl-project#5179) fix: use DeepEPDispatcher on CUDA (sgl-project#5180) feat: add DeepGEMM build warning (sgl-project#5176) docs: remove the use of Downward API for LWS_WORKER_INDEX (sgl-project#5110) [Fix] DeepEP Compatibility with Low Latency (sgl-project#5068) [Bugfix] Fix index out of bounds in local attention with large sequences (sgl-project#5173) ... # Conflicts: # python/sglang/srt/disaggregation/mini_lb.py # python/sglang/srt/managers/scheduler.py
2 parents b94369a + f730362 commit 13c355c

83 files changed

Lines changed: 10523 additions & 1638 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/pr-test-amd.yml

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,16 +56,15 @@ jobs:
5656
docker exec -w /human-eval ci_sglang pip install -e .
5757
5858
docker exec -w / ci_sglang mkdir -p /dummy-grok
59-
mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -P dummy-grok
60-
docker cp ./dummy-grok ci_sglang:/dummy-grok/
59+
mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
60+
docker cp ./dummy-grok ci_sglang:/
6161
6262
- name: Evaluate Accuracy
6363
timeout-minutes: 20
6464
run: |
6565
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_accuracy_large.py
6666
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_eval_fp8_accuracy.py
6767
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_qwen_models.py
68-
docker exec -w /sglang-checkout -e SGLANG_IS_IN_CI=1 ci_sglang python3 -m sglang.bench_one_batch --batch-size 32 --input 1024 --output 8 --model /dummy-grok --tokenizer-path Xenova/grok-1-tokenizer --load-format dummy --tp 8 --quantization fp8
6968
7069
mla-test-1-gpu-amd:
7170
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
@@ -105,6 +104,48 @@ jobs:
105104
run: |
106105
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_mla.py
107106
107+
bench-test-2-gpu-amd:
108+
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
109+
github.event.pull_request.draft == false
110+
runs-on: linux-mi300-gpu-2
111+
steps:
112+
- name: Checkout code
113+
uses: actions/checkout@v4
114+
115+
- name: Setup docker
116+
run: |
117+
# Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG.
118+
if [ -f "/etc/podinfo/gha-render-devices" ]; then
119+
DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
120+
else
121+
DEVICE_FLAG="--device /dev/dri"
122+
fi
123+
docker pull lmsysorg/sglang:v0.4.5-rocm630
124+
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
125+
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
126+
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
127+
-w /sglang-checkout --name ci_sglang \
128+
lmsysorg/sglang:v0.4.5-rocm630
129+
130+
- name: Install dependencies
131+
run: |
132+
docker exec ci_sglang pip install --upgrade pip
133+
docker exec ci_sglang pip uninstall sgl-kernel -y || true
134+
docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
135+
docker exec ci_sglang pip install -e "python[dev_hip]"
136+
137+
docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
138+
docker exec -w /human-eval ci_sglang pip install -e .
139+
140+
docker exec -w / ci_sglang mkdir -p /dummy-grok
141+
mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
142+
docker cp ./dummy-grok ci_sglang:/
143+
144+
- name: Evaluate Benchmark
145+
timeout-minutes: 20
146+
run: |
147+
docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_dummy_grok_models.py
148+
108149
finish:
109150
if: always()
110151
needs: [

docs/backend/native_api.ipynb

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,9 @@
371371
"source": [
372372
"## Capture expert selection distribution in MoE models\n",
373373
"\n",
374-
"SGLang Runtime supports recording the number of times an expert is selected in a MoE model run for each expert in the model. This is useful when analyzing the throughput of the model and plan for optimization."
374+
"SGLang Runtime supports recording the number of times an expert is selected in a MoE model run for each expert in the model. This is useful when analyzing the throughput of the model and plan for optimization.\n",
375+
"\n",
376+
"*Note: We only print out the first 10 lines of the csv below for better readability. Please adjust accordingly if you want to analyze the results more deeply.*"
375377
]
376378
},
377379
{
@@ -412,9 +414,13 @@
412414
"\n",
413415
"output_file = glob.glob(\"expert_distribution_*.csv\")[0]\n",
414416
"with open(output_file, \"r\") as f:\n",
415-
" print_highlight(\"Content of dumped record:\")\n",
416-
" for line in f:\n",
417-
" print_highlight(line.strip())"
417+
" print_highlight(\"\\n| Layer ID | Expert ID | Count |\")\n",
418+
" print_highlight(\"|----------|-----------|--------|\")\n",
419+
" next(f)\n",
420+
" for i, line in enumerate(f):\n",
421+
" if i < 9:\n",
422+
" layer_id, expert_id, count = line.strip().split(\",\")\n",
423+
" print_highlight(f\"| {layer_id:8} | {expert_id:9} | {count:6} |\")"
418424
]
419425
},
420426
{

docs/references/deploy_on_k8s.md

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Here we take the deployment of DeepSeek-R1 as an example.
1414

1515
1. At least two Kubernetes nodes, each with two H20 systems and eight GPUs, are required.
1616

17-
2. Make sure your K8S cluster has LWS correctly installed. If it hasn't been set up yet, please follow the [installation instructions](https://github.com/kubernetes-sigs/lws/blob/main/site/content/en/docs/installation/_index.md).
17+
2. Make sure your K8S cluster has LWS correctly installed. If it hasn't been set up yet, please follow the [installation instructions](https://github.com/kubernetes-sigs/lws/blob/main/site/content/en/docs/installation/_index.md). **Note:** For LWS versions ≤0.5.x, you must use the Downward API to obtain `LWS_WORKER_INDEX`, as native support for this feature was introduced in v0.6.0.
1818

1919
## Basic example
2020

@@ -95,10 +95,6 @@ spec:
9595
env:
9696
- name: NCCL_IB_GID_INDEX
9797
value: "3"
98-
- name: LWS_WORKER_INDEX
99-
valueFrom:
100-
fieldRef:
101-
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
10298
command:
10399
- python3
104100
- -m
@@ -164,10 +160,6 @@ spec:
164160
env:
165161
- name: NCCL_IB_GID_INDEX
166162
value: "3"
167-
- name: LWS_WORKER_INDEX
168-
valueFrom:
169-
fieldRef:
170-
fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
171163
command:
172164
- python3
173165
- -m

python/sglang/bench_one_batch.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
from sglang.srt.entrypoints.engine import _set_envs_and_config
6161
from sglang.srt.hf_transformers_utils import get_tokenizer
6262
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
63+
from sglang.srt.managers.scheduler import Scheduler
6364
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
6465
from sglang.srt.model_executor.model_runner import ModelRunner
6566
from sglang.srt.sampling.sampling_params import SamplingParams
@@ -184,6 +185,7 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
184185
req.prefix_indices = []
185186
req.fill_ids = req.origin_input_ids
186187
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
188+
req.logprob_start_len = len(req.origin_input_ids) - 1
187189
reqs.append(req)
188190

189191
return input_ids, reqs
@@ -199,6 +201,7 @@ def prepare_extend_inputs_for_correctness_test(
199201
i, : bench_args.cut_len
200202
]
201203
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
204+
req.logprob_start_len = len(req.origin_input_ids) - 1
202205
return reqs
203206

204207

@@ -220,6 +223,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
220223
req.prefix_indices = []
221224
req.fill_ids = req.origin_input_ids
222225
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
226+
req.logprob_start_len = len(req.origin_input_ids) - 1
223227
reqs.append(req)
224228

225229
return reqs
@@ -238,6 +242,7 @@ def extend(reqs, model_runner):
238242
enable_custom_logit_processor=False,
239243
)
240244
batch.prepare_for_extend()
245+
_maybe_prepare_dp_attn_batch(batch, model_runner)
241246
model_worker_batch = batch.get_model_worker_batch()
242247
forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
243248
logits_output = model_runner.forward(forward_batch)
@@ -249,13 +254,28 @@ def extend(reqs, model_runner):
249254
def decode(input_token_ids, batch, model_runner):
250255
batch.output_ids = input_token_ids
251256
batch.prepare_for_decode()
257+
_maybe_prepare_dp_attn_batch(batch, model_runner)
252258
model_worker_batch = batch.get_model_worker_batch()
253259
forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
254260
logits_output = model_runner.forward(forward_batch)
255261
next_token_ids = model_runner.sample(logits_output, forward_batch)
256262
return next_token_ids, logits_output.next_token_logits
257263

258264

265+
def _maybe_prepare_dp_attn_batch(batch: ScheduleBatch, model_runner):
266+
if model_runner.server_args.enable_dp_attention:
267+
Scheduler.prepare_dp_attn_batch_raw(
268+
batch,
269+
dp_size=model_runner.server_args.dp_size,
270+
attn_tp_size=1,
271+
tp_cpu_group=model_runner.tp_group.cpu_group,
272+
get_idle_batch=None,
273+
disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
274+
spec_algorithm=SpeculativeAlgorithm.NONE,
275+
speculative_num_draft_tokens=None,
276+
)
277+
278+
259279
def correctness_test(
260280
server_args,
261281
port_args,

python/sglang/srt/configs/model_config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ def _verify_quantization(self) -> None:
279279
"moe_wna16",
280280
]
281281
compatible_quantization_methods = {
282+
"modelopt_fp4": ["modelopt"],
282283
"w8a8_int8": ["compressed-tensors", "compressed_tensors"],
283284
"w8a8_fp8": ["compressed-tensors", "compressed_tensors"],
284285
}
@@ -485,8 +486,8 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
485486
"Gemma3ForConditionalGeneration",
486487
"Grok1VForCausalLM",
487488
"Grok1AForCausalLM",
488-
# TODO: add multimodal support for "Llama4ForConditionalGeneration",
489489
"LlavaLlamaForCausalLM",
490+
"Llama4ForConditionalGeneration",
490491
"LlavaMistralForCausalLM",
491492
"LlavaQwenForCausalLM",
492493
"LlavaVidForCausalLM",

0 commit comments

Comments
 (0)