Skip to content

Commit 19dadc8

Browse files
authored
Merge branch 'main' into dp_rebased
2 parents b2bdbcf + f2a5de2 commit 19dadc8

63 files changed

Lines changed: 2954 additions & 887 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/pr-test-rust.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ jobs:
5656
5757
e2e-python:
5858
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
59-
runs-on: 2-gpu-runner
59+
runs-on: BM.A10.4
6060
timeout-minutes: 30
6161
steps:
6262
- name: Checkout code
@@ -66,6 +66,10 @@ jobs:
6666
run: |
6767
bash scripts/ci/ci_install_rust.sh
6868
69+
- name: Install SGLang dependencies
70+
run: |
71+
sudo bash scripts/ci/ci_install_dependency.sh
72+
6973
- name: Build python binding
7074
run: |
7175
source "$HOME/.cargo/env"

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,11 @@ The core features include:
5353
- **Active Community**: SGLang is open-source and backed by an active community with wide industry adoption.
5454

5555
## Getting Started
56-
- [Install SGLang](https://docs.sglang.ai/start/install.html)
57-
- [Quick Start](https://docs.sglang.ai/backend/send_request.html)
58-
- [Backend Tutorial](https://docs.sglang.ai/backend/openai_api_completions.html)
59-
- [Frontend Tutorial](https://docs.sglang.ai/frontend/frontend.html)
60-
- [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
56+
- [Install SGLang](https://docs.sglang.ai/get_started/install.html)
57+
- [Quick Start](https://docs.sglang.ai/basic_usage/send_request.html)
58+
- [Backend Tutorial](https://docs.sglang.ai/basic_usage/openai_api_completions.html)
59+
- [Frontend Tutorial](https://docs.sglang.ai/references/frontend/frontend_tutorial.html)
60+
- [Contribution Guide](https://docs.sglang.ai/developer_guide/contribution_guide.html)
6161

6262
## Benchmark and Performance
6363
Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/), [Large-scale expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/).
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import json
2+
import queue
3+
import time
4+
5+
import requests
6+
from bench_multiturn import (
7+
ReadyQueue,
8+
WorkloadGenerator,
9+
gen_payload,
10+
log_to_jsonl_file,
11+
parse_args,
12+
)
13+
from tqdm.asyncio import tqdm
14+
15+
from sglang.bench_serving import get_tokenizer
16+
17+
18+
class ContextWorkloadGenerator(WorkloadGenerator):
19+
def __init__(self, args):
20+
# Construct the base URL for requests
21+
self.baseurl = f"http://{args.host}:{args.port}/"
22+
self.url = self.baseurl + "generate"
23+
24+
self.tokenizer = get_tokenizer(args.model_path)
25+
self.distribution = args.distribution
26+
self.request_rate = args.request_rate
27+
self.start_time = None
28+
self.finished_time = None
29+
30+
self.sent_requests = 0
31+
self.completed_requests = 0
32+
33+
self.dataset = json.load(open(args.dataset_path))
34+
35+
init_requests = []
36+
for i in range(min(args.num_clients, len(self.dataset["queries"]))):
37+
context_id = self.dataset["queries"][i]["context"]
38+
init_requests.append(
39+
(
40+
i,
41+
gen_payload(
42+
self.dataset["contexts"][context_id]
43+
+ self.dataset["queries"][i]["question"],
44+
len(
45+
self.tokenizer(
46+
self.dataset["queries"][i]["reference_answer"]
47+
)["input_ids"]
48+
),
49+
),
50+
)
51+
)
52+
self.ready_queue = ReadyQueue(init_requests=init_requests)
53+
54+
self.response_queue = queue.Queue()
55+
self.pbar = tqdm(total=args.num_clients * args.num_rounds)
56+
self.performance_metrics = {
57+
"ttft": [],
58+
"latency": [],
59+
"itl": [],
60+
"prompt_len": [],
61+
"cached_tokens": [],
62+
}
63+
64+
self.max_parallel = args.max_parallel
65+
self.logfile = args.log_file
66+
67+
def response_handler(self):
68+
while True:
69+
try:
70+
client_id, response = self.response_queue.get(
71+
timeout=10
72+
) # Block until response is available
73+
if not response.success:
74+
raise ValueError(f"Request failed with error: {response.error}")
75+
self.performance_metrics["ttft"].append(response.ttft)
76+
self.performance_metrics["itl"].extend(response.itl)
77+
self.performance_metrics["latency"].append(response.latency)
78+
self.completed_requests += 1
79+
80+
except queue.Empty:
81+
if self.pbar.n == self.pbar.total:
82+
break
83+
84+
85+
if __name__ == "__main__":
86+
args = parse_args()
87+
args.num_rounds = 1
88+
args.max_parallel = 128
89+
flush_cache_url = f"http://{args.host}:{args.port}/flush_cache"
90+
91+
for request_rate in [24, 16, 12, 8, 4, 2, 1]:
92+
args.request_rate = request_rate
93+
requests.post(flush_cache_url)
94+
time.sleep(1)
95+
performance_data = ContextWorkloadGenerator(args).run()
96+
log_to_jsonl_file(performance_data, args.log_file, args.tag)

benchmark/hicache/bench_multiturn.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,9 @@ def __init__(self, args):
322322
"prompt_len": [],
323323
"cached_tokens": [],
324324
}
325+
self.num_rounds = args.num_rounds
326+
self.max_parallel = args.max_parallel
327+
self.output_length = args.output_length
325328

326329
async def handle_request(self, item):
327330
try:
@@ -336,7 +339,7 @@ async def handle_request(self, item):
336339
def request_sender(self):
337340
async def request_loop():
338341
while True:
339-
if self.sent_requests - self.completed_requests < args.max_parallel:
342+
if self.sent_requests - self.completed_requests < self.max_parallel:
340343
new_request = self.ready_queue.pop()
341344
if new_request:
342345
asyncio.create_task(self.handle_request(new_request))
@@ -382,7 +385,7 @@ def response_handler(self):
382385
self.performance_metrics["cached_tokens"].append(response.cached_tokens)
383386
self.completed_requests += 1
384387

385-
if self.client_records[client_id]["round"] < args.num_rounds:
388+
if self.client_records[client_id]["round"] < self.num_rounds:
386389
# append new request to client's history
387390
self.client_records[client_id][
388391
"history"
@@ -392,7 +395,7 @@ def response_handler(self):
392395
client_id,
393396
gen_payload(
394397
self.client_records[client_id]["history"],
395-
args.output_length,
398+
self.output_length,
396399
),
397400
)
398401
)
@@ -461,7 +464,7 @@ def run(self):
461464
f" Throughput: {performance_data['summary']['throughput']:.2f} requests per second"
462465
)
463466
print(f" Cache Hit Rate: {performance_data['summary']['cache_hit_rate']:.6f}")
464-
log_to_jsonl_file(performance_data, args.log_file, tag=args.tag)
467+
return performance_data
465468

466469

467470
if __name__ == "__main__":
@@ -482,4 +485,5 @@ def run(self):
482485
args.request_rate = rate
483486
requests.post(flush_cache_url)
484487
time.sleep(1)
485-
WorkloadGenerator(args).run()
488+
performance_data = WorkloadGenerator(args).run()
489+
log_to_jsonl_file(performance_data, args.log_file, tag=args.tag)

docker/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,10 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
7373
&& python3 -m pip install --no-cache-dir nvidia-nccl-cu12==2.27.6 --force-reinstall --no-deps \
7474
&& python3 -m flashinfer --download-cubin \
7575
&& if [ "$CUDA_VERSION" = "12.8.1" ]; then \
76-
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.3/sgl_kernel-0.3.3+cu128-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
76+
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.4/sgl_kernel-0.3.4+cu128-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
7777
fi \
7878
&& if [ "$CUDA_VERSION" = "12.9.1" ]; then \
79-
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.3/sgl_kernel-0.3.3-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
79+
python3 -m pip install --no-cache-dir https://github.com/sgl-project/whl/releases/download/v0.3.4/sgl_kernel-0.3.4-cp39-abi3-manylinux2014_x86_64.whl --force-reinstall --no-deps ; \
8080
fi
8181

8282
# Download source files

docs/supported_models/support_new_models.md

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,8 @@ standard LLM support:
2121
in [model_config.py](https://github.com/sgl-project/sglang/blob/0ab3f437aba729b348a683ab32b35b214456efc7/python/sglang/srt/configs/model_config.py#L561)
2222
to return `True` for your model.
2323

24-
2. **Register a new chat-template**
25-
See [conversation.py](https://github.com/sgl-project/sglang/blob/86a779dbe9e815c02f71ea82574608f6eae016b5/python/sglang/srt/conversation.py)
24+
2. **Register a new chat-template**:
25+
Only when your default chat-template is unable to accept images as input: Register a new chat template in [conversation.py](https://github.com/sgl-project/sglang/tree/main/python/sglang/srt/conversation.py) and the corresponding matching function.
2626

2727
3. **Multimodal Data Processor**:
2828
Define a new `Processor` class that inherits from `BaseMultimodalProcessor` and register this processor as your
@@ -35,16 +35,18 @@ standard LLM support:
3535
expanded (if necessary) and padded with multimodal-data-hashes so that SGLang can recognize different multimodal data
3636
with `RadixAttention`.
3737

38-
5. **Adapt to Vision Attention**:
38+
5. **Handle Image Feature Extraction**:
39+
Implement a `get_image_feature` function for your new model, which extracts image features from raw image data and converts them into the embeddings used by the language model.
40+
41+
6. **Adapt to Vision Attention**:
3942
Adapt the multi-headed `Attention` of ViT with SGLang’s `VisionAttention`.
4043

4144
You can refer to [Qwen2VL](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/models/qwen2_vl.py) or
4245
other mllm implementations. These models demonstrate how to correctly handle both multimodal and textual inputs.
4346

44-
You should test the new MLLM locally against Hugging Face models. See the [
45-
`mmmu`](https://github.com/sgl-project/sglang/tree/main/benchmark/mmmu) benchmark for an example.
47+
## Testing and Debugging
4648

47-
## Test the Correctness
49+
Please note all your testing and benchmarking results in PR description.
4850

4951
### Interactive Debugging
5052

@@ -65,14 +67,21 @@ should give the same text output and very similar prefill logits:
6567
To ensure the new model is well maintained, add it to the test suite by including it in the `ALL_OTHER_MODELS` list in
6668
the [test_generation_models.py](https://github.com/sgl-project/sglang/blob/main/test/srt/models/test_generation_models.py)
6769
file, test the new model on your local machine and report the results on demonstrative benchmarks (GSM8K, MMLU, MMMU,
68-
MMMU-Pro, etc.) in your PR.
70+
MMMU-Pro, etc.) in your PR. \\
71+
For VLMs, also include a test in `test_vision_openai_server_{x}.py` (e.g. [test_vision_openai_server_a.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_vision_openai_server_a.py), [test_vision_openai_server_b.py](https://github.com/sgl-project/sglang/blob/main/test/srt/test_vision_openai_server_b.py)).
72+
6973

70-
This is the command to test a new model on your local machine:
74+
This is an example command to run to test a new model on your local machine:
7175

7276
```bash
7377
ONLY_RUN=Qwen/Qwen2-1.5B python3 -m unittest test_generation_models.TestGenerationModels.test_others
7478
```
7579

80+
### Benchmark
81+
82+
- **(Required) MMMU**: follow MMMU benchmark [README.md](https://github.com/sgl-project/sglang/blob/main/benchmark/mmmu/README.md) to get SGLang vs. HF Transformer accuracy comparison. The accuracy score from SGLang run should not be much lower than that from HF Transformer run. Similarly, follow https://docs.sglang.ai/developer_guide/benchmark_and_profiling.html to get performance comparison: TTFT and throughput must meet or exceed baselines (e.g., HF Transformer).
83+
- **(Optional) Other evals**: If you ran other evals, please note the results in PR description.
84+
7685
## Port a Model from vLLM to SGLang
7786

7887
The [vLLM Models Directory](https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models) is a valuable
@@ -126,6 +135,9 @@ ModelRegistry.models.update(import_new_model_classes())
126135
launch_server(server_args)
127136
```
128137

138+
## Documentation
139+
Add to table of supported models in [generative_models.md](https://github.com/sgl-project/sglang/blob/main/docs/supported_models/generative_models.md) or [multimodal_language_models.md](https://github.com/sgl-project/sglang/blob/main/docs/supported_models/multimodal_language_models.md)
140+
129141
---
130142

131143
By following these guidelines, you can add support for new language models and multimodal large language models in

0 commit comments

Comments
 (0)