11#! /usr/bin/env bash
22# launch_service.sh — 通用推理框架服务启动脚本
3- # 支持 FastDeploy / SGLang,支持单卡/多卡 TP/PD 分离模式
3+ # 支持 FastDeploy / SGLang,支持单卡/多卡 TP/DP/EP/ PD 分离模式
44set -euo pipefail
55
66# ============================================================
@@ -12,6 +12,7 @@ PORT=""
1212GPUS=" "
1313TP=1
1414DP=1
15+ EP=0
1516CONCURRENCY=32
1617MAX_MODEL_LEN=65536
1718QUANTIZATION=" none"
@@ -30,12 +31,15 @@ usage() {
3031 --framework <fd|sg> 推理框架 (fd=FastDeploy, sg=SGLang)
3132 --model <PATH> 模型权重路径
3233 --port <PORT> 服务端口
33- --gpus <DEVICES> CUDA_VISIBLE_DEVICES (如 "0" 或 "0,1,2,3")
34+ --gpus <DEVICES> CUDA_VISIBLE_DEVICES (如 "0" 或 "0,1,2,3,4,5,6,7 ")
3435 --venv <PATH> 虚拟环境路径 (.venv 目录)
3536
3637可选参数:
3738 --tp <N> tensor-parallel-size (默认: 1)
38- --dp <N> data-parallel-size, 仅 FD (默认: 1)
39+ --dp <N> data-parallel-size (默认: 1)
40+ --ep <N> expert-parallel-size, MoE 模型专用 (默认: 0, 不启用)
41+ FD: 映射为 --enable-expert-parallel (EP=TP×DP 隐式)
42+ SG: 映射为 --ep-size N
3943 --concurrency <N> max-num-seqs / max-running-requests (默认: 32)
4044 --max-model-len <N> 最大序列长度 (默认: 65536)
4145 --quantization <TYPE> 量化方式: none|block_wise_fp8|fp8|wint4|wint8 (默认: none)
@@ -50,9 +54,13 @@ usage() {
5054 bash launch_service.sh --framework fd --model /path/to/model --port 8180 \
5155 --gpus 0 --venv /path/to/FastDeploy/.venv
5256
53- # 多卡 TP=4 启动 SGLang
57+ # TP=4 + DP=2 + EP=8 启动 FastDeploy (MoE, 8卡)
58+ bash launch_service.sh --framework fd --model /path/to/model --port 8180 \
59+ --gpus 0,1,2,3,4,5,6,7 --tp 4 --dp 2 --ep 8 --venv /path/to/FastDeploy/.venv
60+
61+ # TP=4 + DP=2 + EP=8 启动 SGLang (MoE, 8卡)
5462 bash launch_service.sh --framework sg --model /path/to/model --port 8280 \
55- --gpus 0,1,2,3 --tp 4 --venv /path/to/sglang_env/.venv
63+ --gpus 0,1,2,3,4,5,6,7 --tp 4 --dp 2 --ep 8 --venv /path/to/sglang_env/.venv
5664EOF
5765 exit " ${1:- 0} "
5866}
@@ -65,6 +73,7 @@ while [[ $# -gt 0 ]]; do
6573 --gpus) GPUS=" $2 " ; shift 2 ;;
6674 --tp) TP=" $2 " ; shift 2 ;;
6775 --dp) DP=" $2 " ; shift 2 ;;
76+ --ep) EP=" $2 " ; shift 2 ;;
6877 --concurrency) CONCURRENCY=" $2 " ; shift 2 ;;
6978 --max-model-len) MAX_MODEL_LEN=" $2 " ; shift 2 ;;
7079 --quantization) QUANTIZATION=" $2 " ; shift 2 ;;
@@ -111,7 +120,7 @@ launch_fastdeploy() {
111120 echo " [INFO] 启动 FastDeploy 服务..."
112121 echo " 模型: $MODEL "
113122 echo " 端口: $PORT "
114- echo " GPU: $GPUS (TP=$TP , DP=$DP )"
123+ echo " GPU: $GPUS (TP=$TP , DP=$DP , EP= $EP )"
115124 echo " 并发: $CONCURRENCY "
116125 echo " 量化: $QUANTIZATION "
117126 echo " 日志: $LOG_FILE "
@@ -156,6 +165,11 @@ else:
156165 CMD+=" --data-parallel-size $DP "
157166 fi
158167
168+ # EP (expert parallelism) — FD 只有 flag,EP size 隐式 = TP×DP
169+ if [[ " $EP " -gt 0 ]]; then
170+ CMD+=" --enable-expert-parallel"
171+ fi
172+
159173 # 量化
160174 if [[ " $QUANTIZATION " != " none" ]]; then
161175 CMD+=" --quantization $QUANTIZATION "
@@ -184,7 +198,7 @@ launch_sglang() {
184198 echo " [INFO] 启动 SGLang 服务..."
185199 echo " 模型: $MODEL "
186200 echo " 端口: $PORT "
187- echo " GPU: $GPUS (TP=$TP )"
201+ echo " GPU: $GPUS (TP=$TP , DP= $DP , EP= $EP )"
188202 echo " 并发: $CONCURRENCY "
189203 echo " 量化: $QUANTIZATION "
190204 echo " 日志: $LOG_FILE "
@@ -208,6 +222,16 @@ launch_sglang() {
208222 CMD+=" --max-running-requests $CONCURRENCY "
209223 CMD+=" --attention-backend $ATTENTION_BACKEND "
210224
225+ # DP (data parallelism)
226+ if [[ " $DP " -gt 1 ]]; then
227+ CMD+=" --dp-size $DP "
228+ fi
229+
230+ # EP (expert parallelism) — SG 使用显式 --ep-size
231+ if [[ " $EP " -gt 0 ]]; then
232+ CMD+=" --ep-size $EP "
233+ fi
234+
211235 # 量化
212236 if [[ " $QUANTIZATION " != " none" ]]; then
213237 local SG_QUANT=" $QUANTIZATION "
0 commit comments