Skip to content

Commit 92ee7ba

Browse files
authored
[Example] add one-click runnable example for P2P NCCL XpYd (#20246)
Signed-off-by: KuntaiDu <[email protected]>
1 parent 7151f92 commit 92ee7ba

File tree

2 files changed

+245
-0
lines changed

2 files changed

+245
-0
lines changed
Lines changed: 245 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,245 @@
1+
#!/bin/bash
2+
3+
# =============================================================================
4+
# vLLM Disaggregated Serving Script - P2P NCCL XpYd Architecture
5+
# =============================================================================
6+
# This script demonstrates disaggregated prefill and decode serving using
7+
# P2P NCCL communication. The architecture supports various XpYd configurations:
8+
#
9+
# - 1P3D: 1 Prefill server + 3 Decode servers (current default)
10+
# - 3P1D: 3 Prefill servers + 1 Decode server
11+
# - etc.
12+
#
13+
# Configuration can be customized via environment variables:
14+
# MODEL: Model to serve
15+
# PREFILL_GPUS: Comma-separated GPU IDs for prefill servers
16+
# DECODE_GPUS: Comma-separated GPU IDs for decode servers
17+
# PREFILL_PORTS: Comma-separated ports for prefill servers
18+
# DECODE_PORTS: Comma-separated ports for decode servers
19+
# PROXY_PORT: Proxy server port used to setup XpYd connection.
20+
# TIMEOUT_SECONDS: Server startup timeout
21+
# =============================================================================
22+
23+
# Configuration - can be overridden via environment variables
24+
MODEL=${MODEL:-meta-llama/Llama-3.1-8B-Instruct}
25+
TIMEOUT_SECONDS=${TIMEOUT_SECONDS:-1200}
26+
PROXY_PORT=${PROXY_PORT:-30001}
27+
28+
# Default 1P3D configuration (1 Prefill + 3 Decode)
29+
PREFILL_GPUS=${PREFILL_GPUS:-0}
30+
DECODE_GPUS=${DECODE_GPUS:-1,2,3}
31+
PREFILL_PORTS=${PREFILL_PORTS:-20003}
32+
DECODE_PORTS=${DECODE_PORTS:-20005,20007,20009}
33+
34+
echo "Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
35+
echo ""
36+
echo "Architecture Configuration:"
37+
echo " Model: $MODEL"
38+
echo " Prefill GPUs: $PREFILL_GPUS, Ports: $PREFILL_PORTS"
39+
echo " Decode GPUs: $DECODE_GPUS, Ports: $DECODE_PORTS"
40+
echo " Proxy Port: $PROXY_PORT"
41+
echo " Timeout: ${TIMEOUT_SECONDS}s"
42+
echo ""
43+
44+
PIDS=()
45+
46+
# Switch to the directory of the current script
47+
cd "$(dirname "${BASH_SOURCE[0]}")"
48+
49+
check_required_files() {
50+
local files=("disagg_proxy_p2p_nccl_xpyd.py")
51+
for file in "${files[@]}"; do
52+
if [[ ! -f "$file" ]]; then
53+
echo "Required file $file not found in $(pwd)"
54+
exit 1
55+
fi
56+
done
57+
}
58+
59+
check_hf_token() {
60+
if [ -z "$HF_TOKEN" ]; then
61+
echo "HF_TOKEN is not set. Please set it to your Hugging Face token."
62+
echo "Example: export HF_TOKEN=your_token_here"
63+
exit 1
64+
fi
65+
if [[ "$HF_TOKEN" != hf_* ]]; then
66+
echo "HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
67+
exit 1
68+
fi
69+
echo "HF_TOKEN is set and valid."
70+
}
71+
72+
check_num_gpus() {
73+
# Check if the number of GPUs are >=2 via nvidia-smi
74+
num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
75+
if [ "$num_gpus" -lt 2 ]; then
76+
echo "You need at least 2 GPUs to run disaggregated prefill."
77+
exit 1
78+
else
79+
echo "Found $num_gpus GPUs."
80+
fi
81+
}
82+
83+
ensure_python_library_installed() {
84+
echo "Checking if $1 is installed..."
85+
if ! python3 -c "import $1" > /dev/null 2>&1; then
86+
echo "$1 is not installed. Please install it via pip install $1."
87+
exit 1
88+
else
89+
echo "$1 is installed."
90+
fi
91+
}
92+
93+
cleanup() {
94+
echo "Stopping everything…"
95+
trap - INT TERM # prevent re-entrancy
96+
kill -- -$$ # negative PID == "this whole process-group"
97+
wait # reap children so we don't leave zombies
98+
exit 0
99+
}
100+
101+
wait_for_server() {
102+
local port=$1
103+
local timeout_seconds=$TIMEOUT_SECONDS
104+
local start_time=$(date +%s)
105+
106+
echo "Waiting for server on port $port..."
107+
108+
while true; do
109+
if curl -s "localhost:${port}/v1/completions" > /dev/null; then
110+
echo "Server on port $port is ready."
111+
return 0
112+
fi
113+
114+
local now=$(date +%s)
115+
if (( now - start_time >= timeout_seconds )); then
116+
echo "Timeout waiting for server on port $port"
117+
return 1
118+
fi
119+
120+
sleep 1
121+
done
122+
}
123+
124+
main() {
125+
check_required_files
126+
check_hf_token
127+
check_num_gpus
128+
ensure_python_library_installed pandas
129+
ensure_python_library_installed datasets
130+
ensure_python_library_installed vllm
131+
ensure_python_library_installed quart
132+
133+
trap cleanup INT
134+
trap cleanup USR1
135+
trap cleanup TERM
136+
137+
echo "Launching disaggregated serving components..."
138+
echo "Please check the log files for detailed output:"
139+
echo " - prefill*.log: Prefill server logs"
140+
echo " - decode*.log: Decode server logs"
141+
echo " - proxy.log: Proxy server log"
142+
143+
# =============================================================================
144+
# Launch Proxy Server
145+
# =============================================================================
146+
echo ""
147+
echo "Starting proxy server on port $PROXY_PORT..."
148+
python3 disagg_proxy_p2p_nccl_xpyd.py &
149+
PIDS+=($!)
150+
151+
# Parse GPU and port arrays
152+
IFS=',' read -ra PREFILL_GPU_ARRAY <<< "$PREFILL_GPUS"
153+
IFS=',' read -ra DECODE_GPU_ARRAY <<< "$DECODE_GPUS"
154+
IFS=',' read -ra PREFILL_PORT_ARRAY <<< "$PREFILL_PORTS"
155+
IFS=',' read -ra DECODE_PORT_ARRAY <<< "$DECODE_PORTS"
156+
157+
# =============================================================================
158+
# Launch Prefill Servers (X Producers)
159+
# =============================================================================
160+
echo ""
161+
echo "Starting ${#PREFILL_GPU_ARRAY[@]} prefill server(s)..."
162+
for i in "${!PREFILL_GPU_ARRAY[@]}"; do
163+
local gpu_id=${PREFILL_GPU_ARRAY[$i]}
164+
local port=${PREFILL_PORT_ARRAY[$i]}
165+
local kv_port=$((21001 + i))
166+
167+
echo " Prefill server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
168+
CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
169+
--enforce-eager \
170+
--host 0.0.0.0 \
171+
--port $port \
172+
--tensor-parallel-size 1 \
173+
--seed 1024 \
174+
--dtype float16 \
175+
--max-model-len 10000 \
176+
--max-num-batched-tokens 10000 \
177+
--max-num-seqs 256 \
178+
--trust-remote-code \
179+
--gpu-memory-utilization 0.9 \
180+
--disable-log-request \
181+
--kv-transfer-config \
182+
"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_size\":\"1e1\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > prefill$((i+1)).log 2>&1 &
183+
PIDS+=($!)
184+
done
185+
186+
# =============================================================================
187+
# Launch Decode Servers (Y Decoders)
188+
# =============================================================================
189+
echo ""
190+
echo "Starting ${#DECODE_GPU_ARRAY[@]} decode server(s)..."
191+
for i in "${!DECODE_GPU_ARRAY[@]}"; do
192+
local gpu_id=${DECODE_GPU_ARRAY[$i]}
193+
local port=${DECODE_PORT_ARRAY[$i]}
194+
local kv_port=$((22001 + i))
195+
196+
echo " Decode server $((i+1)): GPU $gpu_id, Port $port, KV Port $kv_port"
197+
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
198+
--enforce-eager \
199+
--host 0.0.0.0 \
200+
--port $port \
201+
--tensor-parallel-size 1 \
202+
--seed 1024 \
203+
--dtype float16 \
204+
--max-model-len 10000 \
205+
--max-num-batched-tokens 10000 \
206+
--max-num-seqs 256 \
207+
--trust-remote-code \
208+
--gpu-memory-utilization 0.7 \
209+
--disable-log-request \
210+
--kv-transfer-config \
211+
"{\"kv_connector\":\"P2pNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_size\":\"8e9\",\"kv_port\":\"$kv_port\",\"kv_connector_extra_config\":{\"proxy_ip\":\"0.0.0.0\",\"proxy_port\":\"$PROXY_PORT\",\"http_port\":\"$port\",\"send_type\":\"PUT_ASYNC\",\"nccl_num_channels\":\"16\"}}" > decode$((i+1)).log 2>&1 &
212+
PIDS+=($!)
213+
done
214+
215+
# =============================================================================
216+
# Wait for All Servers to Start
217+
# =============================================================================
218+
echo ""
219+
echo "Waiting for all servers to start..."
220+
for port in "${PREFILL_PORT_ARRAY[@]}" "${DECODE_PORT_ARRAY[@]}"; do
221+
if ! wait_for_server $port; then
222+
echo "Failed to start server on port $port"
223+
cleanup
224+
exit 1
225+
fi
226+
done
227+
228+
echo ""
229+
echo "All servers are up. Starting benchmark..."
230+
231+
# =============================================================================
232+
# Run Benchmark
233+
# =============================================================================
234+
cd ../../../benchmarks/
235+
python3 benchmark_serving.py --port 10001 --seed $(date +%s) \
236+
--model $MODEL \
237+
--dataset-name random --random-input-len 7500 --random-output-len 200 \
238+
--num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
239+
240+
echo "Benchmarking done. Cleaning up..."
241+
242+
cleanup
243+
}
244+
245+
main

0 commit comments

Comments
 (0)