1+ #! /bin/bash
2+
3+ # =============================================================================
4+ # vLLM Disaggregated Serving Script - P2P NCCL XpYd Architecture
5+ # =============================================================================
6+ # This script demonstrates disaggregated prefill and decode serving using
7+ # P2P NCCL communication. The architecture supports various XpYd configurations:
8+ #
9+ # - 1P3D: 1 Prefill server + 3 Decode servers (current default)
10+ # - 3P1D: 3 Prefill servers + 1 Decode server
11+ # - etc.
12+ #
13+ # Configuration can be customized via environment variables:
14+ # MODEL: Model to serve
15+ # PREFILL_GPUS: Comma-separated GPU IDs for prefill servers
16+ # DECODE_GPUS: Comma-separated GPU IDs for decode servers
17+ # PREFILL_PORTS: Comma-separated ports for prefill servers
18+ # DECODE_PORTS: Comma-separated ports for decode servers
19+ # PROXY_PORT: Proxy server port used to setup XpYd connection.
20+ # TIMEOUT_SECONDS: Server startup timeout
21+ # =============================================================================
22+
23+ # Configuration - can be overridden via environment variables
24+ MODEL=${MODEL:- meta-llama/ Llama-3.1-8B-Instruct}
25+ TIMEOUT_SECONDS=${TIMEOUT_SECONDS:- 1200}
26+ PROXY_PORT=${PROXY_PORT:- 30001}
27+
28+ # Default 1P3D configuration (1 Prefill + 3 Decode)
29+ PREFILL_GPUS=${PREFILL_GPUS:- 0}
30+ DECODE_GPUS=${DECODE_GPUS:- 1,2,3}
31+ PREFILL_PORTS=${PREFILL_PORTS:- 20003}
32+ DECODE_PORTS=${DECODE_PORTS:- 20005,20007,20009}
33+
34+ echo " Warning: P2P NCCL disaggregated prefill XpYd support for vLLM v1 is experimental and subject to change."
35+ echo " "
36+ echo " Architecture Configuration:"
37+ echo " Model: $MODEL "
38+ echo " Prefill GPUs: $PREFILL_GPUS , Ports: $PREFILL_PORTS "
39+ echo " Decode GPUs: $DECODE_GPUS , Ports: $DECODE_PORTS "
40+ echo " Proxy Port: $PROXY_PORT "
41+ echo " Timeout: ${TIMEOUT_SECONDS} s"
42+ echo " "
43+
44+ PIDS=()
45+
46+ # Switch to the directory of the current script
47+ cd " $( dirname " ${BASH_SOURCE[0]} " ) "
48+
49+ check_required_files () {
50+ local files=(" disagg_proxy_p2p_nccl_xpyd.py" )
51+ for file in " ${files[@]} " ; do
52+ if [[ ! -f " $file " ]]; then
53+ echo " Required file $file not found in $( pwd) "
54+ exit 1
55+ fi
56+ done
57+ }
58+
59+ check_hf_token () {
60+ if [ -z " $HF_TOKEN " ]; then
61+ echo " HF_TOKEN is not set. Please set it to your Hugging Face token."
62+ echo " Example: export HF_TOKEN=your_token_here"
63+ exit 1
64+ fi
65+ if [[ " $HF_TOKEN " != hf_* ]]; then
66+ echo " HF_TOKEN is not a valid Hugging Face token. Please set it to your Hugging Face token."
67+ exit 1
68+ fi
69+ echo " HF_TOKEN is set and valid."
70+ }
71+
72+ check_num_gpus () {
73+ # Check if the number of GPUs are >=2 via nvidia-smi
74+ num_gpus=$( nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
75+ if [ " $num_gpus " -lt 2 ]; then
76+ echo " You need at least 2 GPUs to run disaggregated prefill."
77+ exit 1
78+ else
79+ echo " Found $num_gpus GPUs."
80+ fi
81+ }
82+
83+ ensure_python_library_installed () {
84+ echo " Checking if $1 is installed..."
85+ if ! python3 -c " import $1 " > /dev/null 2>&1 ; then
86+ echo " $1 is not installed. Please install it via pip install $1 ."
87+ exit 1
88+ else
89+ echo " $1 is installed."
90+ fi
91+ }
92+
93+ cleanup () {
94+ echo " Stopping everything…"
95+ trap - INT TERM # prevent re-entrancy
96+ kill -- -$$ # negative PID == "this whole process-group"
97+ wait # reap children so we don't leave zombies
98+ exit 0
99+ }
100+
101+ wait_for_server () {
102+ local port=$1
103+ local timeout_seconds=$TIMEOUT_SECONDS
104+ local start_time=$( date +%s)
105+
106+ echo " Waiting for server on port $port ..."
107+
108+ while true ; do
109+ if curl -s " localhost:${port} /v1/completions" > /dev/null; then
110+ echo " Server on port $port is ready."
111+ return 0
112+ fi
113+
114+ local now=$( date +%s)
115+ if (( now - start_time >= timeout_seconds )) ; then
116+ echo " Timeout waiting for server on port $port "
117+ return 1
118+ fi
119+
120+ sleep 1
121+ done
122+ }
123+
124+ main () {
125+ check_required_files
126+ check_hf_token
127+ check_num_gpus
128+ ensure_python_library_installed pandas
129+ ensure_python_library_installed datasets
130+ ensure_python_library_installed vllm
131+ ensure_python_library_installed quart
132+
133+ trap cleanup INT
134+ trap cleanup USR1
135+ trap cleanup TERM
136+
137+ echo " Launching disaggregated serving components..."
138+ echo " Please check the log files for detailed output:"
139+ echo " - prefill*.log: Prefill server logs"
140+ echo " - decode*.log: Decode server logs"
141+ echo " - proxy.log: Proxy server log"
142+
143+ # =============================================================================
144+ # Launch Proxy Server
145+ # =============================================================================
146+ echo " "
147+ echo " Starting proxy server on port $PROXY_PORT ..."
148+ python3 disagg_proxy_p2p_nccl_xpyd.py &
149+ PIDS+=($! )
150+
151+ # Parse GPU and port arrays
152+ IFS=' ,' read -ra PREFILL_GPU_ARRAY <<< " $PREFILL_GPUS"
153+ IFS=' ,' read -ra DECODE_GPU_ARRAY <<< " $DECODE_GPUS"
154+ IFS=' ,' read -ra PREFILL_PORT_ARRAY <<< " $PREFILL_PORTS"
155+ IFS=' ,' read -ra DECODE_PORT_ARRAY <<< " $DECODE_PORTS"
156+
157+ # =============================================================================
158+ # Launch Prefill Servers (X Producers)
159+ # =============================================================================
160+ echo " "
161+ echo " Starting ${# PREFILL_GPU_ARRAY[@]} prefill server(s)..."
162+ for i in " ${! PREFILL_GPU_ARRAY[@]} " ; do
163+ local gpu_id=${PREFILL_GPU_ARRAY[$i]}
164+ local port=${PREFILL_PORT_ARRAY[$i]}
165+ local kv_port=$(( 21001 + i))
166+
167+ echo " Prefill server $(( i+ 1 )) : GPU $gpu_id , Port $port , KV Port $kv_port "
168+ CUDA_VISIBLE_DEVICES=$gpu_id VLLM_USE_V1=1 vllm serve $MODEL \
169+ --enforce-eager \
170+ --host 0.0.0.0 \
171+ --port $port \
172+ --tensor-parallel-size 1 \
173+ --seed 1024 \
174+ --dtype float16 \
175+ --max-model-len 10000 \
176+ --max-num-batched-tokens 10000 \
177+ --max-num-seqs 256 \
178+ --trust-remote-code \
179+ --gpu-memory-utilization 0.9 \
180+ --disable-log-request \
181+ --kv-transfer-config \
182+ " {\" kv_connector\" :\" P2pNcclConnector\" ,\" kv_role\" :\" kv_producer\" ,\" kv_buffer_size\" :\" 1e1\" ,\" kv_port\" :\" $kv_port \" ,\" kv_connector_extra_config\" :{\" proxy_ip\" :\" 0.0.0.0\" ,\" proxy_port\" :\" $PROXY_PORT \" ,\" http_port\" :\" $port \" ,\" send_type\" :\" PUT_ASYNC\" ,\" nccl_num_channels\" :\" 16\" }}" > prefill$(( i+ 1 )) .log 2>&1 &
183+ PIDS+=($! )
184+ done
185+
186+ # =============================================================================
187+ # Launch Decode Servers (Y Decoders)
188+ # =============================================================================
189+ echo " "
190+ echo " Starting ${# DECODE_GPU_ARRAY[@]} decode server(s)..."
191+ for i in " ${! DECODE_GPU_ARRAY[@]} " ; do
192+ local gpu_id=${DECODE_GPU_ARRAY[$i]}
193+ local port=${DECODE_PORT_ARRAY[$i]}
194+ local kv_port=$(( 22001 + i))
195+
196+ echo " Decode server $(( i+ 1 )) : GPU $gpu_id , Port $port , KV Port $kv_port "
197+ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=$gpu_id vllm serve $MODEL \
198+ --enforce-eager \
199+ --host 0.0.0.0 \
200+ --port $port \
201+ --tensor-parallel-size 1 \
202+ --seed 1024 \
203+ --dtype float16 \
204+ --max-model-len 10000 \
205+ --max-num-batched-tokens 10000 \
206+ --max-num-seqs 256 \
207+ --trust-remote-code \
208+ --gpu-memory-utilization 0.7 \
209+ --disable-log-request \
210+ --kv-transfer-config \
211+ " {\" kv_connector\" :\" P2pNcclConnector\" ,\" kv_role\" :\" kv_consumer\" ,\" kv_buffer_size\" :\" 8e9\" ,\" kv_port\" :\" $kv_port \" ,\" kv_connector_extra_config\" :{\" proxy_ip\" :\" 0.0.0.0\" ,\" proxy_port\" :\" $PROXY_PORT \" ,\" http_port\" :\" $port \" ,\" send_type\" :\" PUT_ASYNC\" ,\" nccl_num_channels\" :\" 16\" }}" > decode$(( i+ 1 )) .log 2>&1 &
212+ PIDS+=($! )
213+ done
214+
215+ # =============================================================================
216+ # Wait for All Servers to Start
217+ # =============================================================================
218+ echo " "
219+ echo " Waiting for all servers to start..."
220+ for port in " ${PREFILL_PORT_ARRAY[@]} " " ${DECODE_PORT_ARRAY[@]} " ; do
221+ if ! wait_for_server $port ; then
222+ echo " Failed to start server on port $port "
223+ cleanup
224+ exit 1
225+ fi
226+ done
227+
228+ echo " "
229+ echo " All servers are up. Starting benchmark..."
230+
231+ # =============================================================================
232+ # Run Benchmark
233+ # =============================================================================
234+ cd ../../../benchmarks/
235+ python3 benchmark_serving.py --port 10001 --seed $( date +%s) \
236+ --model $MODEL \
237+ --dataset-name random --random-input-len 7500 --random-output-len 200 \
238+ --num-prompts 200 --burstiness 100 --request-rate 2 | tee benchmark.log
239+
240+ echo " Benchmarking done. Cleaning up..."
241+
242+ cleanup
243+ }
244+
245+ main
0 commit comments