Skip to content

Commit 981f3c8

Browse files
authored
[Misc] Adding script to setup ray for multi-node vllm deployments (#12913)
1 parent 44c33f0 commit 981f3c8

File tree

1 file changed

+94
-0
lines changed

1 file changed

+94
-0
lines changed
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#!/bin/bash
2+
3+
subcommand=$1
4+
shift
5+
6+
ray_port=6379
7+
ray_init_timeout=300
8+
declare -a start_params
9+
10+
case "$subcommand" in
11+
worker)
12+
ray_address=""
13+
while [ $# -gt 0 ]; do
14+
case "$1" in
15+
--ray_address=*)
16+
ray_address="${1#*=}"
17+
;;
18+
--ray_port=*)
19+
ray_port="${1#*=}"
20+
;;
21+
--ray_init_timeout=*)
22+
ray_init_timeout="${1#*=}"
23+
;;
24+
*)
25+
start_params+=("$1")
26+
esac
27+
shift
28+
done
29+
30+
if [ -z "$ray_address" ]; then
31+
echo "Error: Missing argument --ray_address"
32+
exit 1
33+
fi
34+
35+
for (( i=0; i < $ray_init_timeout; i+=5 )); do
36+
ray start --address=$ray_address:$ray_port --block "${start_params[@]}"
37+
if [ $? -eq 0 ]; then
38+
echo "Worker: Ray runtime started with head address $ray_address:$ray_port"
39+
exit 0
40+
fi
41+
echo "Waiting until the ray worker is active..."
42+
sleep 5s;
43+
done
44+
echo "Ray worker starts timeout, head address: $ray_address:$ray_port"
45+
exit 1
46+
;;
47+
48+
leader)
49+
ray_cluster_size=""
50+
while [ $# -gt 0 ]; do
51+
case "$1" in
52+
--ray_port=*)
53+
ray_port="${1#*=}"
54+
;;
55+
--ray_cluster_size=*)
56+
ray_cluster_size="${1#*=}"
57+
;;
58+
--ray_init_timeout=*)
59+
ray_init_timeout="${1#*=}"
60+
;;
61+
*)
62+
start_params+=("$1")
63+
esac
64+
shift
65+
done
66+
67+
if [ -z "$ray_cluster_size" ]; then
68+
echo "Error: Missing argument --ray_cluster_size"
69+
exit 1
70+
fi
71+
72+
# start the ray daemon
73+
ray start --head --port=$ray_port "${start_params[@]}"
74+
75+
# wait until all workers are active
76+
for (( i=0; i < $ray_init_timeout; i+=5 )); do
77+
active_nodes=`python3 -c 'import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))'`
78+
if [ $active_nodes -eq $ray_cluster_size ]; then
79+
echo "All ray workers are active and the ray cluster is initialized successfully."
80+
exit 0
81+
fi
82+
echo "Wait for all ray workers to be active. $active_nodes/$ray_cluster_size is active"
83+
sleep 5s;
84+
done
85+
86+
echo "Waiting for all ray workers to be active timed out."
87+
exit 1
88+
;;
89+
90+
*)
91+
echo "unknown subcommand: $subcommand"
92+
exit 1
93+
;;
94+
esac

0 commit comments

Comments
 (0)