File tree Expand file tree Collapse file tree 1 file changed +94
-0
lines changed Expand file tree Collapse file tree 1 file changed +94
-0
lines changed Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+
3+ subcommand=$1
4+ shift
5+
6+ ray_port=6379
7+ ray_init_timeout=300
8+ declare -a start_params
9+
10+ case " $subcommand " in
11+ worker)
12+ ray_address=" "
13+ while [ $# -gt 0 ]; do
14+ case " $1 " in
15+ --ray_address=* )
16+ ray_address=" ${1#* =} "
17+ ;;
18+ --ray_port=* )
19+ ray_port=" ${1#* =} "
20+ ;;
21+ --ray_init_timeout=* )
22+ ray_init_timeout=" ${1#* =} "
23+ ;;
24+ * )
25+ start_params+=(" $1 " )
26+ esac
27+ shift
28+ done
29+
30+ if [ -z " $ray_address " ]; then
31+ echo " Error: Missing argument --ray_address"
32+ exit 1
33+ fi
34+
35+ for (( i= 0 ; i < $ray_init_timeout ; i+= 5 )) ; do
36+ ray start --address=$ray_address :$ray_port --block " ${start_params[@]} "
37+ if [ $? -eq 0 ]; then
38+ echo " Worker: Ray runtime started with head address $ray_address :$ray_port "
39+ exit 0
40+ fi
41+ echo " Waiting until the ray worker is active..."
42+ sleep 5s;
43+ done
44+ echo " Ray worker starts timeout, head address: $ray_address :$ray_port "
45+ exit 1
46+ ;;
47+
48+ leader)
49+ ray_cluster_size=" "
50+ while [ $# -gt 0 ]; do
51+ case " $1 " in
52+ --ray_port=* )
53+ ray_port=" ${1#* =} "
54+ ;;
55+ --ray_cluster_size=* )
56+ ray_cluster_size=" ${1#* =} "
57+ ;;
58+ --ray_init_timeout=* )
59+ ray_init_timeout=" ${1#* =} "
60+ ;;
61+ * )
62+ start_params+=(" $1 " )
63+ esac
64+ shift
65+ done
66+
67+ if [ -z " $ray_cluster_size " ]; then
68+ echo " Error: Missing argument --ray_cluster_size"
69+ exit 1
70+ fi
71+
72+ # start the ray daemon
73+ ray start --head --port=$ray_port " ${start_params[@]} "
74+
75+ # wait until all workers are active
76+ for (( i= 0 ; i < $ray_init_timeout ; i+= 5 )) ; do
77+ active_nodes=` python3 -c ' import ray; ray.init(); print(sum(node["Alive"] for node in ray.nodes()))' `
78+ if [ $active_nodes -eq $ray_cluster_size ]; then
79+ echo " All ray workers are active and the ray cluster is initialized successfully."
80+ exit 0
81+ fi
82+ echo " Wait for all ray workers to be active. $active_nodes /$ray_cluster_size is active"
83+ sleep 5s;
84+ done
85+
86+ echo " Waiting for all ray workers to be active timed out."
87+ exit 1
88+ ;;
89+
90+ * )
91+ echo " unknown subcommand: $subcommand "
92+ exit 1
93+ ;;
94+ esac
You can’t perform that action at this time.
0 commit comments