-
Notifications
You must be signed in to change notification settings - Fork 146
Expand file tree
/
Copy pathenv.sh
More file actions
executable file
·149 lines (129 loc) · 6.23 KB
/
env.sh
File metadata and controls
executable file
·149 lines (129 loc) · 6.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/bin/bash
# SGLang/MoRI environment setup for multi-node disaggregated serving.
#
# REQUIRED ENVIRONMENT VARIABLES:
# IBDEVICES - RDMA/InfiniBand device names (e.g., ionic_0,ionic_1,... or mlx5_0,mlx5_1,...)
# This must be set by the runner script (runners/launch_mi355x-amds.sh)
#
# OPTIONAL ENVIRONMENT VARIABLES:
# MORI_RDMA_TC - RDMA traffic class (e.g., 96, 104). Set by runner if cluster uses QoS.
set -x
export PYTHONDONTWRITEBYTECODE=1
# IBDEVICES configuration
# Prefer IBDEVICES set by runner (runners/launch_mi355x-amds.sh)
# Fall back to hostname detection if not set (for direct script execution)
if [[ -z "$IBDEVICES" ]]; then
NODENAME=$(hostname -s)
if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
export IBDEVICES=ionic_0,ionic_1,ionic_2,ionic_3,ionic_4,ionic_5,ionic_6,ionic_7
elif [[ $NODENAME == mia1* ]]; then
export IBDEVICES=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
elif [[ $NODENAME == chi-mi325x* ]]; then
# Vultr/CPE MI325X cluster: Broadcom RoCE (bnxt_re); bnxt_re6 is DOWN, skip it
export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8
elif [[ $NODENAME == chi-mi300x* ]]; then
# Vultr/CPE MI300X cluster: Broadcom RoCE (bnxt_re); all 8 devices present
export IBDEVICES=bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7
else
echo "ERROR: Unable to detect cluster from hostname $NODENAME and IBDEVICES not set" >&2
exit 1
fi
echo "[INFO] Auto-detected IBDEVICES=$IBDEVICES from hostname $NODENAME"
else
echo "[INFO] Using IBDEVICES=$IBDEVICES (set by runner or environment)"
fi
export IBDEVICES
# Auto-detect default network interface (portable across clusters)
export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
set +x
export NCCL_IB_HCA=$IBDEVICES
export SGLANG_USE_AITER=1
export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200
export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200
# GLM-5: uses NSA (not MLA), needs fused-decode-MLA disabled + fast loading
if [[ "$MODEL_NAME" == "GLM-5-FP8" ]]; then
export SGLANG_ROCM_FUSED_DECODE_MLA=0
export ROCM_QUICK_REDUCE_QUANTIZATION=INT4
export SAFETENSORS_FAST_GPU=1
fi
# Disable allocating memory in one pass
export MORI_SHMEM_MODE=ISOLATION
export SGLANG_MORI_FP8_DISP=True
if [[ "$MODEL_NAME" == *mxfp4* ]]; then
export SGLANG_MORI_FP8_DISP=False
fi
export SGLANG_MORI_FP4_DISP=False
export SGLANG_MORI_FP8_COMB=False
# Per-role dispatch token limits (prefill uses higher throughput, decode uses lower)
export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384
if [[ "$MODEL_NAME" == *mxfp4* ]]; then
export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288
fi
export MORI_MAX_DISPATCH_TOKENS_DECODE=160
# set MTP size=1 when EP16
export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
export MORI_IO_QP_MAX_SEND_WR=16384
export MORI_IO_QP_MAX_CQE=32768
export MORI_IO_QP_MAX_SGE=4
export MORI_APP_LOG_LEVEL=INFO
# Router logging control:
# 0 (default) keeps noisy per-request access logs out of stdout while still logging to file.
# 1 mirrors router logs to stdout via tee (useful for live debugging).
export SGLANG_ROUTER_STDOUT_LOGS="${SGLANG_ROUTER_STDOUT_LOGS:-0}"
# QoS/DSCP configuration
# Priority order: 1) Set by runner, 2) Detect via nicctl, 3) Detect from hostname
if [[ -n "$MORI_RDMA_TC" ]]; then
echo "[INFO] Using MORI_RDMA_TC=$MORI_RDMA_TC (set by runner or environment)"
elif command -v nicctl &> /dev/null; then
ND_PRIO=$(nicctl show qos 2>/dev/null | awk '/PFC no-drop priorities/ {print $NF; exit}')
ND_DSCP=$(nicctl show qos 2>/dev/null| awk -v p="$ND_PRIO" '
$1 == "DSCP" && $2 == ":" && $NF == p {
print $3; exit
}')
if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
TC=$(( 4 * ND_DSCP ))
export MORI_RDMA_SL=$ND_PRIO
export MORI_RDMA_TC=$TC
echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL"
else
echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
# Fall back to hostname-based detection
NODENAME=$(hostname -s)
if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
export MORI_RDMA_TC=96
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == chi-mi325x* ]] || [[ $NODENAME == chi-mi300x* ]]; then
# Vultr/CPE MI325X/MI300X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
export MORI_RDMA_TC=104
export MORI_RDMA_SL=3
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
else
echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
fi
fi
else
# nicctl not available, try hostname-based detection
NODENAME=$(hostname -s)
if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
export MORI_RDMA_TC=96
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == chi-mi325x* ]] || [[ $NODENAME == chi-mi300x* ]]; then
# Vultr/CPE MI325X/MI300X: Broadcom Thor 2, DSCP AF31(26)->prio 3, TC=4*26=104
export MORI_RDMA_TC=104
export MORI_RDMA_SL=3
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL from hostname $NODENAME"
else
echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
echo " This is normal for clusters without QoS or outside Docker containers."
fi
fi
# FIXME: WA for latest upstream 0305 image
export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}