diff --git a/README.md b/README.md index 40b3a62..8ff80c4 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Below are blueprints of supported models along with their documentation. | [**PyTorch PEFT/FSDP fine-tuning**](scripts/pytorch_train/HF_PEFT_FSDP/README.md) | Finetuning a HF model with LoRA approach & FSDP strategy | Llama-2-70b-chat-hf | | [**Large EP microbenchmark**](scripts/large-ep-benchmark/README.md) | MoE Large Expert Paralellism with MoRI-EP & DeepEP communication microbenchmarks | no specific models | | [**vLLM disaggregated P/D inference**](scripts/vllm_dissag/README.MD) | Distributed Inference P/D disaggregation with vLLM | DeepSeek-V3, Llama-3.3-70B-Instruct-FP8-KV, Llama-3.1-405B-Instruct-FP8-KV, gpt-oss-120b | -| [**SGLang disaggregated P/D inference**](scripts/sglang_disagg/README.MD) | Distributed Inference P/D disggregation with SGLang | Qwen3-32B, Llama-3.1-8B-Instruct, Llama-3.3-70B-Instruct-FP8-KV, Llama-3.1-405B-Instruct-FP8-KV, DeepSeek-V3, Mixtral-8x7B-v0.1 | +| [**SGLang disaggregated P/D inference**](scripts/sglang_disagg/README.MD) | Distributed Inference P/D disaggregation with SGLang | Qwen3-32B, Llama-3.1-8B-Instruct, Llama-3.3-70B-Instruct-FP8-KV, Llama-3.1-405B-Instruct-FP8-KV, DeepSeek-V3, Mixtral-8x7B-v0.1 | | [**KVCache Transfer Bench**](scripts/kvcache_transfer_bench/README.md) | Inter-node Transfer Benchmark | no specific models | ## Table of Contents diff --git a/docker/vllm_disagg_inference.ubuntu.amd.Dockerfile b/docker/vllm_disagg_inference.ubuntu.amd.Dockerfile index 98634a3..882fc0d 100644 --- a/docker/vllm_disagg_inference.ubuntu.amd.Dockerfile +++ b/docker/vllm_disagg_inference.ubuntu.amd.Dockerfile @@ -1,19 +1,28 @@ -ARG BASE_IMAGE=rocm/vllm:v0.14.0_amd_dev +ARG BASE_IMAGE=rocm/vllm-dev:base_torch2.10_triton3.6_rocm7.2_torch_build_20260216 FROM ${BASE_IMAGE} +ENTRYPOINT [] + WORKDIR /root +RUN sed -i 's|http://|https://|g' /etc/apt/sources.list + ENV _ROCM_DIR=/opt/rocm ENV _UCX_SOURCE=https://github.com/ROCm/ucx.git -ENV _UCX_BRANCH=v1.19.x +ENV _UCX_BRANCH=da3fac2a ENV _UCX_INSTALL_DIR=/usr/local/ucx/ -ENV _RIXL_SOURCE=github.com/ROCm/RIXL.git -ENV _RIXL_BRANCH=develop +ENV _RIXL_SOURCE=https://github.com/ROCm/RIXL.git +ENV _RIXL_BRANCH=f33a5599 ENV _RIXL_INSTALL_DIR=/usr/local/RIXL/install ENV _NIXLBENCH_INSTALL_DIR=/usr/local/RIXL +ARG GFX_COMPILATION_ARCH="gfx942" +ARG NIC_COMPILATION_ARCH="cx7" +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +ARG VLLM_COMMIT=7d6917bef552d6aff70142ab9fb8af648081d4db + RUN pip3 install meson==0.64.0 RUN pip3 install "pybind11[global]" @@ -31,8 +40,9 @@ RUN set -e && apt -y install gcc make libtool autoconf librdmacm-dev rdmacm-util RUN apt install -y libgflags-dev # Install UCX -RUN git clone ${_UCX_SOURCE} -b ${_UCX_BRANCH} && \ +RUN git clone ${_UCX_SOURCE} && \ cd ucx && \ + git checkout ${_UCX_BRANCH} && \ ./autogen.sh && \ mkdir -p build && \ cd build && \ @@ -46,7 +56,7 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/ucx/lib/ ENV PATH=$PATH:/usr/local/ucx/bin/ RUN set -e && apt update && \ - apt install -y libaio-dev liburing-dev etcd etcd-server etcd-client libcpprest-dev libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc && \ + apt install -y libaio-dev liburing-dev libcpprest-dev libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc wget && \ wget https://github.com/google/googletest/archive/refs/tags/v1.14.0.tar.gz && \ tar -xzf v1.14.0.tar.gz && \ cd googletest-1.14.0 && \ @@ -57,32 +67,13 @@ RUN set -e && apt update && \ make install && \ cd ../.. -# Expected etcd at /usr/local/bin/etcd//etcd -RUN wget https://github.com/etcd-io/etcd/releases/download/v3.6.0-rc.5/etcd-v3.6.0-rc.5-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \ - mkdir -p /usr/local/bin/etcd && \ - tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \ - rm /tmp/etcd.tar.gz -ENV PATH=$PATH:/usr/local/bin/etcd/ - -RUN set -e && echo "Compiling etcd-cpp API" && \ - git clone https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git && \ - cd etcd-cpp-apiv3 && \ - mkdir build && cd build && \ - cmake -DCMAKE_FIND_ROOT_PATH=/usr/grpc .. && \ - make -j && \ - make install && \ - cd ../.. && \ - echo "etcd-cpp installation completed." - ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/ -ENV CMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH:/usr/local/lib/cmake/etcd-cpp-api/ ENV PATH=/root/.local/bin:${_UCX_INSTALL_DIR}/bin:$PATH ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${_RIXL_INSTALL_DIR}/lib/x86_64-linux-gnu -ENV CMAKE_PREFIX_PATH=/usr/local/lib/cmake/etcd-cpp-api/:/usr/grpc/lib/cmake/:/usr/local/lib/cmake -RUN set -e && git clone https://${_RIXL_SOURCE} -b ${_RIXL_BRANCH} && \ +RUN set -e && git clone ${_RIXL_SOURCE} && \ cd RIXL && \ - git checkout ed772c8d0d8a47c7b4e1a622b13c4f6087a4972a && \ + git checkout ${_RIXL_BRANCH} && \ meson setup build/ --prefix=${_RIXL_INSTALL_DIR} \ -Ducx_path=${_UCX_INSTALL_DIR} \ -Ddisable_gds_backend=true \ @@ -112,10 +103,6 @@ RUN set -e && echo "Compiling NixlBench" && \ ninja install && \ echo "NixlBench compilation complete" -# Only need tests/ for toy_proxy_server.py; base image already has vLLM installed -RUN git clone --depth 1 https://github.com/vllm-project/vllm.git /tmp/vllm-src && \ - cp -r /tmp/vllm-src/tests /app/vllm/tests && \ - rm -rf /tmp/vllm-src # Install Rust compiler (required for building vllm-router) RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y @@ -123,3 +110,66 @@ ENV PATH="/root/.cargo/bin:${PATH}" # Install vllm-router RUN pip install vllm-router + +WORKDIR /app + +# versions.txt is provided by the base image and contains MORI_REPO / MORI_BRANCH entries. +RUN pip install tqdm prettytable +RUN git clone --recursive $(grep '^MORI_REPO:' /app/versions.txt | cut -d' ' -f2) && \ + cd mori && \ + git checkout $(grep '^MORI_BRANCH:' /app/versions.txt | cut -d' ' -f2) + +RUN git clone --no-checkout --filter=blob:none https://github.com/ROCm/rocm-systems.git && cd rocm-systems && \ + git sparse-checkout set --cone projects/rocshmem && \ + git checkout develop + +WORKDIR /app/rocm-systems/projects/rocshmem +RUN echo "ROCSHMEM_REPO=\"https://github.com/ROCm/rocm-systems.git\"" >> /app/versions.txt +RUN echo "ROCSHMEM_BRANCH=\"$(git log | head -1 | awk '{print $2}' | cut -c1-8)\"" >> /app/versions.txt +RUN mkdir -p /app/rocshmem-build +WORKDIR /app/rocshmem-build +RUN /app/rocm-systems/projects/rocshmem/scripts/build_configs/all_backends -DUSE_EXTERNAL_MPI=OFF -DGPU_TARGETS=$GFX_COMPILATION_ARCH + +WORKDIR /app +RUN git clone https://github.com/ROCm/DeepEP.git +WORKDIR /app/DeepEP +RUN echo "DEEPEP_REPO=\"https://github.com/ROCm/DeepEP.git\"" >> /app/versions.txt +RUN echo "DEEPEP_BRANCH=\"$(git log | head -1 | awk '{print $2}' | cut -c1-8)\"" >> /app/versions.txt +RUN PYTORCH_ROCM_ARCH=$GFX_COMPILATION_ARCH CFLAGS="-O3 -fPIC" CXXFLAGS="-O3 -fPIC --offload-arch=$GFX_COMPILATION_ARCH" HIP_CXX_FLAGS="-O3 -fPIC" \ + python3 setup.py --variant rocm --nic $NIC_COMPILATION_ARCH build develop + +# Uninstall vLLM from the base image, then install the pinned commit from source (ROCm). +# TODO: Remove this installation details after upstream vllm is stable. +RUN pip uninstall -y vllm || true +RUN pip install setuptools-scm huggingface-hub[cli] +RUN pip install quart msgpack --ignore-installed blinker +RUN rm -rf /tmp/vllm-src && \ + git clone --recursive "${VLLM_REPO}" /tmp/vllm-src && \ + cd /tmp/vllm-src && \ + git checkout "${VLLM_COMMIT}" && \ + git submodule update --init --recursive && \ + pip install -r requirements/rocm.txt && \ + pip install -r requirements/kv_connectors_rocm.txt && \ + (PYTORCH_ROCM_ARCH=${GFX_COMPILATION_ARCH} python setup.py install || \ + echo "WARNING: vLLM build from source failed; container may be broken") && \ + mkdir -p /app/vllm && \ + cp -r tests /app/vllm/tests && \ + cp -r examples /app/vllm/examples && \ + cp -r benchmarks /app/vllm/benchmarks && \ + rm -rf /tmp/vllm-src + +WORKDIR /app + +ENV ROCSHMEM_TEST_UUID=1 +ENV ROCSHMEM_HEAP_SIZE=6442450944 + +RUN pip install --upgrade vllm-router && \ + pip install py-spy && \ + pip install --ignore-installed --force-reinstall flask + +RUN echo "UCX_REPO=${_UCX_SOURCE}" >> /app/versions.txt && \ + echo "UCX_BRANCH=${_UCX_BRANCH}" >> /app/versions.txt && \ + echo "RIXL_REPO=${_RIXL_SOURCE}" >> /app/versions.txt && \ + echo "RIXL_BRANCH=${_RIXL_BRANCH}" >> /app/versions.txt + +RUN cat /app/versions.txt diff --git a/scripts/sglang_disagg/README.MD b/scripts/sglang_disagg/README.MD index 269a5e9..2fd0199 100644 --- a/scripts/sglang_disagg/README.MD +++ b/scripts/sglang_disagg/README.MD @@ -1,4 +1,4 @@ -## List of Models - focus SGlang Disaggerated P/D inference +## List of Models - focus SGLang Disaggregated P/D inference Dense Models - Qwen3-32B (https://huggingface.co/Qwen/Qwen3-32B) diff --git a/scripts/sglang_disagg/sglang_disagg_server.sh b/scripts/sglang_disagg/sglang_disagg_server.sh index 7b97cf5..261dcdd 100755 --- a/scripts/sglang_disagg/sglang_disagg_server.sh +++ b/scripts/sglang_disagg/sglang_disagg_server.sh @@ -218,7 +218,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then --node-ips ${MASTER_ADDR} \ --node-ports 2322 - echo "Waiting untill proxy server closes..." + echo "Waiting until proxy server closes..." python $MOONCAKE_COOKBOOK_PATH/socket_wait.py \ --remote-ip ${MASTER_ADDR} \ --remote-port 2322 @@ -254,7 +254,7 @@ else --node-ips ${MASTER_ADDR} \ --node-ports 2322 - echo "Waiting untill proxy server closes..." + echo "Waiting until proxy server closes..." python $MOONCAKE_COOKBOOK_PATH/socket_wait.py \ --remote-ip ${MASTER_ADDR} \ --remote-port 2322 diff --git a/scripts/vllm_dissag/README.MD b/scripts/vllm_dissag/README.MD index fd98dfd..3726d15 100644 --- a/scripts/vllm_dissag/README.MD +++ b/scripts/vllm_dissag/README.MD @@ -1,4 +1,4 @@ -## List of Models - focus VLLM Dissagerated P/D inference +## List of Models - focus VLLM Disaggregated P/D inference Dense Models - deepseek-ai/DeepSeek-V3 (https://huggingface.co/deepseek-ai/DeepSeek-V3) @@ -8,11 +8,11 @@ Dense Models This repository contains scripts and documentation to launch PD Disaggregation using the Nixl framework for above models. You will find setup instructions, node assignment details and benchmarking commands. -## 📝 Prerequisites +## Prerequisites -- A Slurm cluster with required Nodes -> xP + yD + 1 (minimum size 3: xP=1 and xD=1) -- Docker container with VLLM, Nixl, etcd and NIC drivers built-in. Refer to Building the Docker image section below. -- Access to a shared filesystem for log collection( cluster specific) +- A Slurm cluster with required Nodes -> xP + yD (minimum size 2: xP=1 and yD=1) +- Docker container with VLLM, Nixl and NIC drivers built-in. Refer to Building the Docker image section below. +- Access to a shared filesystem for log collection (cluster specific) ## Building the Docker image @@ -25,26 +25,79 @@ docker build -t vllm_dissag_pd_image -f vllm_dissag_inference.ubuntu.amd.Docker ## Scripts and Benchmarking Run instructions - scripts/vllm_dissag/README.MD -Few files of significance: +Key files: -scripts/vllm_dissag/run_xPyD_models.slurm - slurm script to launch docker containers on all nodes using sbatch or salloc -scripts/vllm_dissag/vllm_disagg_server.sh - Script that runs inside each docker to start required proxy, prefill and decode services -scripts/vllm_dissag/benchmark_xPyD.sh - Benchmark script to run VLLM benchmarking tool for performance measurement +| File | Description | +|------|-------------| +| `run_xPyD_models.slurm` | Slurm script to launch docker containers on all nodes using sbatch | +| `vllm_disagg_server.sh` | Default PD server script (NixlConnector, no expert parallel) | +| `vllm_disagg_mori_ep.sh` | MoRI EP server script (MoRIIOConnector, expert parallel) | +| `vllm_disagg_server_deepep.sh` | DeepEP server script (NixlConnector, DeepEP all2all backends) | +| `benchmark_xPyD.sh` | Benchmark script using vLLM benchmarking tool | -## Sbatch run command (one-liner) -```bash +## Run Modes + +The `run_xPyD_models.slurm` script supports three run modes, controlled by the `RUN_MORI` and `RUN_DEEPEP` environment variables. At most one of these may be set to `1`. + +| Mode | Env Variable | Server Script | KV Connector | Models | +|------|-------------|---------------|--------------|--------| +| Default (NixlConnector) | Neither set | `vllm_disagg_server.sh` | NixlConnector | All VALID_MODELS | +| MoRI EP | `RUN_MORI=1` | `vllm_disagg_mori_ep.sh` | MoRIIOConnector | DeepSeek-R1 | +| DeepEP | `RUN_DEEPEP=1` | `vllm_disagg_server_deepep.sh` | NixlConnector | DeepSeek-V3, DeepSeek-V3-5layer, DeepSeek-R1 | + +Setting both `RUN_MORI=1` and `RUN_DEEPEP=1` will exit with an error. + +## Sbatch Run Commands -# Clone the repo +### Default mode (NixlConnector) +```bash git clone https://github.com/ROCm/MAD.git -cd scripts/vllm_dissag +cd MAD/scripts/vllm_dissag -# Sbatch run command [run from the above folder] -export DOCKER_IMAGE_NAME= -export xP=; export yD=; export MODEL_NAME=Llama-3.1-8B-Instruct; sbatch -N -n --nodelist= run_xPyD_models.slurm +export DOCKER_IMAGE_NAME= +export xP=1; export yD=1; export MODEL_NAME=DeepSeek-V3 +sbatch -N 2 -n 2 --nodelist= run_xPyD_models.slurm +``` -# num_nodes = xP + xD + 1 +### MoRI EP mode +```bash +export DOCKER_IMAGE_NAME= +export RUN_MORI=1 +export xP=1; export yD=1; export MODEL_NAME=DeepSeek-R1 +sbatch -N 2 -n 2 --nodelist= run_xPyD_models.slurm ``` +### DeepEP mode +```bash +export DOCKER_IMAGE_NAME= +export RUN_DEEPEP=1 +export xP=1; export yD=1; export MODEL_NAME=DeepSeek-V3 +sbatch -N 2 -n 2 --nodelist= run_xPyD_models.slurm +``` + +### DeepEP environment variables (optional) + +| Variable | Default | Description | +|----------|---------|-------------| +| `PREFILL_DEEPEP_BACKEND` | `deepep_high_throughput` | All2all backend for prefill nodes | +| `DECODE_DEEPEP_BACKEND` | `deepep_low_latency` | All2all backend for decode nodes | +| `ENABLE_DBO` | `false` | Enable Dynamic Batching Optimization | +| `DBO_COMM_SMS` | (vLLM default) | DBO communication SMs override | +| `ENABLE_PROFILING` | `false` | Enable profiling | + +`num_nodes = xP + yD` (proxy co-located on prefill master node) + +## Node Topology (all modes) + +``` +Node 0 -> Prefill MASTER + Proxy (co-located) +Nodes 1..xP-1 -> Prefill CHILD (if xP > 1) +Node xP -> Decode MASTER +Nodes xP+1..end -> Decode CHILD (if yD > 1) +``` + +The proxy/router runs on the same node as the Prefill master (Node 0) to save one physical node. The proxy is CPU-only and listens on a separate port (default 18001) from the vLLM server (default 2584). + ## Proxy Server Options The scripts support two proxy server types via the `PROXY_TYPE` environment variable: @@ -60,7 +113,7 @@ export PROXY_TYPE=toy_proxy # Then run sbatch/srun as usual ``` -## Benchmark parser ( for CONCURRENCY logs) to tabulate different data +## Benchmark parser (for CONCURRENCY logs) to tabulate different data ```bash python3 benchmark_parser.py /dev/null echo "PST Time: $(TZ=America/Los_Angeles date '+%Y-%m-%d %H:%M:%S %Z')" | tee -a ${LOG}_CONCURRENCY.log >/dev/null - + +sleep 10 +echo "Warmup run:" | tee -a ${LOG}_CONCURRENCY.log >/dev/null +vllm bench serve \ + --model $MODEL_PATH \ + --backend vllm \ + --host 127.0.0.1 \ + --port $BENCHMARK_PORT \ + --dataset-name "random" \ + --random-input-len 1024 \ + --random-output-len 1024 \ + --random-prefix-len 0 \ + --num-prompts 16 \ + --request-rate "inf" \ + --ignore-eos \ + --max-concurrency 16 \ + 2>&1 | tee -a ${LOG}_CONCURRENCY.log >/dev/null echo "" -CON="8 16 32 64 128 256 512" -COMBINATIONS=("1024/1024" "8192/1024" "1024/8192") +CON="${BENCHMARK_CON:-8 16 32 64 128 256 512}" +IFS=' ' read -ra COMBINATIONS <<< "${BENCHMARK_COMBINATIONS:-1024/1024 8192/1024 1024/8192}" -for i in {1..1}; do +echo "Benchmarking iterations: $BENCHMARK_ITR" | tee -a ${LOG}_CONCURRENCY.log >/dev/null +for i in $(seq 1 $BENCHMARK_ITR); do echo "Running the benchserving script for iter: $i" for combo in "${COMBINATIONS[@]}"; do IFS="/" read -r isl osl <<< "$combo" diff --git a/scripts/vllm_dissag/run_xPyD_models.slurm b/scripts/vllm_dissag/run_xPyD_models.slurm index 26f02ca..ee1b215 100755 --- a/scripts/vllm_dissag/run_xPyD_models.slurm +++ b/scripts/vllm_dissag/run_xPyD_models.slurm @@ -1,7 +1,7 @@ #!/bin/bash #SBATCH --job-name=vllm-pd # Specify a custom string for your slurm batch job -#SBATCH -N 3 # Request N+1 nodes -#SBATCH -n 3 # Request N+1 total tasks +#SBATCH -N 2 # Request xP + yD nodes (proxy co-located on prefill master) +#SBATCH -n 2 # Request xP + yD total tasks #SBATCH --ntasks-per-node=1 #SBATCH --spread-job #SBATCH --gres=gpu:8 # Request 8 GPUs and 8 NICs (use --gres if specific GPU resources are needed) @@ -24,44 +24,24 @@ VALID_MODELS=( \ "Llama-3.1-405B-Instruct-FP8-KV" \ "amd-Llama-3.3-70B-Instruct-FP8-KV" \ "DeepSeek-V3" \ + "DeepSeek-V3-5layer" \ "gpt-oss-120b" \ + "DeepSeek-R1" \ ) - -# Each model has an associated run file - Set it here -declare -A MODEL_RUNFILES=( - ["Llama-3.1-405B-Instruct-FP8-KV"]="vllm_disagg_server.sh" - ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="vllm_disagg_server.sh" - ["DeepSeek-V3"]="vllm_disagg_server.sh" - ["gpt-oss-120b"]="vllm_disagg_server.sh" +# Models allowed for vllm_disagg_mori_ep.sh when RUN_MORI=1 +MORI_EP_VALID_MODELS=( \ + "DeepSeek-R1" \ ) -# Check if MODEL_NAME exists and fetch runfile -if [[ -n "${MODEL_RUNFILES[$MODEL_NAME]}" ]]; then - RUN_FILE="${MODEL_RUNFILES[$MODEL_NAME]}" - echo "Model found: $MODEL_NAME" - echo "Runfile set: $RUN_FILE" -else - echo "Error: Model '$MODEL_NAME' not found in MODEL_RUNFILES" - echo "Available models: ${!MODEL_RUNFILES[@]}" - exit 1 -fi - -export DOCKER_IMAGE_NAME="${DOCKER_IMAGE_NAME:-rocm/vllm:v0.14.0_amd_dev}" -if test -z "${DOCKER_IMAGE_NAME}"; then - echo "Error: DOCKER_IMAGE_NAME is not set or empty." - exit 1 -fi - -# Set current directory to be REPO directory with all relevant scripts -NIXL_REPO_DIR=$(pwd) -LOG_PATH="${LOG_PATH:-/shared_inference/${USER}/model_blog_logs}" - -xP="${xP:-1}" #-> Number of Prefill Servers -yD="${yD:-1}" #-> Number of Decode Servers +# Models allowed for vllm_disagg_server_deepep.sh when RUN_DEEPEP=1 +DEEPEP_VALID_MODELS=( \ + "DeepSeek-V3" \ + "DeepSeek-V3-5layer" \ + "DeepSeek-R1" \ +) MODEL_NAME="${MODEL_NAME:-None}" -MODEL_DIR="${MODEL_DIR:-"/shared_inference/models_blog/"}" validate_model_name() { local is_valid_model=false @@ -87,6 +67,77 @@ validate_model_name() { validate_model_name "${MODEL_NAME}" +model_allows_mori_ep() { + local m="$1" + for x in "${MORI_EP_VALID_MODELS[@]}"; do + [[ "$m" == "$x" ]] && return 0 + done + return 1 +} + +model_allows_deepep() { + local m="$1" + for x in "${DEEPEP_VALID_MODELS[@]}"; do + [[ "$m" == "$x" ]] && return 0 + done + return 1 +} + +# --------------------------------------------------------------------------- +# Run-mode selection: exactly one of RUN_MORI / RUN_DEEPEP may be "1". +# --------------------------------------------------------------------------- +_run_mori="${RUN_MORI:-0}" +_run_deepep="${RUN_DEEPEP:-0}" + +if [[ "$_run_mori" == "1" && "$_run_deepep" == "1" ]]; then + echo "Error: Both RUN_MORI and RUN_DEEPEP are set to 1. Set only one." >&2 + exit 1 +fi + +if [[ "$_run_mori" == "1" ]]; then + if model_allows_mori_ep "$MODEL_NAME"; then + RUN_FILE="vllm_disagg_mori_ep.sh" + echo "RUN_MORI=1: using $RUN_FILE for model '$MODEL_NAME'" + else + echo "Error: RUN_MORI=1 but MODEL_NAME '$MODEL_NAME' is not in MORI_EP_VALID_MODELS" + printf "MoRI EP allowed models:\n" + for m in "${MORI_EP_VALID_MODELS[@]}"; do + printf " - %s\n" "$m" + done + exit 1 + fi +elif [[ "$_run_deepep" == "1" ]]; then + if model_allows_deepep "$MODEL_NAME"; then + RUN_FILE="vllm_disagg_server_deepep.sh" + echo "RUN_DEEPEP=1: using $RUN_FILE for model '$MODEL_NAME'" + else + echo "Error: RUN_DEEPEP=1 but MODEL_NAME '$MODEL_NAME' is not in DEEPEP_VALID_MODELS" + printf "DeepEP allowed models:\n" + for m in "${DEEPEP_VALID_MODELS[@]}"; do + printf " - %s\n" "$m" + done + exit 1 + fi +else + RUN_FILE="vllm_disagg_server.sh" + echo "RUN_MORI/RUN_DEEPEP not set: using $RUN_FILE" +fi + +if [[ -z "${DOCKER_IMAGE_NAME:-}" ]]; then + echo "Error: DOCKER_IMAGE_NAME is not set. Please export DOCKER_IMAGE_NAME before running." + exit 1 +fi +export DOCKER_IMAGE_NAME + +# Set current directory to be REPO directory with all relevant scripts +NIXL_REPO_DIR=$(pwd) +LOG_PATH="${LOG_PATH:-/shared_inference/${USER}/model_blog_logs}" + +xP="${xP:-1}" #-> Number of Prefill Servers +yD="${yD:-1}" #-> Number of Decode Servers + +MODEL_DIR="${MODEL_DIR:-"/shared_inference/models_blog/"}" + # ------------------------ # Model path validation and selection across all nodes @@ -163,9 +214,18 @@ echo "" # Calculate NUM_NODES based on xP and yD -NUM_NODES=$((xP + yD + 1)) -echo "Calculated NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD + 1)" -echo "Calculated NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD + 1)" +NUM_NODES=$((xP + yD)) +echo "Calculated NUM_NODES: $NUM_NODES (xP=$xP + yD=$yD, proxy co-located on prefill master)" + +# DeepEP configuration (only exported when RUN_DEEPEP=1) +if [[ "$_run_deepep" == "1" ]]; then + export PREFILL_DEEPEP_BACKEND="${PREFILL_DEEPEP_BACKEND:-deepep_high_throughput}" + export DECODE_DEEPEP_BACKEND="${DECODE_DEEPEP_BACKEND:-deepep_low_latency}" + export ENABLE_DBO="${ENABLE_DBO:-false}" + export DBO_COMM_SMS="${DBO_COMM_SMS:-}" + export ENABLE_PROFILING="${ENABLE_PROFILING:-false}" + echo "DeepEP config: PREFILL_BACKEND=$PREFILL_DEEPEP_BACKEND DECODE_BACKEND=$DECODE_DEEPEP_BACKEND DBO=$ENABLE_DBO" +fi # ------------------------ # Extract first NUM_NODES from SLURM allocation and update SLURM variables @@ -234,7 +294,7 @@ echo "Selected nodes for execution:" echo "$SELECTED_NODES" echo "" -# MAke sure xP*8 and yD*8 are multiple of by 129280 for Deepseek V3 +# Make sure xP*8 and yD*8 are multiples of 129280 for DeepSeek V3 # ------------------------ # SLURM Environment Variables @@ -267,6 +327,9 @@ echo "Selected node IPs: ${IPS[*]}" | sed 's/ /,/g' NIXL_COOKBOOK_PATH="/opt/nixl-vllm-cookbook" BENCHMARK_FILE="${BENCHMARK_FILE:-$NIXL_COOKBOOK_PATH/dissag_blog_p1/models_blog/run_xPyD_models.slurm}" +BENCHMARK_ITR="${BENCHMARK_ITR:-1}" +BENCHMARK_CON="${BENCHMARK_CON:-}" +BENCHMARK_COMBINATIONS="${BENCHMARK_COMBINATIONS:-}" timestamp=$(date +"%Y-%m-%d_%H-%M-%S") NNODES=$NUM_NODES @@ -275,7 +338,7 @@ echo "MASTER_NODE is ${MASTER_NODE}" echo "MASTER_ADDR is ${MASTER_ADDR}" echo "MASTER_PORT is ${MASTER_PORT}" echo "NNODES is ${NNODES}" -echo "REPO Directory is ${MOONCAKE_REPO_DIR}" +echo "REPO Directory is ${NIXL_REPO_DIR}" if [ ! -d "$LOG_PATH" ]; then mkdir -p "$LOG_PATH" @@ -296,6 +359,9 @@ export yD=$yD export MODEL_NAME=$MODEL_NAME export USER_NAME=$USER_NAME export IPADDRS="$(echo "${IPS[*]}" | sed 's/ /,/g')" +export BENCHMARK_ITR=$BENCHMARK_ITR +export BENCHMARK_CON="${BENCHMARK_CON}" +export BENCHMARK_COMBINATIONS="${BENCHMARK_COMBINATIONS}" export DOCKER_CONT_NAME="container_${MODEL_NAME}_${SLURM_JOB_ID}" export RUN_FILE_FULL="$NIXL_COOKBOOK_PATH/${RUN_FILE}" @@ -306,6 +372,26 @@ SELECTED_NODELIST_SRUN=$(echo "$SELECTED_NODES" | paste -sd,) srun --nodelist="$SELECTED_NODELIST_SRUN" bash -c ' echo "Rank $SLURM_PROCID on $(hostname)"; docker ps -q | xargs --no-run-if-empty docker stop; +fuser -k 15000/tcp 2>/dev/null || true; +sleep 2; +docker pull $DOCKER_IMAGE_NAME 2>/dev/null || true; + +# --- Build host RDMA library mounts --- +# Mount the host MLNX OFED userspace libraries into the container so that +# libmlx5 / libibverbs always match the host kernel module, preventing +# mlx5dv_devx_alloc_uar failures from ABI mismatches. +_RDMA_MOUNTS="" +_LIBDIR=/usr/lib/x86_64-linux-gnu +for _lib in libmlx5.so libmlx5.so.1 libibverbs.so libibverbs.so.1 librdmacm.so librdmacm.so.1; do + [ -e "$_LIBDIR/$_lib" ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v $_LIBDIR/$_lib:$_LIBDIR/$_lib:ro" +done +for _vlib in $_LIBDIR/libmlx5.so.1.* $_LIBDIR/libibverbs.so.1.* $_LIBDIR/librdmacm.so.1.*; do + [ -e "$_vlib" ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v $_vlib:$_vlib:ro" +done +[ -d "$_LIBDIR/libibverbs" ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v $_LIBDIR/libibverbs:$_LIBDIR/libibverbs:ro" +[ -d /etc/libibverbs.d ] && _RDMA_MOUNTS="$_RDMA_MOUNTS -v /etc/libibverbs.d:/etc/libibverbs.d:ro" +echo "[host-rdma] mounts: $_RDMA_MOUNTS" + docker run --rm \ --device /dev/dri \ --device /dev/kfd \ @@ -321,9 +407,11 @@ docker run --rm \ -v /mnt/m2m_nobackup:/mnt/m2m_nobackup \ -v $HOME/.ssh:/root/.ssh \ --shm-size 64G \ - --ulimit nofile=131072:131072 \ + --ulimit nofile=524288:524288 \ -v ${LOG_PATH}:/run_logs \ -v $NIXL_REPO_DIR:$NIXL_COOKBOOK_PATH \ + $_RDMA_MOUNTS \ + --entrypoint /bin/bash \ -e SLURM_JOB_ID=$SLURM_JOB_ID \ -e SLURM_JOB_NODELIST=$SLURM_JOB_NODELIST \ -e NNODES=$NNODES \ @@ -337,12 +425,20 @@ docker run --rm \ -e USER_NAME=$USER_NAME \ -e MODEL_NAME=$MODEL_NAME \ -e BENCHMARK_FILE=$BENCHMARK_FILE \ + -e BENCHMARK_ITR=$BENCHMARK_ITR \ + -e BENCHMARK_CON="${BENCHMARK_CON}" \ + -e BENCHMARK_COMBINATIONS="${BENCHMARK_COMBINATIONS}" \ -e IPADDRS=$IPADDRS \ -e PROXY_TYPE=${PROXY_TYPE:-vllm_router} \ - -e ROUTER_PORT=${ROUTER_PORT:-2584} \ - -e BENCHMARK_PORT=${BENCHMARK_PORT:-2584} \ + -e ROUTER_PORT=${ROUTER_PORT:-18001} \ + -e BENCHMARK_PORT=${BENCHMARK_PORT:-18001} \ + -e PREFILL_DEEPEP_BACKEND=${PREFILL_DEEPEP_BACKEND:-} \ + -e DECODE_DEEPEP_BACKEND=${DECODE_DEEPEP_BACKEND:-} \ + -e ENABLE_DBO=${ENABLE_DBO:-} \ + -e DBO_COMM_SMS=${DBO_COMM_SMS:-} \ + -e ENABLE_PROFILING=${ENABLE_PROFILING:-} \ --name $DOCKER_CONT_NAME \ - $DOCKER_IMAGE_NAME bash -c " + $DOCKER_IMAGE_NAME -c " mkdir -p /run_logs/${SLURM_JOB_ID} $RUN_FILE_FULL 2>&1 | tee /run_logs/${SLURM_JOB_ID}/pd_vllm_bench_NODE${SLURM_PROCID}.log " diff --git a/scripts/vllm_dissag/start_etcd.sh b/scripts/vllm_dissag/start_etcd.sh deleted file mode 100755 index 7eed9aa..0000000 --- a/scripts/vllm_dissag/start_etcd.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -set -x -# Define the full list of cluster IPs -IPADDRS="${IPADDRS:-localhost}" - -# Automatically detect this host's IP (assuming it's the IP on the correct network) -host_ip=$(hostname -I | awk '{print $1}') - -# Convert comma-separated IP list into an array -IFS=',' read -ra ADDR <<< "$IPADDRS" - -# Determine node name based on position in list -index=0 -for ip in "${ADDR[@]}"; do - if [[ "$ip" == "$host_ip" ]]; then - break - fi - index=$((index + 1)) -done -node_name="etcd-$((index+1))" - -# Build initial cluster string -initial_cluster="" -for i in "${!ADDR[@]}"; do - peer_name="etcd-$((i+1))" - initial_cluster+="$peer_name=http://${ADDR[i]}:2380" - if [[ $i -lt $((${#ADDR[@]} - 1)) ]]; then - initial_cluster+="," - fi -done - -# Prepare etcd data directory -mkdir -p /var/lib/etcd - -rm -rf /var/lib/etcd/* - -# Run etcd with full config -/usr/local/bin/etcd//etcd \ - --name "$node_name" \ - --data-dir /var/lib/etcd \ - --initial-advertise-peer-urls http://$host_ip:2380 \ - --listen-peer-urls http://0.0.0.0:2380 \ - --listen-client-urls http://0.0.0.0:2379 \ - --advertise-client-urls http://$host_ip:2379 \ - --initial-cluster-token etcd-cluster-1 \ - --initial-cluster "$initial_cluster" \ - --initial-cluster-state new \ - 2>&1 | tee /run_logs/${SLURM_JOB_ID}/etcd_NODE${NODE_RANK}.log diff --git a/scripts/vllm_dissag/vllm_disagg_mori_ep.sh b/scripts/vllm_dissag/vllm_disagg_mori_ep.sh new file mode 100755 index 0000000..45024ac --- /dev/null +++ b/scripts/vllm_dissag/vllm_disagg_mori_ep.sh @@ -0,0 +1,297 @@ +#!/bin/bash +# VLLM Disaggregated Server Launcher - MoRI EP Configuration +# ============================================================================= + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +MASTER_ADDR="${MASTER_ADDR:-localhost}" +MASTER_PORT="${MASTER_PORT:-23731}" +NODE_RANK="${NODE_RANK:-0}" +NNODES="${NNODES:-1}" +MODEL_PATH=$MODEL_PATH +MODEL_NAME="${MODEL_NAME:-}" +xP="${xP:-1}" +yD="${yD:-1}" +if [ "$xP" -gt 1 ] || [ "$yD" -gt 1 ]; then + echo "Error: xP > 1 or yD > 1 is not supported yet due to MoRI IO connector issues." >&2 + exit 1 +fi +IPADDRS="${IPADDRS:-localhost}" +IFS=',' read -ra IP_ARRAY <<< "${IPADDRS}" + +echo "Listing NIXL_COOKBOOK_PATH : " +ls ${NIXL_COOKBOOK_PATH} + +# ============================================================================= +# Port Configuration +# ============================================================================= + +RPC_PORT=13345 +SERVE_PORT=20005 +KV_PORT=9711 +PROXY_PORT=10001 +PROXY_PING_PORT=36367 +LOCAL_PING_PORT=61555 +HANDSHAKE_PORT=8405 +NOTIFY_PORT=61005 + +# ============================================================================= +# Node-Specific Configuration +# ============================================================================= + +PREFILL_DP_SIZE=$((xP * 8)) +DECODE_DP_SIZE=$((yD * 8)) +DP_PARALLEL_SIZE_LOCAL=8 +PREFILL_DP_START_RANK=$(( NODE_RANK * 8 )) +PREFILL_MASTER_ADDR=$(echo "$IPADDRS" | awk -F',' '{print $1}') +DECODE_DP_START_RANK=$(( (NODE_RANK - xP) * 8 )) +DECODE_MASTER_ADDR=$(echo "$IPADDRS" | awk -F',' -v pos="$xP" '{print $(pos+1)}') + +echo "-----------------------------Printing node specific details ----------------------" +echo "IPADDRS = ${IPADDRS}" +echo "MASTER_ADDR=${MASTER_ADDR}" +echo "HOST_IP=$(hostname -I)" +echo "PREFILL_DP_SIZE=${PREFILL_DP_SIZE}" +echo "DECODE_DP_SIZE=${DECODE_DP_SIZE}" +echo "PREFILL_DP_START_RANK=${PREFILL_DP_START_RANK}" +echo "PREFILL_MASTER_ADDR=${PREFILL_MASTER_ADDR}" +echo "DECODE_DP_START_RANK=${DECODE_DP_START_RANK}" +echo "DECODE_MASTER_ADDR=${DECODE_MASTER_ADDR}" +host_ip=$(hostname -I | awk '{print $1}') +host_name=$(hostname) + +echo "Listing NIXL_COOKBOOK_PATH : " +ls ${NIXL_COOKBOOK_PATH} + +# ============================================================================= +# Helper Functions +# ============================================================================= + +setup_mori_env() { + export VLLM_ROCM_USE_AITER=1 + export VLLM_ROCM_USE_AITER_MOE=1 + export VLLM_LOGGING_LEVEL=INFO + export VLLM_USE_V1=1 + export VLLM_ROCM_USE_AITER_MLA=1 + export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0 + export VLLM_ALL2ALL_BACKEND=mori + export GLOO_SOCKET_IFNAME=eth0 + export VLLM_ENGINE_READY_TIMEOUT_S=3600 + export VLLM_RINGBUFFER_WARNING_INTERVAL=3600 + export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=3600 +} + +build_kv_transfer_config() { + local kv_role="$1" + echo '{"kv_connector":"MoRIIOConnector","kv_role":"'"${kv_role}"'","kv_port":"'"${KV_PORT}"'","kv_connector_extra_config":{"proxy_ip":"'"${MASTER_ADDR}"'","proxy_port":"'"${PROXY_PORT}"'","proxy_ping_port":"'"${PROXY_PING_PORT}"'","http_port":"'"${SERVE_PORT}"'","local_ping_port":"'"${LOCAL_PING_PORT}"'","handshake_port":"'"${HANDSHAKE_PORT}"'","notify_port":"'"${NOTIFY_PORT}"'"}}' +} + +# Launch a vllm serve worker and set WORKER_PID to its PID. +# $1 = dp_size data-parallel size +# $2 = dp_addr data-parallel master address +# $3 = kv_role "kv_producer" or "kv_consumer" +# $4 = log_prefix "prefill" or "decode" +# $5 = role "master" or "child" +# $6 = dp_start_rank (required for child nodes) +launch_vllm_worker() { + local dp_size="$1" + local dp_addr="$2" + local kv_role="$3" + local log_prefix="$4" + local role="$5" + local dp_start_rank="${6:-}" + + setup_mori_env + + local extra_args=() + if [[ "$role" == "master" ]]; then + extra_args+=(--api-server-count=8) + else + extra_args+=(--data-parallel-start-rank "${dp_start_rank}" --headless) + fi + + local kv_config + kv_config=$(build_kv_transfer_config "${kv_role}") + + vllm serve ${MODEL_PATH} \ + -tp 1 \ + --data-parallel-size "${dp_size}" \ + --data-parallel-size-local ${DP_PARALLEL_SIZE_LOCAL} \ + --data-parallel-address "${dp_addr}" \ + --data-parallel-rpc-port ${RPC_PORT} \ + --enable-expert-parallel \ + --port ${SERVE_PORT} \ + --gpu-memory-utilization 0.8 \ + --kv-cache-dtype fp8 \ + --block-size 1 \ + --no-enable-prefix-caching \ + --all2all-backend mori \ + --trust-remote-code \ + --enforce-eager \ + "${extra_args[@]}" \ + --kv-transfer-config "${kv_config}" \ + 2>&1 | tee /run_logs/${SLURM_JOB_ID}/${log_prefix}_NODE${NODE_RANK}.log > /dev/null & + + WORKER_PID=$! +} + +wait_for_proxy_and_cleanup() { + local worker_pid="$1" + local label="$2" + + echo "Waiting for proxy server to be up..." + python $NIXL_COOKBOOK_PATH/socket_barrier.py \ + --node-ips ${MASTER_ADDR} \ + --node-ports $PROXY_PORT + + echo "Waiting until proxy server closes..." + python $NIXL_COOKBOOK_PATH/socket_wait.py \ + --remote-ip ${MASTER_ADDR} \ + --remote-port $PROXY_PORT + + echo "Killing the ${label} server" + kill "$worker_pid" 2>/dev/null || true +} + +print_node_info() { + local role_desc="$1" + echo "========= NODE INFO ====================" + echo "Node list : ${SLURM_JOB_NODELIST}" + echo "Node IPs : ${IPADDRS}" + echo "Model : ${MODEL_NAME}" + echo "${host_name}:${host_ip} is ${role_desc}." +} + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +for _pid in $(ss -tlnp sport = 2222 2>/dev/null | grep -oP "pid=\K\d+"); do + kill -9 "$_pid" 2>/dev/null +done +sleep 2 + +echo "Waiting at the container creation barrier on $host_name" +python $NIXL_COOKBOOK_PATH/socket_barrier.py \ + --local-ip ${host_ip} \ + --local-port 2222 \ + --enable-port \ + --node-ips ${IPADDRS} \ + --node-ports 2222 + +# ============================================================================= +# Node Role Assignment and Server Launch +# ============================================================================= + +if [ "$NODE_RANK" -eq 0 ]; then + # ================================================================= + # Rank 0: Prefill master + Proxy (co-located) + # ================================================================= + print_node_info "Prefill master + Proxy node (co-located)" + echo "PREFILL_DP_SIZE=${PREFILL_DP_SIZE}" + echo "PREFILL_DP_START_RANK=${PREFILL_DP_START_RANK}" + echo "PREFILL_MASTER_ADDR=${PREFILL_MASTER_ADDR}" + echo "DP_PARALLEL_SIZE_LOCAL=${DP_PARALLEL_SIZE_LOCAL}" + echo "vLLM serve port: ${SERVE_PORT} Proxy port: ${PROXY_PORT}" + + launch_vllm_worker "${PREFILL_DP_SIZE}" "${PREFILL_MASTER_ADDR}" "kv_producer" "prefill" "master" + local_worker_pid=$WORKER_PID + + echo "Waiting for prefill & decode servers to be ready..." + sleep 20 + + TIMEOUT_SECONDS=4000 + SLEEP_SECONDS=10 + SEARCH_SIGNAL="Application startup complete." + + PREFILL_LOG=/run_logs/${SLURM_JOB_ID}/prefill_NODE0.log + DECODE_LOG=/run_logs/${SLURM_JOB_ID}/decode_NODE${xP}.log + + wait_log_signal_or_fail() { + local LOG_FILE="$1" + local LABEL="$2" + local ELAPSED=0 + until grep -q "${SEARCH_SIGNAL}" "${LOG_FILE}" 2>/dev/null; do + if [ "${ELAPSED}" -ge "${TIMEOUT_SECONDS}" ]; then + echo "Timeout (${TIMEOUT_SECONDS}s): '${SEARCH_SIGNAL}' not found in ${LABEL}: ${LOG_FILE}" \ + | tee -a /run_logs/${SLURM_JOB_ID}/proxy_NODE${NODE_RANK}.log + exit 1 + fi + sleep "${SLEEP_SECONDS}" + ELAPSED=$((ELAPSED + SLEEP_SECONDS)) + done + echo "Ready: ${LABEL} (${LOG_FILE})" + } + + wait_log_signal_or_fail "${PREFILL_LOG}" "prefill master" + wait_log_signal_or_fail "${DECODE_LOG}" "decode master" + + sleep 10 + python /app/vllm/examples/online_serving/disaggregated_serving/moriio_toy_proxy_server.py \ + 2>&1 | tee -a /run_logs/${SLURM_JOB_ID}/proxy_NODE${NODE_RANK}.log >/dev/null & + + proxy_pid=$! + + echo "Proxy server ready for benchmarking on ${host_name}:${host_ip}:${PROXY_PORT}" + sleep 20 + curl -X POST http://127.0.0.1:${PROXY_PORT}/v1/completions -H "Content-Type: application/json" -d '{ + "prompt": "Who is AMD CEO?", + "temperature": 0, + "max_tokens" : 10, + "top_k": 1 + }' + + sleep 20 + export BENCHMARK_PORT=${PROXY_PORT} + bash $NIXL_COOKBOOK_PATH/benchmark_xPyD.sh + + echo "Killing the proxy server.." + kill $proxy_pid + echo "Killing the prefill master server.." + kill $local_worker_pid 2>/dev/null || true + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then + # ================================================================= + # Prefill child (only active when xP > 1) + # ================================================================= + print_node_info "Prefill child node" + echo "PREFILL_DP_SIZE=${PREFILL_DP_SIZE}" + echo "PREFILL_DP_START_RANK=${PREFILL_DP_START_RANK}" + echo "PREFILL_MASTER_ADDR=${PREFILL_MASTER_ADDR}" + echo "DP_PARALLEL_SIZE_LOCAL=${DP_PARALLEL_SIZE_LOCAL}" + + launch_vllm_worker "${PREFILL_DP_SIZE}" "${PREFILL_MASTER_ADDR}" "kv_producer" "prefill" "child" "${PREFILL_DP_START_RANK}" + wait_for_proxy_and_cleanup $WORKER_PID "prefill child" + +elif [ "$NODE_RANK" -eq "$xP" ]; then + # ================================================================= + # Decode master + # ================================================================= + print_node_info "Decode master node" + echo "DECODE_DP_SIZE=${DECODE_DP_SIZE}" + echo "DECODE_DP_START_RANK=${DECODE_DP_START_RANK}" + echo "DECODE_MASTER_ADDR=${DECODE_MASTER_ADDR}" + echo "DP_PARALLEL_SIZE_LOCAL=${DP_PARALLEL_SIZE_LOCAL}" + + launch_vllm_worker "${DECODE_DP_SIZE}" "${DECODE_MASTER_ADDR}" "kv_consumer" "decode" "master" + wait_for_proxy_and_cleanup $WORKER_PID "decode master" + +else + # ================================================================= + # Decode child (rank > xP) + # ================================================================= + print_node_info "Decode child node" + echo "DECODE_DP_SIZE=${DECODE_DP_SIZE}" + echo "DECODE_DP_START_RANK=${DECODE_DP_START_RANK}" + echo "DECODE_MASTER_ADDR=${DECODE_MASTER_ADDR}" + echo "DP_PARALLEL_SIZE_LOCAL=${DP_PARALLEL_SIZE_LOCAL}" + + launch_vllm_worker "${DECODE_DP_SIZE}" "${DECODE_MASTER_ADDR}" "kv_consumer" "decode" "child" "${DECODE_DP_START_RANK}" + wait_for_proxy_and_cleanup $WORKER_PID "decode child" + +fi + +echo "Script completed successfully." +exit 0 diff --git a/scripts/vllm_dissag/vllm_disagg_server.sh b/scripts/vllm_dissag/vllm_disagg_server.sh index 82bc773..912a4f2 100755 --- a/scripts/vllm_dissag/vllm_disagg_server.sh +++ b/scripts/vllm_dissag/vllm_disagg_server.sh @@ -17,7 +17,7 @@ IPADDRS="${IPADDRS:-localhost}" # Proxy configuration: "vllm_router" (default) or "toy_proxy" PROXY_TYPE="${PROXY_TYPE:-vllm_router}" -ROUTER_PORT="${ROUTER_PORT:-2584}" +ROUTER_PORT="${ROUTER_PORT:-18001}" if [[ "$PROXY_TYPE" != "vllm_router" && "$PROXY_TYPE" != "toy_proxy" ]]; then echo "Error: Invalid PROXY_TYPE='$PROXY_TYPE'. Must be 'vllm_router' or 'toy_proxy'." >&2 @@ -40,20 +40,20 @@ host_ip=$(hostname -I | awk '{print $1}') host_name=$(hostname) SERVER_PORT=2584 -if [ "$PROXY_TYPE" == "vllm_router" ]; then - PROXY_PORT=$ROUTER_PORT -else - PROXY_PORT=$SERVER_PORT -fi +PROXY_PORT=$ROUTER_PORT if [[ -z "$UCX_NET_DEVICES" ]]; then - echo "Error: UCX_NET_DEVICES is empty" >&2 - exit 1 + UCX_NET_DEVICES=$(ibstat 2>/dev/null | awk ' + /^CA /{gsub(/\047/,"",$2); ca=$2} + /Rate:/{if($2+0 >= 200) devs=devs (devs?",":"") ca":1"} + END{print devs}') + export UCX_NET_DEVICES="${UCX_NET_DEVICES:-mlx5_0:1}" + echo "Auto-detected UCX_NET_DEVICES=${UCX_NET_DEVICES}" fi if [[ -z "$NCCL_SOCKET_IFNAME" ]]; then - echo "Error: NCCL_SOCKET_IFNAME is empty" >&2 - exit 1 + export NCCL_SOCKET_IFNAME="eth0" + echo "Defaulting NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME}" fi # ============================================================================= @@ -64,6 +64,7 @@ declare -A MODEL_PREFILL_CONFIGS=( ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"full_cuda_graph\": false, \"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["DeepSeek-R1"]="--tensor-parallel-size 8 --compilation-config '{\"full_cuda_graph\": false, \"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" # same arch as V3 ["gpt-oss-120b"]="--tensor-parallel-size 8" ) @@ -71,6 +72,7 @@ declare -A MODEL_DECODE_CONFIGS=( ["Llama-3.1-405B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --kv-cache-dtype fp8" ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="--tensor-parallel-size 8 --max-model-len 65536 --kv-cache-dtype fp8" ["DeepSeek-V3"]="--tensor-parallel-size 8 --compilation-config '{\"full_cuda_graph\": false, \"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" + ["DeepSeek-R1"]="--tensor-parallel-size 8 --compilation-config '{\"full_cuda_graph\": false, \"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1" # same arch as V3 ["gpt-oss-120b"]="--tensor-parallel-size 8" ) @@ -78,6 +80,7 @@ declare -A MODEL_ENVS=( ["amd-Llama-3.3-70B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1 " ["Llama-3.1-405B-Instruct-FP8-KV"]="VLLM_USE_V1=1 VLLM_V1_USE_PREFILL_DECODE_ATTENTION=1 AMDGCN_USE_BUFFER_OPS=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_ROPE=1 TRITON_HIP_ASYNC_COPY_BYPASS_PERMUTE=1 TRITON_HIP_USE_ASYNC_COPY=1 TRITON_HIP_USE_BLOCK_PINGPONG=1 TRITON_HIP_ASYNC_FAST_SWIZZLE=1 " ["DeepSeek-V3"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 " + ["DeepSeek-R1"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 " # same arch as V3 ["gpt-oss-120b"]="VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0 " ) @@ -130,6 +133,11 @@ fi # Container Synchronization # ============================================================================= +for _pid in $(ss -tlnp sport = 5000 2>/dev/null | grep -oP "pid=\K\d+"); do + kill -9 "$_pid" 2>/dev/null +done +sleep 2 + echo "Waiting at the container creation barrier on $host_name" python $NIXL_COOKBOOK_PATH/socket_barrier.py \ --local-ip ${host_ip} \ @@ -138,40 +146,6 @@ python $NIXL_COOKBOOK_PATH/socket_barrier.py \ --node-ips ${IPADDRS} \ --node-ports 5000 -# ============================================================================= -# ETCD Server Setup -# ============================================================================= - -echo "Proceeding to start etcd server on $host_name" - -${NIXL_COOKBOOK_PATH}/start_etcd.sh > /dev/null & -etcd_pid=$! - -echo "Waiting at etcd server barrier on $host_name" -python $NIXL_COOKBOOK_PATH/socket_barrier.py \ - --node-ips ${IPADDRS} \ - --node-ports 2379 - -echo "All etcd servers are up : $host_name" -sleep 3 - -echo "etcd endpoint health==================" -/usr/local/bin/etcd//etcdctl endpoint health -echo "======================================" - -echo "etcd member list======================" -/usr/local/bin/etcd//etcdctl member list -echo "======================================" - -echo "etcd status======================" -/usr/local/bin/etcd//etcdctl endpoint status --write-out=table -echo "======================================" - - -echo "Waiting at etcd server barrier on $host_name" -python $NIXL_COOKBOOK_PATH/socket_barrier.py --node-ips ${IPADDRS} --node-ports 2379 -# END SECTION=========================================================================== - # ============================================================================= # Cluster Topology Configuration # ============================================================================= @@ -182,14 +156,14 @@ DECODE_ARGS="" PREFILL_PORTS="" DECODE_PORTS="" -# Loop through for `--prefill` (IPs from index 0 to N-1) -for ((i=1; i<=$xP && i<${#IP_ARRAY[@]}; i++)); do +# Prefill IPs: index 0 .. xP-1 (rank 0 is prefill master + proxy) +for ((i=0; i&1 | tee /run_logs/${SLURM_JOB_ID}/prefill_NODE${NODE_RANK}.log >/dev/null & + + prefill_pid=$! + echo "Waiting for all prefill and decode servers to be up . . ." python $NIXL_COOKBOOK_PATH/socket_barrier.py \ - --node-ips ${PD_IPADDRS} \ + --node-ips ${IPADDRS} \ --node-ports $SERVER_PORT if [ "$PROXY_TYPE" == "vllm_router" ]; then - echo "Starting vLLM Router (Production Proxy)..." + echo "Starting vLLM Router (Production Proxy) on port ${PROXY_PORT}..." [ -f /root/.cargo/env ] && source /root/.cargo/env - + PREFILL_URLS="" DECODE_URLS="" for ip in ${PREFILL_ARGS}; do - PREFILL_URLS+="--prefill-url http://${ip}:${SERVER_PORT}/v1 " + PREFILL_URLS+="--prefill http://${ip}:${SERVER_PORT} " done for ip in ${DECODE_ARGS}; do - DECODE_URLS+="--decode-url http://${ip}:${SERVER_PORT}/v1 " + DECODE_URLS+="--decode http://${ip}:${SERVER_PORT} " done - + UCX_TLS=tcp,self,shm VLLM_USE_V1=1 \ vllm-router \ --host 0.0.0.0 \ @@ -251,22 +260,20 @@ if [ "$NODE_RANK" -eq 0 ]; then --prometheus-port 29000 \ 2>&1 | tee /run_logs/${SLURM_JOB_ID}/vllm_router_NODE${NODE_RANK}.log >/dev/null & proxy_pid=$! - PROXY_PORT=$ROUTER_PORT else - echo "Starting Toy Proxy Server..." + echo "Starting Toy Proxy Server on port ${PROXY_PORT}..." UCX_TLS=tcp,self,shm NCCL_UCX_TLS=tcp VLLM_USE_V1=1 \ python3 "/app/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \ --host 0.0.0.0 \ - --port $SERVER_PORT \ + --port $PROXY_PORT \ --prefiller-hosts ${PREFILL_ARGS} \ --prefiller-ports ${PREFILL_PORTS} \ --decoder-hosts ${DECODE_ARGS} \ --decoder-ports ${DECODE_PORTS} 2>&1 | tee /run_logs/${SLURM_JOB_ID}/proxy_NODE${NODE_RANK}.log >/dev/null & proxy_pid=$! - PROXY_PORT=$SERVER_PORT fi - + echo "Waiting for proxy server to be up . . ." python $NIXL_COOKBOOK_PATH/socket_barrier.py \ --node-ips ${host_ip} \ @@ -280,8 +287,13 @@ if [ "$NODE_RANK" -eq 0 ]; then echo "Killing the proxy server" kill $proxy_pid + echo "Killing the prefill server" + kill $prefill_pid 2>/dev/null || true -elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then + # ================================================================= + # Prefill child (only active when xP > 1) + # ================================================================= echo "${host_name}:${host_ip} is Prefill Node (Model: ${MODEL_NAME:-'default'})" echo "Using prefill config: $PREFILL_MODEL_CONFIG" @@ -307,7 +319,6 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then vllm serve \${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"pd-run\", \"kv_role\": \"kv_producer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"\${host_ip}\"'\", \"kv_port\": 14600}'" if [[ -n "$PREFILL_MODEL_CONFIG" ]]; then @@ -324,7 +335,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then --node-ips ${MASTER_ADDR} \ --node-ports $PROXY_PORT - echo "Waiting untill proxy server closes..." + echo "Waiting until proxy server closes..." python $NIXL_COOKBOOK_PATH/socket_wait.py \ --remote-ip ${MASTER_ADDR} \ --remote-port $PROXY_PORT @@ -333,10 +344,12 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then kill $prefill_pid else + # ================================================================= + # Decode node (rank >= xP) + # ================================================================= echo "${host_name}:${host_ip} is Decode Node (Model: ${MODEL_NAME:-'default'})" echo "Using decode config: $DECODE_MODEL_CONFIG" - DECODE_CMD="LD_LIBRARY_PATH=/app/install/nixl/lib/x86_64-linux-gnu/:/app/install/ucx/lib:/opt/rocm/lib:\$LD_LIBRARY_PATH \ ${DECODE_MODEL_ENVS} \ VLLM_USE_V1=1 \ @@ -359,7 +372,6 @@ else vllm serve \${MODEL_PATH} \ --port $SERVER_PORT \ --trust-remote-code \ - --disable-log-requests \ --kv-transfer-config '{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"llama8b-run\", \"kv_role\": \"kv_consumer\", \"kv_parallel_size\": 8, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"'\"\${host_ip}\"'\", \"kv_port\": 14600}'" if [[ -n "$DECODE_MODEL_CONFIG" ]]; then @@ -376,7 +388,7 @@ else --node-ips ${MASTER_ADDR} \ --node-ports $PROXY_PORT - echo "Waiting untill proxy server closes..." + echo "Waiting until proxy server closes..." python $NIXL_COOKBOOK_PATH/socket_wait.py \ --remote-ip ${MASTER_ADDR} \ --remote-port $PROXY_PORT @@ -386,8 +398,5 @@ else fi -echo "Killing the etcd server" -kill $etcd_pid - echo "Script completed successfully" exit 0 diff --git a/scripts/vllm_dissag/vllm_disagg_server_deepep.sh b/scripts/vllm_dissag/vllm_disagg_server_deepep.sh new file mode 100755 index 0000000..605f69a --- /dev/null +++ b/scripts/vllm_dissag/vllm_disagg_server_deepep.sh @@ -0,0 +1,516 @@ +#!/bin/bash +# vLLM Disaggregated Server - DeepEP + DBO Configuration +# ============================================================================= +# Co-located proxy topology (matches run_xPyD_models.slurm). +# +# Node roles (by NODE_RANK): +# 0 -> Prefill MASTER + Proxy (co-located, API server + DP coordinator + router) +# 1 .. xP-1 -> Prefill CHILD (--headless, no API server) +# xP -> Decode MASTER (API server, DP coordinator) +# xP+1 .. end -> Decode CHILD (--headless, no API server) +# +# Total nodes = xP + yD + +# ============================================================================= +# Environment Configuration +# ============================================================================= + +MASTER_ADDR="${MASTER_ADDR:-localhost}" +MASTER_PORT="${MASTER_PORT:-23731}" +NODE_RANK="${NODE_RANK:-0}" +MODEL_PATH="${MODEL_PATH}" +MODEL_NAME="${MODEL_NAME:-DeepSeek-V3}" +xP="${xP:-1}" +yD="${yD:-1}" +IPADDRS="${IPADDRS:-localhost}" +IFS=',' read -ra IP_ARRAY <<< "${IPADDRS}" + +echo "Listing NIXL_COOKBOOK_PATH : " +ls "${NIXL_COOKBOOK_PATH}" + +# ============================================================================= +# Port Configuration +# ============================================================================= + +SERVER_PORT=2584 +RPC_PORT=13345 +KV_PORT=14600 +BARRIER_PORT="${BARRIER_PORT:-15000}" + +PROXY_TYPE="${PROXY_TYPE:-vllm_router}" +ROUTER_PORT="${ROUTER_PORT:-18001}" +PROXY_PORT="${ROUTER_PORT}" + +if [[ "$PROXY_TYPE" != "vllm_router" && "$PROXY_TYPE" != "toy_proxy" ]]; then + echo "Error: Invalid PROXY_TYPE='$PROXY_TYPE'. Must be 'vllm_router' or 'toy_proxy'." >&2 + exit 1 +fi + +# ============================================================================= +# DeepEP / DBO / Profiling Configuration +# ============================================================================= + +PREFILL_DEEPEP_BACKEND="${PREFILL_DEEPEP_BACKEND:-deepep_high_throughput}" +DECODE_DEEPEP_BACKEND="${DECODE_DEEPEP_BACKEND:-deepep_low_latency}" +ENABLE_DBO="${ENABLE_DBO:-false}" +DBO_COMM_SMS="${DBO_COMM_SMS:-}" +ENABLE_PROFILING="${ENABLE_PROFILING:-false}" + +DBO_ARGS="" +[[ "${ENABLE_DBO}" == "true" ]] && DBO_ARGS="--enable-dbo" + +# ============================================================================= +# Node-Specific Configuration +# ============================================================================= + +host_ip=$(hostname -I | awk '{print $1}') +host_name=$(hostname) + +PREFILL_DP_SIZE=$((xP * 8)) +DECODE_DP_SIZE=$((yD * 8)) +DP_SIZE_LOCAL=8 + +# Rank 0 is prefill master + proxy (co-located) +PREFILL_MASTER_ADDR=$(echo "$IPADDRS" | awk -F',' '{print $1}') +DECODE_MASTER_ADDR=$(echo "$IPADDRS" | awk -F',' -v pos="$xP" '{print $(pos+1)}') +PREFILL_DP_START_RANK=$(( NODE_RANK * DP_SIZE_LOCAL )) +DECODE_DP_START_RANK=$(( (NODE_RANK - xP) * DP_SIZE_LOCAL )) + +echo "=============================================" +echo "DeepEP Configuration for ${MODEL_NAME}" +echo " Prefill backend : ${PREFILL_DEEPEP_BACKEND}" +echo " Decode backend : ${DECODE_DEEPEP_BACKEND}" +echo " DBO enabled : ${ENABLE_DBO}" +echo " DBO COMM_SMS : ${DBO_COMM_SMS:-}" +echo " Profiling : ${ENABLE_PROFILING}" +echo " Server port : ${SERVER_PORT}" +echo " Proxy port : ${PROXY_PORT}" +echo " Prefill DP size : ${PREFILL_DP_SIZE} (xP=${xP})" +echo " Decode DP size : ${DECODE_DP_SIZE} (yD=${yD})" +echo " DP size local : ${DP_SIZE_LOCAL}" +echo " Prefill master : ${PREFILL_MASTER_ADDR}" +echo " Decode master : ${DECODE_MASTER_ADDR}" +echo " Local IP : ${host_ip}" +echo " NODE_RANK : ${NODE_RANK}" +echo "=============================================" + +# ============================================================================= +# Helper Functions +# ============================================================================= + +setup_deepep_env() { + local backend=$1 + + export ROCSHMEM_HEAP_SIZE=7524589824 + export ROCSHMEM_MAX_NUM_CONTEXTS=256 + export HSA_NO_SCRATCH_RECLAIM=1 + + # --- Auto-detect RocSHMEM directory --- + if [[ -z "${ROCSHMEM_DIR}" ]]; then + for _d in /root/rocshmem /opt/rocshmem; do + [[ -d "$_d/lib" ]] && export ROCSHMEM_DIR="$_d" && break + done + export ROCSHMEM_DIR="${ROCSHMEM_DIR:-/root/rocshmem}" + fi + + # --- Auto-detect OMPI directory --- + if [[ -z "${OMPI_DIR}" ]]; then + for _d in /root/install/ompi /usr/lib/x86_64-linux-gnu/openmpi /opt/ompi; do + [[ -d "$_d" ]] && export OMPI_DIR="$_d" && break + done + export OMPI_DIR="${OMPI_DIR:-/root/install/ompi}" + fi + + # --- Auto-detect UCX lib directory --- + local _ucx_lib="" + for _d in /root/install/ucx/lib /usr/local/ucx/lib /opt/ucx/lib; do + [[ -f "$_d/libucp.so" ]] && _ucx_lib="$_d" && break + done + : "${_ucx_lib:=/root/install/ucx/lib}" + + # --- Auto-detect NIXL lib directory --- + local _nixl_lib="" + for _d in /opt/nixl/lib \ + /usr/local/RIXL/install/lib/x86_64-linux-gnu \ + /usr/local/lib/python3.12/dist-packages/.rixl.mesonpy.libs \ + /usr/local/nixl/lib; do + [[ -f "$_d/libnixl.so" && -d "$_d/plugins" ]] && _nixl_lib="$_d" && break + done + if [[ -z "$_nixl_lib" ]]; then + for _d in /opt/nixl/lib \ + /usr/local/RIXL/install/lib/x86_64-linux-gnu \ + /usr/local/lib/python3.12/dist-packages/.rixl.mesonpy.libs \ + /root/RIXL/build/src/core \ + /usr/local/nixl/lib; do + [[ -f "$_d/libnixl.so" ]] && _nixl_lib="$_d" && break + done + fi + + # --- Fix NIXL Python bindings if missing --- + if ! python3 -c "import nixl" 2>/dev/null; then + local _rixl_py + _rixl_py=$(find /root/RIXL/build -path "*/bindings/python" -type d 2>/dev/null | head -1) + if [[ -n "$_rixl_py" ]]; then + echo "[setup_deepep_env] NIXL Python bindings missing, installing from $_rixl_py ..." + pip install --no-deps -e "$_rixl_py" 2>/dev/null || true + fi + fi + + # --- Build LD_LIBRARY_PATH --- + local _extra_ld="" + [[ -n "$_nixl_lib" ]] && _extra_ld+="${_nixl_lib}:" + _extra_ld+="${_ucx_lib}:/usr/local/lib:/usr/local/lib64:/opt/rocm/lib" + export LD_LIBRARY_PATH="${_extra_ld}:${LD_LIBRARY_PATH}" + + echo "[setup_deepep_env] ROCSHMEM_DIR=$ROCSHMEM_DIR OMPI_DIR=$OMPI_DIR" + echo "[setup_deepep_env] UCX_LIB=$_ucx_lib NIXL_LIB=${_nixl_lib:-}" + + # --- vLLM runtime flags --- + export VLLM_USE_V1=1 + export VLLM_LOGGING_LEVEL=INFO + export VLLM_ALL2ALL_BACKEND="${backend}" + export VLLM_ROCM_USE_AITER=1 + export VLLM_ROCM_USE_AITER_MLA=1 + export VLLM_ROCM_USE_AITER_PAGED_ATTN=0 + export VLLM_ROCM_USE_AITER_RMSNORM=1 + export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=0 + export VLLM_USE_AITER_TRITON_SILU_MUL=0 + export VLLM_SERVER_DEV_MODE=0 + export VLLM_ROCM_USE_AITER_MOE=0 + export VLLM_ENGINE_READY_TIMEOUT_S=3600 + + export VLLM_NIXL_SIDE_CHANNEL_HOST="${host_ip}" + export VLLM_NIXL_SIDE_CHANNEL_PORT=5557 + + # --- Network / RDMA --- + export GLOO_SOCKET_IFNAME=eth0 + export NCCL_SOCKET_IFNAME=eth0 + export NCCL_IB_GID_INDEX=3 + export NCCL_CROSS_NIC=1 + export NCCL_NET_GDR_LEVEL=PHB + + export UCX_TLS=rc,sm,self,rocm_copy,rocm_ipc,tcp + if [[ -z "${UCX_NET_DEVICES}" ]]; then + local available_devs + available_devs=$(ibstat 2>/dev/null | awk ' + /^CA /{gsub(/\047/,"",$2); ca=$2} + /Rate:/{if($2+0 >= 200) devs=devs (devs?",":"") ca":1"} + END{print devs}') + export UCX_NET_DEVICES="${available_devs:-mlx5_0:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_7:1,mlx5_8:1,mlx5_9:1}" + fi + if [[ -z "${NCCL_IB_HCA}" ]]; then + local nccl_hcas + nccl_hcas=$(ibstat 2>/dev/null | awk ' + /^CA /{gsub(/\047/,"",$2); ca=$2} + /Rate:/{if($2+0 >= 200) devs=devs (devs?",":"") ca} + END{print devs}') + export NCCL_IB_HCA="${nccl_hcas:-mlx5_0,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_7,mlx5_8,mlx5_9}" + fi + export UCX_SOCKADDR_TLS_PRIORITY=rdmacm,tcp + export UCX_SOCKADDR_CM_ENABLE=y + export UCX_RDMA_CM_ENABLED=y + export UCX_MEMTYPE_CACHE=y + export UCX_RNDV_SCHEME=get_zcopy + export UCX_RNDV_THRESH=4k + export UCX_ROCM_IPC_MIN_ZCOPY=0 + export HSA_ENABLE_SDMA=1 + export UCX_LOG_LEVEL=info + export NIXL_LOG_LEVEL="${NIXL_LOG_LEVEL:-INFO}" + + [[ -n "${DBO_COMM_SMS}" ]] && export VLLM_DBO_COMM_SMS="${DBO_COMM_SMS}" + + # --- PR #39276: Fix NIXL engine_id collision in multi-node DP --- + local _core_py + _core_py=$(python3 -c "import vllm.v1.engine.core as m; print(m.__file__)" 2>/dev/null) + if [[ -n "$_core_py" && -f "$_core_py" ]]; then + if grep -q '_dp{local_dp_rank}' "$_core_py"; then + sed -i 's/_dp{local_dp_rank}/_dp{dp_rank}/g' "$_core_py" + echo "[setup_deepep_env] Applied PR#39276 fix: engine_id uses dp_rank (core.py)" + fi + fi + local _utils_py + _utils_py=$(python3 -c "import vllm.v1.engine.utils as m; print(m.__file__)" 2>/dev/null) + if [[ -n "$_utils_py" && -f "$_utils_py" ]]; then + if grep -q '_dp{local_index}' "$_utils_py"; then + sed -i 's/_dp{local_index}/_dp{index}/g' "$_utils_py" + echo "[setup_deepep_env] Applied PR#39276 fix: engine_id uses index (utils.py)" + fi + fi + + echo "[setup_deepep_env] UCX_NET_DEVICES=$UCX_NET_DEVICES" + echo "[setup_deepep_env] NCCL_IB_HCA=$NCCL_IB_HCA" +} + +build_kv_transfer_config() { + local kv_role="$1" + local engine_id="$2" + local dp_size="$3" + echo "{\"kv_connector\": \"NixlConnector\", \"engine_id\": \"${engine_id}\", \"kv_role\": \"${kv_role}\", \"kv_parallel_size\": ${dp_size}, \"kv_rank\": 0, \"kv_buffer_size\": 5000000000, \"kv_buffer_device\": \"cuda\", \"kv_ip\": \"${host_ip}\", \"kv_port\": ${KV_PORT}}" +} + +# Launch a vllm serve worker and set WORKER_PID to its PID. +# $1 = role "prefill_master" | "prefill_child" | "decode_master" | "decode_child" +# $2 = backend DeepEP all2all backend name +# $3 = dp_size data-parallel size +# $4 = dp_addr data-parallel master address +# $5 = kv_role "kv_producer" or "kv_consumer" +# $6 = engine_id "pd-prefill" or "pd-decode" +# $7 = log_prefix "prefill" or "decode" +# $8 = dp_start_rank (only for child nodes) +launch_vllm_worker() { + local role="$1" + local backend="$2" + local dp_size="$3" + local dp_addr="$4" + local kv_role="$5" + local engine_id="$6" + local log_prefix="$7" + local dp_start_rank="${8:-}" + + setup_deepep_env "${backend}" + + local extra_args=() + case "$role" in + *_master) + extra_args+=(--api-server-count=8) + extra_args+=(--data-parallel-start-rank 0) + ;; + *_child) + extra_args+=(--data-parallel-start-rank "${dp_start_rank}") + extra_args+=(--headless) + ;; + esac + + # Decode roles get cudagraph; prefill uses enforce-eager + local compile_args=() + case "$role" in + decode_*) + compile_args+=(--compilation-config '{"cudagraph_mode":"FULL_DECODE_ONLY","custom_ops":["+quant_fp8"]}') + compile_args+=(--cudagraph-capture-sizes 1 2 4 8 16 32 64 128 256) + ;; + prefill_*) + compile_args+=(--enforce-eager) + ;; + esac + + local kv_config + kv_config=$(build_kv_transfer_config "${kv_role}" "${engine_id}" "${dp_size}") + + vllm serve "${MODEL_PATH}" \ + --port "${SERVER_PORT}" \ + --trust-remote-code \ + -tp 1 \ + --data-parallel-size "${dp_size}" \ + --data-parallel-size-local "${DP_SIZE_LOCAL}" \ + --data-parallel-address "${dp_addr}" \ + --data-parallel-rpc-port "${RPC_PORT}" \ + --master-addr "${dp_addr}" \ + "${compile_args[@]}" \ + --no-enable-prefix-caching --block-size 1 \ + --gpu-memory-utilization 0.8 \ + --kv-cache-dtype fp8 \ + --enable-expert-parallel \ + --all2all-backend "${backend}" \ + ${DBO_ARGS} \ + "${extra_args[@]}" \ + --kv-transfer-config "${kv_config}" \ + 2>&1 | tee /run_logs/${SLURM_JOB_ID}/${log_prefix}_NODE${NODE_RANK}.log > /dev/null & + + WORKER_PID=$! +} + +wait_for_proxy_and_cleanup() { + local worker_pid="$1" + local label="$2" + + echo "Waiting for proxy server to be up..." + python "${NIXL_COOKBOOK_PATH}/socket_barrier.py" \ + --node-ips "${MASTER_ADDR}" \ + --node-ports "${PROXY_PORT}" + + echo "Waiting until proxy server closes..." + python "${NIXL_COOKBOOK_PATH}/socket_wait.py" \ + --remote-ip "${MASTER_ADDR}" \ + --remote-port "${PROXY_PORT}" + + echo "Killing the ${label} server" + kill "${worker_pid}" 2>/dev/null || true +} + +print_node_info() { + local role_desc="$1" + echo "========= NODE INFO ====================" + echo "Node list : ${SLURM_JOB_NODELIST}" + echo "Node IPs : ${IPADDRS}" + echo "Model : ${MODEL_NAME}" + echo "${host_name}:${host_ip} is ${role_desc}." + echo "=========================================" +} + +# ============================================================================= +# Container Synchronization +# ============================================================================= + +for _pid in $(ss -tlnp sport = "${BARRIER_PORT}" 2>/dev/null | grep -oP "pid=\K\d+"); do + kill -9 "$_pid" 2>/dev/null +done +sleep 2 + +echo "Waiting at the container creation barrier on ${host_name}" +python "${NIXL_COOKBOOK_PATH}/socket_barrier.py" \ + --local-ip "${host_ip}" \ + --local-port "${BARRIER_PORT}" \ + --enable-port \ + --node-ips "${IPADDRS}" \ + --node-ports "${BARRIER_PORT}" + +sleep 3 + +# ============================================================================= +# Node Role Assignment and Server Launch +# ============================================================================= + +PREFILL_MASTER_IP="${IP_ARRAY[0]}" +DECODE_MASTER_IP="${IP_ARRAY[$xP]}" +MASTER_IPS="${PREFILL_MASTER_IP},${DECODE_MASTER_IP}" + +if [ "$NODE_RANK" -eq 0 ]; then + # ================================================================= + # Rank 0: Prefill MASTER + Proxy (co-located) + # ================================================================= + print_node_info "Prefill master + Proxy node (co-located)" + echo "Prefill master IP : ${PREFILL_MASTER_IP}" + echo "Decode master IP : ${DECODE_MASTER_IP}" + echo "PREFILL_DP_SIZE=${PREFILL_DP_SIZE} PREFILL_MASTER_ADDR=${PREFILL_MASTER_ADDR}" + echo "vLLM serve port: ${SERVER_PORT} Proxy port: ${PROXY_PORT}" + + launch_vllm_worker "prefill_master" "${PREFILL_DEEPEP_BACKEND}" \ + "${PREFILL_DP_SIZE}" "${PREFILL_MASTER_ADDR}" \ + "kv_producer" "pd-prefill" "prefill" + + local_worker_pid="${WORKER_PID}" + + echo "Waiting for prefill & decode master servers to start..." + + TIMEOUT_SECONDS=4000 + SLEEP_SECONDS=10 + SEARCH_SIGNAL="Application startup complete." + + PREFILL_LOG="/run_logs/${SLURM_JOB_ID}/prefill_NODE0.log" + DECODE_LOG="/run_logs/${SLURM_JOB_ID}/decode_NODE${xP}.log" + + wait_log_signal_or_fail() { + local LOG_FILE="$1" + local LABEL="$2" + local ELAPSED=0 + until grep -q "${SEARCH_SIGNAL}" "${LOG_FILE}" 2>/dev/null; do + if [ "${ELAPSED}" -ge "${TIMEOUT_SECONDS}" ]; then + echo "Timeout (${TIMEOUT_SECONDS}s): '${SEARCH_SIGNAL}' not found in ${LABEL}: ${LOG_FILE}" \ + | tee -a /run_logs/${SLURM_JOB_ID}/proxy_NODE${NODE_RANK}.log + exit 1 + fi + sleep "${SLEEP_SECONDS}" + ELAPSED=$((ELAPSED + SLEEP_SECONDS)) + done + echo "Ready: ${LABEL} (${LOG_FILE})" + } + + wait_log_signal_or_fail "${PREFILL_LOG}" "prefill master" + wait_log_signal_or_fail "${DECODE_LOG}" "decode master" + + sleep 10 + + if [ "$PROXY_TYPE" == "vllm_router" ]; then + echo "Starting vLLM Router (Production Proxy) on port ${PROXY_PORT}..." + [ -f /root/.cargo/env ] && source /root/.cargo/env + + PREFILL_URLS="--prefill http://${PREFILL_MASTER_IP}:${SERVER_PORT}" + DECODE_URLS="--decode http://${DECODE_MASTER_IP}:${SERVER_PORT}" + + UCX_TLS=tcp,self,shm VLLM_USE_V1=1 \ + vllm-router \ + --host 0.0.0.0 \ + --port "${ROUTER_PORT}" \ + --vllm-pd-disaggregation \ + $PREFILL_URLS \ + $DECODE_URLS \ + --policy round_robin \ + --prefill-policy round_robin \ + --decode-policy round_robin \ + --intra-node-data-parallel-size 1 \ + 2>&1 | tee /run_logs/${SLURM_JOB_ID}/vllm_router_NODE${NODE_RANK}.log > /dev/null & + proxy_pid=$! + else + echo "Starting Toy Proxy Server on port ${PROXY_PORT}..." + + UCX_TLS=tcp,self,shm NCCL_UCX_TLS=tcp VLLM_USE_V1=1 \ + python3 "/app/vllm/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py" \ + --host 0.0.0.0 \ + --port "${PROXY_PORT}" \ + --prefiller-hosts "${PREFILL_MASTER_IP}" \ + --prefiller-ports "${SERVER_PORT}" \ + --decoder-hosts "${DECODE_MASTER_IP}" \ + --decoder-ports "${SERVER_PORT}" \ + 2>&1 | tee /run_logs/${SLURM_JOB_ID}/proxy_NODE${NODE_RANK}.log > /dev/null & + proxy_pid=$! + fi + + echo "Waiting for proxy server to be up..." + python "${NIXL_COOKBOOK_PATH}/socket_barrier.py" \ + --node-ips "${host_ip}" \ + --node-ports "${PROXY_PORT}" + + echo "Proxy ready for benchmarking on ${host_name}:${host_ip}:${PROXY_PORT}" + + sleep 10 + export BENCHMARK_PORT="${PROXY_PORT}" + bash "${NIXL_COOKBOOK_PATH}/benchmark_xPyD.sh" + + echo "Killing proxy server" + kill "${proxy_pid}" 2>/dev/null || true + echo "Killing prefill master server" + kill "${local_worker_pid}" 2>/dev/null || true + +elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then + # ================================================================= + # Prefill CHILD (--headless, no API server; only when xP > 1) + # ================================================================= + print_node_info "Prefill child node" + echo "PREFILL_DP_SIZE=${PREFILL_DP_SIZE} PREFILL_DP_START_RANK=${PREFILL_DP_START_RANK}" + + launch_vllm_worker "prefill_child" "${PREFILL_DEEPEP_BACKEND}" \ + "${PREFILL_DP_SIZE}" "${PREFILL_MASTER_ADDR}" \ + "kv_producer" "pd-prefill" "prefill" "${PREFILL_DP_START_RANK}" + + wait_for_proxy_and_cleanup "${WORKER_PID}" "prefill child" + +elif [ "$NODE_RANK" -eq "$xP" ]; then + # ================================================================= + # Decode MASTER (API server + DP coordinator) + # ================================================================= + print_node_info "Decode master node" + echo "DECODE_DP_SIZE=${DECODE_DP_SIZE} DECODE_MASTER_ADDR=${DECODE_MASTER_ADDR}" + + launch_vllm_worker "decode_master" "${DECODE_DEEPEP_BACKEND}" \ + "${DECODE_DP_SIZE}" "${DECODE_MASTER_ADDR}" \ + "kv_consumer" "pd-decode" "decode" + + wait_for_proxy_and_cleanup "${WORKER_PID}" "decode master" + +else + # ================================================================= + # Decode CHILD (--headless, no API server; rank > xP) + # ================================================================= + print_node_info "Decode child node" + echo "DECODE_DP_SIZE=${DECODE_DP_SIZE} DECODE_DP_START_RANK=${DECODE_DP_START_RANK}" + + launch_vllm_worker "decode_child" "${DECODE_DEEPEP_BACKEND}" \ + "${DECODE_DP_SIZE}" "${DECODE_MASTER_ADDR}" \ + "kv_consumer" "pd-decode" "decode" "${DECODE_DP_START_RANK}" + + wait_for_proxy_and_cleanup "${WORKER_PID}" "decode child" + +fi + +echo "Script completed successfully" +exit 0