ROCm · raviguptaamd · Mar 25, 2026 · Mar 26, 2026 · Mar 31, 2026 · Apr 1, 2026
@@ -22,7 +22,7 @@ Below are blueprints of supported models along with their documentation.
 | [**PyTorch PEFT/FSDP fine-tuning**](scripts/pytorch_train/HF_PEFT_FSDP/README.md) | Finetuning a HF model with LoRA approach & FSDP strategy | Llama-2-70b-chat-hf |
 | [**Large EP microbenchmark**](scripts/large-ep-benchmark/README.md) | MoE Large Expert Paralellism with MoRI-EP & DeepEP communication microbenchmarks | no specific models |
 | [**vLLM disaggregated P/D inference**](scripts/vllm_dissag/README.MD) | Distributed Inference P/D disaggregation with vLLM | DeepSeek-V3, Llama-3.3-70B-Instruct-FP8-KV, Llama-3.1-405B-Instruct-FP8-KV, gpt-oss-120b |
-| [**SGLang disaggregated P/D inference**](scripts/sglang_disagg/README.MD) | Distributed Inference P/D disggregation with SGLang | Qwen3-32B, Llama-3.1-8B-Instruct, Llama-3.3-70B-Instruct-FP8-KV, Llama-3.1-405B-Instruct-FP8-KV, DeepSeek-V3, Mixtral-8x7B-v0.1 |
+| [**SGLang disaggregated P/D inference**](scripts/sglang_disagg/README.MD) | Distributed Inference P/D disaggregation with SGLang | Qwen3-32B, Llama-3.1-8B-Instruct, Llama-3.3-70B-Instruct-FP8-KV, Llama-3.1-405B-Instruct-FP8-KV, DeepSeek-V3, Mixtral-8x7B-v0.1 |
 
 ## Table of Contents
 

@@ -1,19 +1,28 @@
-ARG BASE_IMAGE=rocm/vllm:v0.14.0_amd_dev
+ARG BASE_IMAGE=rocm/vllm-dev:base_torch2.10_triton3.6_rocm7.2_torch_build_20260216
 FROM ${BASE_IMAGE}
 
+ENTRYPOINT []
+
 WORKDIR /root
 
+RUN sed -i 's|http://|https://|g' /etc/apt/sources.list
+
 ENV _ROCM_DIR=/opt/rocm
 
 ENV _UCX_SOURCE=https://github.com/ROCm/ucx.git
-ENV _UCX_BRANCH=v1.19.x
+ENV _UCX_BRANCH=da3fac2a
 ENV _UCX_INSTALL_DIR=/usr/local/ucx/
 
-ENV _RIXL_SOURCE=github.com/ROCm/RIXL.git
-ENV _RIXL_BRANCH=develop
+ENV _RIXL_SOURCE=https://github.com/ROCm/RIXL.git
+ENV _RIXL_BRANCH=f33a5599
 ENV _RIXL_INSTALL_DIR=/usr/local/RIXL/install
 ENV _NIXLBENCH_INSTALL_DIR=/usr/local/RIXL
 
+ARG GFX_COMPILATION_ARCH="gfx942"
+ARG NIC_COMPILATION_ARCH="cx7"
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_COMMIT=7d6917bef552d6aff70142ab9fb8af648081d4db
+
 RUN pip3 install meson==0.64.0
 RUN pip3 install "pybind11[global]"
 
@@ -31,8 +40,9 @@ RUN set -e && apt -y install gcc make libtool autoconf librdmacm-dev rdmacm-util
 RUN apt install -y libgflags-dev
 
 # Install UCX
-RUN git clone ${_UCX_SOURCE} -b ${_UCX_BRANCH} && \
+RUN git clone ${_UCX_SOURCE} && \
     cd ucx && \
+    git checkout ${_UCX_BRANCH} && \
     ./autogen.sh && \
     mkdir -p build && \
     cd build && \
@@ -46,7 +56,7 @@ ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/ucx/lib/
 ENV PATH=$PATH:/usr/local/ucx/bin/
 
 RUN set -e && apt update && \
-    apt install -y libaio-dev liburing-dev etcd etcd-server etcd-client libcpprest-dev libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc && \
+    apt install -y libaio-dev liburing-dev libcpprest-dev libgrpc-dev libgrpc++-dev libprotobuf-dev protobuf-compiler-grpc wget && \
     wget https://github.com/google/googletest/archive/refs/tags/v1.14.0.tar.gz && \
     tar -xzf v1.14.0.tar.gz && \
     cd googletest-1.14.0 && \
@@ -57,32 +67,13 @@ RUN set -e && apt update && \
     make install && \
     cd ../..
 
-# Expected etcd at /usr/local/bin/etcd//etcd
-RUN wget https://github.com/etcd-io/etcd/releases/download/v3.6.0-rc.5/etcd-v3.6.0-rc.5-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \
-    mkdir -p /usr/local/bin/etcd && \
-    tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1 && \
-    rm /tmp/etcd.tar.gz
-ENV PATH=$PATH:/usr/local/bin/etcd/
-
-RUN set -e && echo "Compiling etcd-cpp API" && \
-    git clone https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git && \
-    cd etcd-cpp-apiv3 && \
-    mkdir build && cd build && \
-    cmake -DCMAKE_FIND_ROOT_PATH=/usr/grpc .. && \
-    make -j && \
-    make install && \
-    cd ../.. && \
-    echo "etcd-cpp installation completed."
-
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib/
-ENV CMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH:/usr/local/lib/cmake/etcd-cpp-api/
 ENV PATH=/root/.local/bin:${_UCX_INSTALL_DIR}/bin:$PATH
 ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${_RIXL_INSTALL_DIR}/lib/x86_64-linux-gnu
-ENV CMAKE_PREFIX_PATH=/usr/local/lib/cmake/etcd-cpp-api/:/usr/grpc/lib/cmake/:/usr/local/lib/cmake
 
-RUN set -e && git clone https://${_RIXL_SOURCE} -b ${_RIXL_BRANCH} && \
+RUN set -e && git clone ${_RIXL_SOURCE} && \
     cd RIXL && \
-    git checkout ed772c8d0d8a47c7b4e1a622b13c4f6087a4972a && \
+    git checkout ${_RIXL_BRANCH} && \
     meson setup build/ --prefix=${_RIXL_INSTALL_DIR} \
         -Ducx_path=${_UCX_INSTALL_DIR} \
         -Ddisable_gds_backend=true \
@@ -112,14 +103,73 @@ RUN set -e && echo "Compiling NixlBench" && \
     ninja install && \
     echo "NixlBench compilation complete"
 
-# Only need tests/ for toy_proxy_server.py; base image already has vLLM installed
-RUN git clone --depth 1 https://github.com/vllm-project/vllm.git /tmp/vllm-src && \
-    cp -r /tmp/vllm-src/tests /app/vllm/tests && \
-    rm -rf /tmp/vllm-src
 
 # Install Rust compiler (required for building vllm-router)
 RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 ENV PATH="/root/.cargo/bin:${PATH}"
 
 # Install vllm-router
 RUN pip install vllm-router
+
+WORKDIR /app
+
+# versions.txt is provided by the base image and contains MORI_REPO / MORI_BRANCH entries.
+RUN pip install tqdm prettytable
+RUN git clone --recursive $(grep '^MORI_REPO:' /app/versions.txt | cut -d' ' -f2) && \
+    cd mori && \
+    git checkout $(grep '^MORI_BRANCH:' /app/versions.txt | cut -d' ' -f2)
+
+RUN git clone --no-checkout --filter=blob:none https://github.com/ROCm/rocm-systems.git && cd rocm-systems && \
+    git sparse-checkout set --cone projects/rocshmem && \
+    git checkout develop
+
+WORKDIR /app/rocm-systems/projects/rocshmem
+RUN echo "ROCSHMEM_REPO=\"https://github.com/ROCm/rocm-systems.git\"" >> /app/versions.txt
+RUN echo "ROCSHMEM_BRANCH=\"$(git log | head -1 | awk '{print $2}' | cut -c1-8)\"" >> /app/versions.txt
+RUN mkdir -p /app/rocshmem-build
+WORKDIR /app/rocshmem-build
+RUN /app/rocm-systems/projects/rocshmem/scripts/build_configs/all_backends -DUSE_EXTERNAL_MPI=OFF -DGPU_TARGETS=$GFX_COMPILATION_ARCH
+
+WORKDIR /app
+RUN git clone https://github.com/ROCm/DeepEP.git
+WORKDIR /app/DeepEP
+RUN echo "DEEPEP_REPO=\"https://github.com/ROCm/DeepEP.git\"" >> /app/versions.txt
+RUN echo "DEEPEP_BRANCH=\"$(git log | head -1 | awk '{print $2}' | cut -c1-8)\"" >> /app/versions.txt
+RUN PYTORCH_ROCM_ARCH=$GFX_COMPILATION_ARCH  CFLAGS="-O3 -fPIC" CXXFLAGS="-O3 -fPIC --offload-arch=$GFX_COMPILATION_ARCH" HIP_CXX_FLAGS="-O3 -fPIC" \
+    python3 setup.py --variant rocm --nic $NIC_COMPILATION_ARCH build develop
+
+# Uninstall vLLM from the base image, then install the pinned commit from source (ROCm).
+# TODO: Remove this installation details after upstream vllm is stable.
+RUN pip uninstall -y vllm || true
+RUN pip install setuptools-scm huggingface-hub[cli]
+RUN pip install quart msgpack  --ignore-installed blinker
+RUN rm -rf /tmp/vllm-src && \
+    git clone --recursive "${VLLM_REPO}" /tmp/vllm-src && \
+    cd /tmp/vllm-src && \
+    git checkout "${VLLM_COMMIT}" && \
+    git submodule update --init --recursive && \
+    pip install -r requirements/rocm.txt && \
+    pip install -r requirements/kv_connectors_rocm.txt && \
+    (PYTORCH_ROCM_ARCH=${GFX_COMPILATION_ARCH} python setup.py install || \
+        echo "WARNING: vLLM build from source failed; container may be broken") && \
+    mkdir -p /app/vllm && \
+    cp -r tests /app/vllm/tests && \
+    cp -r examples /app/vllm/examples && \
+    cp -r benchmarks /app/vllm/benchmarks && \
+    rm -rf /tmp/vllm-src
+
+WORKDIR /app
+
+ENV ROCSHMEM_TEST_UUID=1
+ENV ROCSHMEM_HEAP_SIZE=6442450944
+
+RUN pip install --upgrade vllm-router && \
+    pip install py-spy && \
+    pip install --ignore-installed --force-reinstall flask
+
+RUN echo "UCX_REPO=${_UCX_SOURCE}" >> /app/versions.txt && \
+    echo "UCX_BRANCH=${_UCX_BRANCH}" >> /app/versions.txt && \
+    echo "RIXL_REPO=${_RIXL_SOURCE}" >> /app/versions.txt && \
+    echo "RIXL_BRANCH=${_RIXL_BRANCH}" >> /app/versions.txt
+
+RUN cat /app/versions.txt
@@ -1,4 +1,4 @@
-## List of Models - focus SGlang Disaggerated P/D inference
+## List of Models - focus SGLang Disaggregated P/D inference
 
 Dense Models
 - Qwen3-32B (https://huggingface.co/Qwen/Qwen3-32B)

@@ -218,7 +218,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -le "$xP" ]; then
         --node-ips ${MASTER_ADDR} \
         --node-ports 2322
 
-    echo "Waiting untill proxy server closes..."
+    echo "Waiting until proxy server closes..."
     python $MOONCAKE_COOKBOOK_PATH/socket_wait.py \
         --remote-ip ${MASTER_ADDR} \
         --remote-port 2322
@@ -254,7 +254,7 @@ else
         --node-ips ${MASTER_ADDR} \
         --node-ports 2322
 
-    echo "Waiting untill proxy server closes..."
+    echo "Waiting until proxy server closes..."
     python $MOONCAKE_COOKBOOK_PATH/socket_wait.py \
         --remote-ip ${MASTER_ADDR} \
         --remote-port 2322

@@ -1,4 +1,4 @@
-## List of Models - focus VLLM Dissagerated P/D inference
+## List of Models - focus VLLM Disaggregated P/D inference
 
 Dense Models
 - deepseek-ai/DeepSeek-V3 (https://huggingface.co/deepseek-ai/DeepSeek-V3)
@@ -8,11 +8,11 @@ Dense Models
 
 This repository contains scripts and documentation to launch PD Disaggregation using the Nixl framework for above models. You will find setup instructions, node assignment details and benchmarking commands.
 
-## 📝 Prerequisites
+## Prerequisites
 
-- A Slurm cluster with required Nodes -> xP + yD + 1 (minimum size 3: xP=1 and xD=1)
-- Docker container with VLLM, Nixl, etcd and NIC drivers built-in. Refer to Building the Docker image section below.
-- Access to a shared filesystem for log collection( cluster specific)
+- A Slurm cluster with required Nodes -> xP + yD + 1 (minimum size 3: xP=1 and yD=1)
+- Docker container with VLLM, Nixl and NIC drivers built-in. Refer to Building the Docker image section below.
+- Access to a shared filesystem for log collection (cluster specific)
 
 
 ## Building the Docker image
@@ -25,24 +25,76 @@ docker build  -t vllm_dissag_pd_image -f vllm_dissag_inference.ubuntu.amd.Docker
 ## Scripts and Benchmarking
 Run instructions - scripts/vllm_dissag/README.MD
 
-Few files of significance:
+Key files:
 
-scripts/vllm_dissag/run_xPyD_models.slurm - slurm script to launch docker containers on all nodes using sbatch or salloc  
-scripts/vllm_dissag/vllm_disagg_server.sh - Script that runs inside each docker to start required proxy, prefill and decode services
-scripts/vllm_dissag/benchmark_xPyD.sh - Benchmark script to run VLLM benchmarking tool for performance measurement
+| File | Description |
+|------|-------------|
+| `run_xPyD_models.slurm` | Slurm script to launch docker containers on all nodes using sbatch |
+| `vllm_disagg_server.sh` | Default PD server script (NixlConnector, no expert parallel) |
+| `vllm_disagg_mori_ep.sh` | MoRI EP server script (MoRIIOConnector, expert parallel) |
+| `vllm_disagg_server_deepep.sh` | DeepEP server script (NixlConnector, DeepEP all2all backends) |
+| `benchmark_xPyD.sh` | Benchmark script using vLLM benchmarking tool |
 
-## Sbatch run command (one-liner)
-```bash
+## Run Modes
+
+The `run_xPyD_models.slurm` script supports three run modes, controlled by the `RUN_MORI` and `RUN_DEEPEP` environment variables. At most one of these may be set to `1`.
+
+| Mode | Env Variable | Server Script | KV Connector | Models |
+|------|-------------|---------------|--------------|--------|
+| Default (NixlConnector) | Neither set | `vllm_disagg_server.sh` | NixlConnector | All VALID_MODELS |
+| MoRI EP | `RUN_MORI=1` | `vllm_disagg_mori_ep.sh` | MoRIIOConnector | DeepSeek-R1 |
+| DeepEP | `RUN_DEEPEP=1` | `vllm_disagg_server_deepep.sh` | NixlConnector | DeepSeek-V3, DeepSeek-V3-5layer, DeepSeek-R1 |
+
+Setting both `RUN_MORI=1` and `RUN_DEEPEP=1` will exit with an error.
+
+## Sbatch Run Commands
 
-# Clone the repo
+### Default mode (NixlConnector)
+```bash
 git clone https://github.com/ROCm/MAD.git
-cd scripts/vllm_dissag
+cd MAD/scripts/vllm_dissag
+
+export DOCKER_IMAGE_NAME=<DOCKER_IMAGE_NAME>
+export xP=1; export yD=1; export MODEL_NAME=DeepSeek-V3
+sbatch -N 3 -n 3 --nodelist=<node0,node1,node2> run_xPyD_models.slurm
+```
+
+### MoRI EP mode
+```bash
+export DOCKER_IMAGE_NAME=<DOCKER_IMAGE_NAME>
+export RUN_MORI=1
+export xP=1; export yD=1; export MODEL_NAME=DeepSeek-R1
+sbatch -N 3 -n 3 --nodelist=<node0,node1,node2> run_xPyD_models.slurm
+```
+
+### DeepEP mode
+```bash
+export DOCKER_IMAGE_NAME=<DOCKER_IMAGE_NAME>
+export RUN_DEEPEP=1
+export xP=1; export yD=1; export MODEL_NAME=DeepSeek-V3
+sbatch -N 3 -n 3 --nodelist=<node0,node1,node2> run_xPyD_models.slurm
+```
 
-# Sbatch run command [run from the above folder]
-export DOCKER_IMAGE_NAME=<DOCKER IMAGE NAME>
-export xP=<num_prefill_nodes>; export yD=<num_decode_nodes>; export MODEL_NAME=Llama-3.1-8B-Instruct; sbatch -N <num_nodes> -n <num_nodes> --nodelist=<Nodes> run_xPyD_models.slurm
+### DeepEP environment variables (optional)
 
-# num_nodes = xP + xD + 1
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `PREFILL_DEEPEP_BACKEND` | `deepep_high_throughput` | All2all backend for prefill nodes |
+| `DECODE_DEEPEP_BACKEND` | `deepep_low_latency` | All2all backend for decode nodes |
+| `ENABLE_DBO` | `false` | Enable Dynamic Batching Optimization |
+| `DBO_COMM_SMS` | (vLLM default) | DBO communication SMs override |
+| `ENABLE_PROFILING` | `false` | Enable profiling |
+
+`num_nodes = xP + yD + 1` (1 dedicated proxy node + xP prefill nodes + yD decode nodes)
+
+## Node Topology (all modes)
+
+```
+Node 0          -> Proxy (dedicated, no vLLM server)
+Node 1          -> Prefill MASTER
+Nodes 2..xP     -> Prefill CHILD (if xP > 1)
+Node xP+1       -> Decode MASTER
+Nodes xP+2..end -> Decode CHILD (if yD > 1)
 ```
 
 ## Proxy Server Options
@@ -60,7 +112,7 @@ export PROXY_TYPE=toy_proxy
 # Then run sbatch/srun as usual
 ```
 
-## Benchmark parser ( for CONCURRENCY logs) to tabulate different data
+## Benchmark parser (for CONCURRENCY logs) to tabulate different data
 
 ```bash
 python3 benchmark_parser.py <log_path/benchmark_XXX_CONCURRENCY.log