From 6fc14d427f7d823d2497fc0d17a5c3c4c7ed7879 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 00:34:03 -0700
Subject: [PATCH 001/278] add idealized disagg prefill benchmark

---
 .../disagg_benchmarks/disagg_benchmark.sh     | 129 ++++++++++++++++++
 .../disagg_benchmarks/round_robin_proxy.sh    |  20 +++
 2 files changed, 149 insertions(+)
 create mode 100644 benchmarks/disagg_benchmarks/disagg_benchmark.sh
 create mode 100644 benchmarks/disagg_benchmarks/round_robin_proxy.sh

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
new file mode 100644
index 000000000000..72c732704a31
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -0,0 +1,129 @@
+#!/bin/bash
+
+# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
+# Query: 2048 input tokens, 128 output tokens, QPS 8, 1000 requests
+# Resource: 8x H100
+# Approaches:
+# 1. Chunked prefill: 1 vllm instance with tp=8
+# 2. Chunked prefill: 2 vllm instance with tp=4
+# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
+# Prefilling instance: max_output_token=1
+# Decoding instance: force the input tokens be the same across requests to bypass prefilling
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill pt_main_thread
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl localhost:${port}/v1/completions; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  cd "$(dirname "$0")"
+
+  mkdir results
+  results_folder="./results"
+  model="neuralmagic/Meta-Llama-3-70B-Instruct-FP8"
+  dataset_name="sonnet"
+  dataset_path="../sonnet.txt"
+  num_prompts=500
+  qps=8
+  prefix_len=64
+  input_len=2048
+  output_len=128
+
+
+  # chunked prefill with tp=8
+  python3 -m vllm.entrypoints.openai.api_server \
+          --model $model \
+          --port 8000 \
+          --tp 8 \
+          --enable-chunked-prefill &
+  wait_for_server 8000
+
+  python3 benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len $output_len \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8000 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename chunked_prefill_tp8.json \
+          --request-rate $qps
+
+
+  # chunked prefill with tp=4
+  CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8100 \
+    --tp 4 \
+    --enable-chunked-prefill &
+
+  CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    --tp 4 \
+    --enable-chunked-prefill &
+
+  wait_for_server 8100
+  wait_for_server 8200
+  # launch round robin proxy
+  bash ./round_robin_proxy.sh &
+
+  python3 benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len $output_len \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8000 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename chunked_prefill_tp8.json \
+          --request-rate $qps
+
+  kill_gpu_processes
+  pkill -f round_robin_proxy.sh
+
+
+  # disaggregated prefill
+
+
+
+
+}
\ No newline at end of file
diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.sh b/benchmarks/disagg_benchmarks/round_robin_proxy.sh
new file mode 100644
index 000000000000..e996756bc89d
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Define the ports to forward to
+PORTS=(8100 8200)
+NUM_PORTS=${#PORTS[@]}
+CURRENT=0
+
+# Function to handle the round-robin logic
+get_next_port() {
+  NEXT_PORT=${PORTS[$CURRENT]}
+  CURRENT=$(( (CURRENT + 1) % NUM_PORTS ))
+  echo $NEXT_PORT
+}
+
+# Start the proxy
+while true; do
+  NEXT_PORT=$(get_next_port)
+  echo "Forwarding to port $NEXT_PORT"
+  socat TCP4-LISTEN:8000,reuseaddr,fork TCP4:localhost:$NEXT_PORT
+done
\ No newline at end of file

From 69d151487d238a25989451b60dfbc533cef51edd Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 16:30:46 -0700
Subject: [PATCH 002/278] add main

---
 benchmarks/disagg_benchmarks/disagg_benchmark.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index 72c732704a31..9ae0071a9130 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# Requirement: 8x H100 GPUs.
+
+
 # Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
 # Query: 2048 input tokens, 128 output tokens, QPS 8, 1000 requests
 # Resource: 8x H100
@@ -123,7 +126,7 @@ main() {
 
   # disaggregated prefill
 
+}
 
 
-
-}
\ No newline at end of file
+main "$@"
\ No newline at end of file

From 2bc8e7931db2358dfa9aa05c991465fb8878f8b4 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 16:31:50 -0700
Subject: [PATCH 003/278] fix typo

---
 benchmarks/disagg_benchmarks/disagg_benchmark.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index 9ae0071a9130..faace2082ee7 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -65,7 +65,7 @@ main() {
   python3 -m vllm.entrypoints.openai.api_server \
           --model $model \
           --port 8000 \
-          --tp 8 \
+          -tp 8 \
           --enable-chunked-prefill &
   wait_for_server 8000
 
@@ -90,14 +90,14 @@ main() {
     -m vllm.entrypoints.openai.api_server \
     --model $model \
     --port 8100 \
-    --tp 4 \
+    -tp 4 \
     --enable-chunked-prefill &
 
   CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model $model \
     --port 8200 \
-    --tp 4 \
+    -tp 4 \
     --enable-chunked-prefill &
 
   wait_for_server 8100

From 3ea715dbf6872766c424587378b29d1e9a503c84 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 16:32:20 -0700
Subject: [PATCH 004/278] use mkdir -p to avoid error

---
 benchmarks/disagg_benchmarks/disagg_benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index faace2082ee7..8c0ed61d284a 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -49,7 +49,7 @@ main() {
 
   cd "$(dirname "$0")"
 
-  mkdir results
+  mkdir -p results
   results_folder="./results"
   model="neuralmagic/Meta-Llama-3-70B-Instruct-FP8"
   dataset_name="sonnet"

From 3656f8aa7aa0cc2e1ede9b3743263051b4382421 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 16:38:31 -0700
Subject: [PATCH 005/278] fix bug

---
 benchmarks/disagg_benchmarks/disagg_benchmark.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index 8c0ed61d284a..ae5034cc3b61 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -69,7 +69,7 @@ main() {
           --enable-chunked-prefill &
   wait_for_server 8000
 
-  python3 benchmark_serving.py \
+  python3 ../benchmark_serving.py \
           --backend vllm \
           --model $model \
           --dataset-name $dataset_name \
@@ -105,7 +105,7 @@ main() {
   # launch round robin proxy
   bash ./round_robin_proxy.sh &
 
-  python3 benchmark_serving.py \
+  python3 ../benchmark_serving.py \
           --backend vllm \
           --model $model \
           --dataset-name $dataset_name \

From f8cb6fcb91cfffb78d6eaf3668cd549abceebf8c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 16:45:27 -0700
Subject: [PATCH 006/278] disable log request from vllm server, and mute curl

---
 benchmarks/disagg_benchmarks/disagg_benchmark.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index ae5034cc3b61..d1d5cc721c3d 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -35,7 +35,7 @@ wait_for_server() {
   # return 1 if vllm server crashes
   local port=$1
   timeout 1200 bash -c "
-    until curl localhost:${port}/v1/completions; do
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
       sleep 1
     done" && return 0 || return 1
 }
@@ -66,6 +66,8 @@ main() {
           --model $model \
           --port 8000 \
           -tp 8 \
+          --disable-log-stats \
+          --disable-log-requests \
           --enable-chunked-prefill &
   wait_for_server 8000
 
@@ -91,6 +93,8 @@ main() {
     --model $model \
     --port 8100 \
     -tp 4 \
+    --disable-log-stats \
+    --disable-log-requests \
     --enable-chunked-prefill &
 
   CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
@@ -98,6 +102,8 @@ main() {
     --model $model \
     --port 8200 \
     -tp 4 \
+    --disable-log-stats \
+    --disable-log-requests \
     --enable-chunked-prefill &
 
   wait_for_server 8100

From d4b23c079e6a745a4c46a27ab4d9e098d49cca84 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 16:57:47 -0700
Subject: [PATCH 007/278] add disaggregated prefilling benchmark

---
 .../disagg_benchmarks/disagg_benchmark.sh     | 56 ++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index d1d5cc721c3d..8f229ef26f63 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -85,6 +85,7 @@ main() {
           --result-dir $results_folder \
           --result-filename chunked_prefill_tp8.json \
           --request-rate $qps
+  kill_gpu_processes
 
 
   # chunked prefill with tp=4
@@ -125,13 +126,66 @@ main() {
           --result-dir $results_folder \
           --result-filename chunked_prefill_tp8.json \
           --request-rate $qps
-
   kill_gpu_processes
   pkill -f round_robin_proxy.sh
 
 
   # disaggregated prefill
 
+  # prefill with tp=4
+  python3 -m vllm.entrypoints.openai.api_server \
+          --model $model \
+          --port 8000 \
+          -tp 4 \
+          --disable-log-stats \
+          --disable-log-requests &
+  wait_for_server 8000
+
+  # set output-len to 1 so that it only do prefilling
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len 1 \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8000 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_tp4.json \
+          --request-rate $qps
+  kill_gpu_processes
+
+  # decode with tp=4, enable APC
+  python3 -m vllm.entrypoints.openai.api_server \
+          --model $model \
+          --port 8000 \
+          -tp 4 \
+          --enable-prefix-caching \
+          --disable-log-stats \
+          --disable-log-requests &
+  wait_for_server 8000
+
+  # skip prefilling 
+  # by enabling APC and force the input tokens be the same
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len $output_len \
+          --sonnet-prefix-len $input_len  \
+          --num-prompts $num_prompts \
+          --port 8000 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_decode_tp4.json \
+          --request-rate $qps
+  kill_gpu_processes
+
 }
 
 

From a9426631083ee96df1dee55f58dfd0e5261112f6 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 18:34:03 -0700
Subject: [PATCH 008/278] do not launch 2 vllm instance

---
 .../disagg_benchmarks/disagg_benchmark.sh     | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index 8f229ef26f63..f2150fab3b63 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -98,19 +98,19 @@ main() {
     --disable-log-requests \
     --enable-chunked-prefill &
 
-  CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
-    --port 8200 \
-    -tp 4 \
-    --disable-log-stats \
-    --disable-log-requests \
-    --enable-chunked-prefill &
+  # CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+  #   -m vllm.entrypoints.openai.api_server \
+  #   --model $model \
+  #   --port 8200 \
+  #   -tp 4 \
+  #   --disable-log-stats \
+  #   --disable-log-requests \
+  #   --enable-chunked-prefill &
 
   wait_for_server 8100
-  wait_for_server 8200
-  # launch round robin proxy
-  bash ./round_robin_proxy.sh &
+  # wait_for_server 8200
+  # # launch round robin proxy
+  # bash ./round_robin_proxy.sh &
 
   python3 ../benchmark_serving.py \
           --backend vllm \
@@ -121,13 +121,13 @@ main() {
           --sonnet-output-len $output_len \
           --sonnet-prefix-len $prefix_len \
           --num-prompts $num_prompts \
-          --port 8000 \
+          --port 8100 \
           --save-result \
           --result-dir $results_folder \
           --result-filename chunked_prefill_tp8.json \
-          --request-rate $qps
+          --request-rate $((qps / 2))
   kill_gpu_processes
-  pkill -f round_robin_proxy.sh
+  # pkill -f round_robin_proxy.sh
 
 
   # disaggregated prefill

From 540d36260c6b8322fa444a2b42a7ebb49e456ecf Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 18:38:55 -0700
Subject: [PATCH 009/278] reduce # of prompt to half

---
 benchmarks/disagg_benchmarks/disagg_benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index f2150fab3b63..cbc372c5f3d3 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -120,7 +120,7 @@ main() {
           --sonnet-input-len $input_len \
           --sonnet-output-len $output_len \
           --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
+          --num-prompts $((num_prompts / 2)) \
           --port 8100 \
           --save-result \
           --result-dir $results_folder \

From 4b0a7ff77f8059b98f7387c75c66e0ca151ac416 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 18:48:58 -0700
Subject: [PATCH 010/278] reduce input len by 1

---
 benchmarks/disagg_benchmarks/disagg_benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index cbc372c5f3d3..9a53d96470b8 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -177,7 +177,7 @@ main() {
           --dataset-path $dataset_path \
           --sonnet-input-len $input_len \
           --sonnet-output-len $output_len \
-          --sonnet-prefix-len $input_len  \
+          --sonnet-prefix-len $((input_len - 1))  \
           --num-prompts $num_prompts \
           --port 8000 \
           --save-result \

From 298965614e2ffd0264a2e80afed281894f4fa297 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 19:39:09 -0700
Subject: [PATCH 011/278] adjust filename

---
 benchmarks/disagg_benchmarks/disagg_benchmark.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index 9a53d96470b8..fe172b075e0c 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -57,7 +57,7 @@ main() {
   num_prompts=500
   qps=8
   prefix_len=64
-  input_len=2048
+  input_len=8192
   output_len=128
 
 
@@ -124,7 +124,7 @@ main() {
           --port 8100 \
           --save-result \
           --result-dir $results_folder \
-          --result-filename chunked_prefill_tp8.json \
+          --result-filename chunked_prefill_tp4.json \
           --request-rate $((qps / 2))
   kill_gpu_processes
   # pkill -f round_robin_proxy.sh

From 69f729c0384818a2e1cac4b858c01414a7fc7978 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 19:47:06 -0700
Subject: [PATCH 012/278] create 4x sonnet

---
 benchmarks/disagg_benchmarks/disagg_benchmark.sh | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index fe172b075e0c..725d050aac53 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -8,7 +8,7 @@
 # Resource: 8x H100
 # Approaches:
 # 1. Chunked prefill: 1 vllm instance with tp=8
-# 2. Chunked prefill: 2 vllm instance with tp=4
+# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
 # 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
 # Prefilling instance: max_output_token=1
 # Decoding instance: force the input tokens be the same across requests to bypass prefilling
@@ -49,11 +49,21 @@ main() {
 
   cd "$(dirname "$0")"
 
+  cd ..
+  # create sonnet-4x.txt
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+
   mkdir -p results
   results_folder="./results"
   model="neuralmagic/Meta-Llama-3-70B-Instruct-FP8"
   dataset_name="sonnet"
-  dataset_path="../sonnet.txt"
+  dataset_path="../sonnet_4x.txt"
   num_prompts=500
   qps=8
   prefix_len=64

From 43e1e5e1f1876524a0e65112b04807114638e7ab Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 21:34:33 -0700
Subject: [PATCH 013/278] adjust setup

---
 benchmarks/disagg_benchmarks/disagg_benchmark.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index 725d050aac53..11c9e25ca120 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -65,10 +65,10 @@ main() {
   dataset_name="sonnet"
   dataset_path="../sonnet_4x.txt"
   num_prompts=500
-  qps=8
+  qps=4
   prefix_len=64
-  input_len=8192
-  output_len=128
+  input_len=2048
+  output_len=11
 
 
   # chunked prefill with tp=8

From 29a7b88c4e708aebb9df903eeb2e0eb0bc260ccd Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 22:46:37 -0700
Subject: [PATCH 014/278] add benchmark

---
 benchmarks/disagg_benchmarks/disagg_benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index 11c9e25ca120..6c68011acd95 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -4,7 +4,7 @@
 
 
 # Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
-# Query: 2048 input tokens, 128 output tokens, QPS 8, 1000 requests
+# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
 # Resource: 8x H100
 # Approaches:
 # 1. Chunked prefill: 1 vllm instance with tp=8

From 4d31316f4e804181cede778823643ecaced22091 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 8 Jul 2024 16:12:02 -0700
Subject: [PATCH 015/278] allow prefix input len == sonnet input len

---
 benchmarks/benchmark_serving.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 42867fc40edd..49dfe780812f 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -123,9 +123,9 @@ def sample_sonnet_requests(
     prefix_len: int,
     tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, str, int, int]]:
-    assert (
-        input_len > prefix_len
-    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
+    assert input_len >= prefix_len, (
+        "'args.sonnet-input-len' must be greater than or equal to "
+        "'args.prefix-input-len'.")
 
     # Load the dataset.
     with open(dataset_path) as f:

From 4e336fcdf68cba994a181e7b3c16188c0dfb2efe Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 8 Jul 2024 16:42:19 -0700
Subject: [PATCH 016/278] add parameter sweeping

---
 .../analyze_benchmark_result.py               |  47 +++++++
 .../disagg_benchmarks/disagg_benchmark.sh     | 118 ++++++++----------
 2 files changed, 97 insertions(+), 68 deletions(-)
 create mode 100644 benchmarks/disagg_benchmarks/analyze_benchmark_result.py

diff --git a/benchmarks/disagg_benchmarks/analyze_benchmark_result.py b/benchmarks/disagg_benchmarks/analyze_benchmark_result.py
new file mode 100644
index 000000000000..0f7ea8f2b654
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/analyze_benchmark_result.py
@@ -0,0 +1,47 @@
+
+import argparse
+import json
+import yaml
+import os
+from pathlib import Path
+
+def load(path):
+    
+    with open(str(path), 'r') as f:
+        return json.loads(f.read())
+
+def main(args):
+
+    results = Path(args.results_folder)
+
+    chunk = load(results / "chunked_prefill_tp4.json")
+    prefill = load(results / "disagg_prefill_tp4.json")
+    decode = load(results / "disagg_decode_tp4.json")
+
+    ttft_ratio = chunk["mean_ttft_ms"] / prefill["mean_ttft_ms"]
+    itl_ratio = chunk["mean_itl_ms"] / decode["mean_itl_ms"]
+    prefill_decode_ratio = prefill["mean_ttft_ms"] / (decode["mean_itl_ms"] * args.output_len)
+    
+    with open(results / args.output_file, 'a') as f:
+        f.write(yaml.dump([{
+            'qps': args.qps,
+            'output_len': args.output_len,
+            'prefill_decode_ratio': prefill_decode_ratio,
+            'ttft_ratio': ttft_ratio,
+            'itl_ratio': itl_ratio,
+            "chunk_ttft": chunk["mean_ttft_ms"],
+            "chunk_itl": chunk["mean_itl_ms"],
+            "disagg_ttft": prefill["mean_ttft_ms"],
+            "disagg_itl": decode["mean_itl_ms"]
+        }]))
+    
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Analyze benchmark results")
+    parser.add_argument("--results-folder", required=True, help="Path to the results folder")
+    parser.add_argument("--output-len", type=int, required=True, help="Target output length")
+    parser.add_argument("--qps", type=int, required=True, help="Target QPS")
+    parser.add_argument("--output-file", type=str, default="chunk_vs_disagg.yaml")
+
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index 6c68011acd95..7736ab2439b8 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -41,86 +41,31 @@ wait_for_server() {
 }
 
 
-main() {
-
-  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-  (which jq) || (apt-get -y install jq)
-  (which socat) || (apt-get -y install socat)
+benchmark() {
 
-  cd "$(dirname "$0")"
+  # compare chunked prefill with disaggregated prefill
 
-  cd ..
-  # create sonnet-4x.txt
-  echo "" > sonnet_4x.txt
-  for _ in {1..4}
-  do
-    cat sonnet.txt >> sonnet_4x.txt
-  done
-  cd disagg_benchmarks
-
-
-  mkdir -p results
   results_folder="./results"
   model="neuralmagic/Meta-Llama-3-70B-Instruct-FP8"
   dataset_name="sonnet"
   dataset_path="../sonnet_4x.txt"
   num_prompts=500
-  qps=4
+  qps=$1
   prefix_len=64
   input_len=2048
-  output_len=11
-
-
-  # chunked prefill with tp=8
-  python3 -m vllm.entrypoints.openai.api_server \
-          --model $model \
-          --port 8000 \
-          -tp 8 \
-          --disable-log-stats \
-          --disable-log-requests \
-          --enable-chunked-prefill &
-  wait_for_server 8000
-
-  python3 ../benchmark_serving.py \
-          --backend vllm \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
-          --sonnet-output-len $output_len \
-          --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
-          --port 8000 \
-          --save-result \
-          --result-dir $results_folder \
-          --result-filename chunked_prefill_tp8.json \
-          --request-rate $qps
-  kill_gpu_processes
+  output_len=$2
 
 
   # chunked prefill with tp=4
   CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model $model \
-    --port 8100 \
+    --port 8000 \
     -tp 4 \
     --disable-log-stats \
     --disable-log-requests \
     --enable-chunked-prefill &
-
-  # CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
-  #   -m vllm.entrypoints.openai.api_server \
-  #   --model $model \
-  #   --port 8200 \
-  #   -tp 4 \
-  #   --disable-log-stats \
-  #   --disable-log-requests \
-  #   --enable-chunked-prefill &
-
-  wait_for_server 8100
-  # wait_for_server 8200
-  # # launch round robin proxy
-  # bash ./round_robin_proxy.sh &
+  wait_for_server 8000
 
   python3 ../benchmark_serving.py \
           --backend vllm \
@@ -131,17 +76,15 @@ main() {
           --sonnet-output-len $output_len \
           --sonnet-prefix-len $prefix_len \
           --num-prompts $((num_prompts / 2)) \
-          --port 8100 \
+          --port 8000 \
           --save-result \
           --result-dir $results_folder \
           --result-filename chunked_prefill_tp4.json \
           --request-rate $((qps / 2))
   kill_gpu_processes
-  # pkill -f round_robin_proxy.sh
 
 
   # disaggregated prefill
-
   # prefill with tp=4
   python3 -m vllm.entrypoints.openai.api_server \
           --model $model \
@@ -150,7 +93,6 @@ main() {
           --disable-log-stats \
           --disable-log-requests &
   wait_for_server 8000
-
   # set output-len to 1 so that it only do prefilling
   python3 ../benchmark_serving.py \
           --backend vllm \
@@ -177,7 +119,6 @@ main() {
           --disable-log-stats \
           --disable-log-requests &
   wait_for_server 8000
-
   # skip prefilling 
   # by enabling APC and force the input tokens be the same
   python3 ../benchmark_serving.py \
@@ -187,7 +128,7 @@ main() {
           --dataset-path $dataset_path \
           --sonnet-input-len $input_len \
           --sonnet-output-len $output_len \
-          --sonnet-prefix-len $((input_len - 1))  \
+          --sonnet-prefix-len $input_len  \
           --num-prompts $num_prompts \
           --port 8000 \
           --save-result \
@@ -196,7 +137,48 @@ main() {
           --request-rate $qps
   kill_gpu_processes
 
+  python3 analyze_results.py \
+          --results-folder $results_folder \
+          --output-len $output_len \
+          --qps $qps
+
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_qps=4
+  default_output_len=12
+
+  for target_qps in 1 2 4 8 16
+  do
+    benchmark $target_qps $default_output_len
+  done
+
+  for output_len in 5 10 20 40 80
+  do
+    benchmark $default_qps $output_len
+  done
+
 }
 
 
-main "$@"
\ No newline at end of file
+main "$@"

From 2770c61dd5f96622be6c07f42032b58265837c10 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 8 Jul 2024 16:47:10 -0700
Subject: [PATCH 017/278] aadjust firmat

---
 benchmarks/disagg_benchmarks/analyze_benchmark_result.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/disagg_benchmarks/analyze_benchmark_result.py b/benchmarks/disagg_benchmarks/analyze_benchmark_result.py
index 0f7ea8f2b654..4b675c675d25 100644
--- a/benchmarks/disagg_benchmarks/analyze_benchmark_result.py
+++ b/benchmarks/disagg_benchmarks/analyze_benchmark_result.py
@@ -44,4 +44,5 @@ def main(args):
     parser.add_argument("--output-file", type=str, default="chunk_vs_disagg.yaml")
 
     args = parser.parse_args()
-    main(args)
\ No newline at end of file
+    main(args)
+    
\ No newline at end of file

From 80061d2f3276aa4490ea416c3d5d2a98d18f4457 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 8 Jul 2024 16:47:48 -0700
Subject: [PATCH 018/278] rename script

---
 benchmarks/disagg_benchmarks/disagg_benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index 7736ab2439b8..bf7ef9dfa3ef 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -137,7 +137,7 @@ benchmark() {
           --request-rate $qps
   kill_gpu_processes
 
-  python3 analyze_results.py \
+  python3 analyze_benchmark_results.py \
           --results-folder $results_folder \
           --output-len $output_len \
           --qps $qps

From 8c0a9dc0183b9110695e093c3d78069d05daaa72 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 8 Jul 2024 16:48:02 -0700
Subject: [PATCH 019/278] align naming

---
 .../{analyze_benchmark_result.py => analyze_benchmark_results.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename benchmarks/disagg_benchmarks/{analyze_benchmark_result.py => analyze_benchmark_results.py} (100%)

diff --git a/benchmarks/disagg_benchmarks/analyze_benchmark_result.py b/benchmarks/disagg_benchmarks/analyze_benchmark_results.py
similarity index 100%
rename from benchmarks/disagg_benchmarks/analyze_benchmark_result.py
rename to benchmarks/disagg_benchmarks/analyze_benchmark_results.py

From 7d84965280d185f2eb8d4381e1936f146612121c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 8 Jul 2024 17:01:52 -0700
Subject: [PATCH 020/278] adjust qps

---
 benchmarks/disagg_benchmarks/disagg_benchmark.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index bf7ef9dfa3ef..c3e652b45685 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -168,14 +168,14 @@ main() {
   default_qps=4
   default_output_len=12
 
-  for target_qps in 1 2 4 8 16
+  for target_qps in 2 4 8 16
   do
     benchmark $target_qps $default_output_len
   done
 
-  for output_len in 5 10 20 40 80
+  for target_output_len in 5 10 20 40 80
   do
-    benchmark $default_qps $output_len
+    benchmark $default_qps $target_output_len
   done
 
 }

From 5ac5249d5347d4cf0d384b40cfde1a55f068ce1a Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 8 Jul 2024 20:06:15 -0700
Subject: [PATCH 021/278] adjust swap range

---
 .../disagg_benchmarks/disagg_benchmark.sh     | 10 +--
 .../results/chunk_vs_disagg.yaml              | 81 +++++++++++++++++++
 .../visualize_benchmark_results.py            | 73 +++++++++++++++++
 3 files changed, 159 insertions(+), 5 deletions(-)
 create mode 100644 benchmarks/disagg_benchmarks/results/chunk_vs_disagg.yaml
 create mode 100644 benchmarks/disagg_benchmarks/visualize_benchmark_results.py

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index c3e652b45685..c8a7cba02a70 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -166,17 +166,17 @@ main() {
   mkdir results
 
   default_qps=4
-  default_output_len=12
+  default_output_len=150
 
   for target_qps in 2 4 8 16
   do
     benchmark $target_qps $default_output_len
   done
 
-  for target_output_len in 5 10 20 40 80
-  do
-    benchmark $default_qps $target_output_len
-  done
+  # for target_output_len in 5 10 20 40 80
+  # do
+  #   benchmark $default_qps $target_output_len
+  # done
 
 }
 
diff --git a/benchmarks/disagg_benchmarks/results/chunk_vs_disagg.yaml b/benchmarks/disagg_benchmarks/results/chunk_vs_disagg.yaml
new file mode 100644
index 000000000000..cbbb8de8826d
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/results/chunk_vs_disagg.yaml
@@ -0,0 +1,81 @@
+- chunk_itl: 35.312214966863394
+  chunk_ttft: 197.25125090777874
+  disagg_itl: 21.324921273315947
+  disagg_ttft: 152.8801853582263
+  itl_ratio: 1.6559130284363517
+  output_len: 12
+  prefill_decode_ratio: 0.597423797407428
+  qps: 2
+  ttft_ratio: 1.290234247463678
+- chunk_itl: 41.504599403589964
+  chunk_ttft: 229.26480463147163
+  disagg_itl: 22.99003972671926
+  disagg_ttft: 199.59523537009954
+  itl_ratio: 1.8053296078193763
+  output_len: 12
+  prefill_decode_ratio: 0.723484451464895
+  qps: 4
+  ttft_ratio: 1.1486486849566186
+- chunk_itl: 63.580538438012205
+  chunk_ttft: 379.72798639535904
+  disagg_itl: 29.123107485473156
+  disagg_ttft: 508.8736157938838
+  itl_ratio: 2.183164638928957
+  output_len: 12
+  prefill_decode_ratio: 1.4560992390885088
+  qps: 8
+  ttft_ratio: 0.7462127620881912
+- chunk_itl: 438.7920691122612
+  chunk_ttft: 4792.676218897104
+  disagg_itl: 38.97295152582228
+  disagg_ttft: 10359.5165893808
+  itl_ratio: 11.258887303455348
+  output_len: 12
+  prefill_decode_ratio: 22.151082104804797
+  qps: 16
+  ttft_ratio: 0.4626351217787439
+- chunk_itl: 65.75006234049798
+  chunk_ttft: 219.36342687904835
+  disagg_itl: 28.58696384578943
+  disagg_ttft: 199.52613697946072
+  itl_ratio: 2.300001591466045
+  output_len: 5
+  prefill_decode_ratio: 1.395923946703762
+  qps: 4
+  ttft_ratio: 1.099422011571495
+- chunk_itl: 45.51790158599616
+  chunk_ttft: 231.06786338984966
+  disagg_itl: 24.55629511550069
+  disagg_ttft: 200.25028175115585
+  itl_ratio: 1.8536143734998467
+  output_len: 10
+  prefill_decode_ratio: 0.815474324645788
+  qps: 4
+  ttft_ratio: 1.1538953222397448
+- chunk_itl: 32.62334335371852
+  chunk_ttft: 224.4068773239851
+  disagg_itl: 22.356921216845514
+  disagg_ttft: 199.03663477301598
+  itl_ratio: 1.4592055425385428
+  output_len: 20
+  prefill_decode_ratio: 0.44513426701849645
+  qps: 4
+  ttft_ratio: 1.1274651904153308
+- chunk_itl: 28.700303505733608
+  chunk_ttft: 238.22125577926636
+  disagg_itl: 21.38163771480322
+  disagg_ttft: 200.1644251793623
+  itl_ratio: 1.3422874285192585
+  output_len: 40
+  prefill_decode_ratio: 0.23403776157050613
+  qps: 4
+  ttft_ratio: 1.1901278439752834
+- chunk_itl: 25.861735691688956
+  chunk_ttft: 237.75536592304707
+  disagg_itl: 22.10882957596332
+  disagg_ttft: 200.66210460662842
+  itl_ratio: 1.1697469376581466
+  output_len: 80
+  prefill_decode_ratio: 0.11345133847835387
+  qps: 4
+  ttft_ratio: 1.184854342025043
\ No newline at end of file
diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
new file mode 100644
index 000000000000..1d5c3536736d
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -0,0 +1,73 @@
+
+import matplotlib.pyplot as plt
+import yaml
+import pandas as pd
+from tabulate import tabulate
+
+
+def stringify(x):
+    return [str(i) for i in x]
+
+
+if __name__ == "__main__":
+    
+    with open("results/chunk_vs_disagg.yaml", "r") as f:
+        data = yaml.load(f, Loader=yaml.FullLoader)
+    df = pd.DataFrame.from_dict(data)
+
+    print_df = df.copy()
+    print_df.drop(columns=[
+        "ttft_ratio",
+        "itl_ratio",
+        "prefill_decode_ratio",
+    ], inplace=True)
+    print_df.to_csv('results/chunk_vs_disagg.csv', index=False)
+
+    df["chunk_e2e"] = df["chunk_ttft"] + df["chunk_itl"] * df["output_len"]
+    df["disagg_e2e"] = df["disagg_ttft"] + df["disagg_itl"] * df["output_len"]
+    df["e2e_ratio"] = df["chunk_e2e"] / df["disagg_e2e"]
+    
+    plt.rcParams['font.size'] = 20
+    
+    
+    # qps vs performance
+    qps_df = df[df["output_len"] == 12].copy()
+    qps_df.drop(columns=[
+        "chunk_itl",
+        "chunk_ttft",
+        "disagg_itl",
+        "disagg_ttft",
+        "output_len",
+        "prefill_decode_ratio", 
+    ], inplace=True)
+    fig, ax = plt.subplots(figsize=(10, 7))
+    qps_df.plot(
+        ax=ax,
+        kind="bar",
+        x="qps",
+        y=["ttft_ratio", "itl_ratio", "e2e_ratio"],
+        ylabel="$T_{chunked}~/~T_{disagg}$",
+        rot=0,
+    )
+    ax.hlines(1, -1, 5, color='black')
+    fig.savefig('results/qps.png')
+    plt.close(fig)
+
+    
+    # prefill decode ratio vs performance
+    tokens_df = df[df["output_len"] != 12]
+    fig, ax = plt.subplots(figsize=(10, 7))
+    tokens_df.plot(
+        ax=ax,
+        kind="bar",
+        x="output_len",
+        xlabel="# of output tokens",
+        y=["ttft_ratio", "itl_ratio", "e2e_ratio", "prefill_decode_ratio"],
+        ylabel="$T_{chunked}~/~T_{disagg}$",
+        rot=0,
+    )
+    ax.hlines(1, -1, 5, color='black')
+    fig.savefig('results/tokens.png')
+    plt.close(fig)
+    
+    
\ No newline at end of file

From 8f259853a4bca740988e0eaff7619c2f1863b4a3 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 8 Jul 2024 20:06:41 -0700
Subject: [PATCH 022/278] remove results

---
 .../results/chunk_vs_disagg.yaml              | 81 -------------------
 1 file changed, 81 deletions(-)
 delete mode 100644 benchmarks/disagg_benchmarks/results/chunk_vs_disagg.yaml

diff --git a/benchmarks/disagg_benchmarks/results/chunk_vs_disagg.yaml b/benchmarks/disagg_benchmarks/results/chunk_vs_disagg.yaml
deleted file mode 100644
index cbbb8de8826d..000000000000
--- a/benchmarks/disagg_benchmarks/results/chunk_vs_disagg.yaml
+++ /dev/null
@@ -1,81 +0,0 @@
-- chunk_itl: 35.312214966863394
-  chunk_ttft: 197.25125090777874
-  disagg_itl: 21.324921273315947
-  disagg_ttft: 152.8801853582263
-  itl_ratio: 1.6559130284363517
-  output_len: 12
-  prefill_decode_ratio: 0.597423797407428
-  qps: 2
-  ttft_ratio: 1.290234247463678
-- chunk_itl: 41.504599403589964
-  chunk_ttft: 229.26480463147163
-  disagg_itl: 22.99003972671926
-  disagg_ttft: 199.59523537009954
-  itl_ratio: 1.8053296078193763
-  output_len: 12
-  prefill_decode_ratio: 0.723484451464895
-  qps: 4
-  ttft_ratio: 1.1486486849566186
-- chunk_itl: 63.580538438012205
-  chunk_ttft: 379.72798639535904
-  disagg_itl: 29.123107485473156
-  disagg_ttft: 508.8736157938838
-  itl_ratio: 2.183164638928957
-  output_len: 12
-  prefill_decode_ratio: 1.4560992390885088
-  qps: 8
-  ttft_ratio: 0.7462127620881912
-- chunk_itl: 438.7920691122612
-  chunk_ttft: 4792.676218897104
-  disagg_itl: 38.97295152582228
-  disagg_ttft: 10359.5165893808
-  itl_ratio: 11.258887303455348
-  output_len: 12
-  prefill_decode_ratio: 22.151082104804797
-  qps: 16
-  ttft_ratio: 0.4626351217787439
-- chunk_itl: 65.75006234049798
-  chunk_ttft: 219.36342687904835
-  disagg_itl: 28.58696384578943
-  disagg_ttft: 199.52613697946072
-  itl_ratio: 2.300001591466045
-  output_len: 5
-  prefill_decode_ratio: 1.395923946703762
-  qps: 4
-  ttft_ratio: 1.099422011571495
-- chunk_itl: 45.51790158599616
-  chunk_ttft: 231.06786338984966
-  disagg_itl: 24.55629511550069
-  disagg_ttft: 200.25028175115585
-  itl_ratio: 1.8536143734998467
-  output_len: 10
-  prefill_decode_ratio: 0.815474324645788
-  qps: 4
-  ttft_ratio: 1.1538953222397448
-- chunk_itl: 32.62334335371852
-  chunk_ttft: 224.4068773239851
-  disagg_itl: 22.356921216845514
-  disagg_ttft: 199.03663477301598
-  itl_ratio: 1.4592055425385428
-  output_len: 20
-  prefill_decode_ratio: 0.44513426701849645
-  qps: 4
-  ttft_ratio: 1.1274651904153308
-- chunk_itl: 28.700303505733608
-  chunk_ttft: 238.22125577926636
-  disagg_itl: 21.38163771480322
-  disagg_ttft: 200.1644251793623
-  itl_ratio: 1.3422874285192585
-  output_len: 40
-  prefill_decode_ratio: 0.23403776157050613
-  qps: 4
-  ttft_ratio: 1.1901278439752834
-- chunk_itl: 25.861735691688956
-  chunk_ttft: 237.75536592304707
-  disagg_itl: 22.10882957596332
-  disagg_ttft: 200.66210460662842
-  itl_ratio: 1.1697469376581466
-  output_len: 80
-  prefill_decode_ratio: 0.11345133847835387
-  qps: 4
-  ttft_ratio: 1.184854342025043
\ No newline at end of file

From 2363fa09f2b0ce4f18b8bd935d4f547dd01623bc Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 9 Jul 2024 16:26:44 -0700
Subject: [PATCH 023/278] adjust benchmark results so that there are 150 output
 tokens by default. Much more realistic

---
 benchmarks/disagg_benchmarks/visualize_benchmark_results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
index 1d5c3536736d..8686fb2abf9b 100644
--- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -31,7 +31,7 @@ def stringify(x):
     
     
     # qps vs performance
-    qps_df = df[df["output_len"] == 12].copy()
+    qps_df = df[df["output_len"] == 150].copy()
     qps_df.drop(columns=[
         "chunk_itl",
         "chunk_ttft",

From 3db988c713691052f0517d877f76d100a9c015f6 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 16 Jul 2024 23:49:59 -0700
Subject: [PATCH 024/278] add example usage for disaggregated prefill

---
 examples/disaggregated_prefill_example.sh | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 examples/disaggregated_prefill_example.sh

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
new file mode 100644
index 000000000000..7adcc6ec35b6
--- /dev/null
+++ b/examples/disaggregated_prefill_example.sh
@@ -0,0 +1,22 @@
+
+
+# prefilling instance
+VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
+    --port 8100 \
+    -tp 4 \
+    --disable-log-stats \
+    --disable-log-requests \
+    --enable-chunked-prefill &
+
+sleep 2
+
+VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
+    --port 8200 \
+    -tp 4 \
+    --disable-log-stats \
+    --disable-log-requests \
+    --enable-chunked-prefill &
\ No newline at end of file

From 00e46de261589b53ec9969d986b4dc8f2b726a63 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 16 Jul 2024 23:50:26 -0700
Subject: [PATCH 025/278] add environment variable for disaggregated prefill

---
 vllm/config.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 1ea288879680..9f9fa938465c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -662,7 +662,15 @@ def __init__(
         self.ray_workers_use_nsight = ray_workers_use_nsight
         self.placement_group = placement_group
 
-        self.world_size = pipeline_parallel_size * self.tensor_parallel_size
+        if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+            # Disaggregated prefilling is enabled
+            # There will be 2 copies of vLLM
+            # One for prefilling and one for decoding
+            self.disagg_prefill_size = 2
+        else:
+            self.disagg_prefill_size = 1
+
+        self.world_size = pipeline_parallel_size * tensor_parallel_size * self.disagg_prefill_size
         if worker_use_ray:
             if self.distributed_executor_backend is None:
                 self.distributed_executor_backend = "ray"

From de434d977e3181da74e44efe2be9925e99481e4c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 16 Jul 2024 23:50:59 -0700
Subject: [PATCH 026/278] add a new distributed group for disaggregated prefill
 NCCL communication

---
 vllm/distributed/parallel_state.py | 55 ++++++++++++++++++++++++++++--
 vllm/envs.py                       |  5 +++
 2 files changed, 58 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 66ffe6e8a9fa..eb3d32aeebb4 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -754,6 +754,14 @@ def get_pp_group() -> GroupCoordinator:
         "pipeline model parallel group is not initialized")
     return _PP
 
+    
+_DISAGG: Optional[GroupCoordinator] = None
+
+def get_disagg_group() -> GroupCoordinator:
+    assert _DISAGG is not None, (
+        "disaggregated prefilling parallel group is not initialized")
+    return _DISAGG
+
 
 # kept for backward compatibility
 get_pipeline_model_parallel_group = get_pp_group
@@ -827,6 +835,28 @@ def init_distributed_environment(
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size")
+        
+        
+def extend_distributed_group_with_offset(
+    groups: List[List[int]],
+    offset: int,
+) -> List[List[int]]:
+    """
+        Extend original distributed group.
+        The extended part will be the original distributed group plus an offset.
+        
+        Arguments:
+            groups: original distributed group
+            offset: the offset we want to apply to the duplicated group.
+                Typically world_size // 2
+    """
+    
+    new_groups = []
+    for group in groups:
+        new_groups.append([rank for rank in group])
+        new_groups.append([rank + offset for rank in group])
+        
+    return new_groups
 
 
 def initialize_model_parallel(
@@ -862,15 +892,24 @@ def initialize_model_parallel(
     backend = backend or torch.distributed.get_backend(
         get_world_group().device_group)
 
+        
+    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+        # Disaggregated prefilling is enabled
+        # There will be 2 copies of vLLM
+        # One for prefilling and one for decoding
+        disagg_prefill_size = 2
+    else:
+        disagg_prefill_size = 1
+
     if (world_size !=
-            tensor_model_parallel_size * pipeline_model_parallel_size):
+            tensor_model_parallel_size * pipeline_model_parallel_size * disagg_prefill_size):
         raise RuntimeError(
             f"world_size ({world_size}) is not equal to "
             f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
             f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
 
     # Build the tensor model-parallel groups.
-    num_tensor_model_parallel_groups: int = (world_size //
+    num_tensor_model_parallel_groups: int = (world_size // disagg_prefill_size //
                                              tensor_model_parallel_size)
     global _TP
     assert _TP is None, ("tensor model parallel group is already initialized")
@@ -880,6 +919,12 @@ def initialize_model_parallel(
             range(i * tensor_model_parallel_size,
                   (i + 1) * tensor_model_parallel_size))
         group_ranks.append(ranks)
+    # extend the distributed group if disaggregated prefilling is enabled
+    if disagg_prefill_size > 1:
+        group_ranks = extend_distributed_group_with_offset(
+            group_ranks, 
+            world_size // disagg_prefill_size
+        )
     _TP = init_model_parallel_group(group_ranks,
                                     get_world_group().local_rank, backend)
 
@@ -893,6 +938,12 @@ def initialize_model_parallel(
     for i in range(num_pipeline_model_parallel_groups):
         ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
         group_ranks.append(ranks)
+    # extend the distributed group if disaggregated prefilling is enabled
+    if disagg_prefill_size > 1:
+        group_ranks = extend_distributed_group_with_offset(
+            group_ranks, 
+            world_size // disagg_prefill_size
+        )
     # pipeline parallel does not need custom allreduce
     _PP = init_model_parallel_group(group_ranks,
                                     get_world_group().local_rank,
diff --git a/vllm/envs.py b/vllm/envs.py
index c624510c7ea1..0c8d9ae2b8d4 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -251,6 +251,11 @@
     lambda: os.getenv("VLLM_XLA_CACHE_PATH", "~/.vllm/xla_cache/"),
     "VLLM_FUSED_MOE_CHUNK_SIZE":
     lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
+
+    # Specify the role of current vllm instance
+    # Value can be "prefill", "decode" or None.
+    "VLLM_DISAGG_PREFILL_ROLE":
+    lambda: os.getenv("VLLM_DISAGG_PREFILL_ROLE", None),
 }
 
 # end-env-vars-definition

From f157f6b42d6348a98a4685bc2b360ea35241b19e Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 17:37:25 -0700
Subject: [PATCH 027/278] only inflate the world size inside parallel_state.py

---
 examples/disaggregated_prefill_example.sh |  3 ++-
 vllm/config.py                            | 10 +---------
 vllm/distributed/parallel_state.py        | 12 +++++++++++-
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index 7adcc6ec35b6..01b30433697c 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -1,4 +1,5 @@
 
+export VLLM_LOGGING_LEVEL=DEBUG
 
 # prefilling instance
 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
@@ -10,7 +11,7 @@ VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
     --disable-log-requests \
     --enable-chunked-prefill &
 
-sleep 2
+sleep 10
 
 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     -m vllm.entrypoints.openai.api_server \
diff --git a/vllm/config.py b/vllm/config.py
index 9f9fa938465c..5586132c8be6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -662,15 +662,7 @@ def __init__(
         self.ray_workers_use_nsight = ray_workers_use_nsight
         self.placement_group = placement_group
 
-        if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-            # Disaggregated prefilling is enabled
-            # There will be 2 copies of vLLM
-            # One for prefilling and one for decoding
-            self.disagg_prefill_size = 2
-        else:
-            self.disagg_prefill_size = 1
-
-        self.world_size = pipeline_parallel_size * tensor_parallel_size * self.disagg_prefill_size
+        self.world_size = pipeline_parallel_size * tensor_parallel_size
         if worker_use_ray:
             if self.distributed_executor_backend is None:
                 self.distributed_executor_backend = "ray"
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index eb3d32aeebb4..428ffb01dc21 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -808,6 +808,16 @@ def init_distributed_environment(
         "world_size=%d rank=%d local_rank=%d "
         "distributed_init_method=%s backend=%s", world_size, rank, local_rank,
         distributed_init_method, backend)
+    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+        # Disaggregated prefilling is enabled
+        # There will be 2 copies of vLLM
+        # One for prefilling and one for decoding
+        world_size = world_size * 2
+        logger.debug(
+            "Disaggregated prefill enabled, "
+            "increase world size to %d", world_size)
+    else:
+        disagg_prefill_size = 1
     if not torch.distributed.is_initialized():
         assert distributed_init_method is not None, (
             "distributed_init_method must be provided when initializing "
@@ -929,7 +939,7 @@ def initialize_model_parallel(
                                     get_world_group().local_rank, backend)
 
     # Build the pipeline model-parallel groups.
-    num_pipeline_model_parallel_groups: int = (world_size //
+    num_pipeline_model_parallel_groups: int = (world_size // disagg_prefill_size //
                                                pipeline_model_parallel_size)
     global _PP
     assert _PP is None, (

From de82c3cbfb716649eb4817aad5f9a9e5b13f8ce2 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 17:49:13 -0700
Subject: [PATCH 028/278] add more log information

---
 examples/disaggregated_prefill_example.sh | 4 +++-
 vllm/distributed/parallel_state.py        | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index 01b30433697c..ee8706227f9e 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -20,4 +20,6 @@ VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     -tp 4 \
     --disable-log-stats \
     --disable-log-requests \
-    --enable-chunked-prefill &
\ No newline at end of file
+    --enable-chunked-prefill &
+
+
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 428ffb01dc21..0bb5a66f37f2 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -861,10 +861,15 @@ def extend_distributed_group_with_offset(
                 Typically world_size // 2
     """
     
+    logger.debug("Extend the distributed groups with offset %d", offset)
+    logger.debug("Before extension:\n%s", str(groups))
+    
     new_groups = []
     for group in groups:
         new_groups.append([rank for rank in group])
         new_groups.append([rank + offset for rank in group])
+
+    logger.debug("After extension:\n%s", str(new_groups))
         
     return new_groups
 

From 69ce0e0d7c1ec7432b64b3fa49a30a8693af957e Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 17:55:40 -0700
Subject: [PATCH 029/278] specify vllm port

---
 examples/disaggregated_prefill_example.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index ee8706227f9e..ebfaa6e43000 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -1,5 +1,6 @@
 
 export VLLM_LOGGING_LEVEL=DEBUG
+export VLLM_PORT=12345
 
 # prefilling instance
 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \

From e3dc2e9ca8df12682558006a2b432838af147441 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 20:47:43 -0700
Subject: [PATCH 030/278] avoid switching to unused ports in disaggregated
 prefilling

---
 vllm/distributed/parallel_state.py | 4 ++++
 vllm/utils.py                      | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 0bb5a66f37f2..96bfc27610f9 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -9,6 +9,10 @@
 - call `init_distributed_environment` to initialize the distributed environment.
 - call `initialize_model_parallel` or `ensure_model_parallel_initialized` to 
  initialize the model parallel groups.
+ - In disaggregated prefilling, we will modify:
+  - World size: 2 * tp * pp
+  - Rank: [0, tp * pp) for prefilling, [tp * pp, 2 * tp * pp) for decoding
+  - Local rank: unchanged
 
 - any code dealing with the distributed stuff
 
diff --git a/vllm/utils.py b/vllm/utils.py
index a3d15d797922..505f2a895ef9 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -364,6 +364,10 @@ def get_distributed_init_method(ip: str, port: int) -> str:
 def get_open_port() -> int:
     port = envs.VLLM_PORT
     if port is not None:
+        if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+            # The prefill and decode instance shares the same port
+            # Skip the binding check as the port may be binded by prefill
+            return port
         while True:
             try:
                 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:

From 18fe19c4744f88ec80f73bc159ab46f53360fd84 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 22:21:15 -0700
Subject: [PATCH 031/278] adjust parallel state to include _DISAGG distributed
 group

---
 vllm/distributed/parallel_state.py | 103 ++++++++++++++++++++---------
 1 file changed, 71 insertions(+), 32 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 8ce8a332bdf3..aabe660cf1da 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -8,11 +8,8 @@
 
 - call `init_distributed_environment` to initialize the distributed environment.
 - call `initialize_model_parallel` or `ensure_model_parallel_initialized` to 
- initialize the model parallel groups.
- - In disaggregated prefilling, we will modify:
-  - World size: 2 * tp * pp
-  - Rank: [0, tp * pp) for prefilling, [tp * pp, 2 * tp * pp) for decoding
-  - Local rank: unchanged
+ initialize the model parallel groups and disaggregated prefilling parallel 
+ groups.
 
 - any code dealing with the distributed stuff
 
@@ -703,8 +700,8 @@ def destroy(self):
             self.ca_comm = None
         if self.mq_broadcaster is not None:
             self.mq_broadcaster = None
-
-
+            
+            
 _WORLD: Optional[GroupCoordinator] = None
 
 
@@ -855,7 +852,7 @@ def init_distributed_environment(
             "world group already initialized with a different world size")
         
         
-def extend_distributed_group_with_offset(
+def offset_distributed_groups(
     groups: List[List[int]],
     offset: int,
 ) -> List[List[int]]:
@@ -869,15 +866,14 @@ def extend_distributed_group_with_offset(
                 Typically world_size // 2
     """
     
-    logger.debug("Extend the distributed groups with offset %d", offset)
-    logger.debug("Before extension:\n%s", str(groups))
+    logger.debug("Offset distributed groups with offset %d", offset)
+    logger.debug("Before offset:\n%s", str(groups))
     
     new_groups = []
     for group in groups:
-        new_groups.append([rank for rank in group])
         new_groups.append([rank + offset for rank in group])
 
-    logger.debug("After extension:\n%s", str(new_groups))
+    logger.debug("After offset:\n%s", str(new_groups))
         
     return new_groups
 
@@ -908,31 +904,51 @@ def initialize_model_parallel(
     are on the same DGX box. For example if we are using 2 DGX-1 boxes
     with a total of 16 GPUs, rank 0 to 7 belong to the first box and
     ranks 8 to 15 belong to the second box.
+
+    Disaggregated prefilling will also initialize using this function.
+    Why: disaggregated prefilling is similar to pipeline parallel
+     except that disaggregated prefilling does not partition model
+    Methodology:
+        - Only change variables in this file
+        - Any variable outside this file should be unchanged
+    Modifications:
+        - World size in vLLM variables (like in `ParallelConfig`): unchanged
+        - World size in `torch.distributed`: doubled (2 * tp * pp)
+        - Rank:
+            - [0, tp * pp) for prefilling
+            - [tp * pp, 2 * tp * pp) for decoding
+        - Parallel groups
+            - Unchanged for prefilling
+            - Offseted by tp * pp for decoding
+            - Add a new parallel group `_DISAGG` for disaggregated prefilling
+                - [0, tp * pp], [1, tp * pp + 1], .. 
+        - Local rank: unchanged
+            - Thanks to PP implementation, distributed operations only rely on
+             local rank. This guarantees the communications inside the
+             prefilling instance and decoding instance are unchanged.
     """
     # Get world size and rank. Ensure some consistencies.
     assert torch.distributed.is_initialized()
     world_size: int = torch.distributed.get_world_size()
     backend = backend or torch.distributed.get_backend(
-        get_world_group().device_group)
-
-        
+        get_world_group().device_group)        
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-        # Disaggregated prefilling is enabled
-        # There will be 2 copies of vLLM
-        # One for prefilling and one for decoding
-        disagg_prefill_size = 2
-    else:
-        disagg_prefill_size = 1
+        # Keep the semantics of world_size the same (`tp * pp`)
+        logger.debug("Disaggregated prefilling enabled")
+        world_size = world_size // 2
+        logger.debug("Shrink the world size from %d to %d",
+                     world_size * 2,
+                     world_size)
 
     if (world_size !=
-            tensor_model_parallel_size * pipeline_model_parallel_size * disagg_prefill_size):
+            tensor_model_parallel_size * pipeline_model_parallel_size):
         raise RuntimeError(
             f"world_size ({world_size}) is not equal to "
             f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
             f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
 
     # Build the tensor model-parallel groups.
-    num_tensor_model_parallel_groups: int = (world_size // disagg_prefill_size //
+    num_tensor_model_parallel_groups: int = (world_size //
                                              tensor_model_parallel_size)
     global _TP
     assert _TP is None, ("tensor model parallel group is already initialized")
@@ -942,11 +958,12 @@ def initialize_model_parallel(
             range(i * tensor_model_parallel_size,
                   (i + 1) * tensor_model_parallel_size))
         group_ranks.append(ranks)
-    # extend the distributed group if disaggregated prefilling is enabled
-    if disagg_prefill_size > 1:
-        group_ranks = extend_distributed_group_with_offset(
+    if envs.VLLM_DISAGG_PREFILL_ROLE == "decoding":
+        logger.debug("Current instance is decoding instance")
+        logger.debug("Offset the distributed group ranks by %d", world_size)
+        group_ranks = offset_distributed_groups(
             group_ranks, 
-            world_size // disagg_prefill_size
+            world_size
         )
 
     # message queue broadcaster is only used in tensor model parallel group
@@ -956,7 +973,7 @@ def initialize_model_parallel(
                                     use_message_queue_broadcaster=True)
 
     # Build the pipeline model-parallel groups.
-    num_pipeline_model_parallel_groups: int = (world_size // disagg_prefill_size //
+    num_pipeline_model_parallel_groups: int = (world_size // 
                                                pipeline_model_parallel_size)
     global _PP
     assert _PP is None, (
@@ -965,17 +982,34 @@ def initialize_model_parallel(
     for i in range(num_pipeline_model_parallel_groups):
         ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
         group_ranks.append(ranks)
-    # extend the distributed group if disaggregated prefilling is enabled
-    if disagg_prefill_size > 1:
-        group_ranks = extend_distributed_group_with_offset(
+    if envs.VLLM_DISAGG_PREFILL_ROLE == "decoding":
+        logger.debug("Current instance is decoding instance")
+        logger.debug("Offset the distributed group ranks by %d", world_size)
+        group_ranks = offset_distributed_groups(
             group_ranks, 
-            world_size // disagg_prefill_size
+            world_size
         )
     # pipeline parallel does not need custom allreduce
     _PP = init_model_parallel_group(group_ranks,
                                     get_world_group().local_rank,
                                     backend,
                                     use_custom_allreduce=False)
+    
+    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+        assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefilling", "decoding"], (
+            "VLLM_DISAGG_PREFILL_ROLE should be either prefilling or decoding")
+        logger.debug("Disaggregated prefilling enabled, create distributed group")
+        group_ranks = []
+        for i in range(world_size):
+            # prefilling local rank: i
+            # decoding global rank: i + world_size
+            group_ranks.append([i, i + world_size])
+        logger.debug("Distributed group is %s", str(group_ranks))
+        _DISAGG = init_model_parallel_group(
+            group_ranks,
+            int(envs.VLLM_DISAGG_PREFILL_ROLE == "decoding"),
+            backend,
+            use_custom_allreduce=False)
 
 
 def ensure_model_parallel_initialized(
@@ -1061,6 +1095,11 @@ def destroy_model_parallel():
         _PP.destroy()
     _PP = None
 
+    global _DISAGG
+    if _DISAGG:
+        _DISAGG.destroy()
+    _DISAGG = None
+
 
 def destroy_distributed_environment():
     global _WORLD

From 94cadb85855458c96535e774d7261585fe975e73 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 22:37:55 -0700
Subject: [PATCH 032/278] offset global rank for decoding instances

---
 vllm/distributed/parallel_state.py | 32 +++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index aabe660cf1da..9c1247e06623 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -813,26 +813,34 @@ def init_distributed_environment(
         "world_size=%d rank=%d local_rank=%d "
         "distributed_init_method=%s backend=%s", world_size, rank, local_rank,
         distributed_init_method, backend)
-    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-        # Disaggregated prefilling is enabled
-        # There will be 2 copies of vLLM
-        # One for prefilling and one for decoding
-        world_size = world_size * 2
-        logger.debug(
-            "Disaggregated prefill enabled, "
-            "increase world size to %d", world_size)
-    else:
-        disagg_prefill_size = 1
     if not torch.distributed.is_initialized():
         assert distributed_init_method is not None, (
             "distributed_init_method must be provided when initializing "
             "distributed environment")
         # this backend is used for WORLD
+        if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+            # Disaggregated prefilling is enabled
+            # world_size in vLLM is tp * pp
+            # for prefill, the ranks are [0, world_size)
+            # for decode, the ranks are [world_size, 2 * world_size)
+            maybe_disagg_world_size = world_size * 2
+            logger.debug(
+                "Disaggregated prefill enabled, handle torch-related changes on world size and ranks. This change is only inside `vllm/distributed/parallel_state.py`) and the other files are unchanged.")
+            assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefilling", "decoding"], (
+            "VLLM_DISAGG_PREFILL_ROLE should be either prefilling or decoding")
+            if envs.VLLM_DISAGG_PREFILL_ROLE == "prefilling":
+                maybe_disagg_rank = rank
+            else:
+                # offset global rank by tp * pp (which is world_size)
+                maybe_disagg_rank = rank + world_size
+        else:
+            maybe_disagg_world_size = world_size
+            maybe_disagg_rank = rank
         torch.distributed.init_process_group(
             backend=backend,
             init_method=distributed_init_method,
-            world_size=world_size,
-            rank=rank)
+            world_size=maybe_disagg_world_size,
+            rank=maybe_disagg_rank)
     # set the local rank
     # local_rank is not available in torch ProcessGroup,
     # see https://github.com/pytorch/pytorch/issues/122816

From ded5d92564e68d4442bdd698708c24d91b3c461f Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 22:42:04 -0700
Subject: [PATCH 033/278] adjust naming: use prefill and decode instead of
 prefilling and decoding

---
 vllm/distributed/parallel_state.py | 54 +++++++++++++++---------------
 1 file changed, 27 insertions(+), 27 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 9c1247e06623..01ccbbb98917 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -8,7 +8,7 @@
 
 - call `init_distributed_environment` to initialize the distributed environment.
 - call `initialize_model_parallel` or `ensure_model_parallel_initialized` to 
- initialize the model parallel groups and disaggregated prefilling parallel 
+ initialize the model parallel groups and disaggregated prefill parallel 
  groups.
 
 - any code dealing with the distributed stuff
@@ -764,7 +764,7 @@ def get_pp_group() -> GroupCoordinator:
 
 def get_disagg_group() -> GroupCoordinator:
     assert _DISAGG is not None, (
-        "disaggregated prefilling parallel group is not initialized")
+        "disaggregated prefill parallel group is not initialized")
     return _DISAGG
 
 
@@ -819,16 +819,16 @@ def init_distributed_environment(
             "distributed environment")
         # this backend is used for WORLD
         if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-            # Disaggregated prefilling is enabled
+            # Disaggregated prefill is enabled
             # world_size in vLLM is tp * pp
             # for prefill, the ranks are [0, world_size)
             # for decode, the ranks are [world_size, 2 * world_size)
             maybe_disagg_world_size = world_size * 2
             logger.debug(
                 "Disaggregated prefill enabled, handle torch-related changes on world size and ranks. This change is only inside `vllm/distributed/parallel_state.py`) and the other files are unchanged.")
-            assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefilling", "decoding"], (
-            "VLLM_DISAGG_PREFILL_ROLE should be either prefilling or decoding")
-            if envs.VLLM_DISAGG_PREFILL_ROLE == "prefilling":
+            assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
+            "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
+            if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
                 maybe_disagg_rank = rank
             else:
                 # offset global rank by tp * pp (which is world_size)
@@ -913,9 +913,9 @@ def initialize_model_parallel(
     with a total of 16 GPUs, rank 0 to 7 belong to the first box and
     ranks 8 to 15 belong to the second box.
 
-    Disaggregated prefilling will also initialize using this function.
-    Why: disaggregated prefilling is similar to pipeline parallel
-     except that disaggregated prefilling does not partition model
+    Disaggregated prefill will also initialize using this function.
+    Why: disaggregated prefill is similar to pipeline parallel
+     except that disaggregated prefill does not partition model
     Methodology:
         - Only change variables in this file
         - Any variable outside this file should be unchanged
@@ -923,17 +923,17 @@ def initialize_model_parallel(
         - World size in vLLM variables (like in `ParallelConfig`): unchanged
         - World size in `torch.distributed`: doubled (2 * tp * pp)
         - Rank:
-            - [0, tp * pp) for prefilling
-            - [tp * pp, 2 * tp * pp) for decoding
+            - [0, tp * pp) for prefill
+            - [tp * pp, 2 * tp * pp) for decode
         - Parallel groups
-            - Unchanged for prefilling
-            - Offseted by tp * pp for decoding
-            - Add a new parallel group `_DISAGG` for disaggregated prefilling
+            - Unchanged for prefill
+            - Offseted by tp * pp for decode
+            - Add a new parallel group `_DISAGG` for disaggregated prefill
                 - [0, tp * pp], [1, tp * pp + 1], .. 
         - Local rank: unchanged
             - Thanks to PP implementation, distributed operations only rely on
              local rank. This guarantees the communications inside the
-             prefilling instance and decoding instance are unchanged.
+             prefill instance and decode instance are unchanged.
     """
     # Get world size and rank. Ensure some consistencies.
     assert torch.distributed.is_initialized()
@@ -942,7 +942,7 @@ def initialize_model_parallel(
         get_world_group().device_group)        
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
         # Keep the semantics of world_size the same (`tp * pp`)
-        logger.debug("Disaggregated prefilling enabled")
+        logger.debug("Disaggregated prefill enabled")
         world_size = world_size // 2
         logger.debug("Shrink the world size from %d to %d",
                      world_size * 2,
@@ -966,8 +966,8 @@ def initialize_model_parallel(
             range(i * tensor_model_parallel_size,
                   (i + 1) * tensor_model_parallel_size))
         group_ranks.append(ranks)
-    if envs.VLLM_DISAGG_PREFILL_ROLE == "decoding":
-        logger.debug("Current instance is decoding instance")
+    if envs.VLLM_DISAGG_PREFILL_ROLE == "decode":
+        logger.debug("Current instance is decode instance")
         logger.debug("Offset the distributed group ranks by %d", world_size)
         group_ranks = offset_distributed_groups(
             group_ranks, 
@@ -990,8 +990,8 @@ def initialize_model_parallel(
     for i in range(num_pipeline_model_parallel_groups):
         ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
         group_ranks.append(ranks)
-    if envs.VLLM_DISAGG_PREFILL_ROLE == "decoding":
-        logger.debug("Current instance is decoding instance")
+    if envs.VLLM_DISAGG_PREFILL_ROLE == "decode":
+        logger.debug("Current instance is decode instance")
         logger.debug("Offset the distributed group ranks by %d", world_size)
         group_ranks = offset_distributed_groups(
             group_ranks, 
@@ -1004,18 +1004,18 @@ def initialize_model_parallel(
                                     use_custom_allreduce=False)
     
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-        assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefilling", "decoding"], (
-            "VLLM_DISAGG_PREFILL_ROLE should be either prefilling or decoding")
-        logger.debug("Disaggregated prefilling enabled, create distributed group")
+        assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
+            "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
+        logger.debug("Disaggregated prefill enabled, create distributed group")
         group_ranks = []
         for i in range(world_size):
-            # prefilling local rank: i
-            # decoding global rank: i + world_size
+            # prefill local rank: i
+            # decode global rank: i + world_size
             group_ranks.append([i, i + world_size])
         logger.debug("Distributed group is %s", str(group_ranks))
         _DISAGG = init_model_parallel_group(
             group_ranks,
-            int(envs.VLLM_DISAGG_PREFILL_ROLE == "decoding"),
+            int(envs.VLLM_DISAGG_PREFILL_ROLE == "decode"),
             backend,
             use_custom_allreduce=False)
 
@@ -1060,7 +1060,7 @@ def model_parallel_is_initialized():
 def patch_tensor_parallel_group(tp_group: GroupCoordinator):
     """Patch the tp group temporarily until this function ends.
 
-    This method is for draft workers of speculative decoding to run draft model
+    This method is for draft workers of speculative decode to run draft model
     with different tp degree from that of target model workers.
 
     Args:

From 709ae054ba098bdcfd5ab24ffd1639c5ba82a35a Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 22:42:59 -0700
Subject: [PATCH 034/278] adjust the example: let the decode process in
 foreground for debugging

---
 examples/disaggregated_prefill_example.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index ebfaa6e43000..6835a378ff56 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -21,6 +21,6 @@ VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     -tp 4 \
     --disable-log-stats \
     --disable-log-requests \
-    --enable-chunked-prefill &
+    --enable-chunked-prefill
 
 

From 2ab44d4aea31b45f57be26b2fac1724a17b73c62 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 22:52:06 -0700
Subject: [PATCH 035/278] adjust logger format

---
 vllm/distributed/parallel_state.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 01ccbbb98917..ccf0bb465d75 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -22,6 +22,7 @@
 """
 import contextlib
 import pickle
+import logging
 from collections import namedtuple
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
@@ -793,6 +794,13 @@ def graph_capture():
 
 
 logger = init_logger(__name__)
+if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+    # disaggregated prefill enabled
+    # indicating if the current instance is prefill or decode
+    class CustomAdapter(logging.LoggerAdapter):
+        def process(self, msg, kwargs):
+            return f"[{envs.VLLM_DISAGG_PREFILL_ROLE}] {msg}", kwargs
+    logger = CustomAdapter(logger)
 
 _ENABLE_CUSTOM_ALL_REDUCE = True
 
@@ -942,11 +950,8 @@ def initialize_model_parallel(
         get_world_group().device_group)        
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
         # Keep the semantics of world_size the same (`tp * pp`)
-        logger.debug("Disaggregated prefill enabled")
+        logger.debug("Disaggregated prefill enabled, the world size obtained from torch.distributed (2 * tp * pp) should be decreased to align with vLLM world size (tp * pp)")
         world_size = world_size // 2
-        logger.debug("Shrink the world size from %d to %d",
-                     world_size * 2,
-                     world_size)
 
     if (world_size !=
             tensor_model_parallel_size * pipeline_model_parallel_size):

From 22138814ffad83a1e2a5ddf12bbe2ec28126f285 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 23:03:21 -0700
Subject: [PATCH 036/278] test if the P2P cache stucks when no disaggregated
 prefilling

---
 examples/disaggregated_prefill_example.sh | 33 ++++++++++++++---------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index 6835a378ff56..7b3247dcc4a6 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -2,25 +2,34 @@
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
 
-# prefilling instance
-VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
     --port 8100 \
     -tp 4 \
     --disable-log-stats \
     --disable-log-requests \
-    --enable-chunked-prefill &
+    --enable-chunked-prefill 
 
-sleep 10
+# # prefilling instance
+# VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+#     -m vllm.entrypoints.openai.api_server \
+#     --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
+#     --port 8100 \
+#     -tp 4 \
+#     --disable-log-stats \
+#     --disable-log-requests \
+#     --enable-chunked-prefill &
 
-VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
-    --port 8200 \
-    -tp 4 \
-    --disable-log-stats \
-    --disable-log-requests \
-    --enable-chunked-prefill
+# sleep 10
+
+# VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+#     -m vllm.entrypoints.openai.api_server \
+#     --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
+#     --port 8200 \
+#     -tp 4 \
+#     --disable-log-stats \
+#     --disable-log-requests \
+#     --enable-chunked-prefill
 
 

From 544f5cb243a86baeba6c5c84ddb2727cacbd535a Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 23:07:05 -0700
Subject: [PATCH 037/278] let decode instance sleep, to avoid generating P2P
 cache simultaneously

---
 vllm/distributed/parallel_state.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index ccf0bb465d75..3f770c520ee0 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -20,6 +20,7 @@
  parallelism, you can skip the model parallel initialization and destruction
  steps.
 """
+import time
 import contextlib
 import pickle
 import logging
@@ -849,6 +850,8 @@ def init_distributed_environment(
             init_method=distributed_init_method,
             world_size=maybe_disagg_world_size,
             rank=maybe_disagg_rank)
+        if envs.VLLM_DISAGG_PREFILL_ROLE == "decode":
+            time.sleep(60)
     # set the local rank
     # local_rank is not available in torch ProcessGroup,
     # see https://github.com/pytorch/pytorch/issues/122816
@@ -883,13 +886,10 @@ def offset_distributed_groups(
     """
     
     logger.debug("Offset distributed groups with offset %d", offset)
-    logger.debug("Before offset:\n%s", str(groups))
     
     new_groups = []
     for group in groups:
         new_groups.append([rank + offset for rank in group])
-
-    logger.debug("After offset:\n%s", str(new_groups))
         
     return new_groups
 

From 04d319a08ae484934813ee525c8d73230f49d99c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 23:09:34 -0700
Subject: [PATCH 038/278] continue disaggregated prefill debugging

---
 examples/disaggregated_prefill_example.sh | 34 ++++++++---------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index 7b3247dcc4a6..360ad4c69c00 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -2,34 +2,24 @@
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
 
-CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+# prefilling instance
+VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
     --port 8100 \
     -tp 4 \
     --disable-log-stats \
     --disable-log-requests \
-    --enable-chunked-prefill 
+    --enable-chunked-prefill &
 
-# # prefilling instance
-# VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
-#     -m vllm.entrypoints.openai.api_server \
-#     --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
-#     --port 8100 \
-#     -tp 4 \
-#     --disable-log-stats \
-#     --disable-log-requests \
-#     --enable-chunked-prefill &
-
-# sleep 10
-
-# VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
-#     -m vllm.entrypoints.openai.api_server \
-#     --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
-#     --port 8200 \
-#     -tp 4 \
-#     --disable-log-stats \
-#     --disable-log-requests \
-#     --enable-chunked-prefill
+# decoding instance
+VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
+    --port 8200 \
+    -tp 4 \
+    --disable-log-stats \
+    --disable-log-requests \
+    --enable-chunked-prefill
 
 

From 2e0f02cea3e86309e364a31ac7293dbdfc0a85cf Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 23:17:46 -0700
Subject: [PATCH 039/278] offset world group for decoding instance

---
 vllm/distributed/parallel_state.py | 54 ++++++++++++++++--------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 3f770c520ee0..60d999ab75aa 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -809,6 +809,29 @@ def process(self, msg, kwargs):
 def set_custom_all_reduce(enable: bool):
     global _ENABLE_CUSTOM_ALL_REDUCE
     _ENABLE_CUSTOM_ALL_REDUCE = enable
+    
+
+def offset_distributed_groups(
+    groups: List[List[int]],
+    offset: int,
+) -> List[List[int]]:
+    """
+        Extend original distributed group.
+        The extended part will be the original distributed group plus an offset.
+        
+        Arguments:
+            groups: original distributed group
+            offset: the offset we want to apply to the duplicated group.
+                Typically world_size // 2
+    """
+    
+    logger.debug("Offset distributed groups with offset %d", offset)
+    
+    new_groups = []
+    for group in groups:
+        new_groups.append([rank + offset for rank in group])
+        
+    return new_groups
 
 
 def init_distributed_environment(
@@ -850,8 +873,6 @@ def init_distributed_environment(
             init_method=distributed_init_method,
             world_size=maybe_disagg_world_size,
             rank=maybe_disagg_rank)
-        if envs.VLLM_DISAGG_PREFILL_ROLE == "decode":
-            time.sleep(60)
     # set the local rank
     # local_rank is not available in torch ProcessGroup,
     # see https://github.com/pytorch/pytorch/issues/122816
@@ -864,34 +885,17 @@ def init_distributed_environment(
             local_rank = rank
     global _WORLD
     if _WORLD is None:
-        ranks = list(range(torch.distributed.get_world_size()))
+        ranks = list(range(world_size))
+        # offset the distributed group
+        if all(
+            [envs.VLLM_DISAGG_PREFILL_ROLE is not None],
+            [envs.VLLM_DISAGG_PREFILL_ROLE == "decode"]):
+            ranks = offset_distributed_groups(ranks, world_size)
         _WORLD = init_world_group(ranks, local_rank, backend)
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size")
         
-        
-def offset_distributed_groups(
-    groups: List[List[int]],
-    offset: int,
-) -> List[List[int]]:
-    """
-        Extend original distributed group.
-        The extended part will be the original distributed group plus an offset.
-        
-        Arguments:
-            groups: original distributed group
-            offset: the offset we want to apply to the duplicated group.
-                Typically world_size // 2
-    """
-    
-    logger.debug("Offset distributed groups with offset %d", offset)
-    
-    new_groups = []
-    for group in groups:
-        new_groups.append([rank + offset for rank in group])
-        
-    return new_groups
 
 
 def initialize_model_parallel(

From fd5f1153c0b4a39b626026b32917fef1522f260c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 23:19:31 -0700
Subject: [PATCH 040/278] a syntax fix

---
 vllm/distributed/parallel_state.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 60d999ab75aa..66bf43ec563e 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -887,9 +887,9 @@ def init_distributed_environment(
     if _WORLD is None:
         ranks = list(range(world_size))
         # offset the distributed group
-        if all(
-            [envs.VLLM_DISAGG_PREFILL_ROLE is not None],
-            [envs.VLLM_DISAGG_PREFILL_ROLE == "decode"]):
+        if all([
+            envs.VLLM_DISAGG_PREFILL_ROLE is not None,
+            envs.VLLM_DISAGG_PREFILL_ROLE == "decode"]):
             ranks = offset_distributed_groups(ranks, world_size)
         _WORLD = init_world_group(ranks, local_rank, backend)
     else:

From 8d90e6a4e902ab8868f3901aa6cff74fd68422e9 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 17 Jul 2024 23:22:04 -0700
Subject: [PATCH 041/278] bug fix

---
 vllm/distributed/parallel_state.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 66bf43ec563e..86d3ef5135d3 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -890,7 +890,8 @@ def init_distributed_environment(
         if all([
             envs.VLLM_DISAGG_PREFILL_ROLE is not None,
             envs.VLLM_DISAGG_PREFILL_ROLE == "decode"]):
-            ranks = offset_distributed_groups(ranks, world_size)
+                ranks = list(range(world_size, 2 * world_size))
+            
         _WORLD = init_world_group(ranks, local_rank, backend)
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (

From a9474a72b52649ffa96b2e3a64eee10bca8c54b4 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 18 Jul 2024 00:00:27 -0700
Subject: [PATCH 042/278] specify the source of get_open_port

---
 vllm/distributed/device_communicators/shm_broadcast.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 151b08c1b996..213e734f58c1 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -182,11 +182,11 @@ def __init__(
                                         max_chunks)
 
             self.local_socket = context.socket(PUB)
-            local_subscribe_port = get_open_port()
+            local_subscribe_port = get_open_port(is_for_dist_init = False)
             self.local_socket.bind(f"tcp://*:{local_subscribe_port}")
 
             self.local_sync_socket = context.socket(REP)
-            local_sync_port = get_open_port()
+            local_sync_port = get_open_port(is_for_dist_init = False)
             self.local_sync_socket.bind(f"tcp://*:{local_sync_port}")
             self.current_idx = 0
 
@@ -202,11 +202,11 @@ def __init__(
             # for remote readers, we will:
             # create a publish-subscribe socket to communicate large data
             self.remote_socket = context.socket(PUB)
-            remote_subscribe_port = get_open_port()
+            remote_subscribe_port = get_open_port(is_for_dist_init = False)
             self.remote_socket.bind(f"tcp://*:{remote_subscribe_port}")
 
             self.remote_sync_socket = context.socket(REP)
-            remote_sync_port = get_open_port()
+            remote_sync_port = get_open_port(is_for_dist_init = False)
             self.remote_sync_socket.bind(f"tcp://*:{remote_sync_port}")
         else:
             remote_subscribe_port = None

From 701b0878e69245cb0a1231efac4dc43a90a654e5 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 18 Jul 2024 00:01:53 -0700
Subject: [PATCH 043/278] document why specifying the source of get_open_port

---
 vllm/utils.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 75c3c4de02c0..f4c3809360f8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -361,11 +361,12 @@ def get_distributed_init_method(ip: str, port: int) -> str:
     return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}"
 
 
-def get_open_port() -> int:
+def get_open_port(is_for_dist_init: bool = True) -> int:
     port = envs.VLLM_PORT
     if port is not None:
-        if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-            # The prefill and decode instance shares the same port
+        if envs.VLLM_DISAGG_PREFILL_ROLE is not None and is_for_dist_init:
+            # When initializing distributed environment for disagg prefill
+            # The prefill and decode instance may share the same port
             # Skip the binding check as the port may be binded by prefill
             return port
         while True:

From fa5d71fa5afa0d3c87ddbc07caa8d97e6d91c1ba Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 18 Jul 2024 19:02:50 -0700
Subject: [PATCH 044/278] add VLLM_TRACE_FUNCTION to track the call stack

---
 examples/disaggregated_prefill_example.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index 360ad4c69c00..bbcb11e863cd 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -1,6 +1,7 @@
 
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
+export VLLM_TRACE_FUNCTION=1
 
 # prefilling instance
 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \

From e2faedee295196ae8ce6fe2e950fbd250dfc332a Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 18 Jul 2024 19:06:27 -0700
Subject: [PATCH 045/278] fix customadapter bug

---
 vllm/distributed/parallel_state.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 86d3ef5135d3..52cadcea90ec 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -801,7 +801,7 @@ def graph_capture():
     class CustomAdapter(logging.LoggerAdapter):
         def process(self, msg, kwargs):
             return f"[{envs.VLLM_DISAGG_PREFILL_ROLE}] {msg}", kwargs
-    logger = CustomAdapter(logger)
+    logger = CustomAdapter(logger, extra=None)
 
 _ENABLE_CUSTOM_ALL_REDUCE = True
 

From 76b6c5e2277fcd3251c989cdce7d24f2e53788b3 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 18 Jul 2024 19:58:52 -0700
Subject: [PATCH 046/278] add parallel state logs for debugging

---
 vllm/distributed/parallel_state.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 52cadcea90ec..4a4ef7c50ca2 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -140,6 +140,7 @@ def __init__(
     ):
 
         self.rank = torch.distributed.get_rank()
+        logger.debug("My rank is %d", self.rank)
         self.local_rank = local_rank
         self.device_group = None
         self.cpu_group = None
@@ -147,9 +148,11 @@ def __init__(
         for ranks in group_ranks:
             device_group = torch.distributed.new_group(
                 ranks, backend=torch_distributed_backend)
+            logger.debug("device group initialized")
             # a group with `gloo` backend, to allow direct coordination between
             # processes through the CPU.
             cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            logger.debug("cpu group initialized")
             if self.rank in ranks:
                 self.ranks = ranks
                 self.world_size = len(ranks)
@@ -180,6 +183,7 @@ def __init__(
                 group=self.cpu_group,
                 device=self.device,
             )
+            logger.debug("Pynccl initialized")
         else:
             self.pynccl_comm = None
 
@@ -954,7 +958,6 @@ def initialize_model_parallel(
     backend = backend or torch.distributed.get_backend(
         get_world_group().device_group)        
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-        # Keep the semantics of world_size the same (`tp * pp`)
         logger.debug("Disaggregated prefill enabled, the world size obtained from torch.distributed (2 * tp * pp) should be decreased to align with vLLM world size (tp * pp)")
         world_size = world_size // 2
 
@@ -978,7 +981,7 @@ def initialize_model_parallel(
         group_ranks.append(ranks)
     if envs.VLLM_DISAGG_PREFILL_ROLE == "decode":
         logger.debug("Current instance is decode instance")
-        logger.debug("Offset the distributed group ranks by %d", world_size)
+        logger.debug("Offset the _TP ranks by %d", world_size)
         group_ranks = offset_distributed_groups(
             group_ranks, 
             world_size
@@ -1002,7 +1005,7 @@ def initialize_model_parallel(
         group_ranks.append(ranks)
     if envs.VLLM_DISAGG_PREFILL_ROLE == "decode":
         logger.debug("Current instance is decode instance")
-        logger.debug("Offset the distributed group ranks by %d", world_size)
+        logger.debug("Offset the _PP ranks by %d", world_size)
         group_ranks = offset_distributed_groups(
             group_ranks, 
             world_size
@@ -1016,7 +1019,7 @@ def initialize_model_parallel(
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
         assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
             "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
-        logger.debug("Disaggregated prefill enabled, create distributed group")
+        logger.debug("Disaggregated prefill enabled, create _DISAGG group")
         group_ranks = []
         for i in range(world_size):
             # prefill local rank: i

From cb6d6a5a11d8a63e82ff663c0aa216c76b81bc2e Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 18 Jul 2024 20:08:58 -0700
Subject: [PATCH 047/278] add sleep when initializing parallel state

---
 vllm/distributed/parallel_state.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 4a4ef7c50ca2..6db042bade2b 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -894,6 +894,9 @@ def init_distributed_environment(
         if all([
             envs.VLLM_DISAGG_PREFILL_ROLE is not None,
             envs.VLLM_DISAGG_PREFILL_ROLE == "decode"]):
+                # sleep 10 seconds to avoid potential collisions
+                # when initializing distributed environment
+                time.sleep(10)
                 ranks = list(range(world_size, 2 * world_size))
             
         _WORLD = init_world_group(ranks, local_rank, backend)

From fe8fb473aa73ed056ab8a31ab6a6c517ece1289c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 18 Jul 2024 20:26:17 -0700
Subject: [PATCH 048/278] only log when rank%4==0

---
 vllm/distributed/parallel_state.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 6db042bade2b..18b63639ce08 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -140,7 +140,6 @@ def __init__(
     ):
 
         self.rank = torch.distributed.get_rank()
-        logger.debug("My rank is %d", self.rank)
         self.local_rank = local_rank
         self.device_group = None
         self.cpu_group = None
@@ -799,13 +798,16 @@ def graph_capture():
 
 
 logger = init_logger(__name__)
-if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+
+original_logger = logger
+def logger(*args, **kwargs):
     # disaggregated prefill enabled
     # indicating if the current instance is prefill or decode
-    class CustomAdapter(logging.LoggerAdapter):
-        def process(self, msg, kwargs):
-            return f"[{envs.VLLM_DISAGG_PREFILL_ROLE}] {msg}", kwargs
-    logger = CustomAdapter(logger, extra=None)
+    if torch.distributed.is_initialized():
+        if torch.distributed.get_rank() % 4 == 0:
+            original_logger(*args, **kwargs)
+    else:
+        original_logger(*args, **kwargs)
 
 _ENABLE_CUSTOM_ALL_REDUCE = True
 

From cc89bfbb6d5728b7eb004ecb9d994162526247be Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 18 Jul 2024 20:33:41 -0700
Subject: [PATCH 049/278] only log when rank%4==0

---
 vllm/distributed/parallel_state.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 18b63639ce08..56bdef65a882 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -798,16 +798,16 @@ def graph_capture():
 
 
 logger = init_logger(__name__)
-
-original_logger = logger
-def logger(*args, **kwargs):
-    # disaggregated prefill enabled
-    # indicating if the current instance is prefill or decode
-    if torch.distributed.is_initialized():
-        if torch.distributed.get_rank() % 4 == 0:
-            original_logger(*args, **kwargs)
-    else:
-        original_logger(*args, **kwargs)
+class ConditionalLoggingHandler(logging.Handler):
+    def emit(self, record):
+        dist = torch.distributed
+        try:
+            if not dist.is_initialized() or (dist.is_initialized() and dist.get_rank() % 4 == 0):
+                msg = self.format(record)
+                print(msg)  # You can replace this with any other logging mechanism you prefer
+        except Exception:
+            pass
+logger.addHandler(handler)
 
 _ENABLE_CUSTOM_ALL_REDUCE = True
 

From 531bdf3ca93e7005e621bac2f8127104359e2af0 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 18 Jul 2024 20:34:14 -0700
Subject: [PATCH 050/278] bug fix

---
 vllm/distributed/parallel_state.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 56bdef65a882..260e7dcfd423 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -807,7 +807,7 @@ def emit(self, record):
                 print(msg)  # You can replace this with any other logging mechanism you prefer
         except Exception:
             pass
-logger.addHandler(handler)
+logger.addHandler(ConditionalLoggingHandler())
 
 _ENABLE_CUSTOM_ALL_REDUCE = True
 

From 1804656ca72d77a16f81411feb3571932b3b16e9 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 18 Jul 2024 20:36:11 -0700
Subject: [PATCH 051/278] also only log when rank=4 in custom all reduce

---
 .../device_communicators/custom_all_reduce.py        | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index a4f30808d32e..b7d5af5a8a0a 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,5 +1,6 @@
 from contextlib import contextmanager
 from typing import Any, List, Optional, Union
+import logging
 
 import torch
 import torch.distributed as dist
@@ -22,6 +23,17 @@
 
 logger = init_logger(__name__)
 
+class ConditionalLoggingHandler(logging.Handler):
+    def emit(self, record):
+        dist = torch.distributed
+        try:
+            if not dist.is_initialized() or (dist.is_initialized() and dist.get_rank() % 4 == 0):
+                msg = self.format(record)
+                print(msg)  # You can replace this with any other logging mechanism you prefer
+        except Exception:
+            pass
+logger.addHandler(ConditionalLoggingHandler())
+
 
 def _can_p2p(rank: int, world_size: int) -> bool:
     for i in range(world_size):

From 81c8640066b05c824af6db21ea8d1407f02e5a07 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 18 Jul 2024 21:44:33 -0700
Subject: [PATCH 052/278] add debuging statement around broadcast

---
 vllm/distributed/device_communicators/pynccl.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 731956654567..36f1e04aec79 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -41,6 +41,7 @@ def __init__(
         self.rank = dist.get_rank(group)
         self.world_size = dist.get_world_size(group)
 
+
         # if world_size == 1, no need to create communicator
         if self.world_size == 1:
             self.available = False
@@ -70,8 +71,11 @@ def __init__(
             self.unique_id = ncclUniqueId()
         tensor = torch.ByteTensor(list(self.unique_id.internal))
         ranks = dist.get_process_group_ranks(group)
+        logger.debug("Group: %s, group rank: %s, world size: %s, src: %s", str(group), str(self.rank), str(self.world_size), ranks[0])
+        
         # arg `src` in `broadcast` is the global rank
         dist.broadcast(tensor, src=ranks[0], group=group)
+        logger.debug("dist broadcast succeeded")
         byte_list = tensor.tolist()
         for i, byte in enumerate(byte_list):
             self.unique_id.internal[i] = byte

From 5ba142c287fc8910441d8c9ce84f49f06585727c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 18 Jul 2024 22:24:34 -0700
Subject: [PATCH 053/278] debug init_world_group

---
 vllm/distributed/parallel_state.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 260e7dcfd423..de5e68822e12 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -145,6 +145,7 @@ def __init__(
         self.cpu_group = None
 
         for ranks in group_ranks:
+            logger.debug("initializing device group")
             device_group = torch.distributed.new_group(
                 ranks, backend=torch_distributed_backend)
             logger.debug("device group initialized")
@@ -889,6 +890,11 @@ def init_distributed_environment(
             local_rank = envs.LOCAL_RANK
         else:
             local_rank = rank
+            
+    if all([
+        envs.VLLM_DISAGG_PREFILL_ROLE is not None,
+        envs.VLLM_DISAGG_PREFILL_ROLE == "prefill"]):
+        time.sleep(1000)
     global _WORLD
     if _WORLD is None:
         ranks = list(range(world_size))
@@ -898,7 +904,6 @@ def init_distributed_environment(
             envs.VLLM_DISAGG_PREFILL_ROLE == "decode"]):
                 # sleep 10 seconds to avoid potential collisions
                 # when initializing distributed environment
-                time.sleep(10)
                 ranks = list(range(world_size, 2 * world_size))
             
         _WORLD = init_world_group(ranks, local_rank, backend)

From cc939cfeed19b95485b48199be61d941c7e5eaf1 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 18 Jul 2024 22:29:23 -0700
Subject: [PATCH 054/278] put the log inside a text file

---
 examples/disaggregated_prefill_example.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index bbcb11e863cd..c06e90cb4432 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -11,7 +11,7 @@ VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
     -tp 4 \
     --disable-log-stats \
     --disable-log-requests \
-    --enable-chunked-prefill &
+    --enable-chunked-prefill > >(tee -a prefill.txt) 2> >(tee -a prefill.txt >&2) &
 
 # decoding instance
 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
@@ -21,6 +21,6 @@ VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     -tp 4 \
     --disable-log-stats \
     --disable-log-requests \
-    --enable-chunked-prefill
+    --enable-chunked-prefill > >(tee -a decode.txt) 2> >(tee -a decode.txt >&2) &
 
 

From 8ac9266a711ca9e0ceff8abcc171b9896232ee70 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 01:08:49 -0700
Subject: [PATCH 055/278] init DISAGG first

---
 vllm/distributed/parallel_state.py | 34 ++++++++++++++++--------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index de5e68822e12..58862574d493 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -996,6 +996,24 @@ def initialize_model_parallel(
             group_ranks, 
             world_size
         )
+        
+    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+        assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
+            "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
+        logger.debug("Disaggregated prefill enabled, create _DISAGG group")
+        group_ranks = []
+        for i in range(world_size):
+            # prefill local rank: i
+            # decode global rank: i + world_size
+            group_ranks.append([i, i + world_size])
+        logger.debug("Distributed group is %s", str(group_ranks))
+        _DISAGG = init_model_parallel_group(
+            group_ranks,
+            int(envs.VLLM_DISAGG_PREFILL_ROLE == "decode"),
+            backend,
+            use_custom_allreduce=False)
+        
+    time.sleep(1000)
 
     # message queue broadcaster is only used in tensor model parallel group
     _TP = init_model_parallel_group(group_ranks,
@@ -1025,22 +1043,6 @@ def initialize_model_parallel(
                                     get_world_group().local_rank,
                                     backend,
                                     use_custom_allreduce=False)
-    
-    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-        assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
-            "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
-        logger.debug("Disaggregated prefill enabled, create _DISAGG group")
-        group_ranks = []
-        for i in range(world_size):
-            # prefill local rank: i
-            # decode global rank: i + world_size
-            group_ranks.append([i, i + world_size])
-        logger.debug("Distributed group is %s", str(group_ranks))
-        _DISAGG = init_model_parallel_group(
-            group_ranks,
-            int(envs.VLLM_DISAGG_PREFILL_ROLE == "decode"),
-            backend,
-            use_custom_allreduce=False)
 
 
 def ensure_model_parallel_initialized(

From 58849fa61b559d9785a50696d03f810aaa0788c4 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 01:11:31 -0700
Subject: [PATCH 056/278] init DISAGG before global

---
 vllm/distributed/parallel_state.py | 36 ++++++++++++++++--------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 58862574d493..a83f286c3b46 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -962,6 +962,25 @@ def initialize_model_parallel(
              local rank. This guarantees the communications inside the
              prefill instance and decode instance are unchanged.
     """
+    
+    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+        assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
+            "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
+        logger.debug("Disaggregated prefill enabled, create _DISAGG group")
+        group_ranks = []
+        for i in range(world_size):
+            # prefill local rank: i
+            # decode global rank: i + world_size
+            group_ranks.append([i, i + world_size])
+        logger.debug("Distributed group is %s", str(group_ranks))
+        _DISAGG = init_model_parallel_group(
+            group_ranks,
+            int(envs.VLLM_DISAGG_PREFILL_ROLE == "decode"),
+            backend,
+            use_custom_allreduce=False)
+        
+    time.sleep(1000)
+    
     # Get world size and rank. Ensure some consistencies.
     assert torch.distributed.is_initialized()
     world_size: int = torch.distributed.get_world_size()
@@ -997,23 +1016,6 @@ def initialize_model_parallel(
             world_size
         )
         
-    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-        assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
-            "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
-        logger.debug("Disaggregated prefill enabled, create _DISAGG group")
-        group_ranks = []
-        for i in range(world_size):
-            # prefill local rank: i
-            # decode global rank: i + world_size
-            group_ranks.append([i, i + world_size])
-        logger.debug("Distributed group is %s", str(group_ranks))
-        _DISAGG = init_model_parallel_group(
-            group_ranks,
-            int(envs.VLLM_DISAGG_PREFILL_ROLE == "decode"),
-            backend,
-            use_custom_allreduce=False)
-        
-    time.sleep(1000)
 
     # message queue broadcaster is only used in tensor model parallel group
     _TP = init_model_parallel_group(group_ranks,

From 08797e263e79b74434c6114bf39edeaae5799647 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 01:14:05 -0700
Subject: [PATCH 057/278] put it behind world_size

---
 vllm/distributed/parallel_state.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a83f286c3b46..a9e4008e18b0 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -963,6 +963,18 @@ def initialize_model_parallel(
              prefill instance and decode instance are unchanged.
     """
     
+    # Get world size and rank. Ensure some consistencies.
+    assert torch.distributed.is_initialized()
+    world_size: int = torch.distributed.get_world_size()
+    backend = backend or torch.distributed.get_backend(
+        get_world_group().device_group)        
+    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+        logger.debug("Disaggregated prefill enabled, the world size obtained from torch.distributed (2 * tp * pp) should be decreased to align with vLLM world size (tp * pp)")
+        world_size = world_size // 2
+        
+        
+        
+    
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
         assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
             "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
@@ -981,14 +993,9 @@ def initialize_model_parallel(
         
     time.sleep(1000)
     
-    # Get world size and rank. Ensure some consistencies.
-    assert torch.distributed.is_initialized()
-    world_size: int = torch.distributed.get_world_size()
-    backend = backend or torch.distributed.get_backend(
-        get_world_group().device_group)        
-    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-        logger.debug("Disaggregated prefill enabled, the world size obtained from torch.distributed (2 * tp * pp) should be decreased to align with vLLM world size (tp * pp)")
-        world_size = world_size // 2
+    
+        
+    
 
     if (world_size !=
             tensor_model_parallel_size * pipeline_model_parallel_size):

From 4ff4cd69f61e69a89f52d5f355a7dd333f4d962e Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 01:18:16 -0700
Subject: [PATCH 058/278] add more debug information in pynccl

---
 vllm/distributed/parallel_state.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a9e4008e18b0..a21a64ece37b 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -175,10 +175,11 @@ def __init__(
         from vllm.distributed.device_communicators.custom_all_reduce import (
             CustomAllreduce)
         from vllm.distributed.device_communicators.pynccl import (
-            PyNcclCommunicator)
+            PyNcclCommunicator))
 
         self.pynccl_comm: Optional[PyNcclCommunicator]
         if use_pynccl and self.world_size > 1:
+            logger.debug("Before pynccl")
             self.pynccl_comm = PyNcclCommunicator(
                 group=self.cpu_group,
                 device=self.device,

From b09e4e601cd39a5d56083d59529ab29ade210af2 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 01:18:41 -0700
Subject: [PATCH 059/278] typo fix

---
 vllm/distributed/parallel_state.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a21a64ece37b..eba1e4af9bc1 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -175,7 +175,7 @@ def __init__(
         from vllm.distributed.device_communicators.custom_all_reduce import (
             CustomAllreduce)
         from vllm.distributed.device_communicators.pynccl import (
-            PyNcclCommunicator))
+            PyNcclCommunicator)
 
         self.pynccl_comm: Optional[PyNcclCommunicator]
         if use_pynccl and self.world_size > 1:

From 583de9753dc09f45af7d656e1a928c6715c5c8d3 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 01:20:39 -0700
Subject: [PATCH 060/278] more debug

---
 vllm/distributed/parallel_state.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index eba1e4af9bc1..bc9890b0aff5 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -177,6 +177,8 @@ def __init__(
         from vllm.distributed.device_communicators.pynccl import (
             PyNcclCommunicator)
 
+        logger.debug("Oh plz")
+
         self.pynccl_comm: Optional[PyNcclCommunicator]
         if use_pynccl and self.world_size > 1:
             logger.debug("Before pynccl")

From 74bcffffd922da8a1d1d9c1b645aabf7f80dcfcf Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 01:23:08 -0700
Subject: [PATCH 061/278] more debug info

---
 vllm/distributed/parallel_state.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index bc9890b0aff5..e5fcde36f33f 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -145,14 +145,14 @@ def __init__(
         self.cpu_group = None
 
         for ranks in group_ranks:
-            logger.debug("initializing device group")
+            logger.debug("initializing device group, rank %d", self.rank)
             device_group = torch.distributed.new_group(
                 ranks, backend=torch_distributed_backend)
-            logger.debug("device group initialized")
+            logger.debug("device group initialized, rank %d", self.rank)
             # a group with `gloo` backend, to allow direct coordination between
             # processes through the CPU.
             cpu_group = torch.distributed.new_group(ranks, backend="gloo")
-            logger.debug("cpu group initialized")
+            logger.debug("cpu group initialized, rank %d", self.rank)
             if self.rank in ranks:
                 self.ranks = ranks
                 self.world_size = len(ranks)
@@ -162,11 +162,15 @@ def __init__(
 
         assert self.cpu_group is not None
         assert self.device_group is not None
+        
+        logger.debug("Here 166 , rank %d", self.rank)
 
         if torch.cuda.is_available():
             self.device = torch.device(f"cuda:{local_rank}")
         else:
             self.device = torch.device("cpu")
+            
+        logger.debug("Here 173 , rank %d", self.rank)
 
         self.use_pynccl = use_pynccl
         self.use_custom_allreduce = use_custom_allreduce

From 21758259da88fd94c77155acaaf9c24cbefa67e1 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 01:24:38 -0700
Subject: [PATCH 062/278] put every output

---
 examples/disaggregated_prefill_example.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index c06e90cb4432..5c9b652f9e61 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -11,7 +11,7 @@ VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
     -tp 4 \
     --disable-log-stats \
     --disable-log-requests \
-    --enable-chunked-prefill > >(tee -a prefill.txt) 2> >(tee -a prefill.txt >&2) &
+    --enable-chunked-prefill &
 
 # decoding instance
 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
@@ -21,6 +21,6 @@ VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     -tp 4 \
     --disable-log-stats \
     --disable-log-requests \
-    --enable-chunked-prefill > >(tee -a decode.txt) 2> >(tee -a decode.txt >&2) &
+    --enable-chunked-prefill &
 
 

From 3e0777009f9742e41aad42660ba10d14dfd097df Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 01:25:42 -0700
Subject: [PATCH 063/278] remove unnecessary sleep

---
 vllm/distributed/parallel_state.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index e5fcde36f33f..01dfd37a653d 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -898,10 +898,6 @@ def init_distributed_environment(
         else:
             local_rank = rank
             
-    if all([
-        envs.VLLM_DISAGG_PREFILL_ROLE is not None,
-        envs.VLLM_DISAGG_PREFILL_ROLE == "prefill"]):
-        time.sleep(1000)
     global _WORLD
     if _WORLD is None:
         ranks = list(range(world_size))
@@ -909,7 +905,6 @@ def init_distributed_environment(
         if all([
             envs.VLLM_DISAGG_PREFILL_ROLE is not None,
             envs.VLLM_DISAGG_PREFILL_ROLE == "decode"]):
-                # sleep 10 seconds to avoid potential collisions
                 # when initializing distributed environment
                 ranks = list(range(world_size, 2 * world_size))
             

From a22e5cdfa59704de07fcf99fd558a2539eb7ca68 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 01:37:20 -0700
Subject: [PATCH 064/278] add sucess statement

---
 vllm/distributed/parallel_state.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 01dfd37a653d..f06091d04ab6 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -993,6 +993,7 @@ def initialize_model_parallel(
             backend,
             use_custom_allreduce=False)
         
+    logger.debug("Success")
     time.sleep(1000)
     
     

From 2c0c27dc6ef45b3ad172f75c4bbfe2efc9140e69 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 01:45:10 -0700
Subject: [PATCH 065/278] add debug statement

---
 vllm/distributed/parallel_state.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index f06091d04ab6..637318f36f2c 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -145,14 +145,17 @@ def __init__(
         self.cpu_group = None
 
         for ranks in group_ranks:
-            logger.debug("initializing device group, rank %d", self.rank)
+            if self.rank in ranks:
+                logger.debug("initializing device group, rank %d", self.rank)
             device_group = torch.distributed.new_group(
                 ranks, backend=torch_distributed_backend)
-            logger.debug("device group initialized, rank %d", self.rank)
+            if self.rank in ranks:
+                logger.debug("device group initialized, rank %d", self.rank)
             # a group with `gloo` backend, to allow direct coordination between
             # processes through the CPU.
             cpu_group = torch.distributed.new_group(ranks, backend="gloo")
-            logger.debug("cpu group initialized, rank %d", self.rank)
+            if self.rank in ranks:
+                logger.debug("cpu group initialized, rank %d", self.rank)
             if self.rank in ranks:
                 self.ranks = ranks
                 self.world_size = len(ranks)

From a783787ae4cf931b7d235c1776e1cf6e75816329 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 10:56:08 -0700
Subject: [PATCH 066/278] log rank in success message

---
 vllm/distributed/parallel_state.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 637318f36f2c..be3021b10d21 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -996,7 +996,7 @@ def initialize_model_parallel(
             backend,
             use_custom_allreduce=False)
         
-    logger.debug("Success")
+    logger.debug("Success, rank %d", torch.distributed.get_rank())
     time.sleep(1000)
     
     

From 79f0b06500e40d6c79d2c6b24e9e72658364dd3d Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 10:58:26 -0700
Subject: [PATCH 067/278] sleep based on rank to avoid message overlapping

---
 vllm/distributed/parallel_state.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index be3021b10d21..a874ea8ca605 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -976,6 +976,8 @@ def initialize_model_parallel(
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
         logger.debug("Disaggregated prefill enabled, the world size obtained from torch.distributed (2 * tp * pp) should be decreased to align with vLLM world size (tp * pp)")
         world_size = world_size // 2
+
+    time.sleep(torch.distributed.get_rank())
         
         
         

From b17f20f34dbe59f00bab812c32186b1521746616 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 16:07:41 -0700
Subject: [PATCH 068/278] increase torch debug level

---
 examples/disaggregated_prefill_example.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index 5c9b652f9e61..cfc183675917 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -2,6 +2,7 @@
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
 export VLLM_TRACE_FUNCTION=1
+export TORCH_DISTRIBUTED_DEBUG=DETAIL
 
 # prefilling instance
 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \

From 025f20941aa8b59453781cc86126a5b39cdd2044 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 16:11:05 -0700
Subject: [PATCH 069/278] sleep

---
 vllm/distributed/parallel_state.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a874ea8ca605..534d8bbcedce 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -900,6 +900,8 @@ def init_distributed_environment(
             local_rank = envs.LOCAL_RANK
         else:
             local_rank = rank
+
+        
             
     global _WORLD
     if _WORLD is None:
@@ -915,6 +917,11 @@ def init_distributed_environment(
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size")
+
+            
+    time.sleep(torch.distributed.get_rank())
+    logger.debug("Success initialized _WORLD for rank %d", torch.distributed.get_rank())
+    time.sleep(100)
         
 
 

From 32292f1272c06d5fec7c2393694056b1cdd93c60 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 16:17:21 -0700
Subject: [PATCH 070/278] set gloo debugging level to trace

---
 examples/disaggregated_prefill_example.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index cfc183675917..653bb72c4dce 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -3,6 +3,7 @@ export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
 export VLLM_TRACE_FUNCTION=1
 export TORCH_DISTRIBUTED_DEBUG=DETAIL
+export GLOO_LOGGING_LEVEL=TRACE
 
 # prefilling instance
 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \

From 389fb24c5b1922a8a8ca0ef9dd646eacb5f4f947 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 16:20:40 -0700
Subject: [PATCH 071/278] reduce debugging commands

---
 vllm/distributed/parallel_state.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 534d8bbcedce..35844996b617 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -145,12 +145,12 @@ def __init__(
         self.cpu_group = None
 
         for ranks in group_ranks:
-            if self.rank in ranks:
-                logger.debug("initializing device group, rank %d", self.rank)
             device_group = torch.distributed.new_group(
                 ranks, backend=torch_distributed_backend)
             if self.rank in ranks:
-                logger.debug("device group initialized, rank %d", self.rank)
+                import time
+                time.sleep(self.rank)
+                logger.debug("initializing cpu group, rank %d", self.rank)
             # a group with `gloo` backend, to allow direct coordination between
             # processes through the CPU.
             cpu_group = torch.distributed.new_group(ranks, backend="gloo")
@@ -166,14 +166,12 @@ def __init__(
         assert self.cpu_group is not None
         assert self.device_group is not None
         
-        logger.debug("Here 166 , rank %d", self.rank)
 
         if torch.cuda.is_available():
             self.device = torch.device(f"cuda:{local_rank}")
         else:
             self.device = torch.device("cpu")
             
-        logger.debug("Here 173 , rank %d", self.rank)
 
         self.use_pynccl = use_pynccl
         self.use_custom_allreduce = use_custom_allreduce
@@ -184,16 +182,13 @@ def __init__(
         from vllm.distributed.device_communicators.pynccl import (
             PyNcclCommunicator)
 
-        logger.debug("Oh plz")
 
         self.pynccl_comm: Optional[PyNcclCommunicator]
         if use_pynccl and self.world_size > 1:
-            logger.debug("Before pynccl")
             self.pynccl_comm = PyNcclCommunicator(
                 group=self.cpu_group,
                 device=self.device,
             )
-            logger.debug("Pynccl initialized")
         else:
             self.pynccl_comm = None
 
@@ -919,7 +914,6 @@ def init_distributed_environment(
             "world group already initialized with a different world size")
 
             
-    time.sleep(torch.distributed.get_rank())
     logger.debug("Success initialized _WORLD for rank %d", torch.distributed.get_rank())
     time.sleep(100)
         

From 1b38b298b26afdf8e3a3fcec1804ab9f91d4ef0e Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 16:24:22 -0700
Subject: [PATCH 072/278] avoid initializing NCCL first

---
 vllm/distributed/parallel_state.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 35844996b617..92ad807392b7 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -145,8 +145,9 @@ def __init__(
         self.cpu_group = None
 
         for ranks in group_ranks:
-            device_group = torch.distributed.new_group(
-                ranks, backend=torch_distributed_backend)
+            # device_group = torch.distributed.new_group(
+            #     ranks, backend=torch_distributed_backend)
+            device_group = 233
             if self.rank in ranks:
                 import time
                 time.sleep(self.rank)

From bb8c08a42c3bbd4939a8349fde78d620fa6a1e18 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 16:31:49 -0700
Subject: [PATCH 073/278] check

---
 vllm/distributed/parallel_state.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 92ad807392b7..b6a2a13acc6d 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -897,6 +897,10 @@ def init_distributed_environment(
         else:
             local_rank = rank
 
+            
+    logger.debug("My rank is %d", torch.distributed.get_rank())
+    time.sleep(20)
+
         
             
     global _WORLD

From 25a7cf332a2bf505deb078e44ffbcf85e1436add Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 16:44:09 -0700
Subject: [PATCH 074/278] locate the hanging line

---
 vllm/distributed/parallel_state.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index b6a2a13acc6d..5680a7d6cf58 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -899,7 +899,11 @@ def init_distributed_environment(
 
             
     logger.debug("My rank is %d", torch.distributed.get_rank())
-    time.sleep(20)
+
+    cpu_group = torch.distributed.new_group(list(range(8)), backend="gloo")
+    
+    logger.debug("CPU group initialized")
+    time.sleep(1000)
 
         
             
@@ -919,8 +923,6 @@ def init_distributed_environment(
             "world group already initialized with a different world size")
 
             
-    logger.debug("Success initialized _WORLD for rank %d", torch.distributed.get_rank())
-    time.sleep(100)
         
 
 

From 999bd729d83920a0fa391193b94123dc85f75483 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 16:45:40 -0700
Subject: [PATCH 075/278] add rank to CPU group

---
 vllm/distributed/parallel_state.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5680a7d6cf58..cfcf5b6c5c49 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -902,7 +902,7 @@ def init_distributed_environment(
 
     cpu_group = torch.distributed.new_group(list(range(8)), backend="gloo")
     
-    logger.debug("CPU group initialized")
+    logger.debug("CPU group initialized, rank %d", torch.distributed.get_rank())
     time.sleep(1000)
 
         

From 3428ea667a8b41ba3b9f168e408af753f8f4e82a Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 16:49:59 -0700
Subject: [PATCH 076/278] narrow case

---
 vllm/distributed/parallel_state.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index cfcf5b6c5c49..f44381deb37c 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -147,16 +147,16 @@ def __init__(
         for ranks in group_ranks:
             # device_group = torch.distributed.new_group(
             #     ranks, backend=torch_distributed_backend)
-            device_group = 233
             if self.rank in ranks:
                 import time
                 time.sleep(self.rank)
-                logger.debug("initializing cpu group, rank %d", self.rank)
+                logger.debug("initializing cpu group, rank %d, group %s", self.rank, ranks)
             # a group with `gloo` backend, to allow direct coordination between
             # processes through the CPU.
             cpu_group = torch.distributed.new_group(ranks, backend="gloo")
             if self.rank in ranks:
-                logger.debug("cpu group initialized, rank %d", self.rank)
+                logger.debug("cpu group initialized, rank %d, group %s", self.rank, ranks)
+                time.sleep(1000)
             if self.rank in ranks:
                 self.ranks = ranks
                 self.world_size = len(ranks)
@@ -898,12 +898,6 @@ def init_distributed_environment(
             local_rank = rank
 
             
-    logger.debug("My rank is %d", torch.distributed.get_rank())
-
-    cpu_group = torch.distributed.new_group(list(range(8)), backend="gloo")
-    
-    logger.debug("CPU group initialized, rank %d", torch.distributed.get_rank())
-    time.sleep(1000)
 
         
             

From 91e3ed2dbdfc89b262d950d0c522426f150b8469 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 19 Jul 2024 17:42:39 -0700
Subject: [PATCH 077/278] bug fix: need to align the distributed groups between
 prefill and decode instances

---
 examples/disaggregated_prefill_example.sh |   6 +-
 vllm/distributed/parallel_state.py        | 173 +++++++++-------------
 2 files changed, 73 insertions(+), 106 deletions(-)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index 653bb72c4dce..c0009ae0b17d 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -1,9 +1,9 @@
 
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
-export VLLM_TRACE_FUNCTION=1
-export TORCH_DISTRIBUTED_DEBUG=DETAIL
-export GLOO_LOGGING_LEVEL=TRACE
+# export VLLM_TRACE_FUNCTION=1
+# export TORCH_DISTRIBUTED_DEBUG=DETAIL
+# export GLOO_LOGGING_LEVEL=TRACE
 
 # prefilling instance
 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index f44381deb37c..7be0fefc3d33 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -145,18 +145,11 @@ def __init__(
         self.cpu_group = None
 
         for ranks in group_ranks:
-            # device_group = torch.distributed.new_group(
-            #     ranks, backend=torch_distributed_backend)
-            if self.rank in ranks:
-                import time
-                time.sleep(self.rank)
-                logger.debug("initializing cpu group, rank %d, group %s", self.rank, ranks)
+            device_group = torch.distributed.new_group(
+                ranks, backend=torch_distributed_backend)
             # a group with `gloo` backend, to allow direct coordination between
             # processes through the CPU.
             cpu_group = torch.distributed.new_group(ranks, backend="gloo")
-            if self.rank in ranks:
-                logger.debug("cpu group initialized, rank %d, group %s", self.rank, ranks)
-                time.sleep(1000)
             if self.rank in ranks:
                 self.ranks = ranks
                 self.world_size = len(ranks)
@@ -722,10 +715,10 @@ def get_world_group() -> GroupCoordinator:
     return _WORLD
 
 
-def init_world_group(ranks: List[int], local_rank: int,
+def init_world_group(ranks: List[List[int]], local_rank: int,
                      backend: str) -> GroupCoordinator:
     return GroupCoordinator(
-        group_ranks=[ranks],
+        group_ranks=ranks,
         local_rank=local_rank,
         torch_distributed_backend=backend,
         use_pynccl=False,
@@ -824,27 +817,33 @@ def set_custom_all_reduce(enable: bool):
     _ENABLE_CUSTOM_ALL_REDUCE = enable
     
 
-def offset_distributed_groups(
+def include_decoding_groups_if_disagg_enabled(
     groups: List[List[int]],
-    offset: int,
+    world_size: int,
 ) -> List[List[int]]:
     """
-        Extend original distributed group.
-        The extended part will be the original distributed group plus an offset.
+        Include the distributed group for decode
+        Only for disaggregated prefill
         
+        Example:
+            Original group: [ [0,1], [2,3] ], world_size = 4
+            Extended: [ [0,1], [2,3], [4,5], [6,7] ]
         Arguments:
             groups: original distributed group
-            offset: the offset we want to apply to the duplicated group.
-                Typically world_size // 2
+            world_size: the vLLM world size, which is half of torch.distributed.get_world_size()
     """
-    
-    logger.debug("Offset distributed groups with offset %d", offset)
-    
-    new_groups = []
-    for group in groups:
-        new_groups.append([rank + offset for rank in group])
-        
-    return new_groups
+
+    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+        assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
+            "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
+        new_groups = []
+        for group in groups:
+            new_groups.append([rank for rank in group])
+        for group in groups:
+            new_groups.append([rank + world_size for rank in group])
+        return new_groups
+    else:
+        return groups
 
 
 def init_distributed_environment(
@@ -863,24 +862,21 @@ def init_distributed_environment(
             "distributed_init_method must be provided when initializing "
             "distributed environment")
         # this backend is used for WORLD
+        maybe_disagg_world_size = world_size
+        maybe_disagg_rank = rank
         if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-            # Disaggregated prefill is enabled
-            # world_size in vLLM is tp * pp
-            # for prefill, the ranks are [0, world_size)
-            # for decode, the ranks are [world_size, 2 * world_size)
             maybe_disagg_world_size = world_size * 2
             logger.debug(
-                "Disaggregated prefill enabled, handle torch-related changes on world size and ranks. This change is only inside `vllm/distributed/parallel_state.py`) and the other files are unchanged.")
+                "Disaggregated prefill enabled.")
             assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
             "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
             if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
+                # for prefill, the ranks are [0, world_size)
                 maybe_disagg_rank = rank
             else:
                 # offset global rank by tp * pp (which is world_size)
                 maybe_disagg_rank = rank + world_size
-        else:
-            maybe_disagg_world_size = world_size
-            maybe_disagg_rank = rank
+            
         torch.distributed.init_process_group(
             backend=backend,
             init_method=distributed_init_method,
@@ -896,22 +892,18 @@ def init_distributed_environment(
             local_rank = envs.LOCAL_RANK
         else:
             local_rank = rank
-
-            
-
         
             
     global _WORLD
     if _WORLD is None:
-        ranks = list(range(world_size))
+        ranks = [[i for i in range(world_size)]]
         # offset the distributed group
-        if all([
-            envs.VLLM_DISAGG_PREFILL_ROLE is not None,
-            envs.VLLM_DISAGG_PREFILL_ROLE == "decode"]):
-                # when initializing distributed environment
-                ranks = list(range(world_size, 2 * world_size))
+        if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+            ranks = include_decoding_groups_if_disagg_enabled(ranks, world_size)
             
         _WORLD = init_world_group(ranks, local_rank, backend)
+        logger.debug("_WORLD initialized for rank %d", torch.distributed.get_rank())
+        time.sleep(5)
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size")
@@ -947,27 +939,21 @@ def initialize_model_parallel(
     with a total of 16 GPUs, rank 0 to 7 belong to the first box and
     ranks 8 to 15 belong to the second box.
 
-    Disaggregated prefill will also initialize using this function.
-    Why: disaggregated prefill is similar to pipeline parallel
-     except that disaggregated prefill does not partition model
-    Methodology:
-        - Only change variables in this file
-        - Any variable outside this file should be unchanged
-    Modifications:
-        - World size in vLLM variables (like in `ParallelConfig`): unchanged
-        - World size in `torch.distributed`: doubled (2 * tp * pp)
-        - Rank:
+
+    Disaggregated prefill will also initialize its process group using this function.
+    Changes:
+        - vLLM world size: unchanged (tp * pp)
+        - torch.distributed.get_world_size():
+            - 2 * tp * pp
+            - Why: torch.distributed package sees 2 vLLM instances (prefill and decode)
+        - Global rank:
             - [0, tp * pp) for prefill
             - [tp * pp, 2 * tp * pp) for decode
         - Parallel groups
-            - Unchanged for prefill
-            - Offseted by tp * pp for decode
+            - Extend _WORLD, _TP and _PP using `include_decoding_groups_if_disagg_enabled`
             - Add a new parallel group `_DISAGG` for disaggregated prefill
-                - [0, tp * pp], [1, tp * pp + 1], .. 
+                - [ [0, tp * pp], [1, tp * pp + 1], .. ]
         - Local rank: unchanged
-            - Thanks to PP implementation, distributed operations only rely on
-             local rank. This guarantees the communications inside the
-             prefill instance and decode instance are unchanged.
     """
     
     # Get world size and rank. Ensure some consistencies.
@@ -976,37 +962,11 @@ def initialize_model_parallel(
     backend = backend or torch.distributed.get_backend(
         get_world_group().device_group)        
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-        logger.debug("Disaggregated prefill enabled, the world size obtained from torch.distributed (2 * tp * pp) should be decreased to align with vLLM world size (tp * pp)")
+        # Disaggregated prefill enabled
+        # The world_size for this vLLM instance is tp * pp, but torch.distributed contains 2 vLLM instances, its world size is 2 * tp * pp
+        # Adjust the world_size to match.
         world_size = world_size // 2
 
-    time.sleep(torch.distributed.get_rank())
-        
-        
-        
-    
-    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-        assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
-            "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
-        logger.debug("Disaggregated prefill enabled, create _DISAGG group")
-        group_ranks = []
-        for i in range(world_size):
-            # prefill local rank: i
-            # decode global rank: i + world_size
-            group_ranks.append([i, i + world_size])
-        logger.debug("Distributed group is %s", str(group_ranks))
-        _DISAGG = init_model_parallel_group(
-            group_ranks,
-            int(envs.VLLM_DISAGG_PREFILL_ROLE == "decode"),
-            backend,
-            use_custom_allreduce=False)
-        
-    logger.debug("Success, rank %d", torch.distributed.get_rank())
-    time.sleep(1000)
-    
-    
-        
-    
-
     if (world_size !=
             tensor_model_parallel_size * pipeline_model_parallel_size):
         raise RuntimeError(
@@ -1025,20 +985,14 @@ def initialize_model_parallel(
             range(i * tensor_model_parallel_size,
                   (i + 1) * tensor_model_parallel_size))
         group_ranks.append(ranks)
-    if envs.VLLM_DISAGG_PREFILL_ROLE == "decode":
-        logger.debug("Current instance is decode instance")
-        logger.debug("Offset the _TP ranks by %d", world_size)
-        group_ranks = offset_distributed_groups(
-            group_ranks, 
-            world_size
-        )
-        
-
+    group_ranks = include_decoding_groups_if_disagg_enabled(group_ranks, world_size)
     # message queue broadcaster is only used in tensor model parallel group
     _TP = init_model_parallel_group(group_ranks,
                                     get_world_group().local_rank,
                                     backend,
                                     use_message_queue_broadcaster=True)
+    logger.debug("_TP initialized for rank %d", torch.distributed.get_rank())
+    time.sleep(5)
 
     # Build the pipeline model-parallel groups.
     num_pipeline_model_parallel_groups: int = (world_size // 
@@ -1050,18 +1004,31 @@ def initialize_model_parallel(
     for i in range(num_pipeline_model_parallel_groups):
         ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
         group_ranks.append(ranks)
-    if envs.VLLM_DISAGG_PREFILL_ROLE == "decode":
-        logger.debug("Current instance is decode instance")
-        logger.debug("Offset the _PP ranks by %d", world_size)
-        group_ranks = offset_distributed_groups(
-            group_ranks, 
-            world_size
-        )
+    group_ranks = include_decoding_groups_if_disagg_enabled(group_ranks, world_size)
     # pipeline parallel does not need custom allreduce
     _PP = init_model_parallel_group(group_ranks,
                                     get_world_group().local_rank,
                                     backend,
                                     use_custom_allreduce=False)
+    logger.debug("_PP initialized for rank %d", torch.distributed.get_rank())
+    time.sleep(5)
+    
+    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+        global _DISAGG
+        logger.debug("Disaggregated prefill enabled, create _DISAGG group")
+        group_ranks = []
+        for i in range(world_size):
+            # prefill local rank: i
+            # decode global rank: i + world_size
+            group_ranks.append([i, i + world_size])
+        logger.debug("Distributed group is %s", str(group_ranks))
+        _DISAGG = init_model_parallel_group(
+            group_ranks,
+            int(envs.VLLM_DISAGG_PREFILL_ROLE == "decode"),
+            backend,
+            use_custom_allreduce=False)
+        logger.debug("_DISAGG initialized for rank %d", torch.distributed.get_rank())
+        time.sleep(5)
 
 
 def ensure_model_parallel_initialized(

From 3dd2275cea617f85925ac854a0674a2f2d83d4e8 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 18:56:01 -0700
Subject: [PATCH 078/278] add disaggregated prefilling for flashinfer

---
 vllm/attention/backends/flashinfer.py | 55 +++++++++++++++++++++++----
 1 file changed, 48 insertions(+), 7 deletions(-)

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index daff76051a95..0a76962bdd91 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -23,11 +23,18 @@
 from vllm.sequence import SequenceGroupMetadata
 from vllm.utils import get_kv_cache_torch_dtype, make_tensor_with_pad
 
+# This group is used for KV cache transfer in disaggregated prefilling
+from vllm.distributed import get_disagg_group
+
+# To identify if the VLLM_DISAGG_PREFILL_ROLE is set or no
+import vllm.envs as envs
+
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (GPUModelRunnerBase,
                                           ModelInputForGPUBuilder)
 
 
+
 class FlashInferBackend(AttentionBackend):
 
     @staticmethod
@@ -479,6 +486,20 @@ def forward(
         if attn_metadata.num_decode_tokens > 0:
             assert attn_metadata.num_prefill_tokens == 0, (
                 "Chunked prefill is not supported with flashinfer yet.")
+            
+        prefill_meta = attn_metadata.prefill_metadata
+        
+        if all([
+            kv_cache is not None, # we are not in profile run
+            prefill_meta is not None, # during prefill stage
+            envs.VLLM_DISAGG_PREFILL_ROLE is not None, # disagg prefill enabled
+        ]):
+            if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
+                get_disagg_group().send(key)
+                get_disagg_group().send(value)
+            else:
+                key = get_disagg_group().recv(key.shape, key.dtype)
+                value = get_disagg_group().recv(value.shape, value.dtype)
 
         if kv_cache is not None:
             # Use the same reshape and cache kernel as flash attention.
@@ -493,7 +514,7 @@ def forward(
 
         query = query.contiguous(
         )  # Flashinfer requires query to be contiguous
-        if prefill_meta := attn_metadata.prefill_metadata:
+        if prefill_meta is not None:
             # We will use flash attention for prefill
             # when kv_cache is not provided.
             # This happens when vllm runs the profiling to
@@ -515,11 +536,26 @@ def forward(
             else:
                 assert prefill_meta is not None
                 assert prefill_meta.prefill_wrapper is not None
-                output = prefill_meta.prefill_wrapper.forward(
-                    query,
-                    kv_cache,
-                    logits_soft_cap=attn_metadata.logits_soft_cap,
-                    causal=True)
+                
+                if not all([
+                    envs.VLLM_DISAGG_PREFILL_ROLE is not None,
+                    envs.VLLM_DISAGG_PREFILL_ROLE == "decode",
+                ]): # Only skip prefill for disagg decode instance
+                    output = prefill_meta.prefill_wrapper.forward(
+                        query,
+                        kv_cache,
+                        logits_soft_cap=attn_metadata.logits_soft_cap,
+                        causal=True)
+                    output = output.view(num_tokens, hidden_size)
+                    
+                if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+                    # communication for disaggregated prefill.
+                    if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
+                        get_disagg_group().send(output)
+                    else:
+                        # Kuntai: This assume that output has the same dtype as key
+                        # Is this assumption true?
+                        output = get_disagg_group().recv([num_tokens, hidden_size], key.dtype)
         else:
             assert attn_metadata.decode_metadata is not None
             assert attn_metadata.decode_metadata.decode_wrapper is not None
@@ -528,4 +564,9 @@ def forward(
                 kv_cache,
                 sm_scale=self.scale,
                 logits_soft_cap=attn_metadata.logits_soft_cap)
-        return output.view(num_tokens, hidden_size)
+            output = output.view(num_tokens, hidden_size)
+            
+        
+        
+        
+        return output

From 2b13f3ca1803444262542d49315bd01a78092c8b Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 18:56:38 -0700
Subject: [PATCH 079/278] adjust comments

---
 vllm/distributed/parallel_state.py | 12 +++++-------
 vllm/envs.py                       |  2 +-
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 7be0fefc3d33..5188b71d95b9 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -764,7 +764,11 @@ def get_pp_group() -> GroupCoordinator:
         "pipeline model parallel group is not initialized")
     return _PP
 
-    
+
+# kept for backward compatibility
+get_pipeline_model_parallel_group = get_pp_group
+
+
 _DISAGG: Optional[GroupCoordinator] = None
 
 def get_disagg_group() -> GroupCoordinator:
@@ -773,9 +777,6 @@ def get_disagg_group() -> GroupCoordinator:
     return _DISAGG
 
 
-# kept for backward compatibility
-get_pipeline_model_parallel_group = get_pp_group
-
 
 @contextmanager
 def graph_capture():
@@ -992,7 +993,6 @@ def initialize_model_parallel(
                                     backend,
                                     use_message_queue_broadcaster=True)
     logger.debug("_TP initialized for rank %d", torch.distributed.get_rank())
-    time.sleep(5)
 
     # Build the pipeline model-parallel groups.
     num_pipeline_model_parallel_groups: int = (world_size // 
@@ -1011,7 +1011,6 @@ def initialize_model_parallel(
                                     backend,
                                     use_custom_allreduce=False)
     logger.debug("_PP initialized for rank %d", torch.distributed.get_rank())
-    time.sleep(5)
     
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
         global _DISAGG
@@ -1028,7 +1027,6 @@ def initialize_model_parallel(
             backend,
             use_custom_allreduce=False)
         logger.debug("_DISAGG initialized for rank %d", torch.distributed.get_rank())
-        time.sleep(5)
 
 
 def ensure_model_parallel_initialized(
diff --git a/vllm/envs.py b/vllm/envs.py
index 0952b299a4a6..d409bba2292f 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -297,7 +297,7 @@ def get_default_config_root():
     lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
 
     # Specify the role of current vllm instance
-    # Value can be "prefill", "decode" or None.
+    # Value can be "prefill", "decode".
     "VLLM_DISAGG_PREFILL_ROLE":
     lambda: os.getenv("VLLM_DISAGG_PREFILL_ROLE", None),
 

From 8c3f209f35b30df74dc5373c68ffe7be7dd1dc43 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 19:30:57 -0700
Subject: [PATCH 080/278] add logging for send and recv

---
 vllm/distributed/parallel_state.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5188b71d95b9..7889b4829648 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -666,6 +666,8 @@ def barrier(self):
     def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
         """Sends a tensor to the destination rank in a non-blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
+
+        logger.debug("Sending tensor: ", tensor.shape, tensor.dtype, dst)
         if dst is None:
             dst = (self.rank_in_group + 1) % self.world_size
 
@@ -681,6 +683,8 @@ def recv(self,
              src: Optional[int] = None) -> torch.Tensor:
         """Receives a tensor from the src rank."""
         """NOTE: `src` is the local rank of the destination rank."""
+
+        logger.debug("Recving tensor: ", size, dtype, src)
         if src is None:
             src = (self.rank_in_group - 1) % self.world_size
 

From c6a5e5759c1b843db2a4223b626c345b37d5ef9f Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 19:46:23 -0700
Subject: [PATCH 081/278] turn off chunked prefill to use flashinfer kernel

---
 examples/disaggregated_prefill_example.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index c0009ae0b17d..47ed5434a0ce 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -13,7 +13,7 @@ VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
     -tp 4 \
     --disable-log-stats \
     --disable-log-requests \
-    --enable-chunked-prefill &
+    --enable-prefix-caching &
 
 # decoding instance
 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
@@ -23,6 +23,6 @@ VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     -tp 4 \
     --disable-log-stats \
     --disable-log-requests \
-    --enable-chunked-prefill &
+    --enable-prefix-caching &
 
 

From b3c47f3c5db69636db77ccf01617e1d0c0c55513 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 19:49:50 -0700
Subject: [PATCH 082/278] confirm which backend is being used

---
 vllm/attention/backends/flash_attn.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index b8a64205b362..ddf99802fd99 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -473,6 +473,7 @@ def forward(
             # Reshape the input keys and values and store them in the cache.
             # If kv_cache is not provided, the new key and value tensors are
             # not cached. This happens during the initial memory profiling run.
+            print("I am in flash attn")
             ops.reshape_and_cache_flash(
                 key,
                 value,

From f05540c65d46b176c84b1dd645c989921042ea01 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 19:54:33 -0700
Subject: [PATCH 083/278] remove debugging from parallel_state, its too much...

---
 vllm/distributed/parallel_state.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 7889b4829648..5188b71d95b9 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -666,8 +666,6 @@ def barrier(self):
     def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
         """Sends a tensor to the destination rank in a non-blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
-
-        logger.debug("Sending tensor: ", tensor.shape, tensor.dtype, dst)
         if dst is None:
             dst = (self.rank_in_group + 1) % self.world_size
 
@@ -683,8 +681,6 @@ def recv(self,
              src: Optional[int] = None) -> torch.Tensor:
         """Receives a tensor from the src rank."""
         """NOTE: `src` is the local rank of the destination rank."""
-
-        logger.debug("Recving tensor: ", size, dtype, src)
         if src is None:
             src = (self.rank_in_group - 1) % self.world_size
 

From eb96fe739cf16a41dae1ff0966d9af2488590a8d Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 19:58:00 -0700
Subject: [PATCH 084/278] add disagg prefill for flash attn backend

---
 vllm/attention/backends/flash_attn.py | 68 +++++++++++++++++++++------
 1 file changed, 54 insertions(+), 14 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index ddf99802fd99..69ae57364fdb 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -16,10 +16,18 @@
 from vllm.sequence import SequenceGroupMetadata
 from vllm.utils import make_tensor_with_pad
 
+# This group is used for KV cache transfer in disaggregated prefilling
+from vllm.distributed import get_disagg_group
+
+# To identify if the VLLM_DISAGG_PREFILL_ROLE is set or no
+import vllm.envs as envs
+from vllm.logger import init_logger
+
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (GPUModelRunnerBase,
                                           ModelInputForGPUBuilder)
 
+logger = init_logger(__name__)
 
 class FlashAttentionBackend(AttentionBackend):
 
@@ -466,6 +474,21 @@ def forward(
         key = key.view(-1, self.num_kv_heads, self.head_size)
         value = value.view(-1, self.num_kv_heads, self.head_size)
 
+        if all([
+            kv_cache is not None, # we are not in profile run
+            prefill_meta is not None, # during prefill stage
+            envs.VLLM_DISAGG_PREFILL_ROLE is not None, # disagg prefill enabled
+        ]):
+            if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
+                logger.debug("Sending key & value, ", key.shape, key.dtype, value.shape, value.dtype)
+                get_disagg_group().send(key)
+                get_disagg_group().send(value)
+            else:
+                logger.debug("Recving key & value, ", key.shape, key.dtype, value.shape, value.dtype)
+                key = get_disagg_group().recv(key.shape, key.dtype)
+                value = get_disagg_group().recv(value.shape, value.dtype)
+
+
         if kv_cache is not None:
             key_cache = kv_cache[0]
             value_cache = kv_cache[1]
@@ -473,7 +496,6 @@ def forward(
             # Reshape the input keys and values and store them in the cache.
             # If kv_cache is not provided, the new key and value tensors are
             # not cached. This happens during the initial memory profiling run.
-            print("I am in flash attn")
             ops.reshape_and_cache_flash(
                 key,
                 value,
@@ -525,19 +547,37 @@ def forward(
                 # prefix-enabled attention
                 assert prefill_meta.seq_lens is not None
                 max_seq_len = max(prefill_meta.seq_lens)
-                output[:num_prefill_tokens] = flash_attn_varlen_func(
-                    q=query,
-                    k=key_cache,
-                    v=value_cache,
-                    cu_seqlens_q=prefill_meta.query_start_loc,
-                    max_seqlen_q=prefill_meta.max_query_len,
-                    cu_seqlens_k=prefill_meta.seq_start_loc,
-                    max_seqlen_k=max_seq_len,
-                    softmax_scale=self.scale,
-                    causal=True,
-                    alibi_slopes=self.alibi_slopes,
-                    block_table=prefill_meta.block_tables,
-                )
+
+                if not all([
+                    envs.VLLM_DISAGG_PREFILL_ROLE is not None,
+                    envs.VLLM_DISAGG_PREFILL_ROLE == "decode",
+                ]): # Only skip prefill for disagg decode instance
+                    output[:num_prefill_tokens] = flash_attn_varlen_func(
+                        q=query,
+                        k=key_cache,
+                        v=value_cache,
+                        cu_seqlens_q=prefill_meta.query_start_loc,
+                        max_seqlen_q=prefill_meta.max_query_len,
+                        cu_seqlens_k=prefill_meta.seq_start_loc,
+                        max_seqlen_k=max_seq_len,
+                        softmax_scale=self.scale,
+                        causal=True,
+                        alibi_slopes=self.alibi_slopes,
+                        block_table=prefill_meta.block_tables,
+                    )
+                    output.view(num_tokens, hidden_size)
+
+                if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+                    # communication for disaggregated prefill.
+                    if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
+                        logger.info("Sending output, " , output.shape, output.dtype)
+                        get_disagg_group().send(output)
+                    else:
+                        logger.info("Recv output, " , output.shape, output.dtype)
+                        # Kuntai: This assume that output has the same dtype as key
+                        # Is this assumption true?
+                        output = get_disagg_group().recv([num_tokens, hidden_size], key.dtype)
+                        
 
         if decode_meta := attn_metadata.decode_metadata:
             # Decoding run.

From 09d5588b06285a43912d8ca5372a67a04c30ec4e Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 20:00:12 -0700
Subject: [PATCH 085/278] edit flash attn to assign prefill_meta first

---
 vllm/attention/backends/flash_attn.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 69ae57364fdb..00846efac521 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -474,6 +474,8 @@ def forward(
         key = key.view(-1, self.num_kv_heads, self.head_size)
         value = value.view(-1, self.num_kv_heads, self.head_size)
 
+        prefill_meta = attn_metadata.prefill_metadata
+
         if all([
             kv_cache is not None, # we are not in profile run
             prefill_meta is not None, # during prefill stage
@@ -521,7 +523,7 @@ def forward(
         assert query.shape[0] == num_prefill_tokens
         assert decode_query.shape[0] == num_decode_tokens
 
-        if prefill_meta := attn_metadata.prefill_metadata:
+        if prefill_meta is not None:
             # Prompt run.
             if (kv_cache is None or prefill_meta.block_tables is None
                     or prefill_meta.block_tables.numel() == 0):

From 43077e7d3dad51139c694211f9ef8f7110c0a96c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 20:04:02 -0700
Subject: [PATCH 086/278] use print instead of attn

---
 vllm/attention/backends/flash_attn.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 00846efac521..9ca3909fa4e8 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -482,11 +482,11 @@ def forward(
             envs.VLLM_DISAGG_PREFILL_ROLE is not None, # disagg prefill enabled
         ]):
             if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
-                logger.debug("Sending key & value, ", key.shape, key.dtype, value.shape, value.dtype)
+                print("Sending key & value, ", key.shape, key.dtype, value.shape, value.dtype)
                 get_disagg_group().send(key)
                 get_disagg_group().send(value)
             else:
-                logger.debug("Recving key & value, ", key.shape, key.dtype, value.shape, value.dtype)
+                print("Recving key & value, ", key.shape, key.dtype, value.shape, value.dtype)
                 key = get_disagg_group().recv(key.shape, key.dtype)
                 value = get_disagg_group().recv(value.shape, value.dtype)
 
@@ -572,10 +572,10 @@ def forward(
                 if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
                     # communication for disaggregated prefill.
                     if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
-                        logger.info("Sending output, " , output.shape, output.dtype)
+                        print("Sending output, " , output.shape, output.dtype)
                         get_disagg_group().send(output)
                     else:
-                        logger.info("Recv output, " , output.shape, output.dtype)
+                        print("Recv output, " , output.shape, output.dtype)
                         # Kuntai: This assume that output has the same dtype as key
                         # Is this assumption true?
                         output = get_disagg_group().recv([num_tokens, hidden_size], key.dtype)

From f7167379656fa6bf5b7f1483b2cfc63a2fd6a300 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 20:05:12 -0700
Subject: [PATCH 087/278] make data contiguous

---
 vllm/attention/backends/flash_attn.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 9ca3909fa4e8..3892cec9aa19 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -482,6 +482,8 @@ def forward(
             envs.VLLM_DISAGG_PREFILL_ROLE is not None, # disagg prefill enabled
         ]):
             if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
+                key = key.contiguous()
+                value = value.contiguous()
                 print("Sending key & value, ", key.shape, key.dtype, value.shape, value.dtype)
                 get_disagg_group().send(key)
                 get_disagg_group().send(value)
@@ -567,7 +569,7 @@ def forward(
                         alibi_slopes=self.alibi_slopes,
                         block_table=prefill_meta.block_tables,
                     )
-                    output.view(num_tokens, hidden_size)
+                    output = output.view(num_tokens, hidden_size).contiguous()
 
                 if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
                     # communication for disaggregated prefill.

From 0d072519caad47881f9e3d04da336af2fd629ed3 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 20:09:59 -0700
Subject: [PATCH 088/278] add more debug message

---
 vllm/attention/backends/flash_attn.py | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 3892cec9aa19..c30d323de7f8 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -474,25 +474,6 @@ def forward(
         key = key.view(-1, self.num_kv_heads, self.head_size)
         value = value.view(-1, self.num_kv_heads, self.head_size)
 
-        prefill_meta = attn_metadata.prefill_metadata
-
-        if all([
-            kv_cache is not None, # we are not in profile run
-            prefill_meta is not None, # during prefill stage
-            envs.VLLM_DISAGG_PREFILL_ROLE is not None, # disagg prefill enabled
-        ]):
-            if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
-                key = key.contiguous()
-                value = value.contiguous()
-                print("Sending key & value, ", key.shape, key.dtype, value.shape, value.dtype)
-                get_disagg_group().send(key)
-                get_disagg_group().send(value)
-            else:
-                print("Recving key & value, ", key.shape, key.dtype, value.shape, value.dtype)
-                key = get_disagg_group().recv(key.shape, key.dtype)
-                value = get_disagg_group().recv(value.shape, value.dtype)
-
-
         if kv_cache is not None:
             key_cache = kv_cache[0]
             value_cache = kv_cache[1]
@@ -525,7 +506,7 @@ def forward(
         assert query.shape[0] == num_prefill_tokens
         assert decode_query.shape[0] == num_decode_tokens
 
-        if prefill_meta is not None:
+        if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
             if (kv_cache is None or prefill_meta.block_tables is None
                     or prefill_meta.block_tables.numel() == 0):
@@ -556,6 +537,7 @@ def forward(
                     envs.VLLM_DISAGG_PREFILL_ROLE is not None,
                     envs.VLLM_DISAGG_PREFILL_ROLE == "decode",
                 ]): # Only skip prefill for disagg decode instance
+                    logger.debug("Do prefill")
                     output[:num_prefill_tokens] = flash_attn_varlen_func(
                         q=query,
                         k=key_cache,

From 2177737e44f0d855bc8c9d0d4e808463ad28122e Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 20:44:16 -0700
Subject: [PATCH 089/278] turn on logging

---
 examples/disaggregated_prefill_example.sh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index 47ed5434a0ce..601420fabd75 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -11,8 +11,6 @@ VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
     --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
     --port 8100 \
     -tp 4 \
-    --disable-log-stats \
-    --disable-log-requests \
     --enable-prefix-caching &
 
 # decoding instance
@@ -21,8 +19,6 @@ VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
     --port 8200 \
     -tp 4 \
-    --disable-log-stats \
-    --disable-log-requests \
     --enable-prefix-caching &
 
 

From a293bd08948bf7e2c028e01a598c8233527b4bee Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 20:49:17 -0700
Subject: [PATCH 090/278] more debug prints in flash_attn

---
 vllm/attention/backends/flash_attn.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index c30d323de7f8..91b080c254a5 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -513,6 +513,7 @@ def forward(
                 # normal attention
                 # When block_tables are not filled, it means q and k are the
                 # prompt, and they have the same length.
+                print("profile run ", end="")
                 out = flash_attn_varlen_func(
                     q=query,
                     k=key,
@@ -532,6 +533,7 @@ def forward(
                 # prefix-enabled attention
                 assert prefill_meta.seq_lens is not None
                 max_seq_len = max(prefill_meta.seq_lens)
+                print("non promt_run")
 
                 if not all([
                     envs.VLLM_DISAGG_PREFILL_ROLE is not None,

From cc7f646a511771f505f4e52ae99765c7d27d6f63 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 21:01:08 -0700
Subject: [PATCH 091/278] remove enforce eager

---
 examples/disaggregated_prefill_example.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index 601420fabd75..1df1aba0e4a6 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -12,6 +12,7 @@ VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
     --port 8100 \
     -tp 4 \
     --enable-prefix-caching &
+    
 
 # decoding instance
 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \

From 68f3d16511afc575038c3cc0c9936ca8209ec92e Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 21:01:25 -0700
Subject: [PATCH 092/278] adjust printing order in flash attn

---
 vllm/attention/backends/flash_attn.py | 69 +++++++++++++--------------
 1 file changed, 34 insertions(+), 35 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 91b080c254a5..1594bbb76d04 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -505,15 +505,19 @@ def forward(
 
         assert query.shape[0] == num_prefill_tokens
         assert decode_query.shape[0] == num_decode_tokens
-
-        if prefill_meta := attn_metadata.prefill_metadata:
+        
+        prefill_meta = attn_metadata.prefill_metadata
+        if (prefill_meta is not None) and (
+            (envs.VLLM_DISAGG_PREFILL_ROLE is None)
+            or
+            (envs.VLLM_DISAGG_PREFILL_ROLE == "prefill")
+        ): # during prefilling, and this instance is not disagg decode instance
             # Prompt run.
             if (kv_cache is None or prefill_meta.block_tables is None
                     or prefill_meta.block_tables.numel() == 0):
                 # normal attention
                 # When block_tables are not filled, it means q and k are the
                 # prompt, and they have the same length.
-                print("profile run ", end="")
                 out = flash_attn_varlen_func(
                     q=query,
                     k=key,
@@ -533,38 +537,33 @@ def forward(
                 # prefix-enabled attention
                 assert prefill_meta.seq_lens is not None
                 max_seq_len = max(prefill_meta.seq_lens)
-                print("non promt_run")
-
-                if not all([
-                    envs.VLLM_DISAGG_PREFILL_ROLE is not None,
-                    envs.VLLM_DISAGG_PREFILL_ROLE == "decode",
-                ]): # Only skip prefill for disagg decode instance
-                    logger.debug("Do prefill")
-                    output[:num_prefill_tokens] = flash_attn_varlen_func(
-                        q=query,
-                        k=key_cache,
-                        v=value_cache,
-                        cu_seqlens_q=prefill_meta.query_start_loc,
-                        max_seqlen_q=prefill_meta.max_query_len,
-                        cu_seqlens_k=prefill_meta.seq_start_loc,
-                        max_seqlen_k=max_seq_len,
-                        softmax_scale=self.scale,
-                        causal=True,
-                        alibi_slopes=self.alibi_slopes,
-                        block_table=prefill_meta.block_tables,
-                    )
-                    output = output.view(num_tokens, hidden_size).contiguous()
-
-                if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-                    # communication for disaggregated prefill.
-                    if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
-                        print("Sending output, " , output.shape, output.dtype)
-                        get_disagg_group().send(output)
-                    else:
-                        print("Recv output, " , output.shape, output.dtype)
-                        # Kuntai: This assume that output has the same dtype as key
-                        # Is this assumption true?
-                        output = get_disagg_group().recv([num_tokens, hidden_size], key.dtype)
+
+                output[:num_prefill_tokens] = flash_attn_varlen_func(
+                    q=query,
+                    k=key_cache,
+                    v=value_cache,
+                    cu_seqlens_q=prefill_meta.query_start_loc,
+                    max_seqlen_q=prefill_meta.max_query_len,
+                    cu_seqlens_k=prefill_meta.seq_start_loc,
+                    max_seqlen_k=max_seq_len,
+                    softmax_scale=self.scale,
+                    causal=True,
+                    alibi_slopes=self.alibi_slopes,
+                    block_table=prefill_meta.block_tables,
+                )
+
+        if (prefill_meta is not None) and \
+            (envs.VLLM_DISAGG_PREFILL_ROLE is not None):
+            # communication for disaggregated prefill.
+            if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
+                output = output.view(num_tokens, hidden_size).contiguous()
+                print("Sending output, " , output.shape, output.dtype)
+                get_disagg_group().send(output)
+            else:
+                print("Recv output, " , output.shape, output.dtype)
+                # Kuntai: This assume that output has the same dtype as key
+                # Is this assumption true?
+                output = get_disagg_group().recv([num_tokens, hidden_size], key.dtype)
                         
 
         if decode_meta := attn_metadata.decode_metadata:

From 21a61b9e027f074ab3d707e0212a5d08bd417162 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 21:05:01 -0700
Subject: [PATCH 093/278] avoid sending & receiving output tensor during
 profile run

---
 vllm/attention/backends/flash_attn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 1594bbb76d04..3df3f4ac402c 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -552,7 +552,7 @@ def forward(
                     block_table=prefill_meta.block_tables,
                 )
 
-        if (prefill_meta is not None) and \
+        if (prefill_meta is not None) and (kv_cache is not None) and \
             (envs.VLLM_DISAGG_PREFILL_ROLE is not None):
             # communication for disaggregated prefill.
             if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":

From 691cad78164f73e127de7dec0c515e513289fd44 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 21:17:14 -0700
Subject: [PATCH 094/278] also log the device

---
 vllm/attention/backends/flash_attn.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 3df3f4ac402c..ffb8f954e28c 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -557,14 +557,15 @@ def forward(
             # communication for disaggregated prefill.
             if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
                 output = output.view(num_tokens, hidden_size).contiguous()
-                print("Sending output, " , output.shape, output.dtype)
+                print("Sending output, " , output.shape, output.dtype, output.device)
                 get_disagg_group().send(output)
             else:
-                print("Recv output, " , output.shape, output.dtype)
+                print("Recv output, " , output.shape, output.dtype, output.device)
                 # Kuntai: This assume that output has the same dtype as key
                 # Is this assumption true?
                 output = get_disagg_group().recv([num_tokens, hidden_size], key.dtype)
-                        
+            import time
+            time.sleep(10)
 
         if decode_meta := attn_metadata.decode_metadata:
             # Decoding run.

From c057f1935949067d800342764b942291532bd9ac Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 21:51:28 -0700
Subject: [PATCH 095/278] adjust implementation

---
 vllm/attention/backends/flash_attn.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index ffb8f954e28c..6928e05530c8 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -556,14 +556,13 @@ def forward(
             (envs.VLLM_DISAGG_PREFILL_ROLE is not None):
             # communication for disaggregated prefill.
             if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
-                output = output.view(num_tokens, hidden_size).contiguous()
-                print("Sending output, " , output.shape, output.dtype, output.device)
-                get_disagg_group().send(output)
+                out = output[:num_prefill_tokens].contiguous()
+                print("Sending out, " , out.shape, out.dtype, out.device)
+                get_disagg_group().send(out)
             else:
-                print("Recv output, " , output.shape, output.dtype, output.device)
-                # Kuntai: This assume that output has the same dtype as key
-                # Is this assumption true?
-                output = get_disagg_group().recv([num_tokens, hidden_size], key.dtype)
+
+                print("Recv out, " , output[:num_prefill_tokens].shape, output.dtype, output.device)
+                output[:num_prefill_tokens] = get_disagg_group().recv(output[:num_prefill_tokens].shape, output.dtype)
             import time
             time.sleep(10)
 

From 82b73bbe0e8fc3693492449d28f9d9ca5270f7fc Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 21:54:53 -0700
Subject: [PATCH 096/278] finish adjustment

---
 vllm/attention/backends/flash_attn.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 6928e05530c8..0563e33b6c3c 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -563,9 +563,7 @@ def forward(
 
                 print("Recv out, " , output[:num_prefill_tokens].shape, output.dtype, output.device)
                 output[:num_prefill_tokens] = get_disagg_group().recv(output[:num_prefill_tokens].shape, output.dtype)
-            import time
-            time.sleep(10)
-
+                
         if decode_meta := attn_metadata.decode_metadata:
             # Decoding run.
             output[num_prefill_tokens:] = flash_attn_with_kvcache(

From 6db1d48d3cc5cfe33db5d69035db504ab17fe442 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 23:53:59 -0700
Subject: [PATCH 097/278] fall back to original flashinfer

---
 vllm/attention/backends/flashinfer.py | 123 +++++++++-----------------
 1 file changed, 41 insertions(+), 82 deletions(-)

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 0a76962bdd91..8271efe330c9 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -20,19 +20,11 @@
 from vllm.attention.backends.utils import (PAD_SLOT_ID, compute_slot_mapping,
                                            compute_slot_mapping_start_idx,
                                            is_block_tables_empty)
-from vllm.sequence import SequenceGroupMetadata
+from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.utils import get_kv_cache_torch_dtype, make_tensor_with_pad
 
-# This group is used for KV cache transfer in disaggregated prefilling
-from vllm.distributed import get_disagg_group
-
-# To identify if the VLLM_DISAGG_PREFILL_ROLE is set or no
-import vllm.envs as envs
-
 if TYPE_CHECKING:
-    from vllm.worker.model_runner import (GPUModelRunnerBase,
-                                          ModelInputForGPUBuilder)
-
+    from vllm.worker.model_runner import ModelInputForGPUBuilder
 
 
 class FlashInferBackend(AttentionBackend):
@@ -68,14 +60,14 @@ def swap_blocks(
         dst_kv_cache: torch.Tensor,
         src_to_dst: torch.Tensor,
     ) -> None:
-        raise NotImplementedError
+        PagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
 
     @staticmethod
     def copy_blocks(
         kv_caches: List[torch.Tensor],
         src_to_dists: torch.Tensor,
     ) -> None:
-        raise NotImplementedError
+        PagedAttention.copy_blocks(kv_caches, src_to_dists)
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
@@ -222,6 +214,9 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         self.num_prefill_tokens = 0
         self.num_decode_tokens = 0
 
+        self.input_builder = input_builder
+        self.runner = input_builder.runner
+
         self.sliding_window = input_builder.sliding_window
         self.block_size = input_builder.block_size
         self.use_v2_block_manager = (
@@ -244,26 +239,24 @@ def __init__(self, input_builder: "ModelInputForGPUBuilder"):
         # paged_kv_last_page_len is the length of the last page of each request
         self.paged_kv_last_page_len: List[int] = []
 
-    def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata,
-                      token_lens: List[int], seq_lens: List[int],
-                      curr_seq_lens: List[int], query_lens: List[int],
-                      context_lens: List[int],
-                      curr_sliding_window_blocks: List[int],
-                      prefix_cache_hit: bool, chunked_prefill_enabled: bool):
+    def _add_seq_group(
+            self, inter_data: "ModelInputForGPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool):
         """Add a sequence group to the metadata. Specifically update/append
         1. context length.
         2. block table.
         3. slot mapping.
         """
-        is_prompt = seq_group_metadata.is_prompt
-        block_tables = seq_group_metadata.block_tables
-        computed_block_nums = seq_group_metadata.computed_block_nums
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+        computed_block_nums = inter_data.computed_block_nums
 
         for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
              curr_sliding_window_block) in zip(
-                 seq_group_metadata.seq_data.keys(), token_lens, seq_lens,
-                 curr_seq_lens, query_lens, context_lens,
-                 curr_sliding_window_blocks):
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
             self.context_lens.append(context_len)
             if is_prompt:
                 self.num_prefills += 1
@@ -281,7 +274,7 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata,
             # only allowing multiple of block_size chunk size.
             # NOTE: This only works for oooooooxxx style attention.
             block_table = []
-            if prefix_cache_hit:
+            if inter_data.prefix_cache_hit:
                 block_table = computed_block_nums
             elif ((chunked_prefill_enabled or not is_prompt)
                   and block_tables is not None):
@@ -296,8 +289,7 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata,
                 self.use_v2_block_manager)
             compute_slot_mapping(is_profile_run, self.slot_mapping, seq_id,
                                  seq_len, context_len, start_idx,
-                                 self.block_size,
-                                 seq_group_metadata.block_tables)
+                                 self.block_size, inter_data.block_tables)
 
             # It is not necessary to add paged_kv_indices, paged_kv_indptr,
             # and paged_kv_last_page_len for profile run because we will
@@ -323,9 +315,13 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata,
                 last_page_len = self.block_size
             self.paged_kv_last_page_len.append(last_page_len)
 
-    def build(self, runner: "GPUModelRunnerBase", seq_lens, query_lens,
+    def build(self, seq_lens: List[int], query_lens: List[int],
               cuda_graph_pad_size: int, batch_size: int):
-        device = runner.device
+        for inter_data in self.input_builder.inter_data_list:
+            self._add_seq_group(inter_data,
+                                self.input_builder.chunked_prefill_enabled)
+
+        device = self.runner.device
         use_captured_graph = cuda_graph_pad_size != -1
 
         max_query_len = max(query_lens)
@@ -339,7 +335,7 @@ def build(self, runner: "GPUModelRunnerBase", seq_lens, query_lens,
 
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].
-            input_block_tables = runner.graph_block_tables[:batch_size]
+            input_block_tables = self.runner.graph_block_tables[:batch_size]
             for i, block_table in enumerate(self.block_tables):
                 if block_table:
                     input_block_tables[i, :len(block_table)] = block_table
@@ -350,11 +346,8 @@ def build(self, runner: "GPUModelRunnerBase", seq_lens, query_lens,
                                         cuda_graph_pad_size)
             self.paged_kv_last_page_len.extend([0] * cuda_graph_pad_size)
         else:
-            max_block_table_len = max(
-                len(block_table) for block_table in self.block_tables)
             block_tables = make_tensor_with_pad(
                 self.block_tables,
-                max_len=max_block_table_len,
                 pad=0,
                 dtype=torch.int,
                 device=device,
@@ -386,7 +379,7 @@ def build(self, runner: "GPUModelRunnerBase", seq_lens, query_lens,
                                            dtype=torch.long,
                                            device=device)
 
-        logits_soft_cap = getattr(runner.model_config.hf_config,
+        logits_soft_cap = getattr(self.runner.model_config.hf_config,
                                   "attn_logit_softcapping", None)
 
         if len(self.paged_kv_indptr) > 0:
@@ -403,8 +396,8 @@ def build(self, runner: "GPUModelRunnerBase", seq_lens, query_lens,
             paged_kv_indptr_tensor = None
             paged_kv_last_page_len_tensor = None
 
-        kv_cache_dtype = get_kv_cache_torch_dtype(runner.kv_cache_dtype,
-                                                  runner.model_config.dtype)
+        kv_cache_dtype = get_kv_cache_torch_dtype(
+            self.runner.kv_cache_dtype, self.runner.model_config.dtype)
         return FlashInferMetadata(
             num_prefills=self.num_prefills,
             slot_mapping=slot_mapping_tensor,
@@ -415,11 +408,11 @@ def build(self, runner: "GPUModelRunnerBase", seq_lens, query_lens,
             paged_kv_indptr=paged_kv_indptr_tensor,
             paged_kv_indices=paged_kv_indices_tensor,
             paged_kv_last_page_len=paged_kv_last_page_len_tensor,
-            num_qo_heads=runner.model_config.get_num_attention_heads(
-                runner.parallel_config),
-            num_kv_heads=runner.model_config.get_num_kv_heads(
-                runner.parallel_config),
-            head_dim=runner.model_config.get_head_size(),
+            num_qo_heads=self.runner.model_config.get_num_attention_heads(
+                self.runner.parallel_config),
+            num_kv_heads=self.runner.model_config.get_num_kv_heads(
+                self.runner.parallel_config),
+            head_dim=self.runner.model_config.get_head_size(),
             page_size=self.block_size,
             seq_start_loc=seq_start_loc,
             query_start_loc=query_start_loc,
@@ -486,20 +479,6 @@ def forward(
         if attn_metadata.num_decode_tokens > 0:
             assert attn_metadata.num_prefill_tokens == 0, (
                 "Chunked prefill is not supported with flashinfer yet.")
-            
-        prefill_meta = attn_metadata.prefill_metadata
-        
-        if all([
-            kv_cache is not None, # we are not in profile run
-            prefill_meta is not None, # during prefill stage
-            envs.VLLM_DISAGG_PREFILL_ROLE is not None, # disagg prefill enabled
-        ]):
-            if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
-                get_disagg_group().send(key)
-                get_disagg_group().send(value)
-            else:
-                key = get_disagg_group().recv(key.shape, key.dtype)
-                value = get_disagg_group().recv(value.shape, value.dtype)
 
         if kv_cache is not None:
             # Use the same reshape and cache kernel as flash attention.
@@ -514,7 +493,7 @@ def forward(
 
         query = query.contiguous(
         )  # Flashinfer requires query to be contiguous
-        if prefill_meta is not None:
+        if prefill_meta := attn_metadata.prefill_metadata:
             # We will use flash attention for prefill
             # when kv_cache is not provided.
             # This happens when vllm runs the profiling to
@@ -536,26 +515,11 @@ def forward(
             else:
                 assert prefill_meta is not None
                 assert prefill_meta.prefill_wrapper is not None
-                
-                if not all([
-                    envs.VLLM_DISAGG_PREFILL_ROLE is not None,
-                    envs.VLLM_DISAGG_PREFILL_ROLE == "decode",
-                ]): # Only skip prefill for disagg decode instance
-                    output = prefill_meta.prefill_wrapper.forward(
-                        query,
-                        kv_cache,
-                        logits_soft_cap=attn_metadata.logits_soft_cap,
-                        causal=True)
-                    output = output.view(num_tokens, hidden_size)
-                    
-                if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-                    # communication for disaggregated prefill.
-                    if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
-                        get_disagg_group().send(output)
-                    else:
-                        # Kuntai: This assume that output has the same dtype as key
-                        # Is this assumption true?
-                        output = get_disagg_group().recv([num_tokens, hidden_size], key.dtype)
+                output = prefill_meta.prefill_wrapper.forward(
+                    query,
+                    kv_cache,
+                    logits_soft_cap=attn_metadata.logits_soft_cap,
+                    causal=True)
         else:
             assert attn_metadata.decode_metadata is not None
             assert attn_metadata.decode_metadata.decode_wrapper is not None
@@ -564,9 +528,4 @@ def forward(
                 kv_cache,
                 sm_scale=self.scale,
                 logits_soft_cap=attn_metadata.logits_soft_cap)
-            output = output.view(num_tokens, hidden_size)
-            
-        
-        
-        
-        return output
+        return output.view(num_tokens, hidden_size)
\ No newline at end of file

From dbaade746704b674fccc41bf76596a5a3234e528 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 23:56:18 -0700
Subject: [PATCH 098/278] add space

---
 vllm/attention/backends/flashinfer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 8271efe330c9..f7e467e121e7 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -528,4 +528,5 @@ def forward(
                 kv_cache,
                 sm_scale=self.scale,
                 logits_soft_cap=attn_metadata.logits_soft_cap)
-        return output.view(num_tokens, hidden_size)
\ No newline at end of file
+        return output.view(num_tokens, hidden_size)
+        
\ No newline at end of file

From f572db8590fd3f16a2f8e62f2a4ba108ad04f161 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 23:57:34 -0700
Subject: [PATCH 099/278] clean config.py

---
 vllm/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index a2ca527173ca..c87974d0df16 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -661,7 +661,7 @@ def __init__(
         self.ray_workers_use_nsight = ray_workers_use_nsight
         self.placement_group = placement_group
 
-        self.world_size = pipeline_parallel_size * tensor_parallel_size
+        self.world_size = pipeline_parallel_size * self.tensor_parallel_size
         if worker_use_ray:
             if self.distributed_executor_backend is None:
                 self.distributed_executor_backend = "ray"

From 9ebf3ad8d8f4397cff3bc7c061a170531c65ccf5 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 23:58:56 -0700
Subject: [PATCH 100/278] keep flashattn implementation

---
 vllm/attention/backends/flash_attn.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 0563e33b6c3c..db16a6561270 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -554,14 +554,17 @@ def forward(
 
         if (prefill_meta is not None) and (kv_cache is not None) and \
             (envs.VLLM_DISAGG_PREFILL_ROLE is not None):
-            # communication for disaggregated prefill.
+            # transfer the output if
+            #   1). during prefilling
+            #   2). disaggregated prefill enabled
+            #   3). not in the profile run (kv_cache is not None)
+            # no need to transfer kv cache, as it is already the input of this function
             if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
                 out = output[:num_prefill_tokens].contiguous()
-                print("Sending out, " , out.shape, out.dtype, out.device)
+                logger.debug("Send output, " , out.shape, out.dtype, out.device)
                 get_disagg_group().send(out)
             else:
-
-                print("Recv out, " , output[:num_prefill_tokens].shape, output.dtype, output.device)
+                logger.debug("Recv output, " , output[:num_prefill_tokens].shape, output.dtype, output.device)
                 output[:num_prefill_tokens] = get_disagg_group().recv(output[:num_prefill_tokens].shape, output.dtype)
                 
         if decode_meta := attn_metadata.decode_metadata:

From 67b1c2eac7234e6d608a5a3063129811092803a1 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 22 Jul 2024 23:59:21 -0700
Subject: [PATCH 101/278] commit changes that will be merged

---
 vllm/core/block/prefix_caching_block.py | 5 +++++
 vllm/model_executor/models/llama.py     | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index f272e23ee608..2664657e3c26 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -11,6 +11,10 @@
 from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
 from vllm.utils import cdiv
 
+import vllm.envs as envs
+from vllm.distributed import get_disagg_group
+
+
 PrefixHash = int
 
 # By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME
@@ -163,6 +167,7 @@ def allocate_immutable_block(self,
         # No cached block => Allocate a new block
         block = self.allocate_mutable_block(prev_block)
         block.append_token_ids(token_ids)
+
         return block
 
     def allocate_immutable_blocks(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 4c434e54cf74..5cc27dbbc34e 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -137,7 +137,7 @@ def __init__(
             quant_config=quant_config,
         )
 
-        self.rotary_emb = get_rope(
+        self.rotry_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,

From 3abca470ba30ea48655034b2e568e6682c2ea428 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 00:02:28 -0700
Subject: [PATCH 102/278] revert custom allreduce changes

---
 .../device_communicators/custom_all_reduce.py        | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index b7d5af5a8a0a..a4f30808d32e 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,6 +1,5 @@
 from contextlib import contextmanager
 from typing import Any, List, Optional, Union
-import logging
 
 import torch
 import torch.distributed as dist
@@ -23,17 +22,6 @@
 
 logger = init_logger(__name__)
 
-class ConditionalLoggingHandler(logging.Handler):
-    def emit(self, record):
-        dist = torch.distributed
-        try:
-            if not dist.is_initialized() or (dist.is_initialized() and dist.get_rank() % 4 == 0):
-                msg = self.format(record)
-                print(msg)  # You can replace this with any other logging mechanism you prefer
-        except Exception:
-            pass
-logger.addHandler(ConditionalLoggingHandler())
-
 
 def _can_p2p(rank: int, world_size: int) -> bool:
     for i in range(world_size):

From 0ce251b90d3421538ee73973fa49f4a46bf1427f Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 00:03:17 -0700
Subject: [PATCH 103/278] remove debug logs from the file

---
 vllm/distributed/device_communicators/pynccl.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 36f1e04aec79..f159d43a47c9 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -71,11 +71,8 @@ def __init__(
             self.unique_id = ncclUniqueId()
         tensor = torch.ByteTensor(list(self.unique_id.internal))
         ranks = dist.get_process_group_ranks(group)
-        logger.debug("Group: %s, group rank: %s, world size: %s, src: %s", str(group), str(self.rank), str(self.world_size), ranks[0])
-        
         # arg `src` in `broadcast` is the global rank
         dist.broadcast(tensor, src=ranks[0], group=group)
-        logger.debug("dist broadcast succeeded")
         byte_list = tensor.tolist()
         for i, byte in enumerate(byte_list):
             self.unique_id.internal[i] = byte

From 1f3ac2bcf96299e3414794156d80c329440227f4 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 00:04:29 -0700
Subject: [PATCH 104/278] revert changes to prefix_caching_block ---
 unnecessary

---
 vllm/core/block/prefix_caching_block.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index a7eacadd03b0..d102ad404559 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -11,10 +11,6 @@
 from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
 from vllm.utils import cdiv
 
-import vllm.envs as envs
-from vllm.distributed import get_disagg_group
-
-
 PrefixHash = int
 
 # By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME
@@ -167,7 +163,6 @@ def allocate_immutable_block(self,
         # No cached block => Allocate a new block
         block = self.allocate_mutable_block(prev_block)
         block.append_token_ids(token_ids)
-
         return block
 
     def allocate_immutable_blocks(

From c93bf33810c7e6d45b7b7c206e3b8d7421f8ce9e Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 00:05:14 -0700
Subject: [PATCH 105/278] revert changes

---
 vllm/distributed/device_communicators/pynccl.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index f159d43a47c9..731956654567 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -41,7 +41,6 @@ def __init__(
         self.rank = dist.get_rank(group)
         self.world_size = dist.get_world_size(group)
 
-
         # if world_size == 1, no need to create communicator
         if self.world_size == 1:
             self.available = False

From 8dcaf43df0ee8da38e70dd2359cb7da58ddedcbd Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 00:06:42 -0700
Subject: [PATCH 106/278] fix typos

---
 vllm/model_executor/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 88b6fc62e963..2052c443a888 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -148,7 +148,7 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
 
-        self.rotry_emb = get_rope(
+        self.rotary_emb = get_rope(
             self.head_dim,
             rotary_dim=self.head_dim,
             max_position=max_position_embeddings,

From 4d83813d7934e8edc88f656ef079e9be74a0e560 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 00:15:08 -0700
Subject: [PATCH 107/278] add example usage to disaggregated prefill

---
 examples/disaggregated_prefill_example.sh | 46 +++++++++++++++++++++--
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index 1df1aba0e4a6..571bc7072a8e 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -1,9 +1,19 @@
+#!/bin/bash
+# This file demonstrates the example usage of disaggregated prefilling
+# We will launch 2 vllm instances (1 for prefill and 1 for decode),
+# and then transfer the KV cache between them.
 
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
-# export VLLM_TRACE_FUNCTION=1
-# export TORCH_DISTRIBUTED_DEBUG=DETAIL
-# export GLOO_LOGGING_LEVEL=TRACE
+
+# a function that waits vLLM server to start
+wait_for_server() {
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
 
 # prefilling instance
 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
@@ -12,7 +22,6 @@ VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
     --port 8100 \
     -tp 4 \
     --enable-prefix-caching &
-    
 
 # decoding instance
 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
@@ -23,3 +32,32 @@ VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     --enable-prefix-caching &
 
 
+wait_for_server 8100
+wait_for_server 8200
+
+# sending an example request
+# in disaggregated prefilling, there are two steps of sending a request:
+#   1. send the request to prefill instance, with max_tokens set to 1
+#   2. send the request again to decode instance, no modification
+
+# send to prefill instance
+curl http://localhost:8100/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "neuralmagic/Meta-Llama-3-70B-Instruct-FP8",
+"prompt": "San Francisco is a",
+"max_tokens": 1,
+"temperature": 0
+}' &
+
+# send to decode instance
+curl http://localhost:8200/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "neuralmagic/Meta-Llama-3-70B-Instruct-FP8",
+"prompt": "San Francisco is a",
+"max_tokens": 5,
+"temperature": 0
+}'
+
+

From 11c3ace8525e83e2a6adcc0c44f6784873720350 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 00:17:54 -0700
Subject: [PATCH 108/278] can only use print instead of log.debug...

---
 vllm/attention/backends/flash_attn.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 992bd7c2366b..49b8d38c02ba 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -20,13 +20,10 @@
 
 # To identify if the VLLM_DISAGG_PREFILL_ROLE is set or no
 import vllm.envs as envs
-from vllm.logger import init_logger
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUBuilder
 
-logger = init_logger(__name__)
-
 class FlashAttentionBackend(AttentionBackend):
 
     @staticmethod
@@ -559,10 +556,10 @@ def forward(
             # no need to transfer kv cache, as it is already the input of this function
             if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
                 out = output[:num_prefill_tokens].contiguous()
-                logger.debug("Send output, " , out.shape, out.dtype, out.device)
+                print("Send output, " , out.shape, out.dtype, out.device)
                 get_disagg_group().send(out)
             else:
-                logger.debug("Recv output, " , output[:num_prefill_tokens].shape, output.dtype, output.device)
+                print("Recv output, " , output[:num_prefill_tokens].shape, output.dtype, output.device)
                 output[:num_prefill_tokens] = get_disagg_group().recv(output[:num_prefill_tokens].shape, output.dtype)
                 
         if decode_meta := attn_metadata.decode_metadata:

From 0bd0cc9defae39d657303a030f07a64ebe6a69ca Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 00:18:51 -0700
Subject: [PATCH 109/278] kill vllm instance after run

---
 examples/disaggregated_prefill_example.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disaggregated_prefill_example.sh
index 571bc7072a8e..fabba139ae26 100644
--- a/examples/disaggregated_prefill_example.sh
+++ b/examples/disaggregated_prefill_example.sh
@@ -60,4 +60,5 @@ curl http://localhost:8200/v1/completions \
 "temperature": 0
 }'
 
-
+# gracefully kill all vllm instances
+ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9

From 39973bb7cbc121f381a2709f6517e817a5266cd0 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 21:27:53 -0700
Subject: [PATCH 110/278] add proxy server for disaggregated prefilling

---
 .../disagg_prefill_example.sh}                |  0
 .../disagg_prefill/disagg_proxy_server.py     | 62 +++++++++++++++++++
 2 files changed, 62 insertions(+)
 rename examples/{disaggregated_prefill_example.sh => disagg_prefill/disagg_prefill_example.sh} (100%)
 create mode 100644 examples/disagg_prefill/disagg_proxy_server.py

diff --git a/examples/disaggregated_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
similarity index 100%
rename from examples/disaggregated_prefill_example.sh
rename to examples/disagg_prefill/disagg_prefill_example.sh
diff --git a/examples/disagg_prefill/disagg_proxy_server.py b/examples/disagg_prefill/disagg_proxy_server.py
new file mode 100644
index 000000000000..eb45778d4ad2
--- /dev/null
+++ b/examples/disagg_prefill/disagg_proxy_server.py
@@ -0,0 +1,62 @@
+import http.server
+import socketserver
+import requests
+import json
+import argparse
+
+class ProxyHTTPRequestHandler(http.server.BaseHTTPRequestHandler):
+    def __init__(self, *args, **kwargs):
+        self.prefill_port = kwargs.pop('prefill_port', 8100)
+        self.decode_port = kwargs.pop('decode_port', 8200)
+        super().__init__(*args, **kwargs)
+
+    def do_POST(self):
+        # Read the content length to get the data size
+        content_length = int(self.headers['Content-Length'])
+        post_data = self.rfile.read(content_length)
+        
+        # Parse the JSON payload
+        data = json.loads(post_data)
+        
+        # Change the max_tokens to 1 for the request to prefill_port
+        data_prefill = data.copy()
+        data_prefill["max_tokens"] = 1
+        post_data_prefill = json.dumps(data_prefill)
+        
+        # Forward the request to prefill_port with modified max_tokens
+        response_prefill = requests.post(f"http://localhost:{self.prefill_port}/v1/completions", 
+                                         headers={"Content-Type": "application/json"},
+                                         data=post_data_prefill)
+        
+        # Check if the response from prefill_port is successful
+        if response_prefill.status_code == 200:
+            # Forward the original request to decode_port
+            response_decode = requests.post(f"http://localhost:{self.decode_port}/v1/completions", 
+                                            headers={"Content-Type": "application/json"},
+                                            data=post_data)
+            
+            # Send the response back to the client
+            self.send_response(response_decode.status_code)
+            self.send_header('Content-type', 'application/json')
+            self.end_headers()
+            self.wfile.write(response_decode.content)
+        else:
+            # Send an error response back to the client
+            self.send_response(response_prefill.status_code)
+            self.send_header('Content-type', 'application/json')
+            self.end_headers()
+            self.wfile.write(response_prefill.content)
+
+def run_server(port_8000, prefill_port, decode_port):
+    handler = lambda *args, **kwargs: ProxyHTTPRequestHandler(*args, prefill_port=prefill_port, decode_port=decode_port, **kwargs)
+    with socketserver.TCPServer(("", port_8000), handler) as httpd:
+        print(f"Serving at port {port_8000}")
+        httpd.serve_forever()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Proxy server")
+    parser.add_argument('prefill_port', type=int, help='Port to forward the first request to (with max_tokens=1)')
+    parser.add_argument('decode_port', type=int, help='Port to forward the second request to')
+    args = parser.parse_args()
+    
+    run_server(8000, args.prefill_port, args.decode_port)

From 13a6d12be2def55976a3a78b514d916bcf222e7f Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 21:49:56 -0700
Subject: [PATCH 111/278] update disagg proxy server

---
 .../disagg_prefill/disagg_prefill_example.sh  |   2 -
 .../disagg_prefill/disagg_proxy_server.py     | 102 +++++++++---------
 2 files changed, 48 insertions(+), 56 deletions(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index fabba139ae26..a505d8b3ca85 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -60,5 +60,3 @@ curl http://localhost:8200/v1/completions \
 "temperature": 0
 }'
 
-# gracefully kill all vllm instances
-ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
diff --git a/examples/disagg_prefill/disagg_proxy_server.py b/examples/disagg_prefill/disagg_proxy_server.py
index eb45778d4ad2..6ec2737367c8 100644
--- a/examples/disagg_prefill/disagg_proxy_server.py
+++ b/examples/disagg_prefill/disagg_proxy_server.py
@@ -1,62 +1,56 @@
-import http.server
-import socketserver
-import requests
-import json
 import argparse
+import aiohttp
+import asyncio
+from aiohttp import web
+import json
 
-class ProxyHTTPRequestHandler(http.server.BaseHTTPRequestHandler):
-    def __init__(self, *args, **kwargs):
-        self.prefill_port = kwargs.pop('prefill_port', 8100)
-        self.decode_port = kwargs.pop('decode_port', 8200)
-        super().__init__(*args, **kwargs)
+async def handle_post(request):
+    prefill_port = request.app['prefill_port']
+    decode_port = request.app['decode_port']
+
+    # Read and parse the request payload
+    try:
+        payload = await request.json()
+    except Exception as e:
+        return web.json_response({'error': str(e)}, status=400)
+    
+    # Modify max_tokens for prefill request
+    payload_prefill = payload.copy()
+    payload_prefill["max_tokens"] = 1
 
-    def do_POST(self):
-        # Read the content length to get the data size
-        content_length = int(self.headers['Content-Length'])
-        post_data = self.rfile.read(content_length)
-        
-        # Parse the JSON payload
-        data = json.loads(post_data)
-        
-        # Change the max_tokens to 1 for the request to prefill_port
-        data_prefill = data.copy()
-        data_prefill["max_tokens"] = 1
-        post_data_prefill = json.dumps(data_prefill)
-        
-        # Forward the request to prefill_port with modified max_tokens
-        response_prefill = requests.post(f"http://localhost:{self.prefill_port}/v1/completions", 
-                                         headers={"Content-Type": "application/json"},
-                                         data=post_data_prefill)
-        
-        # Check if the response from prefill_port is successful
-        if response_prefill.status_code == 200:
-            # Forward the original request to decode_port
-            response_decode = requests.post(f"http://localhost:{self.decode_port}/v1/completions", 
-                                            headers={"Content-Type": "application/json"},
-                                            data=post_data)
-            
-            # Send the response back to the client
-            self.send_response(response_decode.status_code)
-            self.send_header('Content-type', 'application/json')
-            self.end_headers()
-            self.wfile.write(response_decode.content)
-        else:
-            # Send an error response back to the client
-            self.send_response(response_prefill.status_code)
-            self.send_header('Content-type', 'application/json')
-            self.end_headers()
-            self.wfile.write(response_prefill.content)
+    async with aiohttp.ClientSession() as session:
+        # Forward request to prefill port
+        async with session.post(f"http://localhost:{prefill_port}/v1/completions", json=payload_prefill) as response_prefill:
+            if response_prefill.status != 200:
+                return web.json_response(await response_prefill.json(), status=response_prefill.status)
 
-def run_server(port_8000, prefill_port, decode_port):
-    handler = lambda *args, **kwargs: ProxyHTTPRequestHandler(*args, prefill_port=prefill_port, decode_port=decode_port, **kwargs)
-    with socketserver.TCPServer(("", port_8000), handler) as httpd:
-        print(f"Serving at port {port_8000}")
-        httpd.serve_forever()
+        # Forward original request to decode port
+        async with session.post(f"http://localhost:{decode_port}/v1/completions", json=payload) as response_decode:
+            if 'stream' in payload and payload['stream']:
+                # If streaming, set up a streaming response
+                response = web.StreamResponse(status=response_decode.status, reason=response_decode.reason, headers=response_decode.headers)
+                await response.prepare(request)
+                
+                async for data, _ in response_decode.content.iter_chunks():
+                    await response.write(data)
+                await response.write_eof()
+                return response
+            else:
+                # Return non-streaming response as JSON
+                return web.json_response(await response_decode.json(), status=response_decode.status)
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Proxy server")
+async def init_app(prefill_port, decode_port):
+    app = web.Application()
+    app['prefill_port'] = prefill_port
+    app['decode_port'] = decode_port
+    app.router.add_post('/v1/completions', handle_post)
+    return app
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Async Proxy server")
     parser.add_argument('prefill_port', type=int, help='Port to forward the first request to (with max_tokens=1)')
     parser.add_argument('decode_port', type=int, help='Port to forward the second request to')
     args = parser.parse_args()
-    
-    run_server(8000, args.prefill_port, args.decode_port)
+
+    app = asyncio.run(init_app(args.prefill_port, args.decode_port))
+    web.run_app(app, port=8000)

From 81cad25d3dfca617d2be4ce299af1948b01fbc9d Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 22:00:37 -0700
Subject: [PATCH 112/278] add debug message for proxy server

---
 .../disagg_prefill/disagg_proxy_server.py     | 37 ++++++++++++++++---
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/examples/disagg_prefill/disagg_proxy_server.py b/examples/disagg_prefill/disagg_proxy_server.py
index 6ec2737367c8..6655ff6379b2 100644
--- a/examples/disagg_prefill/disagg_proxy_server.py
+++ b/examples/disagg_prefill/disagg_proxy_server.py
@@ -3,29 +3,48 @@
 import asyncio
 from aiohttp import web
 import json
+import logging
+
+# Configure logging
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
 
 async def handle_post(request):
     prefill_port = request.app['prefill_port']
     decode_port = request.app['decode_port']
 
+    logger.debug(f"Received request to {request.path} with method {request.method}")
+
     # Read and parse the request payload
     try:
         payload = await request.json()
+        logger.debug(f"Request payload: {json.dumps(payload, indent=2)}")
     except Exception as e:
+        logger.error(f"Error parsing request payload: {str(e)}")
         return web.json_response({'error': str(e)}, status=400)
     
     # Modify max_tokens for prefill request
     payload_prefill = payload.copy()
     payload_prefill["max_tokens"] = 1
+    logger.debug(f"Modified prefill payload: {json.dumps(payload_prefill, indent=2)}")
 
     async with aiohttp.ClientSession() as session:
         # Forward request to prefill port
         async with session.post(f"http://localhost:{prefill_port}/v1/completions", json=payload_prefill) as response_prefill:
+            try:
+                response_prefill_data = await response_prefill.json()
+                logger.debug(f"Prefill response data: {json.dumps(response_prefill_data, indent=2)}")
+            except aiohttp.ContentTypeError:
+                response_prefill_data = await response_prefill.text()
+                logger.debug(f"Prefill response text: {response_prefill_data}")
+            
             if response_prefill.status != 200:
-                return web.json_response(await response_prefill.json(), status=response_prefill.status)
+                logger.error(f"Prefill request failed with status {response_prefill.status}")
+                return web.json_response(response_prefill_data, status=response_prefill.status)
 
         # Forward original request to decode port
         async with session.post(f"http://localhost:{decode_port}/v1/completions", json=payload) as response_decode:
+            logger.debug(f"Forwarding request to decode port {decode_port}")
             if 'stream' in payload and payload['stream']:
                 # If streaming, set up a streaming response
                 response = web.StreamResponse(status=response_decode.status, reason=response_decode.reason, headers=response_decode.headers)
@@ -33,11 +52,19 @@ async def handle_post(request):
                 
                 async for data, _ in response_decode.content.iter_chunks():
                     await response.write(data)
+                    logger.debug(f"Streaming chunk: {data}")
                 await response.write_eof()
+                logger.debug("Finished streaming response")
                 return response
             else:
-                # Return non-streaming response as JSON
-                return web.json_response(await response_decode.json(), status=response_decode.status)
+                # Handle non-streaming response
+                try:
+                    response_decode_data = await response_decode.json()
+                    logger.debug(f"Decode response data: {json.dumps(response_decode_data, indent=2)}")
+                except aiohttp.ContentTypeError:
+                    response_decode_data = await response_decode.text()
+                    logger.debug(f"Decode response text: {response_decode_data}")
+                return web.json_response(response_decode_data, status=response_decode.status)
 
 async def init_app(prefill_port, decode_port):
     app = web.Application()
@@ -49,8 +76,8 @@ async def init_app(prefill_port, decode_port):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Async Proxy server")
     parser.add_argument('prefill_port', type=int, help='Port to forward the first request to (with max_tokens=1)')
-    parser.add_argument('decode_port', type=int, help='Port to forward the second request to')
+    parser.add_argument 'decode_port', type=int, help='Port to forward the second request to')
     args = parser.parse_args()
 
     app = asyncio.run(init_app(args.prefill_port, args.decode_port))
-    web.run_app(app, port=8000)
+    web.run_app(app, port=8000)
\ No newline at end of file

From 198931befb8ff7fe6611189a5d22000a7c794114 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 22:01:35 -0700
Subject: [PATCH 113/278] fix bug

---
 examples/disagg_prefill/disagg_proxy_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/disagg_prefill/disagg_proxy_server.py b/examples/disagg_prefill/disagg_proxy_server.py
index 6655ff6379b2..f87a081941fc 100644
--- a/examples/disagg_prefill/disagg_proxy_server.py
+++ b/examples/disagg_prefill/disagg_proxy_server.py
@@ -76,7 +76,7 @@ async def init_app(prefill_port, decode_port):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Async Proxy server")
     parser.add_argument('prefill_port', type=int, help='Port to forward the first request to (with max_tokens=1)')
-    parser.add_argument 'decode_port', type=int, help='Port to forward the second request to')
+    parser.add_argument('decode_port', type=int, help='Port to forward the second request to')
     args = parser.parse_args()
 
     app = asyncio.run(init_app(args.prefill_port, args.decode_port))

From 7412767d5574640ec4ef406fe25c05844b54d045 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 22:12:48 -0700
Subject: [PATCH 114/278] increase nccl buff size

---
 examples/disagg_prefill/disagg_prefill_example.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index a505d8b3ca85..cf1646b6b307 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -5,6 +5,7 @@
 
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
+export NCCL_BUFFSIZE=2147483648
 
 # a function that waits vLLM server to start
 wait_for_server() {

From bd6f41b5c9dee9762fbe0fbbc8842224d8cf2099 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 22:15:25 -0700
Subject: [PATCH 115/278] increase nccl buffer size

---
 examples/disagg_prefill/disagg_prefill_example.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index cf1646b6b307..88e3920ac6e7 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -5,7 +5,7 @@
 
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
-export NCCL_BUFFSIZE=2147483648
+export NCCL_BUFFSIZE=1073741824
 
 # a function that waits vLLM server to start
 wait_for_server() {

From 20f9de1155792b6b6e87331701e8682e8350f3b7 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 22:19:30 -0700
Subject: [PATCH 116/278] add debug flag

---
 examples/disagg_prefill/disagg_prefill_example.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index 88e3920ac6e7..3c1e33ca161d 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -6,6 +6,7 @@
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
 export NCCL_BUFFSIZE=1073741824
+export NCCL_DEBUG=INFO
 
 # a function that waits vLLM server to start
 wait_for_server() {

From 11850d57a739c94b1e01a3b93baff1b2e27180a1 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 22:22:21 -0700
Subject: [PATCH 117/278] reduce gpu memory usage

---
 examples/disagg_prefill/disagg_prefill_example.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index 3c1e33ca161d..b6521e22336f 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -23,7 +23,8 @@ VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
     --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
     --port 8100 \
     -tp 4 \
-    --enable-prefix-caching &
+    --enable-prefix-caching \
+    --gpu-memory-utilization 0.8 &
 
 # decoding instance
 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
@@ -31,7 +32,7 @@ VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
     --port 8200 \
     -tp 4 \
-    --enable-prefix-caching &
+    --enable-prefix-caching 0.8 &
 
 
 wait_for_server 8100

From d6ad9bdf53834306a0cf9694df93fd2e0bf7eb71 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 22:24:18 -0700
Subject: [PATCH 118/278] fix syntax bug

---
 examples/disagg_prefill/disagg_prefill_example.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index b6521e22336f..28160fd67ff1 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -32,7 +32,8 @@ VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
     --port 8200 \
     -tp 4 \
-    --enable-prefix-caching 0.8 &
+    --enable-prefix-caching \
+    --gpu-memory-utilization 0.8 &
 
 
 wait_for_server 8100

From 57dd656fde6d775b8510441a5d84684fe997ea83 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 22:34:30 -0700
Subject: [PATCH 119/278] temporarily lift up nccl buffer size for send and
 recv

---
 examples/disagg_prefill/disagg_prefill_example.sh | 1 -
 vllm/distributed/parallel_state.py                | 3 +++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index 28160fd67ff1..d9ea28070053 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -5,7 +5,6 @@
 
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
-export NCCL_BUFFSIZE=1073741824
 export NCCL_DEBUG=INFO
 
 # a function that waits vLLM server to start
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 5188b71d95b9..1eab00db49d4 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -835,6 +835,9 @@ def include_decoding_groups_if_disagg_enabled(
     """
 
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+        # dirty fix: temporarily lift up NCCL buffer size to 1GB
+        import os
+        os.environ["NCCL_BUFFSIZE"] = "1073741824"
         assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
             "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
         new_groups = []

From 9379fbbe80268b4dd7b20cf83c6d343d880469bb Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 22:37:27 -0700
Subject: [PATCH 120/278] reduce nccl buffer size and see if bug fixed

---
 vllm/distributed/parallel_state.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 1eab00db49d4..15ffa8817b6a 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -836,8 +836,8 @@ def include_decoding_groups_if_disagg_enabled(
 
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
         # dirty fix: temporarily lift up NCCL buffer size to 1GB
-        import os
-        os.environ["NCCL_BUFFSIZE"] = "1073741824"
+        # import os
+        # os.environ["NCCL_BUFFSIZE"] = "1073741824"
         assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
             "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
         new_groups = []

From c23d8419e5fbc531106577f669e1fdaf709188b0 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 22:38:23 -0700
Subject: [PATCH 121/278] fix

---
 vllm/distributed/parallel_state.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 15ffa8817b6a..99a7cc932baa 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -835,9 +835,6 @@ def include_decoding_groups_if_disagg_enabled(
     """
 
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-        # dirty fix: temporarily lift up NCCL buffer size to 1GB
-        # import os
-        # os.environ["NCCL_BUFFSIZE"] = "1073741824"
         assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
             "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
         new_groups = []
@@ -1016,6 +1013,9 @@ def initialize_model_parallel(
     logger.debug("_PP initialized for rank %d", torch.distributed.get_rank())
     
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+        # dirty fix: temporarily lift up NCCL buffer size to 1GB
+        import os
+        os.environ["NCCL_BUFFSIZE"] = "1073741824"
         global _DISAGG
         logger.debug("Disaggregated prefill enabled, create _DISAGG group")
         group_ranks = []

From 7fc62b4162a2055933e7f1e609e80af9b0775e50 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 22:47:17 -0700
Subject: [PATCH 122/278] add debug info -- see which layer the prefill
 instance got stuck

---
 vllm/attention/backends/flash_attn.py | 6 ++++--
 vllm/model_executor/models/llama.py   | 2 ++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 49b8d38c02ba..2d363af2590a 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -556,10 +556,12 @@ def forward(
             # no need to transfer kv cache, as it is already the input of this function
             if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
                 out = output[:num_prefill_tokens].contiguous()
-                print("Send output, " , out.shape, out.dtype, out.device)
+                if torch.distributed.get_rank() % 4 == 0:
+                    print("Send output, " , out.shape, out.dtype, out.device)
                 get_disagg_group().send(out)
             else:
-                print("Recv output, " , output[:num_prefill_tokens].shape, output.dtype, output.device)
+                if torch.distributed.get_rank() % 4 == 0:
+                    print("Recv output, " , output[:num_prefill_tokens].shape, output.dtype, output.device)
                 output[:num_prefill_tokens] = get_disagg_group().recv(output[:num_prefill_tokens].shape, output.dtype)
                 
         if decode_meta := attn_metadata.decode_metadata:
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2052c443a888..9e5fc0223554 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -318,6 +318,8 @@ def forward(
             residual = intermediate_tensors["residual"]
 
         for i in range(self.start_layer, self.end_layer):
+            if torch.distributed.get_rank() % 4 == 0:
+                print(f"Layer {i}")
             layer = self.layers[i]
             hidden_states, residual = layer(
                 positions,

From e54236690ad86b2c1a23ebb86010ec322f6b1038 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 22:48:48 -0700
Subject: [PATCH 123/278] remove nccl debug -- it is too loud

---
 examples/disagg_prefill/disagg_prefill_example.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index d9ea28070053..7b58fe6f64ce 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -5,7 +5,7 @@
 
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
-export NCCL_DEBUG=INFO
+# export NCCL_DEBUG=INFO
 
 # a function that waits vLLM server to start
 wait_for_server() {

From e9f7dc2004d636535184f512d5e29d562548c619 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 23:51:23 -0700
Subject: [PATCH 124/278] change buffer size only for disagg communicator

---
 examples/disagg_prefill/disagg_prefill_example.sh | 2 +-
 vllm/distributed/parallel_state.py                | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index 7b58fe6f64ce..d9ea28070053 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -5,7 +5,7 @@
 
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
-# export NCCL_DEBUG=INFO
+export NCCL_DEBUG=INFO
 
 # a function that waits vLLM server to start
 wait_for_server() {
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 99a7cc932baa..a58be5ed14e1 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1016,6 +1016,8 @@ def initialize_model_parallel(
         # dirty fix: temporarily lift up NCCL buffer size to 1GB
         import os
         os.environ["NCCL_BUFFSIZE"] = "1073741824"
+        import time
+        time.sleep(20)
         global _DISAGG
         logger.debug("Disaggregated prefill enabled, create _DISAGG group")
         group_ranks = []

From 18ded4ca14633f7b8b248fc4b7896003472a3068 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 23 Jul 2024 23:55:37 -0700
Subject: [PATCH 125/278] disable nccl debug

---
 examples/disagg_prefill/disagg_prefill_example.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index d9ea28070053..7b58fe6f64ce 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -5,7 +5,7 @@
 
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
-export NCCL_DEBUG=INFO
+# export NCCL_DEBUG=INFO
 
 # a function that waits vLLM server to start
 wait_for_server() {

From e814f82d7a44e967f3dbf57189622c82151c192a Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 24 Jul 2024 00:13:24 -0700
Subject: [PATCH 126/278] use isend and irecv

---
 vllm/distributed/parallel_state.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a58be5ed14e1..06b4cce3e813 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -675,6 +675,16 @@ def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
         else:
             torch.distributed.send(tensor, self.ranks[dst], self.device_group)
 
+    def isend(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+        """NOTE: this function leverage pytorch's isend, to bypass PyNccl buffer limit"""
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+
+        torch.distributed.isend(tensor, self.ranks[dst], self.device_group)
+
+
     def recv(self,
              size: torch.Size,
              dtype: torch.dtype,
@@ -692,6 +702,19 @@ def recv(self,
             torch.distributed.recv(tensor, self.ranks[src], self.device_group)
         return tensor
 
+    def irecv_wait(self,
+             size: torch.Size,
+             dtype: torch.dtype,
+             src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the src rank asynchronously."""
+        """NOTE: `src` is the local rank of the destination rank."""
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        torch.distributed.irecv(tensor, self.ranks[src], self.device_group).wait()
+        return tensor
+
     def destroy(self):
         if self.device_group is not None:
             torch.distributed.destroy_process_group(self.device_group)

From a3399b336d1e348c894b97175caa4444893543b0 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 24 Jul 2024 00:22:19 -0700
Subject: [PATCH 127/278] try to increase the buffer size

---
 examples/disagg_prefill/disagg_prefill_example.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index 7b58fe6f64ce..8a1f7a48e7e3 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -6,6 +6,7 @@
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
 # export NCCL_DEBUG=INFO
+export NCCL_BUFFSIZE=536870912
 
 # a function that waits vLLM server to start
 wait_for_server() {

From e4e60d91969db33ceb31a632c96549cd169e60d7 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Wed, 31 Jul 2024 00:15:23 +0000
Subject: [PATCH 128/278] bug fix, now disaggregated prefill should work as
 expected

---
 .../disagg_prefill/disagg_prefill_example.sh  |  4 +-
 vllm/attention/backends/flash_attn.py         | 42 +++------
 vllm/distributed/parallel_state.py            | 79 +++++++++++++----
 vllm/model_executor/models/llama.py           |  6 +-
 vllm/worker/model_runner.py                   | 87 ++++++++++++++++---
 5 files changed, 153 insertions(+), 65 deletions(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index 8a1f7a48e7e3..0ebd3e7e97da 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -6,7 +6,7 @@
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
 # export NCCL_DEBUG=INFO
-export NCCL_BUFFSIZE=536870912
+# export NCCL_BUFFSIZE=536870912
 
 # a function that waits vLLM server to start
 wait_for_server() {
@@ -52,7 +52,7 @@ curl http://localhost:8100/v1/completions \
 "prompt": "San Francisco is a",
 "max_tokens": 1,
 "temperature": 0
-}' &
+}'
 
 # send to decode instance
 curl http://localhost:8200/v1/completions \
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index ebe5bf348489..330addd3b52f 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -15,15 +15,13 @@
                                            is_block_tables_empty)
 from vllm.utils import make_tensor_with_pad
 
-# This group is used for KV cache transfer in disaggregated prefilling
 from vllm.distributed import get_disagg_group
-
-# To identify if the VLLM_DISAGG_PREFILL_ROLE is set or no
 import vllm.envs as envs
 
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUBuilder
 
+
 class FlashAttentionBackend(AttentionBackend):
 
     @staticmethod
@@ -495,6 +493,15 @@ def forward(
                 v_scale,
             )
 
+            # send out the KV cache when current vllm is prefill instance
+            # the corresponding receive code is in vllm/worker/model_runner.py
+            if all([
+                envs.VLLM_DISAGG_PREFILL_ROLE == "prefill",
+                attn_metadata.prefill_metadata is not None]):
+                
+                get_disagg_group().push(key)
+                get_disagg_group().push(value)
+
         num_prefill_tokens = attn_metadata.num_prefill_tokens
         num_decode_tokens = attn_metadata.num_decode_tokens
         assert key.shape[0] == num_prefill_tokens + num_decode_tokens
@@ -510,13 +517,8 @@ def forward(
 
         assert query.shape[0] == num_prefill_tokens
         assert decode_query.shape[0] == num_decode_tokens
-        
-        prefill_meta = attn_metadata.prefill_metadata
-        if (prefill_meta is not None) and (
-            (envs.VLLM_DISAGG_PREFILL_ROLE is None)
-            or
-            (envs.VLLM_DISAGG_PREFILL_ROLE == "prefill")
-        ): # during prefilling, and this instance is not disagg decode instance
+
+        if prefill_meta := attn_metadata.prefill_metadata:
             # Prompt run.
             if (kv_cache is None or prefill_meta.block_tables is None
                     or prefill_meta.block_tables.numel() == 0):
@@ -542,7 +544,6 @@ def forward(
                 # prefix-enabled attention
                 assert prefill_meta.seq_lens is not None
                 max_seq_len = max(prefill_meta.seq_lens)
-
                 output[:num_prefill_tokens] = flash_attn_varlen_func(
                     q=query,
                     k=key_cache,
@@ -557,23 +558,6 @@ def forward(
                     block_table=prefill_meta.block_tables,
                 )
 
-        if (prefill_meta is not None) and (kv_cache is not None) and \
-            (envs.VLLM_DISAGG_PREFILL_ROLE is not None):
-            # transfer the output if
-            #   1). during prefilling
-            #   2). disaggregated prefill enabled
-            #   3). not in the profile run (kv_cache is not None)
-            # no need to transfer kv cache, as it is already the input of this function
-            if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
-                out = output[:num_prefill_tokens].contiguous()
-                if torch.distributed.get_rank() % 4 == 0:
-                    print("Send output, " , out.shape, out.dtype, out.device)
-                get_disagg_group().send(out)
-            else:
-                if torch.distributed.get_rank() % 4 == 0:
-                    print("Recv output, " , output[:num_prefill_tokens].shape, output.dtype, output.device)
-                output[:num_prefill_tokens] = get_disagg_group().recv(output[:num_prefill_tokens].shape, output.dtype)
-                
         if decode_meta := attn_metadata.decode_metadata:
             # Decoding run.
             output[num_prefill_tokens:] = flash_attn_with_kvcache(
@@ -588,4 +572,4 @@ def forward(
             ).squeeze(1)
 
         # Reshape the output tensor.
-        return output.view(num_tokens, hidden_size)
+        return output.view(num_tokens, hidden_size)
\ No newline at end of file
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 23d72fb7ea92..83cdb381490b 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -30,6 +30,8 @@
 from multiprocessing import shared_memory
 from typing import Any, Dict, List, Optional, Tuple, Union
 from unittest.mock import patch
+from concurrent.futures import ThreadPoolExecutor
+import queue
 
 import torch
 import torch.distributed
@@ -160,8 +162,8 @@ def __init__(
 
         assert self.cpu_group is not None
         assert self.device_group is not None
-        
 
+        
         if torch.cuda.is_available():
             self.device = torch.device(f"cuda:{local_rank}")
         else:
@@ -211,6 +213,12 @@ def __init__(
             self.mq_broadcaster = MessageQueue.create_from_process_group(
                 self.cpu_group, 1 << 22, 6)
 
+                
+        # use a threadpool to buffer send request in disaggregated prefill
+        self.send_buffer = None
+        # use a list to cache send items.
+        self.send_queue = queue.Queue()
+
     @property
     def first_rank(self):
         """Return the global rank of the first process in the group"""
@@ -696,6 +704,7 @@ def barrier(self):
     def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
         """Sends a tensor to the destination rank in a non-blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
+
         if dst is None:
             dst = (self.rank_in_group + 1) % self.world_size
 
@@ -705,16 +714,6 @@ def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
         else:
             torch.distributed.send(tensor, self.ranks[dst], self.device_group)
 
-    def isend(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
-        """Sends a tensor to the destination rank in a non-blocking way"""
-        """NOTE: `dst` is the local rank of the destination rank."""
-        """NOTE: this function leverage pytorch's isend, to bypass PyNccl buffer limit"""
-        if dst is None:
-            dst = (self.rank_in_group + 1) % self.world_size
-
-        torch.distributed.isend(tensor, self.ranks[dst], self.device_group)
-
-
     def recv(self,
              size: torch.Size,
              dtype: torch.dtype,
@@ -731,18 +730,60 @@ def recv(self,
         else:
             torch.distributed.recv(tensor, self.ranks[src], self.device_group)
         return tensor
+    
+    
+    def push(self,
+             tensor: torch.Tensor,
+             dst: Optional[int] = None,
+             enable_verification: bool = False) -> None:
+        """Push the KV cache send request into the send buffer"""
+        """NOTE: `dst` is the local rank of the destination rank."""
 
-    def irecv_wait(self,
+        if self.send_buffer is None:
+            self.send_buffer = ThreadPoolExecutor(max_workers=1)
+
+        if enable_verification:
+            # Send tensor, together with metadatas
+            # We will use this metadata to perform some sanity check
+            # But this transfer is VERY slow. 
+            # So this is a good option for debugging but not for produciton
+            self.send_buffer.submit(
+                self.send_tensor_dict,
+                # tensor needs to be cloned, if not the mean doesn't match
+                {"tensor": tensor.clone(), "mean": tensor.mean()},
+                dst
+            )
+        else:
+            # only send tensor, use NCCL if available
+            # very fast but error-prone
+            self.send_buffer.submit(
+                self.send,
+                # tensor needs to be cloned, if not the mean doesn't match
+                tensor.clone(),
+                dst
+            )
+    
+    
+    def fetch(self,
              size: torch.Size,
              dtype: torch.dtype,
-             src: Optional[int] = None) -> torch.Tensor:
-        """Receives a tensor from the src rank asynchronously."""
+             src: Optional[int] = None,
+             enable_verification: bool = False) -> torch.Tensor:
+        """Receives a tensor from the src rank (blocking)."""
+        """This API should be used together with `push`"""
         """NOTE: `src` is the local rank of the destination rank."""
-        if src is None:
-            src = (self.rank_in_group - 1) % self.world_size
 
-        tensor = torch.empty(size, dtype=dtype, device=self.device)
-        torch.distributed.irecv(tensor, self.ranks[src], self.device_group).wait()
+        if enable_verification:
+            # receive tensor and perform verifications
+            result = self.recv_tensor_dict(src)
+            tensor = result["tensor"]
+            mean = result["mean"]
+            assert tensor.shape == size
+            assert tensor.dtype == dtype
+            assert tensor.mean() == mean
+        else:
+            tensor = self.recv(size, dtype, src)
+
         return tensor
 
     def destroy(self):
@@ -1083,7 +1124,7 @@ def initialize_model_parallel(
         logger.debug("Distributed group is %s", str(group_ranks))
         _DISAGG = init_model_parallel_group(
             group_ranks,
-            int(envs.VLLM_DISAGG_PREFILL_ROLE == "decode"),
+            get_world_group().local_rank,
             backend,
             use_custom_allreduce=False)
         logger.debug("_DISAGG initialized for rank %d", torch.distributed.get_rank())
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index ab884110b71c..2052c443a888 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -318,8 +318,6 @@ def forward(
             residual = intermediate_tensors["residual"]
 
         for i in range(self.start_layer, self.end_layer):
-            if torch.distributed.get_rank() % 4 == 0:
-                print(f"Layer {i}")
             layer = self.layers[i]
             hidden_states, residual = layer(
                 positions,
@@ -420,11 +418,9 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
-        input_embeds: Optional[torch.Tensor] = None
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors,
-                                  input_embeds)
+                                  attn_metadata, intermediate_tensors)
         return model_output
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 86d26b4a84c3..8493c0e6fc7d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -27,7 +27,7 @@
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
                          PromptAdapterConfig, SchedulerConfig)
-from vllm.distributed import get_pp_group
+from vllm.distributed import get_pp_group, get_disagg_group
 from vllm.distributed.parallel_state import graph_capture
 from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
@@ -59,6 +59,9 @@
     _init_attn_metadata_from_tensor_dict,
     _init_sampling_metadata_from_tensor_dict)
 
+import vllm.envs as envs
+from vllm import _custom_ops as ops
+
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
 
@@ -1351,19 +1354,82 @@ def execute_model(
             "finished_requests_ids": model_input.finished_requests_ids,
             "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
         } if self.has_seqlen_agnostic else {}
-        hidden_or_intermediate_states = model_executable(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=model_input.attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            **multi_modal_kwargs,
-            **seqlen_agnostic_kwargs)
+
+        
+        # call `model_executable`
+        # and handle KV cache transfer for disaggregated prefilling
+        if any([
+            prefill_meta is None,
+            envs.VLLM_DISAGG_PREFILL_ROLE != "decode", 
+            kv_caches is None,
+            kv_caches[0] is None]):
+            
+            # model forwarding
+            # during forwarding the KV cache will be sent in prefill instance
+            # see vllm/attention/backends/flash_attn.py for sending impl
+            hidden_or_intermediate_states = model_executable(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                kv_caches=kv_caches,
+                attn_metadata=model_input.attn_metadata,
+                intermediate_tensors=intermediate_tensors,
+                **multi_modal_kwargs,
+                **seqlen_agnostic_kwargs)
+            
+            
+            if all([
+                prefill_meta is not None,
+                envs.VLLM_DISAGG_PREFILL_ROLE == "prefill",
+                kv_caches is not None,
+                kv_caches[0] is not None,]):
+                # send hidden state if disaggregated prefilling enabled
+
+                get_disagg_group().push(hidden_or_intermediate_states)
+                
+        else:
+            # receive KV cache from disaggregated prefill instance
+            for i in range(model_executable.model.start_layer,
+                           model_executable.model.end_layer):
+                    
+                # get kv cache
+                kv_cache = kv_caches[i - model_executable.model.start_layer]
+                # get corresponding layer
+                layer = model_executable.model.layers[i]
+                
+                # get kv cache shape (after sliced by tp)
+                _, _, num_head, head_size = kv_cache[0].shape
+                num_tokens = model_input.input_tokens.shape[0]
+                key = get_disagg_group().fetch(
+                    torch.Size([num_tokens, num_head, head_size]),
+                    kv_cache[0].dtype
+                )
+                value = get_disagg_group().fetch(
+                    torch.Size([num_tokens, num_head, head_size]),
+                    kv_cache[0].dtype
+                )
+                
+                key_cache, value_cache = kv_cache[0], kv_cache[1]
+                ops.reshape_and_cache_flash(
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    model_input.attn_metadata.slot_mapping.flatten(),
+                    layer.self_attn.attn.kv_cache_dtype,
+                    layer.self_attn.attn._k_scale,
+                    layer.self_attn.attn._v_scale,
+                )
+
+            hidden_or_intermediate_states = get_disagg_group().fetch(
+                torch.Size([num_tokens, model_executable.config.hidden_size]),
+                kv_cache[0].dtype
+            )
+                
 
         # Compute the logits in the last pipeline stage.
         if not get_pp_group().is_last_rank:
             return hidden_or_intermediate_states
-
+        
         logits = self.model.compute_logits(hidden_or_intermediate_states,
                                            model_input.sampling_metadata)
 
@@ -1376,6 +1442,7 @@ def execute_model(
             sampling_metadata=model_input.sampling_metadata,
         )
 
+
         if self.return_hidden_states:
             # we only need to pass hidden states of most recent token
             assert model_input.sampling_metadata is not None

From 87fbfae154dfc21dd9b751ab01bc6f5a6c6c61d6 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Wed, 31 Jul 2024 06:35:46 +0000
Subject: [PATCH 129/278] add proxy server

---
 .../disagg_benchmarks/disagg_benchmark.sh     | 180 +++++++++++-------
 .../disagg_prefill_proxy_server.py            |  49 +++++
 .../disagg_prefill/disagg_prefill_example.sh  |   4 +-
 .../disagg_prefill/disagg_proxy_server.py     |  83 --------
 4 files changed, 163 insertions(+), 153 deletions(-)
 create mode 100644 benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
 delete mode 100644 examples/disagg_prefill/disagg_proxy_server.py

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index c8a7cba02a70..58a99de7e4a9 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -49,78 +49,121 @@ benchmark() {
   model="neuralmagic/Meta-Llama-3-70B-Instruct-FP8"
   dataset_name="sonnet"
   dataset_path="../sonnet_4x.txt"
-  num_prompts=500
+  num_prompts=50
   qps=$1
   prefix_len=64
   input_len=2048
   output_len=$2
 
 
+  # # chunked prefill with tp=4
+  # CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+  #   -m vllm.entrypoints.openai.api_server \
+  #   --model $model \
+  #   --port 8000 \
+  #   -tp 4 \
+  #   --disable-log-stats \
+  #   --disable-log-requests \
+  #   --enable-chunked-prefill &
+  # wait_for_server 8000
+
+  # python3 ../benchmark_serving.py \
+  #         --backend vllm \
+  #         --model $model \
+  #         --dataset-name $dataset_name \
+  #         --dataset-path $dataset_path \
+  #         --sonnet-input-len $input_len \
+  #         --sonnet-output-len $output_len \
+  #         --sonnet-prefix-len $prefix_len \
+  #         --num-prompts $((num_prompts / 2)) \
+  #         --port 8000 \
+  #         --save-result \
+  #         --result-dir $results_folder \
+  #         --result-filename chunked_prefill_tp4.json \
+  #         --request-rate $((qps / 2))
+  # kill_gpu_processes
+
+
+  # # disaggregated prefill
+  # # prefill with tp=4
+  # python3 -m vllm.entrypoints.openai.api_server \
+  #         --model $model \
+  #         --port 8000 \
+  #         -tp 4 \
+  #         --disable-log-stats \
+  #         --disable-log-requests &
+  # wait_for_server 8000
+  # # set output-len to 1 so that it only do prefilling
+  # python3 ../benchmark_serving.py \
+  #         --backend vllm \
+  #         --model $model \
+  #         --dataset-name $dataset_name \
+  #         --dataset-path $dataset_path \
+  #         --sonnet-input-len $input_len \
+  #         --sonnet-output-len 1 \
+  #         --sonnet-prefix-len $prefix_len \
+  #         --num-prompts $num_prompts \
+  #         --port 8000 \
+  #         --save-result \
+  #         --result-dir $results_folder \
+  #         --result-filename disagg_prefill_tp4.json \
+  #         --request-rate $qps
+  # kill_gpu_processes
+
+  # # decode with tp=4, enable APC
+  # python3 -m vllm.entrypoints.openai.api_server \
+  #         --model $model \
+  #         --port 8000 \
+  #         -tp 4 \
+  #         --enable-prefix-caching \
+  #         --disable-log-stats \
+  #         --disable-log-requests &
+  # wait_for_server 8000
+  # # skip prefilling 
+  # # by enabling APC and force the input tokens be the same
+  # python3 ../benchmark_serving.py \
+  #         --backend vllm \
+  #         --model $model \
+  #         --dataset-name $dataset_name \
+  #         --dataset-path $dataset_path \
+  #         --sonnet-input-len $input_len \
+  #         --sonnet-output-len $output_len \
+  #         --sonnet-prefix-len $input_len  \
+  #         --num-prompts $num_prompts \
+  #         --port 8000 \
+  #         --save-result \
+  #         --result-dir $results_folder \
+  #         --result-filename disagg_decode_tp4.json \
+  #         --request-rate $qps
+  # kill_gpu_processes
+
+
+
   # chunked prefill with tp=4
-  CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+  export VLLM_PORT=12345
+  VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+      -m vllm.entrypoints.openai.api_server \
+      --model $model \
+      --port 8100 \
+      -tp 4 \
+      --disable-log-stats \
+      --disable-log-requests \
+      --enable-chunked-prefill &
+  VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model $model \
-    --port 8000 \
+    --port 8200 \
     -tp 4 \
     --disable-log-stats \
     --disable-log-requests \
     --enable-chunked-prefill &
-  wait_for_server 8000
-
-  python3 ../benchmark_serving.py \
-          --backend vllm \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
-          --sonnet-output-len $output_len \
-          --sonnet-prefix-len $prefix_len \
-          --num-prompts $((num_prompts / 2)) \
-          --port 8000 \
-          --save-result \
-          --result-dir $results_folder \
-          --result-filename chunked_prefill_tp4.json \
-          --request-rate $((qps / 2))
-  kill_gpu_processes
-
+  wait_for_server 8100
+  wait_for_server 8200
 
-  # disaggregated prefill
-  # prefill with tp=4
-  python3 -m vllm.entrypoints.openai.api_server \
-          --model $model \
-          --port 8000 \
-          -tp 4 \
-          --disable-log-stats \
-          --disable-log-requests &
-  wait_for_server 8000
-  # set output-len to 1 so that it only do prefilling
-  python3 ../benchmark_serving.py \
-          --backend vllm \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
-          --sonnet-output-len 1 \
-          --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
-          --port 8000 \
-          --save-result \
-          --result-dir $results_folder \
-          --result-filename disagg_prefill_tp4.json \
-          --request-rate $qps
-  kill_gpu_processes
+  # launch a proxy server that listen from port 8000
+  python3 disagg_prefill_proxy_server.py &
+  sleep 5
 
-  # decode with tp=4, enable APC
-  python3 -m vllm.entrypoints.openai.api_server \
-          --model $model \
-          --port 8000 \
-          -tp 4 \
-          --enable-prefix-caching \
-          --disable-log-stats \
-          --disable-log-requests &
-  wait_for_server 8000
-  # skip prefilling 
-  # by enabling APC and force the input tokens be the same
   python3 ../benchmark_serving.py \
           --backend vllm \
           --model $model \
@@ -128,19 +171,19 @@ benchmark() {
           --dataset-path $dataset_path \
           --sonnet-input-len $input_len \
           --sonnet-output-len $output_len \
-          --sonnet-prefix-len $input_len  \
+          --sonnet-prefix-len $prefix_len \
           --num-prompts $num_prompts \
           --port 8000 \
           --save-result \
           --result-dir $results_folder \
-          --result-filename disagg_decode_tp4.json \
+          --result-filename disagg_prefill_2xtp4.json \
           --request-rate $qps
   kill_gpu_processes
 
-  python3 analyze_benchmark_results.py \
-          --results-folder $results_folder \
-          --output-len $output_len \
-          --qps $qps
+  # python3 analyze_benchmark_results.py \
+  #         --results-folder $results_folder \
+  #         --output-len $output_len \
+  #         --qps $qps
 
 }
 
@@ -151,6 +194,8 @@ main() {
   (which jq) || (apt-get -y install jq)
   (which socat) || (apt-get -y install socat)
 
+  pip install quart httpx
+
   cd "$(dirname "$0")"
 
   cd ..
@@ -168,10 +213,11 @@ main() {
   default_qps=4
   default_output_len=150
 
-  for target_qps in 2 4 8 16
-  do
-    benchmark $target_qps $default_output_len
-  done
+  # for target_qps in 2 4 8 16
+  # do
+  #   benchmark $target_qps $default_output_len
+  # done
+  benchmark 1 150
 
   # for target_output_len in 5 10 20 40 80
   # do
diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
new file mode 100644
index 000000000000..ea21e94c5f64
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -0,0 +1,49 @@
+from quart import Quart, request, jsonify, Response
+import httpx
+
+app = Quart(__name__)
+
+async def forward_request(url, data):
+    async with httpx.AsyncClient() as client:
+        async with client.stream('POST', url, json=data) as response:
+            if response.status_code == 200:
+                # Check if the response is streaming
+                if 'transfer-encoding' in response.headers and response.headers['transfer-encoding'] == 'chunked':
+                    # Stream the response
+                    async def stream_response():
+                        async for chunk in response.aiter_bytes():
+                            yield chunk
+                    return Response(stream_response(), status=200, content_type=response.headers.get('content-type'))
+                else:
+                    # Return the full response
+                    response_data = await response.aread()
+                    return Response(response_data, status=200, content_type=response.headers.get('content-type'))
+            else:
+                error_data = await response.aread()
+                return jsonify({'error': error_data.decode()}), response.status_code
+
+@app.route('/v1/completions', methods=['POST'])
+async def handle_request():
+    # Get the original request data
+    original_request_data = await request.get_json()
+
+    # Modify the max_tokens to 1 for the request to port 8100
+    modified_request_data_8100 = original_request_data.copy()
+    modified_request_data_8100['max_tokens'] = 1
+
+    # Forward the request to port 8100
+    response_8100 = await forward_request('http://localhost:8100/v1/completions', modified_request_data_8100)
+
+    if response_8100.status_code == 200:
+        # If the request to port 8100 is successful, forward the original request to port 8200
+        response_8200 = await forward_request('http://localhost:8200/v1/completions', original_request_data)
+
+        if response_8200.status_code == 200:
+            return response_8200
+        else:
+            return jsonify({'error': 'Failed to get response from port 8200'}), response_8200.status_code
+    else:
+        return jsonify({'error': 'Failed to get response from port 8100'}), response_8100.status_code
+
+if __name__ == '__main__':
+    app.run(port=8000)
diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index 0ebd3e7e97da..576a3ef4975f 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -5,8 +5,6 @@
 
 export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_PORT=12345
-# export NCCL_DEBUG=INFO
-# export NCCL_BUFFSIZE=536870912
 
 # a function that waits vLLM server to start
 wait_for_server() {
@@ -35,7 +33,7 @@ VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     --enable-prefix-caching \
     --gpu-memory-utilization 0.8 &
 
-
+# wait until prefill and decode instances are ready
 wait_for_server 8100
 wait_for_server 8200
 
diff --git a/examples/disagg_prefill/disagg_proxy_server.py b/examples/disagg_prefill/disagg_proxy_server.py
deleted file mode 100644
index f87a081941fc..000000000000
--- a/examples/disagg_prefill/disagg_proxy_server.py
+++ /dev/null
@@ -1,83 +0,0 @@
-import argparse
-import aiohttp
-import asyncio
-from aiohttp import web
-import json
-import logging
-
-# Configure logging
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger(__name__)
-
-async def handle_post(request):
-    prefill_port = request.app['prefill_port']
-    decode_port = request.app['decode_port']
-
-    logger.debug(f"Received request to {request.path} with method {request.method}")
-
-    # Read and parse the request payload
-    try:
-        payload = await request.json()
-        logger.debug(f"Request payload: {json.dumps(payload, indent=2)}")
-    except Exception as e:
-        logger.error(f"Error parsing request payload: {str(e)}")
-        return web.json_response({'error': str(e)}, status=400)
-    
-    # Modify max_tokens for prefill request
-    payload_prefill = payload.copy()
-    payload_prefill["max_tokens"] = 1
-    logger.debug(f"Modified prefill payload: {json.dumps(payload_prefill, indent=2)}")
-
-    async with aiohttp.ClientSession() as session:
-        # Forward request to prefill port
-        async with session.post(f"http://localhost:{prefill_port}/v1/completions", json=payload_prefill) as response_prefill:
-            try:
-                response_prefill_data = await response_prefill.json()
-                logger.debug(f"Prefill response data: {json.dumps(response_prefill_data, indent=2)}")
-            except aiohttp.ContentTypeError:
-                response_prefill_data = await response_prefill.text()
-                logger.debug(f"Prefill response text: {response_prefill_data}")
-            
-            if response_prefill.status != 200:
-                logger.error(f"Prefill request failed with status {response_prefill.status}")
-                return web.json_response(response_prefill_data, status=response_prefill.status)
-
-        # Forward original request to decode port
-        async with session.post(f"http://localhost:{decode_port}/v1/completions", json=payload) as response_decode:
-            logger.debug(f"Forwarding request to decode port {decode_port}")
-            if 'stream' in payload and payload['stream']:
-                # If streaming, set up a streaming response
-                response = web.StreamResponse(status=response_decode.status, reason=response_decode.reason, headers=response_decode.headers)
-                await response.prepare(request)
-                
-                async for data, _ in response_decode.content.iter_chunks():
-                    await response.write(data)
-                    logger.debug(f"Streaming chunk: {data}")
-                await response.write_eof()
-                logger.debug("Finished streaming response")
-                return response
-            else:
-                # Handle non-streaming response
-                try:
-                    response_decode_data = await response_decode.json()
-                    logger.debug(f"Decode response data: {json.dumps(response_decode_data, indent=2)}")
-                except aiohttp.ContentTypeError:
-                    response_decode_data = await response_decode.text()
-                    logger.debug(f"Decode response text: {response_decode_data}")
-                return web.json_response(response_decode_data, status=response_decode.status)
-
-async def init_app(prefill_port, decode_port):
-    app = web.Application()
-    app['prefill_port'] = prefill_port
-    app['decode_port'] = decode_port
-    app.router.add_post('/v1/completions', handle_post)
-    return app
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description="Async Proxy server")
-    parser.add_argument('prefill_port', type=int, help='Port to forward the first request to (with max_tokens=1)')
-    parser.add_argument('decode_port', type=int, help='Port to forward the second request to')
-    args = parser.parse_args()
-
-    app = asyncio.run(init_app(args.prefill_port, args.decode_port))
-    web.run_app(app, port=8000)
\ No newline at end of file

From fa664c0dae92f0b1e4b88fea2ce96035488f3825 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 1 Aug 2024 10:07:04 +0000
Subject: [PATCH 130/278] startr slow -- using pp=1 and tp=1

---
 .../disagg_prefill/disagg_prefill_example.sh  | 74 ++++++++++++-------
 1 file changed, 47 insertions(+), 27 deletions(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index 576a3ef4975f..8cfe528ffb58 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -4,7 +4,9 @@
 # and then transfer the KV cache between them.
 
 export VLLM_LOGGING_LEVEL=DEBUG
-export VLLM_PORT=12345
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+# export NCCL_DEBUG=INFO
+export NCCL_BUFFSIZE=67108864
 
 # a function that waits vLLM server to start
 wait_for_server() {
@@ -16,22 +18,24 @@ wait_for_server() {
 }
 
 # prefilling instance
-VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+VLLM_LOGGING_LEVEL=DEBUG VLLM_HOST_IP=$(hostname -I | awk '{print $1}') VLLM_PORT=2345 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 python3 \
     -m vllm.entrypoints.openai.api_server \
-    --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
+    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8100 \
-    -tp 4 \
+    -tp 1 \
     --enable-prefix-caching \
-    --gpu-memory-utilization 0.8 &
+    --gpu-memory-utilization 0.8 \
+    --max-model-len 10000 &
 
 # decoding instance
-VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+VLLM_LOGGING_LEVEL=DEBUG VLLM_HOST_IP=$(hostname -I | awk '{print $1}') VLLM_PORT=2345 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
-    --model neuralmagic/Meta-Llama-3-70B-Instruct-FP8 \
+    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8200 \
-    -tp 4 \
+    -tp 1 \
     --enable-prefix-caching \
-    --gpu-memory-utilization 0.8 &
+    --gpu-memory-utilization 0.8 \
+    --max-model-len 10000 &
 
 # wait until prefill and decode instances are ready
 wait_for_server 8100
@@ -42,23 +46,39 @@ wait_for_server 8200
 #   1. send the request to prefill instance, with max_tokens set to 1
 #   2. send the request again to decode instance, no modification
 
-# send to prefill instance
-curl http://localhost:8100/v1/completions \
--H "Content-Type: application/json" \
--d '{
-"model": "neuralmagic/Meta-Llama-3-70B-Instruct-FP8",
-"prompt": "San Francisco is a",
-"max_tokens": 1,
-"temperature": 0
-}'
 
-# send to decode instance
-curl http://localhost:8200/v1/completions \
--H "Content-Type: application/json" \
--d '{
-"model": "neuralmagic/Meta-Llama-3-70B-Instruct-FP8",
-"prompt": "San Francisco is a",
-"max_tokens": 5,
-"temperature": 0
-}'
+for i in {0..0}
+do
+  # send to prefill instance
+  curl -m 5 http://localhost:8100/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "prompt": "'$i' San Francisco is a",
+  "max_tokens": 1,
+  "temperature": 0
+  }'
 
+  curl -m 5 http://localhost:8100/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  "prompt": "'$i' San Francisco is a",
+  "max_tokens": 1,
+  "temperature": 0
+  }'
+
+  # # send to decode instance
+  # curl -m 60 http://localhost:8200/v1/completions \
+  # -H "Content-Type: application/json" \
+  # -d '{
+  # "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+  # "prompt": "'$i' San Francisco is a",
+  # "max_tokens": 5,
+  # "temperature": 0
+  # }'
+
+done
+
+# kill command:
+# ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
\ No newline at end of file

From 6bf7583f02c01d1d195b87117cb02fbe6f922a4a Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 1 Aug 2024 10:07:25 +0000
Subject: [PATCH 131/278] adjust the API

---
 vllm/distributed/parallel_state.py | 109 ++++++++++++++++++-----------
 1 file changed, 69 insertions(+), 40 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 83cdb381490b..e8c7bbe59e05 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -24,7 +24,7 @@
 import contextlib
 import pickle
 import logging
-from collections import namedtuple
+from collections import namedtuple, defaultdict
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from multiprocessing import shared_memory
@@ -215,9 +215,8 @@ def __init__(
 
                 
         # use a threadpool to buffer send request in disaggregated prefill
-        self.send_buffer = None
-        # use a list to cache send items.
-        self.send_queue = queue.Queue()
+        self.input_hash_to_kv_sending_requests = {}
+        self.kv_sending_thread = ThreadPoolExecutor(max_workers=1)
 
     @property
     def first_rank(self):
@@ -705,8 +704,11 @@ def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
         """Sends a tensor to the destination rank in a non-blocking way"""
         """NOTE: `dst` is the local rank of the destination rank."""
 
+
         if dst is None:
             dst = (self.rank_in_group + 1) % self.world_size
+         
+        print('Sending %.3f MB to %d' % (tensor.element_size() * tensor.numel() / 1024 / 1024, self.ranks[dst]), end=' ', flush=True)
 
         pynccl_comm = self.pynccl_comm
         if pynccl_comm is not None and not pynccl_comm.disabled:
@@ -714,6 +716,8 @@ def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
         else:
             torch.distributed.send(tensor, self.ranks[dst], self.device_group)
 
+        print(' End sending ', end=' ', flush=True)
+
     def recv(self,
              size: torch.Size,
              dtype: torch.dtype,
@@ -722,6 +726,8 @@ def recv(self,
         """NOTE: `src` is the local rank of the destination rank."""
         if src is None:
             src = (self.rank_in_group - 1) % self.world_size
+        
+        print('Start receiving from %d',self.ranks[src])
 
         tensor = torch.empty(size, dtype=dtype, device=self.device)
         pynccl_comm = self.pynccl_comm
@@ -729,46 +735,74 @@ def recv(self,
             pynccl_comm.recv(tensor, src)
         else:
             torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+
+        print('End receiving')
         return tensor
     
     
-    def push(self,
-             tensor: torch.Tensor,
-             dst: Optional[int] = None,
-             enable_verification: bool = False) -> None:
+    def kv_cache_send(self,
+                    input_hash: int,
+                    tensor: torch.Tensor,
+                    dst: Optional[int] = None,
+                    enable_verification: bool = False) -> None:
         """Push the KV cache send request into the send buffer"""
         """NOTE: `dst` is the local rank of the destination rank."""
 
-        if self.send_buffer is None:
-            self.send_buffer = ThreadPoolExecutor(max_workers=1)
+        print('Pushing %.3f MB' % (tensor.element_size() * tensor.numel() / 1024 / 1024), end=' ', flush=True)
 
+        
         if enable_verification:
             # Send tensor, together with metadatas
             # We will use this metadata to perform some sanity check
             # But this transfer is VERY slow. 
             # So this is a good option for debugging but not for produciton
-            self.send_buffer.submit(
+            self.input_hash_to_kv_sending_requests[input_hash].append([
                 self.send_tensor_dict,
                 # tensor needs to be cloned, if not the mean doesn't match
                 {"tensor": tensor.clone(), "mean": tensor.mean()},
                 dst
-            )
+            ])
         else:
             # only send tensor, use NCCL if available
             # very fast but error-prone
-            self.send_buffer.submit(
+            self.input_hash_to_kv_sending_requests[input_hash].append([
                 self.send,
-                # tensor needs to be cloned, if not the mean doesn't match
+                # tensor needs to be cloned, if not the tensor may be freed
                 tensor.clone(),
                 dst
-            )
+            ])
+
+            
+    def sending_kv_from_input_hash(self):
+
+        # receive the input hash that the decode instance requires
+        input_hash_tensor = self.recv(torch.Size([1]), torch.long)
+        input_hash = input_hash_tensor.item()
+        
+        # execute corresponding send jobs in request queue
+        for request in input_hash_to_kv_sending_requests[input_hash]:
+            request[0](*request[1:])
+        # free GPU memory occupied by sending
+        del input_hash_to_kv_sending_requests[input_hash]
+
+
+    def kv_cache_send_ready(self):
+        
+        self.kv_sending_thread.submit([self.sending_kv_from_input_hash])
+
+            
+    def kv_cache_recv_start(self, input_hash: int):
+        
+        input_hash_tensor = torch.tensor([input_hash]).long().to(self.device)
+        # notify the kv cache sender with the input hash id
+        torch.distributed.send(input_hash_tensor)
     
     
-    def fetch(self,
-             size: torch.Size,
-             dtype: torch.dtype,
-             src: Optional[int] = None,
-             enable_verification: bool = False) -> torch.Tensor:
+    def kv_cache_recv(self,
+                    size: torch.Size,
+                    dtype: torch.dtype,
+                    src: Optional[int] = None,
+                    enable_verification: bool = False) -> torch.Tensor:
         """Receives a tensor from the src rank (blocking)."""
         """This API should be used together with `push`"""
         """NOTE: `src` is the local rank of the destination rank."""
@@ -895,16 +929,6 @@ def graph_capture():
 
 
 logger = init_logger(__name__)
-class ConditionalLoggingHandler(logging.Handler):
-    def emit(self, record):
-        dist = torch.distributed
-        try:
-            if not dist.is_initialized() or (dist.is_initialized() and dist.get_rank() % 4 == 0):
-                msg = self.format(record)
-                print(msg)  # You can replace this with any other logging mechanism you prefer
-        except Exception:
-            pass
-logger.addHandler(ConditionalLoggingHandler())
 
 _ENABLE_CUSTOM_ALL_REDUCE = True
 
@@ -973,12 +997,15 @@ def init_distributed_environment(
             else:
                 # offset global rank by tp * pp (which is world_size)
                 maybe_disagg_rank = rank + world_size
+
+        logger.debug(f"Before: world size {maybe_disagg_world_size}, rank {maybe_disagg_rank}")
             
         torch.distributed.init_process_group(
             backend=backend,
             init_method=distributed_init_method,
             world_size=maybe_disagg_world_size,
             rank=maybe_disagg_rank)
+        logger.debug("torch.distributed initialized")
     # set the local rank
     # local_rank is not available in torch ProcessGroup,
     # see https://github.com/pytorch/pytorch/issues/122816
@@ -1109,11 +1136,6 @@ def initialize_model_parallel(
     logger.debug("_PP initialized for rank %d", torch.distributed.get_rank())
     
     if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-        # dirty fix: temporarily lift up NCCL buffer size to 1GB
-        import os
-        os.environ["NCCL_BUFFSIZE"] = "1073741824"
-        import time
-        time.sleep(20)
         global _DISAGG
         logger.debug("Disaggregated prefill enabled, create _DISAGG group")
         group_ranks = []
@@ -1122,11 +1144,18 @@ def initialize_model_parallel(
             # decode global rank: i + world_size
             group_ranks.append([i, i + world_size])
         logger.debug("Distributed group is %s", str(group_ranks))
-        _DISAGG = init_model_parallel_group(
-            group_ranks,
-            get_world_group().local_rank,
-            backend,
-            use_custom_allreduce=False)
+        _DISAGG = init_model_parallel_group(group_ranks,
+                                            get_world_group().local_rank,
+                                            backend,
+                                            use_custom_allreduce=False)
+        # follow by a warmup, to warmup nccl
+        # necessary, as NCCL may not be warmed up when tp and pp are both 1.
+        temp_tensor = torch.tensor([1.]).to(_DISAGG.device)
+        if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
+            _DISAGG.send(temp_tensor)
+        else:
+            recv_tensor = _DISAGG.recv(temp_tensor.shape, temp_tensor.dtype)
+            assert torch.allclose(temp_tensor, recv_tensor)
         logger.debug("_DISAGG initialized for rank %d", torch.distributed.get_rank())
 
 

From 6aad5cc4991aa5b25eed4d480f7fd99297f7e7d8 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 2 Aug 2024 06:05:15 +0000
Subject: [PATCH 132/278] support batch size >1

---
 .../disagg_benchmarks/disagg_benchmark.sh     |  66 ++++---
 .../disagg_prefill_proxy_server.py            |  80 ++++----
 vllm/distributed/parallel_state.py            | 159 +++++++++-------
 vllm/worker/model_runner.py                   | 171 ++++++++++++++----
 4 files changed, 313 insertions(+), 163 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index 58a99de7e4a9..96e3cf35d49a 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -43,16 +43,20 @@ wait_for_server() {
 
 benchmark() {
 
+  export VLLM_LOGGING_LEVEL=DEBUG
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+  export VLLM_PORT=12345
+
   # compare chunked prefill with disaggregated prefill
 
   results_folder="./results"
-  model="neuralmagic/Meta-Llama-3-70B-Instruct-FP8"
+  model="meta-llama/Meta-Llama-3.1-70B-Instruct"
   dataset_name="sonnet"
   dataset_path="../sonnet_4x.txt"
-  num_prompts=50
+  num_prompts=100
   qps=$1
-  prefix_len=64
-  input_len=2048
+  prefix_len=50
+  input_len="100"
   output_len=$2
 
 
@@ -138,31 +142,47 @@ benchmark() {
   # kill_gpu_processes
 
 
-
-  # chunked prefill with tp=4
-  export VLLM_PORT=12345
-  VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
-      -m vllm.entrypoints.openai.api_server \
-      --model $model \
-      --port 8100 \
-      -tp 4 \
-      --disable-log-stats \
-      --disable-log-requests \
-      --enable-chunked-prefill &
-  VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+# large model
+VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model $model \
-    --port 8200 \
+    --port 8100 \
     -tp 4 \
-    --disable-log-stats \
-    --disable-log-requests \
-    --enable-chunked-prefill &
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.8 &
+VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+  -m vllm.entrypoints.openai.api_server \
+  --model $model \
+  --port 8200 \
+  -tp 4 \
+  --max-model-len 10000 \
+  --gpu-memory-utilization 0.8 &
+
+# # Small Model
+# # prefilling instance
+# VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 python3 \
+#     -m vllm.entrypoints.openai.api_server \
+#     --model $model \
+#     --port 8100 \
+#     -tp 1 \
+#     --gpu-memory-utilization 0.8 \
+#     --max-model-len 10000 &
+
+# # decoding instance
+# VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=1 python3 \
+#     -m vllm.entrypoints.openai.api_server \
+#     --model $model \
+#     --port 8200 \
+#     -tp 1 \
+#     --gpu-memory-utilization 0.8 \
+#     --max-model-len 10000 &
+
   wait_for_server 8100
   wait_for_server 8200
 
   # launch a proxy server that listen from port 8000
   python3 disagg_prefill_proxy_server.py &
-  sleep 5
+  sleep 1
 
   python3 ../benchmark_serving.py \
           --backend vllm \
@@ -210,14 +230,14 @@ main() {
   rm -rf results
   mkdir results
 
-  default_qps=4
+  default_qps=10
   default_output_len=150
 
   # for target_qps in 2 4 8 16
   # do
   #   benchmark $target_qps $default_output_len
   # done
-  benchmark 1 150
+  benchmark $default_qps $default_output_len
 
   # for target_output_len in 5 10 20 40 80
   # do
diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index ea21e94c5f64..9028d9be86ec 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -1,49 +1,51 @@
-from quart import Quart, request, jsonify, Response
+from quart import Quart, request, Response, jsonify, make_response
+import aiohttp
+import sys
 import httpx
+import traceback
+import os
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 
 app = Quart(__name__)
 
 async def forward_request(url, data):
-    async with httpx.AsyncClient() as client:
-        async with client.stream('POST', url, json=data) as response:
-            if response.status_code == 200:
-                # Check if the response is streaming
-                if 'transfer-encoding' in response.headers and response.headers['transfer-encoding'] == 'chunked':
-                    # Stream the response
-                    async def stream_response():
-                        async for chunk in response.aiter_bytes():
-                            yield chunk
-                    return Response(stream_response(), status=200, content_type=response.headers.get('content-type'))
-                else:
-                    # Return the full response
-                    response_data = await response.aread()
-                    return Response(response_data, status=200, content_type=response.headers.get('content-type'))
-            else:
-                error_data = await response.aread()
-                return jsonify({'error': error_data.decode()}), response.status_code
-
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: 
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+        }
+        async with session.post(url=url, json=data,
+                                headers=headers) as response:
+            if response.status == 200:
+                async for chunk_bytes in response.content:
+                    yield chunk_bytes
+    
 @app.route('/v1/completions', methods=['POST'])
 async def handle_request():
-    # Get the original request data
-    original_request_data = await request.get_json()
-
-    # Modify the max_tokens to 1 for the request to port 8100
-    modified_request_data_8100 = original_request_data.copy()
-    modified_request_data_8100['max_tokens'] = 1
-
-    # Forward the request to port 8100
-    response_8100 = await forward_request('http://localhost:8100/v1/completions', modified_request_data_8100)
-
-    if response_8100.status_code == 200:
-        # If the request to port 8100 is successful, forward the original request to port 8200
-        response_8200 = await forward_request('http://localhost:8200/v1/completions', original_request_data)
-
-        if response_8200.status_code == 200:
-            return response_8200
-        else:
-            return jsonify({'error': 'Failed to get response from port 8200'}), response_8200.status_code
-    else:
-        return jsonify({'error': 'Failed to get response from port 8100'}), response_8100.status_code
+    
+    try:
+        original_request_data = await request.get_json()
+
+        prefill_request = original_request_data.copy()
+        prefill_request['max_tokens'] = 1
+
+        # finish prefill
+        async for data in forward_request('http://localhost:8100/v1/completions', prefill_request):
+            continue
+
+        print(f"Request {prefill_request} prefill done. proceeding to decode.")
+        
+        # return decode
+        generator = forward_request('http://localhost:8200/v1/completions', original_request_data)
+        response = await make_response(generator)
+        response.timeout = None
+
+        return response
+    
+    except Exception as e:
+        exc_info = sys.exc_info()
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
 
 if __name__ == '__main__':
     app.run(port=8000)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index e8c7bbe59e05..b4f668bd537e 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -215,8 +215,8 @@ def __init__(
 
                 
         # use a threadpool to buffer send request in disaggregated prefill
-        self.input_hash_to_kv_sending_requests = {}
-        self.kv_sending_thread = ThreadPoolExecutor(max_workers=1)
+        self.input_hash_to_kv_sending_requests = defaultdict(list)
+        self.kv_sending_thread = None
 
     @property
     def first_rank(self):
@@ -707,8 +707,6 @@ def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
 
         if dst is None:
             dst = (self.rank_in_group + 1) % self.world_size
-         
-        print('Sending %.3f MB to %d' % (tensor.element_size() * tensor.numel() / 1024 / 1024, self.ranks[dst]), end=' ', flush=True)
 
         pynccl_comm = self.pynccl_comm
         if pynccl_comm is not None and not pynccl_comm.disabled:
@@ -716,7 +714,6 @@ def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
         else:
             torch.distributed.send(tensor, self.ranks[dst], self.device_group)
 
-        print(' End sending ', end=' ', flush=True)
 
     def recv(self,
              size: torch.Size,
@@ -727,7 +724,6 @@ def recv(self,
         if src is None:
             src = (self.rank_in_group - 1) % self.world_size
         
-        print('Start receiving from %d',self.ranks[src])
 
         tensor = torch.empty(size, dtype=dtype, device=self.device)
         pynccl_comm = self.pynccl_comm
@@ -736,90 +732,129 @@ def recv(self,
         else:
             torch.distributed.recv(tensor, self.ranks[src], self.device_group)
 
-        print('End receiving')
         return tensor
+
+        
+    def debug_send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """Will send several metadata. Useful for debugging."""
+        """NOTE: `dst` is the local rank of the destination rank."""
+
+
+        self.send_tensor_dict(
+            {
+            "tensor": tensor,
+            "mean": tensor.float().mean(),
+            "shape": tensor.shape
+            },
+            dst
+        )
+
+    def debug_recv(self,
+             size: torch.Size,
+             dtype: torch.dtype,
+             src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the src rank."""
+        """NOTE: `src` is the local rank of the destination rank."""
+        
+        result = self.recv_tensor_dict(src)
+        tensor = result["tensor"]
+        assert torch.allclose(result["mean"], tensor.float().mean())
+        assert result["shape"] == tensor.shape
+        assert result["shape"] == size, f"The shape sent by sender is {result['shape']} but trying to receive {size}"
+        return tensor
+
+        
+    
     
     
     def kv_cache_send(self,
                     input_hash: int,
                     tensor: torch.Tensor,
                     dst: Optional[int] = None,
-                    enable_verification: bool = False) -> None:
+                    enable_verification: bool = True) -> None:
         """Push the KV cache send request into the send buffer"""
         """NOTE: `dst` is the local rank of the destination rank."""
 
-        print('Pushing %.3f MB' % (tensor.element_size() * tensor.numel() / 1024 / 1024), end=' ', flush=True)
-
-        
         if enable_verification:
-            # Send tensor, together with metadatas
-            # We will use this metadata to perform some sanity check
-            # But this transfer is VERY slow. 
-            # So this is a good option for debugging but not for produciton
-            self.input_hash_to_kv_sending_requests[input_hash].append([
-                self.send_tensor_dict,
-                # tensor needs to be cloned, if not the mean doesn't match
-                {"tensor": tensor.clone(), "mean": tensor.mean()},
-                dst
-            ])
+            send_func = self.debug_send
         else:
-            # only send tensor, use NCCL if available
-            # very fast but error-prone
-            self.input_hash_to_kv_sending_requests[input_hash].append([
-                self.send,
-                # tensor needs to be cloned, if not the tensor may be freed
-                tensor.clone(),
-                dst
-            ])
-
-            
-    def sending_kv_from_input_hash(self):
-
-        # receive the input hash that the decode instance requires
-        input_hash_tensor = self.recv(torch.Size([1]), torch.long)
-        input_hash = input_hash_tensor.item()
-        
-        # execute corresponding send jobs in request queue
-        for request in input_hash_to_kv_sending_requests[input_hash]:
-            request[0](*request[1:])
-        # free GPU memory occupied by sending
-        del input_hash_to_kv_sending_requests[input_hash]
-
+            send_func = self.send
 
-    def kv_cache_send_ready(self):
-        
-        self.kv_sending_thread.submit([self.sending_kv_from_input_hash])
+        self.input_hash_to_kv_sending_requests[input_hash].append([
+            send_func,
+            # tensor needs to be cloned, if not the tensor may be freed
+            tensor.clone(),
+            dst
+        ])
 
-            
-    def kv_cache_recv_start(self, input_hash: int):
-        
-        input_hash_tensor = torch.tensor([input_hash]).long().to(self.device)
-        # notify the kv cache sender with the input hash id
-        torch.distributed.send(input_hash_tensor)
-    
     
     def kv_cache_recv(self,
                     size: torch.Size,
                     dtype: torch.dtype,
                     src: Optional[int] = None,
-                    enable_verification: bool = False) -> torch.Tensor:
+                    enable_verification: bool = True) -> torch.Tensor:
         """Receives a tensor from the src rank (blocking)."""
         """This API should be used together with `push`"""
         """NOTE: `src` is the local rank of the destination rank."""
 
         if enable_verification:
-            # receive tensor and perform verifications
-            result = self.recv_tensor_dict(src)
-            tensor = result["tensor"]
-            mean = result["mean"]
-            assert tensor.shape == size
-            assert tensor.dtype == dtype
-            assert tensor.mean() == mean
+            recv_func = self.debug_recv
         else:
-            tensor = self.recv(size, dtype, src)
+            recv_func = self.recv
+        
+        tensor = recv_func(size, dtype, src)
 
         return tensor
 
+            
+    def recv_input_hash_and_send_kv(self):
+
+        try:
+
+            # receive the input hash that the decode instance requires
+            logger.debug('Waiting for input hash ...')
+            # FIXME(Kuntai): debug_recv guarantees correctness but hurts perf
+            input_hash_tensor = self.debug_recv(torch.Size([1]), torch.long)
+            input_hash = input_hash_tensor.item()
+            logger.debug('Receiving input hash %d', input_hash)
+            assert input_hash in self.input_hash_to_kv_sending_requests, \
+                f"The KV cache of {input_hash} does not exist."   
+            logger.debug('Input hash %d exists, start sending', input_hash)
+            
+            # execute corresponding kv cache sending jobs in request queue
+            for idx, request in enumerate(
+                self.input_hash_to_kv_sending_requests[input_hash]):
+                request[0](*request[1:])
+            logger.debug('Finish input hash %d, free memory...' % input_hash)
+            # free GPU memory occupied by sending
+            del self.input_hash_to_kv_sending_requests[input_hash]
+
+        except Exception:
+            import sys
+            import traceback
+
+
+    def kv_cache_send_finish(self):
+
+        if self.kv_sending_thread is None:
+            self.kv_sending_thread = ThreadPoolExecutor(max_workers=1)
+        
+        job = self.kv_sending_thread.submit(self.recv_input_hash_and_send_kv)
+        logger.debug(f'Submit job {job} into kv cache sending thread')
+
+            
+    def kv_cache_recv_start(self, input_hash: int):
+
+        logger.debug('Requesting KV cache transfer for input hash %d', input_hash)
+        
+        input_hash_tensor = torch.tensor([input_hash]).long().to(self.device)
+        # notify the kv cache sender with the input hash id
+        # FIXME(Kuntai): debug_send guarantees correctness but hurts perf.
+        self.debug_send(input_hash_tensor)
+    
+    
+
     def destroy(self):
         if self.device_group is not None:
             torch.distributed.destroy_process_group(self.device_group)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 8493c0e6fc7d..349ec6514817 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1363,7 +1363,7 @@ def execute_model(
             envs.VLLM_DISAGG_PREFILL_ROLE != "decode", 
             kv_caches is None,
             kv_caches[0] is None]):
-            
+
             # model forwarding
             # during forwarding the KV cache will be sent in prefill instance
             # see vllm/attention/backends/flash_attn.py for sending impl
@@ -1375,7 +1375,7 @@ def execute_model(
                 intermediate_tensors=intermediate_tensors,
                 **multi_modal_kwargs,
                 **seqlen_agnostic_kwargs)
-            
+
             
             if all([
                 prefill_meta is not None,
@@ -1383,47 +1383,140 @@ def execute_model(
                 kv_caches is not None,
                 kv_caches[0] is not None,]):
                 # send hidden state if disaggregated prefilling enabled
-
-                get_disagg_group().push(hidden_or_intermediate_states)
                 
-        else:
-            # receive KV cache from disaggregated prefill instance
-            for i in range(model_executable.model.start_layer,
-                           model_executable.model.end_layer):
+                _input_tokens_list = model_input.input_tokens.tolist()
+                seq_lens = model_input.seq_lens
+                query_lens = model_input.query_lens
+                slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+                
+                # failed = False
+                # reason = ""
+
+                # if sum(query_lens) != sum(seq_lens):
+                #     logger.error("Query len sum is %d but seq len sum is %d", sum(query_lens), sum(seq_lens))
+                #     failed=True
+                # if sum(query_lens) != len(_input_tokens_list):
+                #     logger.error("Input tokens len is %d, doesn't match with query lens sum %d",
+                #                  sum(query_lens),
+                #                  len(_input_tokens_list))
+                #     failed=True
+                # if slot_mapping.shape[0] != len(_input_tokens_list):
+                #     logger.error("Slot mapping shape is %s, mismatch with input shape %s",
+                #                  slot_mapping.shape,
+                #                  len(_input_tokens_list))
+                #     failed=True
+                # if failed:
+                #     import subprocess 
+                #     subprocess.run("ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9", shell=True)
                     
-                # get kv cache
-                kv_cache = kv_caches[i - model_executable.model.start_layer]
-                # get corresponding layer
-                layer = model_executable.model.layers[i]
                 
-                # get kv cache shape (after sliced by tp)
-                _, _, num_head, head_size = kv_cache[0].shape
-                num_tokens = model_input.input_tokens.shape[0]
-                key = get_disagg_group().fetch(
-                    torch.Size([num_tokens, num_head, head_size]),
-                    kv_cache[0].dtype
-                )
-                value = get_disagg_group().fetch(
-                    torch.Size([num_tokens, num_head, head_size]),
-                    kv_cache[0].dtype
-                )
+                # query_lens contains new KV caches that are added to vLLM.
+                # so we will send them to decode instance
+                # FIXME(Kuntai): This assume that all requests are prefill. 
+                if query_lens is not None:
+                    for idx, qlen in enumerate(query_lens):
+
+
+                        start_pos = sum(query_lens[:idx])
+                        end_pos = start_pos + qlen
+                        input_hash = hash(tuple(_input_tokens_list[start_pos:end_pos]))
+                        
+                        for i in range(model_executable.model.start_layer,
+                                model_executable.model.end_layer):
+                            kv_cache = kv_caches[i - model_executable.model.start_layer]
+                            
+                            _, _, num_heads, head_size = kv_cache[0].shape
+                            
+                            key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+                            value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+
+                            current_slot_mapping = slot_mapping[start_pos:end_pos]
+
+                            get_disagg_group().kv_cache_send(
+                                input_hash,
+                                key_cache[current_slot_mapping])
+                            get_disagg_group().kv_cache_send(
+                                input_hash,
+                                value_cache[current_slot_mapping])
+
+
+                        get_disagg_group().kv_cache_send(
+                            input_hash, 
+                            hidden_or_intermediate_states[start_pos:end_pos])
+                        get_disagg_group().kv_cache_send_finish()
                 
-                key_cache, value_cache = kv_cache[0], kv_cache[1]
-                ops.reshape_and_cache_flash(
-                    key,
-                    value,
-                    key_cache,
-                    value_cache,
-                    model_input.attn_metadata.slot_mapping.flatten(),
-                    layer.self_attn.attn.kv_cache_dtype,
-                    layer.self_attn.attn._k_scale,
-                    layer.self_attn.attn._v_scale,
-                )
-
-            hidden_or_intermediate_states = get_disagg_group().fetch(
-                torch.Size([num_tokens, model_executable.config.hidden_size]),
-                kv_cache[0].dtype
-            )
+        else:
+            
+            # This is disagg decode instance, during prefill state
+            # Need to receive KV from the prefill instance
+            # FIXME(Kuntai): This impl assumes that all requests are prefill. 
+
+            _input_tokens_list = model_input.input_tokens.tolist()
+            query_lens = model_input.query_lens
+            seq_lens = model_input.seq_lens
+            slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+            
+            hidden_or_intermediate_states_for_one_req = []
+                
+            # enumerate different requests
+            logger.debug("My query lens is %s, seq len is %s, rank is %s", 
+                         str(query_lens),
+                         str(seq_lens),
+                         torch.distributed.get_rank())
+            if query_lens is not None:
+                for idx, qlen in enumerate(query_lens):
+
+                    start_pos = sum(query_lens[:idx])
+                    end_pos = start_pos + qlen
+                    input_hash = hash(tuple(_input_tokens_list[start_pos:end_pos]))
+                    num_tokens = qlen
+                    
+                    # notify the prefill instance to start sending KVs associated with input_hash 
+                    get_disagg_group().kv_cache_recv_start(input_hash)
+
+                    # receive KV cache from disaggregated prefill instance
+                    for i in range(model_executable.model.start_layer,
+                                model_executable.model.end_layer):
+                        
+                        # get kv cache
+                        kv_cache = kv_caches[i - model_executable.model.start_layer]
+                        # get corresponding layer
+                        layer = model_executable.model.layers[i]
+                        
+                        # get kv cache shape (after sliced by tp)
+                        _, _, num_heads, head_size = kv_cache[0].shape
+                        key = get_disagg_group().kv_cache_recv(
+                            torch.Size([num_tokens, num_heads, head_size]),
+                            kv_cache[0].dtype
+                        )
+                        value = get_disagg_group().kv_cache_recv(
+                            torch.Size([num_tokens, num_heads, head_size]),
+                            kv_cache[0].dtype
+                        )
+                        
+                        key_cache, value_cache = kv_cache[0], kv_cache[1]
+                        ops.reshape_and_cache_flash(
+                            key,
+                            value,
+                            key_cache,
+                            value_cache,
+                            slot_mapping[start_pos:end_pos],
+                            layer.self_attn.attn.kv_cache_dtype,
+                            layer.self_attn.attn._k_scale,
+                            layer.self_attn.attn._v_scale,
+                        )
+
+
+                    hidden_or_intermediate_states_for_one_req.append(
+                        get_disagg_group().kv_cache_recv(
+                            torch.Size([num_tokens, model_executable.config.hidden_size]),
+                            kv_cache[0].dtype
+                        )
+                    )
+
+                # concatenate hidden states from different requests
+                hidden_or_intermediate_states = torch.cat(hidden_or_intermediate_states_for_one_req, dim=0)
+
                 
 
         # Compute the logits in the last pipeline stage.

From e9342867c66ec03382c7be27b3d4b51f1bfcda9e Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 2 Aug 2024 06:22:31 +0000
Subject: [PATCH 133/278] update model runner

---
 vllm/worker/model_runner.py | 152 +++++++++++++++++++-----------------
 1 file changed, 79 insertions(+), 73 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 349ec6514817..ed552d6104ec 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -27,7 +27,7 @@
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
                          PromptAdapterConfig, SchedulerConfig)
-from vllm.distributed import get_pp_group, get_disagg_group
+from vllm.distributed import get_tp_group, get_pp_group, get_disagg_group
 from vllm.distributed.parallel_state import graph_capture
 from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
@@ -1387,6 +1387,8 @@ def execute_model(
                 _input_tokens_list = model_input.input_tokens.tolist()
                 seq_lens = model_input.seq_lens
                 query_lens = model_input.query_lens
+                seq_lens = get_tp_group().broadcast_object(seq_lens)
+                query_lens = get_tp_group().broadcast_object(query_lens)
                 slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
                 
                 # failed = False
@@ -1413,37 +1415,38 @@ def execute_model(
                 # query_lens contains new KV caches that are added to vLLM.
                 # so we will send them to decode instance
                 # FIXME(Kuntai): This assume that all requests are prefill. 
-                if query_lens is not None:
-                    for idx, qlen in enumerate(query_lens):
+                for idx, qlen in enumerate(query_lens):
 
 
-                        start_pos = sum(query_lens[:idx])
-                        end_pos = start_pos + qlen
-                        input_hash = hash(tuple(_input_tokens_list[start_pos:end_pos]))
+                    start_pos = sum(query_lens[:idx])
+                    end_pos = start_pos + qlen
+                    input_hash = hash(tuple(_input_tokens_list[start_pos:end_pos]))
+                    
+                    for i in range(model_executable.model.start_layer,
+                            model_executable.model.end_layer):
+                        kv_cache = kv_caches[i - model_executable.model.start_layer]
                         
-                        for i in range(model_executable.model.start_layer,
-                                model_executable.model.end_layer):
-                            kv_cache = kv_caches[i - model_executable.model.start_layer]
-                            
-                            _, _, num_heads, head_size = kv_cache[0].shape
-                            
-                            key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
-                            value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+                        _, _, num_heads, head_size = kv_cache[0].shape
+                        
+                        key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+                        value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
 
-                            current_slot_mapping = slot_mapping[start_pos:end_pos]
+                        current_slot_mapping = slot_mapping[start_pos:end_pos]
 
-                            get_disagg_group().kv_cache_send(
-                                input_hash,
-                                key_cache[current_slot_mapping])
-                            get_disagg_group().kv_cache_send(
-                                input_hash,
-                                value_cache[current_slot_mapping])
+                        get_disagg_group().kv_cache_send(
+                            input_hash,
+                            key_cache[current_slot_mapping])
+                        get_disagg_group().kv_cache_send(
+                            input_hash,
+                            value_cache[current_slot_mapping])
 
 
-                        get_disagg_group().kv_cache_send(
-                            input_hash, 
-                            hidden_or_intermediate_states[start_pos:end_pos])
-                        get_disagg_group().kv_cache_send_finish()
+                    get_disagg_group().kv_cache_send(
+                        input_hash, 
+                        hidden_or_intermediate_states[start_pos:end_pos])
+                    get_disagg_group().kv_cache_send_finish()
+
+                logger.error("\033[92mKV send DONE for rank %d\033[0m", torch.distributed.get_rank())
                 
         else:
             
@@ -1452,8 +1455,10 @@ def execute_model(
             # FIXME(Kuntai): This impl assumes that all requests are prefill. 
 
             _input_tokens_list = model_input.input_tokens.tolist()
-            query_lens = model_input.query_lens
             seq_lens = model_input.seq_lens
+            query_lens = model_input.query_lens
+            seq_lens = get_tp_group().broadcast_object(seq_lens)
+            query_lens = get_tp_group().broadcast_object(query_lens)
             slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
             
             hidden_or_intermediate_states_for_one_req = []
@@ -1463,59 +1468,60 @@ def execute_model(
                          str(query_lens),
                          str(seq_lens),
                          torch.distributed.get_rank())
-            if query_lens is not None:
-                for idx, qlen in enumerate(query_lens):
+            for idx, qlen in enumerate(query_lens):
 
-                    start_pos = sum(query_lens[:idx])
-                    end_pos = start_pos + qlen
-                    input_hash = hash(tuple(_input_tokens_list[start_pos:end_pos]))
-                    num_tokens = qlen
-                    
-                    # notify the prefill instance to start sending KVs associated with input_hash 
-                    get_disagg_group().kv_cache_recv_start(input_hash)
+                start_pos = sum(query_lens[:idx])
+                end_pos = start_pos + qlen
+                input_hash = hash(tuple(_input_tokens_list[start_pos:end_pos]))
+                num_tokens = qlen
+                
+                # notify the prefill instance to start sending KVs associated with input_hash 
+                get_disagg_group().kv_cache_recv_start(input_hash)
 
-                    # receive KV cache from disaggregated prefill instance
-                    for i in range(model_executable.model.start_layer,
-                                model_executable.model.end_layer):
-                        
-                        # get kv cache
-                        kv_cache = kv_caches[i - model_executable.model.start_layer]
-                        # get corresponding layer
-                        layer = model_executable.model.layers[i]
-                        
-                        # get kv cache shape (after sliced by tp)
-                        _, _, num_heads, head_size = kv_cache[0].shape
-                        key = get_disagg_group().kv_cache_recv(
-                            torch.Size([num_tokens, num_heads, head_size]),
-                            kv_cache[0].dtype
-                        )
-                        value = get_disagg_group().kv_cache_recv(
-                            torch.Size([num_tokens, num_heads, head_size]),
-                            kv_cache[0].dtype
-                        )
-                        
-                        key_cache, value_cache = kv_cache[0], kv_cache[1]
-                        ops.reshape_and_cache_flash(
-                            key,
-                            value,
-                            key_cache,
-                            value_cache,
-                            slot_mapping[start_pos:end_pos],
-                            layer.self_attn.attn.kv_cache_dtype,
-                            layer.self_attn.attn._k_scale,
-                            layer.self_attn.attn._v_scale,
-                        )
+                # receive KV cache from disaggregated prefill instance
+                for i in range(model_executable.model.start_layer,
+                            model_executable.model.end_layer):
+                    
+                    # get kv cache
+                    kv_cache = kv_caches[i - model_executable.model.start_layer]
+                    # get corresponding layer
+                    layer = model_executable.model.layers[i]
+                    
+                    # get kv cache shape (after sliced by tp)
+                    _, _, num_heads, head_size = kv_cache[0].shape
+                    key = get_disagg_group().kv_cache_recv(
+                        torch.Size([num_tokens, num_heads, head_size]),
+                        kv_cache[0].dtype
+                    )
+                    value = get_disagg_group().kv_cache_recv(
+                        torch.Size([num_tokens, num_heads, head_size]),
+                        kv_cache[0].dtype
+                    )
+                    
+                    key_cache, value_cache = kv_cache[0], kv_cache[1]
+                    ops.reshape_and_cache_flash(
+                        key,
+                        value,
+                        key_cache,
+                        value_cache,
+                        slot_mapping[start_pos:end_pos],
+                        layer.self_attn.attn.kv_cache_dtype,
+                        layer.self_attn.attn._k_scale,
+                        layer.self_attn.attn._v_scale,
+                    )
 
 
-                    hidden_or_intermediate_states_for_one_req.append(
-                        get_disagg_group().kv_cache_recv(
-                            torch.Size([num_tokens, model_executable.config.hidden_size]),
-                            kv_cache[0].dtype
-                        )
+                hidden_or_intermediate_states_for_one_req.append(
+                    get_disagg_group().kv_cache_recv(
+                        torch.Size([num_tokens, model_executable.config.hidden_size]),
+                        kv_cache[0].dtype
                     )
+                )
+
+            # concatenate hidden states from different requests
+            hidden_or_intermediate_states = torch.cat(hidden_or_intermediate_states_for_one_req, dim=0)
 
-                # concatenate hidden states from different requests
-                hidden_or_intermediate_states = torch.cat(hidden_or_intermediate_states_for_one_req, dim=0)
+            logger.error("\033[92mKV receive DONE for rank %d\033[0m", torch.distributed.get_rank())
 
                 
 

From b68435ac66dafca1e2f36d75a7ffaf8891f43d86 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 4 Aug 2024 03:56:38 +0000
Subject: [PATCH 134/278] move group coordinator to a separate file, move
 disagg implementations to a separate file

---
 tests/distributed/test_parallel_state.py |   2 +-
 vllm/distributed/distributed_kv.py       | 169 ++++++
 vllm/distributed/group_coordinator.py    | 714 +++++++++++++++++++++++
 3 files changed, 884 insertions(+), 1 deletion(-)
 create mode 100644 vllm/distributed/distributed_kv.py
 create mode 100644 vllm/distributed/group_coordinator.py

diff --git a/tests/distributed/test_parallel_state.py b/tests/distributed/test_parallel_state.py
index 3adcf6b61046..cbb01239521a 100644
--- a/tests/distributed/test_parallel_state.py
+++ b/tests/distributed/test_parallel_state.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
 
-from vllm.distributed.parallel_state import (_split_tensor_dict,
+from vllm.distributed.group_coordinator import (_split_tensor_dict,
                                              _update_nested_dict)
 
 
diff --git a/vllm/distributed/distributed_kv.py b/vllm/distributed/distributed_kv.py
new file mode 100644
index 000000000000..1ed6c03872e6
--- /dev/null
+++ b/vllm/distributed/distributed_kv.py
@@ -0,0 +1,169 @@
+"""vLLM distributed KV cache transfer API.
+These APIs are used in `vllm/worker/model_runner.py`.
+"""
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch.distributed import Backend, ProcessGroup
+
+import vllm.envs as envs
+from vllm.distributed.group_coordinator import GroupCoordinator
+
+
+assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode"], \
+    "VLLM_DISAGG_PREFILL_ROLE can only be prefill or decode."
+
+IS_DISTRIBUTED_KV_INSTANCE = (envs.VLLM_DISAGG_PREFILL_ROLE is not None)
+IS_KV_PREFILL_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "prefill")
+IS_KV_DECODE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "decode")
+
+
+class DistributedKVCoordinator(GroupCoordinator):
+    """
+        A class designated for distributed KV transfer
+        
+        Target use cases:
+            1. Disaggregated prefill
+            2. Remote KV cache storage
+    """
+
+    def __init__(
+        self,
+        group_ranks: List[List[int]],
+        local_rank: int,
+        torch_distributed_backend: Union[str, Backend],
+        use_pynccl: bool = True,
+        use_custom_allreduce: bool = False,
+        use_tpu_communicator: bool = True,
+        use_message_queue_broadcaster: bool = False,
+        use_cpu_verfication: bool = True,
+    ):
+
+        super().__init__(
+            group_ranks,
+            local_rank,
+            torch_distributed_backend,
+            use_pynccl,
+            use_custom_allreduce,
+            use_tpu_communicator,
+            use_message_queue_broadcaster,
+        )
+
+        # if turned on, will use CPU-based communication to perform a series of sanity check.
+        # but it adds ~5ms delay, so please turn it off in performance-demanding usecases (e.g. disaggregated prefill)
+        self.use_cpu_verfication = use_cpu_verfication
+
+        # use a threadpool to buffer send request in disaggregated prefill
+        self.input_hash_to_kv_sending_requests = defaultdict(list)
+        self.kv_sending_thread = None
+
+    def debug_send(self,
+                   tensor: torch.Tensor,
+                   dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """Will send several metadata. Useful for debugging."""
+        """NOTE: `dst` is the local rank of the destination rank."""
+
+        self.send_tensor_dict(
+            {
+                "tensor": tensor,
+                "mean": tensor.float().mean(),
+                "shape": tensor.shape
+            }, dst)
+
+    def debug_recv(self,
+                   size: torch.Size,
+                   dtype: torch.dtype,
+                   src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the src rank."""
+        """NOTE: `src` is the local rank of the destination rank."""
+
+        result = self.recv_tensor_dict(src)
+        tensor = result["tensor"]
+        assert torch.allclose(result["mean"], tensor.float().mean())
+        assert result["shape"] == tensor.shape
+        assert result[
+            "shape"] == size, f"The shape sent by sender is {result['shape']} but trying to receive {size}"
+        return tensor
+
+    def kv_cache_send(self,
+                      input_hash: int,
+                      tensor: torch.Tensor,
+                      dst: Optional[int] = None,
+                      enable_verification: bool = True) -> None:
+        """Push the KV cache send request into the send buffer"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+
+        if enable_verification:
+            send_func = self.debug_send
+        else:
+            send_func = self.send
+
+        self.input_hash_to_kv_sending_requests[input_hash].append([
+            send_func,
+            # tensor needs to be cloned, if not the tensor may be freed
+            tensor.clone(),
+            dst
+        ])
+
+    def kv_cache_recv(self,
+                      size: torch.Size,
+                      dtype: torch.dtype,
+                      src: Optional[int] = None,
+                      enable_verification: bool = True) -> torch.Tensor:
+        """Receives a tensor from the src rank (blocking)."""
+        """This API should be used together with `push`"""
+        """NOTE: `src` is the local rank of the destination rank."""
+
+        if enable_verification:
+            recv_func = self.debug_recv
+        else:
+            recv_func = self.recv
+
+        tensor = recv_func(size, dtype, src)
+
+        return tensor
+
+    def recv_input_hash_and_send_kv(self):
+
+        try:
+
+            # receive the input hash that the decode instance requires
+            logger.debug('Waiting for input hash ...')
+            # FIXME(Kuntai): debug_recv guarantees correctness but hurts perf
+            input_hash_tensor = self.debug_recv(torch.Size([1]), torch.long)
+            input_hash = input_hash_tensor.item()
+            logger.debug('Receiving input hash %d', input_hash)
+            assert input_hash in self.input_hash_to_kv_sending_requests, \
+                f"The KV cache of {input_hash} does not exist."
+            logger.debug('Input hash %d exists, start sending', input_hash)
+
+            # execute corresponding kv cache sending jobs in request queue
+            for idx, request in enumerate(
+                    self.input_hash_to_kv_sending_requests[input_hash]):
+                request[0](*request[1:])
+            logger.debug('Finish input hash %d, free memory...' % input_hash)
+            # free GPU memory occupied by sending
+            del self.input_hash_to_kv_sending_requests[input_hash]
+
+        except Exception:
+            import sys
+            import traceback
+
+    def kv_cache_send_finish(self):
+
+        if self.kv_sending_thread is None:
+            self.kv_sending_thread = ThreadPoolExecutor(max_workers=1)
+
+        job = self.kv_sending_thread.submit(self.recv_input_hash_and_send_kv)
+        logger.debug(f'Submit job {job} into kv cache sending thread')
+
+    def kv_cache_recv_start(self, input_hash: int):
+
+        logger.debug('Requesting KV cache transfer for input hash %d',
+                     input_hash)
+
+        input_hash_tensor = torch.tensor([input_hash]).long().to(self.device)
+        # notify the kv cache sender with the input hash id
+        # FIXME(Kuntai): debug_send guarantees correctness but hurts perf.
+        self.debug_send(input_hash_tensor)
diff --git a/vllm/distributed/group_coordinator.py b/vllm/distributed/group_coordinator.py
new file mode 100644
index 000000000000..1202ed652a0b
--- /dev/null
+++ b/vllm/distributed/group_coordinator.py
@@ -0,0 +1,714 @@
+"""vLLM PyTorch ProcessGroup wrapper for a group of processes.
+    PyTorch ProcessGroup is bound to one specific communication backend,
+        e.g. NCCL, Gloo, MPI, etc.
+    GroupCoordinator takes charge of all the communication operations among
+        the processes in the group. It can route the communication to
+        a specific implementation (e.g. switch allreduce implementation
+        based on the tensor size and cuda graph mode).
+"""
+
+from dataclasses import dataclass
+from contextlib import contextmanager, nullcontext
+from collections import namedtuple
+from typing import Any, Dict, List, Optional, Tuple, Union
+import pickle
+
+import torch
+from torch.distributed import Backend, ProcessGroup
+
+
+@dataclass
+class GraphCaptureContext:
+    stream: torch.cuda.Stream
+
+
+TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
+
+
+def _split_tensor_dict(
+        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
+        prefix: str = "") -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+    """Split the tensor dictionary into two parts:
+    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
+         by its metadata.
+    2. A list of tensors.
+
+    If the Tensor is nested under `tensor_dict["key1"]["key2"]`, the key of its
+    metadata will be "key1%key2".
+    """
+    metadata_list: List[Tuple[str, Any]] = []
+    tensor_list = []
+    for key, value in tensor_dict.items():
+        assert "%" not in key, (
+            "Avoid having '%' in key "
+            "as it is used as a separator for nested entries.")
+        if isinstance(value, torch.Tensor):
+            # Note: we cannot use `value.device` here,
+            # because it contains not only the device type but also the device
+            # index (e.g. "cuda:0"). We only need the device type.
+            # receiving side will set the device index.
+            device = value.device.type
+            metadata_list.append(
+                (prefix + key, TensorMetadata(device, value.dtype,
+                                              value.size())))
+            tensor_list.append(value)
+        elif isinstance(value, dict):
+            if len(value) == 0:
+                metadata_list.append((prefix + key, value))
+            inner_metadata_list, inner_tensor_list = _split_tensor_dict(
+                value, prefix + key + "%")
+            metadata_list.extend(inner_metadata_list)
+            tensor_list.extend(inner_tensor_list)
+        else:
+            metadata_list.append((prefix + key, value))
+    return metadata_list, tensor_list
+
+
+def _update_nested_dict(nested_dict, flattened_key, value):
+    key_splits = flattened_key.split("%")
+    cur_dict = nested_dict
+    for k in key_splits[:-1]:
+        if k not in cur_dict:
+            cur_dict[k] = {}
+        cur_dict = cur_dict[k]
+    cur_dict[key_splits[-1]] = value
+
+
+class GroupCoordinator:
+    """
+    PyTorch ProcessGroup wrapper for a group of processes.
+    PyTorch ProcessGroup is bound to one specific communication backend,
+        e.g. NCCL, Gloo, MPI, etc.
+    GroupCoordinator takes charge of all the communication operations among
+        the processes in the group. It can route the communication to
+        a specific implementation (e.g. switch allreduce implementation
+        based on the tensor size and cuda graph mode).
+    """
+
+    # available attributes:
+    rank: int  # global rank
+    ranks: List[int]  # global ranks in the group
+    world_size: int  # size of the group
+    # difference between `local_rank` and `rank_in_group`:
+    # if we have a group of size 4 across two nodes:
+    # Process | Node | Rank | Local Rank | Rank in Group
+    #   0     |   0  |  0   |     0      |       0
+    #   1     |   0  |  1   |     1      |       1
+    #   2     |   1  |  2   |     0      |       2
+    #   3     |   1  |  3   |     1      |       3
+    local_rank: int  # local rank used to assign devices
+    rank_in_group: int  # rank inside the group
+    cpu_group: ProcessGroup  # group for CPU communication
+    device_group: ProcessGroup  # group for device communication
+    use_pynccl: bool  # a hint of whether to use PyNccl
+    use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
+    # communicators are only created for world size > 1
+    pynccl_comm: Optional[Any]  # PyNccl communicator
+    ca_comm: Optional[Any]  # Custom allreduce communicator
+    mq_broadcaster: Optional[Any]  # shared memory broadcaster
+
+    def __init__(
+        self,
+        group_ranks: List[List[int]],
+        local_rank: int,
+        torch_distributed_backend: Union[str, Backend],
+        use_pynccl: bool,
+        use_custom_allreduce: bool,
+        use_tpu_communicator: bool,
+        use_message_queue_broadcaster: bool = False,
+    ):
+
+        self.rank = torch.distributed.get_rank()
+        self.local_rank = local_rank
+        self.device_group = None
+        self.cpu_group = None
+
+        for ranks in group_ranks:
+            device_group = torch.distributed.new_group(
+                ranks, backend=torch_distributed_backend)
+            # a group with `gloo` backend, to allow direct coordination between
+            # processes through the CPU.
+            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            if self.rank in ranks:
+                self.ranks = ranks
+                self.world_size = len(ranks)
+                self.rank_in_group = ranks.index(self.rank)
+                self.device_group = device_group
+                self.cpu_group = cpu_group
+
+        assert self.cpu_group is not None
+        assert self.device_group is not None
+
+        if torch.cuda.is_available():
+            self.device = torch.device(f"cuda:{local_rank}")
+        else:
+            self.device = torch.device("cpu")
+
+        self.use_pynccl = use_pynccl
+        self.use_custom_allreduce = use_custom_allreduce
+        self.use_tpu_communicator = use_tpu_communicator
+
+        # lazy import to avoid documentation build error
+        from vllm.distributed.device_communicators.custom_all_reduce import (
+            CustomAllreduce)
+        from vllm.distributed.device_communicators.pynccl import (
+            PyNcclCommunicator)
+
+        self.pynccl_comm: Optional[PyNcclCommunicator]
+        if use_pynccl and self.world_size > 1:
+            self.pynccl_comm = PyNcclCommunicator(
+                group=self.cpu_group,
+                device=self.device,
+            )
+        else:
+            self.pynccl_comm = None
+
+        self.ca_comm: Optional[CustomAllreduce]
+        if use_custom_allreduce and self.world_size > 1:
+            # Initialize a custom fast all-reduce implementation.
+            self.ca_comm = CustomAllreduce(
+                group=self.cpu_group,
+                device=self.device,
+            )
+        else:
+            self.ca_comm = None
+
+        from vllm.distributed.device_communicators.tpu_communicator import (
+            TpuCommunicator)
+        self.tpu_communicator: Optional[TpuCommunicator]
+        if use_tpu_communicator and self.world_size > 1:
+            self.tpu_communicator = TpuCommunicator(group=self.cpu_group)
+
+        from vllm.distributed.device_communicators.shm_broadcast import (
+            MessageQueue)
+        self.mq_broadcaster: Optional[MessageQueue] = None
+        if use_message_queue_broadcaster and self.world_size > 1:
+            self.mq_broadcaster = MessageQueue.create_from_process_group(
+                self.cpu_group, 1 << 22, 6)
+
+    @property
+    def first_rank(self):
+        """Return the global rank of the first process in the group"""
+        return self.ranks[0]
+
+    @property
+    def last_rank(self):
+        """Return the global rank of the last process in the group"""
+        return self.ranks[-1]
+
+    @property
+    def is_first_rank(self):
+        """Return whether the caller is the first process in the group"""
+        return self.rank == self.first_rank
+
+    @property
+    def is_last_rank(self):
+        """Return whether the caller is the last process in the group"""
+        return self.rank == self.last_rank
+
+    @property
+    def next_rank(self):
+        """Return the global rank of the process that follows the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group + 1) % world_size]
+
+    @property
+    def prev_rank(self):
+        """Return the global rank of the process that precedes the caller"""
+        rank_in_group = self.rank_in_group
+        world_size = self.world_size
+        return self.ranks[(rank_in_group - 1) % world_size]
+
+    @contextmanager
+    def graph_capture(
+            self, graph_capture_context: Optional[GraphCaptureContext] = None):
+        if graph_capture_context is None:
+            stream = torch.cuda.Stream()
+            graph_capture_context = GraphCaptureContext(stream)
+        else:
+            stream = graph_capture_context.stream
+
+        ca_comm = self.ca_comm
+        maybe_ca_context = nullcontext(
+        ) if ca_comm is None else ca_comm.capture()
+
+        # ensure all initialization operations complete before attempting to
+        # capture the graph on another stream
+        curr_stream = torch.cuda.current_stream()
+        if curr_stream != stream:
+            stream.wait_stream(curr_stream)
+
+        with torch.cuda.stream(stream), maybe_ca_context:
+            # In graph mode, we have to be very careful about the collective
+            # operations. The current status is:
+            #     allreduce \ Mode   |  Eager  |  Graph  |
+            # --------------------------------------------
+            # custom allreduce       | enabled | enabled |
+            # PyNccl                 | disabled| enabled |
+            # torch.distributed      | enabled | disabled|
+            #
+            # Note that custom allreduce will have a runtime check, if the
+            #  tensor size is too large, it will fallback to the next
+            #  available option.
+            # In summary: When using CUDA graph, we use
+            #  either custom all-reduce kernel or pynccl. When not using
+            #  CUDA graph, we use either custom all-reduce kernel or
+            #  PyTorch NCCL. We always prioritize using custom all-reduce
+            #  kernel but fall back to PyTorch or pynccl if it is
+            #  disabled or not supported.
+            pynccl_comm = self.pynccl_comm
+            maybe_pynccl_context: Any
+            if not pynccl_comm:
+                maybe_pynccl_context = nullcontext()
+            else:
+                maybe_pynccl_context = pynccl_comm.change_state(
+                    enable=True, stream=torch.cuda.current_stream())
+            with maybe_pynccl_context:
+                yield graph_capture_context
+
+    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
+        """
+        NOTE: This operation will be applied in-place or out-of-place. 
+        Always assume this function modifies its input, but use the return
+        value as the output.
+        """
+        ca_comm = self.ca_comm
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+
+        # For TPUs, use TPU communicator.
+        tpu_comm = self.tpu_communicator
+        if tpu_comm is not None and not tpu_comm.disabled:
+            return tpu_comm.all_reduce(input_)
+
+        if ca_comm is not None:
+            out = ca_comm.custom_all_reduce(input_)
+            if out is not None:
+                return out
+        pynccl_comm = self.pynccl_comm
+        if (pynccl_comm is not None and not pynccl_comm.disabled):
+            pynccl_comm.all_reduce(input_)
+        elif input_.is_cpu:
+            import intel_extension_for_pytorch as ipex
+            ipex.distributed.all_reduce(input_, group=self.device_group)
+        else:
+            torch.distributed.all_reduce(input_, group=self.device_group)
+        return input_
+
+    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+
+        # For TPUs, use TPU communicator.
+        tpu_comm = self.tpu_communicator
+        if tpu_comm is not None and not tpu_comm.disabled:
+            return tpu_comm.all_gather(input_, dim)
+
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        input_size = input_.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty((world_size, ) + input_size,
+                                    dtype=input_.dtype,
+                                    device=input_.device)
+        # All-gather.
+        torch.distributed.all_gather_into_tensor(output_tensor,
+                                                 input_,
+                                                 group=self.device_group)
+        # Reshape
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(input_size[:dim] +
+                                              (world_size *
+                                               input_size[dim], ) +
+                                              input_size[dim + 1:])
+        return output_tensor
+
+    def gather(self,
+               input_: torch.Tensor,
+               dst: int = 0,
+               dim: int = -1) -> torch.Tensor:
+        """
+        NOTE: We assume that the input tensor is on the same device across
+        all the ranks.
+        NOTE: `dst` is the local rank of the destination rank.
+        """
+        world_size = self.world_size
+        # Bypass the function if we are using only 1 GPU.
+        if world_size == 1:
+            return input_
+        assert -input_.dim() <= dim < input_.dim(), (
+            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += input_.dim()
+        # Allocate output tensor.
+        if self.rank_in_group == dst:
+            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
+        else:
+            gather_list = None
+        # Gather.
+        torch.distributed.gather(input_,
+                                 gather_list,
+                                 dst=self.ranks[dst],
+                                 group=self.device_group)
+        if self.rank_in_group == dst:
+            output_tensor = torch.cat(gather_list, dim=dim)
+        else:
+            output_tensor = None
+        return output_tensor
+
+    def broadcast(self, input_: torch.Tensor, src: int = 0):
+        """Broadcast the input tensor.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return input_
+        # Broadcast.
+        torch.distributed.broadcast(input_,
+                                    src=self.ranks[src],
+                                    group=self.device_group)
+        return input_
+
+    def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
+        """Broadcast the input object.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj
+        if self.mq_broadcaster is not None:
+            assert src == 0, "Message queue broadcaster only supports src=0"
+            return self.mq_broadcaster.broadcast_object(obj)
+        if self.rank_in_group == src:
+            torch.distributed.broadcast_object_list([obj],
+                                                    src=self.ranks[src],
+                                                    group=self.cpu_group)
+            return obj
+        else:
+            recv = [None]
+            torch.distributed.broadcast_object_list(recv,
+                                                    src=self.ranks[src],
+                                                    group=self.cpu_group)
+            return recv[0]
+
+    def broadcast_object_list(self,
+                              obj_list: List[Any],
+                              src: int = 0,
+                              group: Optional[ProcessGroup] = None):
+        """Broadcast the input object list.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        # Bypass the function if we are using only 1 GPU.
+        if self.world_size == 1:
+            return obj_list
+        # Broadcast.
+        torch.distributed.broadcast_object_list(obj_list,
+                                                src=self.ranks[src],
+                                                group=self.device_group)
+        return obj_list
+
+    def send_object(self, obj: Any, dst: int) -> None:
+        """Send the input object list to the destination rank."""
+        """NOTE: `dst` is the local rank of the destination rank."""
+
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        assert dst != self.rank_in_group, (
+            "Invalid destination rank. Destination rank is the same "
+            "as the current rank.")
+
+        # Serialize object to tensor and get the size as well
+        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
+
+        size_tensor = torch.tensor([object_tensor.numel()],
+                                   dtype=torch.long,
+                                   device="cpu")
+
+        # Send object size
+
+        torch.distributed.send(size_tensor,
+                               dst=self.ranks[dst],
+                               group=self.cpu_group)
+
+        # Send object
+        torch.distributed.send(object_tensor,
+                               dst=self.ranks[dst],
+                               group=self.cpu_group)
+
+        return None
+
+    def recv_object(self, src: int) -> Any:
+        """Receive the input object list from the source rank."""
+        """NOTE: `src` is the local rank of the source rank."""
+
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        assert src != self.rank_in_group, (
+            "Invalid source rank. Source rank is the same as the current rank."
+        )
+
+        size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
+
+        # Receive object size
+        rank_size = torch.distributed.recv(size_tensor,
+                                           src=self.ranks[src],
+                                           group=self.cpu_group)
+
+        # Tensor to receive serialized objects into.
+        object_tensor = torch.empty(  # type: ignore[call-overload]
+            size_tensor.item(),  # type: ignore[arg-type]
+            dtype=torch.uint8,
+            device="cpu")
+
+        rank_object = torch.distributed.recv(object_tensor,
+                                             src=self.ranks[src],
+                                             group=self.cpu_group)
+
+        assert rank_object == rank_size, (
+            "Received object sender rank does not match the size sender rank.")
+
+        obj = pickle.loads(object_tensor.numpy().tobytes())
+
+        return obj
+
+    def broadcast_tensor_dict(
+        self,
+        tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None,
+        src: int = 0,
+        group: Optional[ProcessGroup] = None,
+        metadata_group: Optional[ProcessGroup] = None
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Broadcast the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if (not torch.distributed.is_initialized() or self.world_size == 1):
+            return tensor_dict
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        rank_in_group = self.rank_in_group
+        if rank_in_group == src:
+            metadata_list: List[Tuple[Any, Any]] = []
+            assert isinstance(
+                tensor_dict,
+                dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
+            metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+            # `metadata_list` lives in CPU memory.
+            # `broadcast_object_list` has serialization & deserialization,
+            # all happening on CPU. Therefore, we can use the CPU group.
+            self.broadcast_object(metadata_list, src=src)
+            async_handles = []
+            for tensor in tensor_list:
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    continue
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    handle = torch.distributed.broadcast(tensor,
+                                                         src=self.ranks[src],
+                                                         group=metadata_group,
+                                                         async_op=True)
+                else:
+                    # use group for GPU tensors
+                    handle = torch.distributed.broadcast(tensor,
+                                                         src=self.ranks[src],
+                                                         group=group,
+                                                         async_op=True)
+                async_handles.append(handle)
+            for async_handle in async_handles:
+                async_handle.wait()
+
+        else:
+            metadata_list = self.broadcast_object(None, src=src)
+            tensor_dict = {}
+            async_handles = []
+            for key, value in metadata_list:
+                if isinstance(value, TensorMetadata):
+                    tensor = torch.empty(value.size,
+                                         dtype=value.dtype,
+                                         device=value.device)
+                    if tensor.numel() == 0:
+                        # Skip broadcasting empty tensors.
+                        _update_nested_dict(tensor_dict, key, tensor)
+                        continue
+                    if tensor.is_cpu:
+                        # use metadata_group for CPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor,
+                            src=self.ranks[src],
+                            group=metadata_group,
+                            async_op=True)
+                    else:
+                        # use group for GPU tensors
+                        handle = torch.distributed.broadcast(
+                            tensor,
+                            src=self.ranks[src],
+                            group=group,
+                            async_op=True)
+                    async_handles.append(handle)
+                    _update_nested_dict(tensor_dict, key, tensor)
+                else:
+                    _update_nested_dict(tensor_dict, key, value)
+            for async_handle in async_handles:
+                async_handle.wait()
+        return tensor_dict
+
+    def send_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
+        dst: Optional[int] = None
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Send the input tensor dictionary.
+        NOTE: `dst` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return tensor_dict
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+        assert dst < self.world_size, f"Invalid dst rank ({dst})"
+
+        metadata_list: List[Tuple[Any, Any]] = []
+        assert isinstance(
+            tensor_dict,
+            dict), f"Expecting a dictionary, got {type(tensor_dict)}"
+        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
+        # `metadata_list` lives in CPU memory.
+        # `send_object_list` has serialization & deserialization,
+        # all happening on CPU. Therefore, we can use the CPU group.
+        self.send_object(metadata_list, dst=dst)
+        for tensor in tensor_list:
+            if tensor.numel() == 0:
+                # Skip sending empty tensors.
+                continue
+            if tensor.is_cpu:
+                # use metadata_group for CPU tensors
+                torch.distributed.send(tensor,
+                                       dst=self.ranks[dst],
+                                       group=metadata_group)
+            else:
+                # use group for GPU tensors
+                torch.distributed.send(tensor,
+                                       dst=self.ranks[dst],
+                                       group=group)
+        return None
+
+    def recv_tensor_dict(
+        self,
+        src: Optional[int] = None
+    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
+        """Recv the input tensor dictionary.
+        NOTE: `src` is the local rank of the source rank.
+        """
+        # Bypass the function if we are using only 1 GPU.
+        if not torch.distributed.is_initialized() or self.world_size == 1:
+            return None
+
+        group = self.device_group
+        metadata_group = self.cpu_group
+
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+        assert src < self.world_size, f"Invalid src rank ({src})"
+
+        recv_metadata_list = self.recv_object(src=src)
+        tensor_dict: Dict[str, Any] = {}
+        for key, value in recv_metadata_list:
+            if isinstance(value, TensorMetadata):
+                tensor = torch.empty(value.size,
+                                     dtype=value.dtype,
+                                     device=value.device)
+                if tensor.numel() == 0:
+                    # Skip broadcasting empty tensors.
+                    _update_nested_dict(tensor_dict, key, tensor)
+                    continue
+                if tensor.is_cpu:
+                    # use metadata_group for CPU tensors
+                    torch.distributed.recv(tensor,
+                                           src=self.ranks[src],
+                                           group=metadata_group)
+                else:
+                    # use group for GPU tensors
+                    torch.distributed.recv(tensor,
+                                           src=self.ranks[src],
+                                           group=group)
+                _update_nested_dict(tensor_dict, key, tensor)
+            else:
+                _update_nested_dict(tensor_dict, key, value)
+        return tensor_dict
+
+    def barrier(self):
+        """Barrier synchronization among the group.
+        NOTE: don't use `device_group` here! `barrier` in NCCL is
+        terrible because it is internally a broadcast operation with
+        secretly created GPU tensors. It is easy to mess up the current
+        device. Use the CPU group instead.
+        """
+        torch.distributed.barrier(group=self.cpu_group)
+
+    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+
+        if dst is None:
+            dst = (self.rank_in_group + 1) % self.world_size
+
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.send(tensor, dst)
+        else:
+            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
+
+    def recv(self,
+             size: torch.Size,
+             dtype: torch.dtype,
+             src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the src rank."""
+        """NOTE: `src` is the local rank of the destination rank."""
+        if src is None:
+            src = (self.rank_in_group - 1) % self.world_size
+
+        tensor = torch.empty(size, dtype=dtype, device=self.device)
+        pynccl_comm = self.pynccl_comm
+        if pynccl_comm is not None and not pynccl_comm.disabled:
+            pynccl_comm.recv(tensor, src)
+        else:
+            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
+
+        return tensor
+
+    def destroy(self):
+        if self.device_group is not None:
+            torch.distributed.destroy_process_group(self.device_group)
+            self.device_group = None
+        if self.cpu_group is not None:
+            torch.distributed.destroy_process_group(self.cpu_group)
+            self.cpu_group = None
+        if self.pynccl_comm is not None:
+            self.pynccl_comm = None
+        if self.ca_comm is not None:
+            self.ca_comm = None
+        if self.mq_broadcaster is not None:
+            self.mq_broadcaster = None

From e54f7a3b9413d6ec72b9c2a153987e7f4402f4bd Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 4 Aug 2024 04:06:54 +0000
Subject: [PATCH 135/278] no need to send during attention

---
 vllm/attention/backends/flash_attn.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 330addd3b52f..9aaac2886510 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -493,15 +493,6 @@ def forward(
                 v_scale,
             )
 
-            # send out the KV cache when current vllm is prefill instance
-            # the corresponding receive code is in vllm/worker/model_runner.py
-            if all([
-                envs.VLLM_DISAGG_PREFILL_ROLE == "prefill",
-                attn_metadata.prefill_metadata is not None]):
-                
-                get_disagg_group().push(key)
-                get_disagg_group().push(value)
-
         num_prefill_tokens = attn_metadata.num_prefill_tokens
         num_decode_tokens = attn_metadata.num_decode_tokens
         assert key.shape[0] == num_prefill_tokens + num_decode_tokens
@@ -572,4 +563,4 @@ def forward(
             ).squeeze(1)
 
         # Reshape the output tensor.
-        return output.view(num_tokens, hidden_size)
\ No newline at end of file
+        return output.view(num_tokens, hidden_size)

From 23c99496a4f9ee0236454161c89d742a0476bfcb Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 4 Aug 2024 07:44:00 +0000
Subject: [PATCH 136/278] debug tp

---
 vllm/distributed/group_coordinator.py |  11 +-
 vllm/distributed/parallel_state.py    | 914 ++------------------------
 vllm/worker/model_runner.py           | 175 +----
 3 files changed, 78 insertions(+), 1022 deletions(-)

diff --git a/vllm/distributed/group_coordinator.py b/vllm/distributed/group_coordinator.py
index 1202ed652a0b..411e59ce2440 100644
--- a/vllm/distributed/group_coordinator.py
+++ b/vllm/distributed/group_coordinator.py
@@ -637,9 +637,14 @@ def recv_tensor_dict(
         tensor_dict: Dict[str, Any] = {}
         for key, value in recv_metadata_list:
             if isinstance(value, TensorMetadata):
+                target_device = value.device
+                if 'cuda' in value.device:
+                    # receiving a cuda tensor
+                    # need to allocate buffer on LOCAL cuda device
+                    target_device = self.device
                 tensor = torch.empty(value.size,
                                      dtype=value.dtype,
-                                     device=value.device)
+                                     device=target_device)
                 if tensor.numel() == 0:
                     # Skip broadcasting empty tensors.
                     _update_nested_dict(tensor_dict, key, tensor)
@@ -652,8 +657,8 @@ def recv_tensor_dict(
                 else:
                     # use group for GPU tensors
                     torch.distributed.recv(tensor,
-                                           src=self.ranks[src],
-                                           group=group)
+                                        src=self.ranks[src],
+                                        group=group)
                 _update_nested_dict(tensor_dict, key, tensor)
             else:
                 _update_nested_dict(tensor_dict, key, value)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index b4f668bd537e..d2f8d7ea2a21 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -24,13 +24,12 @@
 import contextlib
 import pickle
 import logging
-from collections import namedtuple, defaultdict
+from collections import namedtuple
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from multiprocessing import shared_memory
 from typing import Any, Dict, List, Optional, Tuple, Union
 from unittest.mock import patch
-from concurrent.futures import ThreadPoolExecutor
 import queue
 
 import torch
@@ -39,837 +38,9 @@
 
 import vllm.envs as envs
 from vllm.logger import init_logger
+from vllm.distributed.group_coordinator import GroupCoordinator
+import vllm.distributed.distributed_kv as dist_kv
 
-
-@dataclass
-class GraphCaptureContext:
-    stream: torch.cuda.Stream
-
-
-TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
-
-
-def _split_tensor_dict(
-        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
-        prefix: str = "") -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
-    """Split the tensor dictionary into two parts:
-    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
-         by its metadata.
-    2. A list of tensors.
-
-    If the Tensor is nested under `tensor_dict["key1"]["key2"]`, the key of its
-    metadata will be "key1%key2".
-    """
-    metadata_list: List[Tuple[str, Any]] = []
-    tensor_list = []
-    for key, value in tensor_dict.items():
-        assert "%" not in key, (
-            "Avoid having '%' in key "
-            "as it is used as a separator for nested entries.")
-        if isinstance(value, torch.Tensor):
-            # Note: we cannot use `value.device` here,
-            # because it contains not only the device type but also the device
-            # index (e.g. "cuda:0"). We only need the device type.
-            # receiving side will set the device index.
-            device = value.device.type
-            metadata_list.append(
-                (prefix + key, TensorMetadata(device, value.dtype,
-                                              value.size())))
-            tensor_list.append(value)
-        elif isinstance(value, dict):
-            if len(value) == 0:
-                metadata_list.append((prefix + key, value))
-            inner_metadata_list, inner_tensor_list = _split_tensor_dict(
-                value, prefix + key + "%")
-            metadata_list.extend(inner_metadata_list)
-            tensor_list.extend(inner_tensor_list)
-        else:
-            metadata_list.append((prefix + key, value))
-    return metadata_list, tensor_list
-
-
-def _update_nested_dict(nested_dict, flattened_key, value):
-    key_splits = flattened_key.split("%")
-    cur_dict = nested_dict
-    for k in key_splits[:-1]:
-        if k not in cur_dict:
-            cur_dict[k] = {}
-        cur_dict = cur_dict[k]
-    cur_dict[key_splits[-1]] = value
-
-
-class GroupCoordinator:
-    """
-    PyTorch ProcessGroup wrapper for a group of processes.
-    PyTorch ProcessGroup is bound to one specific communication backend,
-        e.g. NCCL, Gloo, MPI, etc.
-    GroupCoordinator takes charge of all the communication operations among
-        the processes in the group. It can route the communication to
-        a specific implementation (e.g. switch allreduce implementation
-        based on the tensor size and cuda graph mode).
-    """
-
-    # available attributes:
-    rank: int  # global rank
-    ranks: List[int]  # global ranks in the group
-    world_size: int  # size of the group
-    # difference between `local_rank` and `rank_in_group`:
-    # if we have a group of size 4 across two nodes:
-    # Process | Node | Rank | Local Rank | Rank in Group
-    #   0     |   0  |  0   |     0      |       0
-    #   1     |   0  |  1   |     1      |       1
-    #   2     |   1  |  2   |     0      |       2
-    #   3     |   1  |  3   |     1      |       3
-    local_rank: int  # local rank used to assign devices
-    rank_in_group: int  # rank inside the group
-    cpu_group: ProcessGroup  # group for CPU communication
-    device_group: ProcessGroup  # group for device communication
-    use_pynccl: bool  # a hint of whether to use PyNccl
-    use_custom_allreduce: bool  # a hint of whether to use CustomAllreduce
-    # communicators are only created for world size > 1
-    pynccl_comm: Optional[Any]  # PyNccl communicator
-    ca_comm: Optional[Any]  # Custom allreduce communicator
-    mq_broadcaster: Optional[Any]  # shared memory broadcaster
-
-    def __init__(
-        self,
-        group_ranks: List[List[int]],
-        local_rank: int,
-        torch_distributed_backend: Union[str, Backend],
-        use_pynccl: bool,
-        use_custom_allreduce: bool,
-        use_tpu_communicator: bool,
-        use_message_queue_broadcaster: bool = False,
-    ):
-
-        self.rank = torch.distributed.get_rank()
-        self.local_rank = local_rank
-        self.device_group = None
-        self.cpu_group = None
-
-        for ranks in group_ranks:
-            device_group = torch.distributed.new_group(
-                ranks, backend=torch_distributed_backend)
-            # a group with `gloo` backend, to allow direct coordination between
-            # processes through the CPU.
-            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
-            if self.rank in ranks:
-                self.ranks = ranks
-                self.world_size = len(ranks)
-                self.rank_in_group = ranks.index(self.rank)
-                self.device_group = device_group
-                self.cpu_group = cpu_group
-
-        assert self.cpu_group is not None
-        assert self.device_group is not None
-
-        
-        if torch.cuda.is_available():
-            self.device = torch.device(f"cuda:{local_rank}")
-        else:
-            self.device = torch.device("cpu")
-            
-
-        self.use_pynccl = use_pynccl
-        self.use_custom_allreduce = use_custom_allreduce
-        self.use_tpu_communicator = use_tpu_communicator
-
-        # lazy import to avoid documentation build error
-        from vllm.distributed.device_communicators.custom_all_reduce import (
-            CustomAllreduce)
-        from vllm.distributed.device_communicators.pynccl import (
-            PyNcclCommunicator)
-
-
-        self.pynccl_comm: Optional[PyNcclCommunicator]
-        if use_pynccl and self.world_size > 1:
-            self.pynccl_comm = PyNcclCommunicator(
-                group=self.cpu_group,
-                device=self.device,
-            )
-        else:
-            self.pynccl_comm = None
-
-        self.ca_comm: Optional[CustomAllreduce]
-        if use_custom_allreduce and self.world_size > 1:
-            # Initialize a custom fast all-reduce implementation.
-            self.ca_comm = CustomAllreduce(
-                group=self.cpu_group,
-                device=self.device,
-            )
-        else:
-            self.ca_comm = None
-
-        from vllm.distributed.device_communicators.tpu_communicator import (
-            TpuCommunicator)
-        self.tpu_communicator: Optional[TpuCommunicator]
-        if use_tpu_communicator and self.world_size > 1:
-            self.tpu_communicator = TpuCommunicator(group=self.cpu_group)
-
-        from vllm.distributed.device_communicators.shm_broadcast import (
-            MessageQueue)
-        self.mq_broadcaster: Optional[MessageQueue] = None
-        if use_message_queue_broadcaster and self.world_size > 1:
-            self.mq_broadcaster = MessageQueue.create_from_process_group(
-                self.cpu_group, 1 << 22, 6)
-
-                
-        # use a threadpool to buffer send request in disaggregated prefill
-        self.input_hash_to_kv_sending_requests = defaultdict(list)
-        self.kv_sending_thread = None
-
-    @property
-    def first_rank(self):
-        """Return the global rank of the first process in the group"""
-        return self.ranks[0]
-
-    @property
-    def last_rank(self):
-        """Return the global rank of the last process in the group"""
-        return self.ranks[-1]
-
-    @property
-    def is_first_rank(self):
-        """Return whether the caller is the first process in the group"""
-        return self.rank == self.first_rank
-
-    @property
-    def is_last_rank(self):
-        """Return whether the caller is the last process in the group"""
-        return self.rank == self.last_rank
-
-    @property
-    def next_rank(self):
-        """Return the global rank of the process that follows the caller"""
-        rank_in_group = self.rank_in_group
-        world_size = self.world_size
-        return self.ranks[(rank_in_group + 1) % world_size]
-
-    @property
-    def prev_rank(self):
-        """Return the global rank of the process that precedes the caller"""
-        rank_in_group = self.rank_in_group
-        world_size = self.world_size
-        return self.ranks[(rank_in_group - 1) % world_size]
-
-    @contextmanager
-    def graph_capture(
-            self, graph_capture_context: Optional[GraphCaptureContext] = None):
-        if graph_capture_context is None:
-            stream = torch.cuda.Stream()
-            graph_capture_context = GraphCaptureContext(stream)
-        else:
-            stream = graph_capture_context.stream
-
-        ca_comm = self.ca_comm
-        maybe_ca_context = nullcontext(
-        ) if ca_comm is None else ca_comm.capture()
-
-        # ensure all initialization operations complete before attempting to
-        # capture the graph on another stream
-        curr_stream = torch.cuda.current_stream()
-        if curr_stream != stream:
-            stream.wait_stream(curr_stream)
-
-        with torch.cuda.stream(stream), maybe_ca_context:
-            # In graph mode, we have to be very careful about the collective
-            # operations. The current status is:
-            #     allreduce \ Mode   |  Eager  |  Graph  |
-            # --------------------------------------------
-            # custom allreduce       | enabled | enabled |
-            # PyNccl                 | disabled| enabled |
-            # torch.distributed      | enabled | disabled|
-            #
-            # Note that custom allreduce will have a runtime check, if the
-            #  tensor size is too large, it will fallback to the next
-            #  available option.
-            # In summary: When using CUDA graph, we use
-            #  either custom all-reduce kernel or pynccl. When not using
-            #  CUDA graph, we use either custom all-reduce kernel or
-            #  PyTorch NCCL. We always prioritize using custom all-reduce
-            #  kernel but fall back to PyTorch or pynccl if it is
-            #  disabled or not supported.
-            pynccl_comm = self.pynccl_comm
-            maybe_pynccl_context: Any
-            if not pynccl_comm:
-                maybe_pynccl_context = nullcontext()
-            else:
-                maybe_pynccl_context = pynccl_comm.change_state(
-                    enable=True, stream=torch.cuda.current_stream())
-            with maybe_pynccl_context:
-                yield graph_capture_context
-
-    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
-        """
-        NOTE: This operation will be applied in-place or out-of-place. 
-        Always assume this function modifies its input, but use the return
-        value as the output.
-        """
-        ca_comm = self.ca_comm
-
-        # Bypass the function if we are using only 1 GPU.
-        if self.world_size == 1:
-            return input_
-
-        # For TPUs, use TPU communicator.
-        tpu_comm = self.tpu_communicator
-        if tpu_comm is not None and not tpu_comm.disabled:
-            return tpu_comm.all_reduce(input_)
-
-        if ca_comm is not None:
-            out = ca_comm.custom_all_reduce(input_)
-            if out is not None:
-                return out
-        pynccl_comm = self.pynccl_comm
-        if (pynccl_comm is not None and not pynccl_comm.disabled):
-            pynccl_comm.all_reduce(input_)
-        elif input_.is_cpu:
-            import intel_extension_for_pytorch as ipex
-            ipex.distributed.all_reduce(input_, group=self.device_group)
-        else:
-            torch.distributed.all_reduce(input_, group=self.device_group)
-        return input_
-
-    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
-        world_size = self.world_size
-        # Bypass the function if we are using only 1 GPU.
-        if world_size == 1:
-            return input_
-        assert -input_.dim() <= dim < input_.dim(), (
-            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
-
-        # For TPUs, use TPU communicator.
-        tpu_comm = self.tpu_communicator
-        if tpu_comm is not None and not tpu_comm.disabled:
-            return tpu_comm.all_gather(input_, dim)
-
-        if dim < 0:
-            # Convert negative dim to positive.
-            dim += input_.dim()
-        input_size = input_.size()
-        # Allocate output tensor.
-        output_tensor = torch.empty((world_size, ) + input_size,
-                                    dtype=input_.dtype,
-                                    device=input_.device)
-        # All-gather.
-        torch.distributed.all_gather_into_tensor(output_tensor,
-                                                 input_,
-                                                 group=self.device_group)
-        # Reshape
-        output_tensor = output_tensor.movedim(0, dim)
-        output_tensor = output_tensor.reshape(input_size[:dim] +
-                                              (world_size *
-                                               input_size[dim], ) +
-                                              input_size[dim + 1:])
-        return output_tensor
-
-    def gather(self,
-               input_: torch.Tensor,
-               dst: int = 0,
-               dim: int = -1) -> torch.Tensor:
-        """
-        NOTE: We assume that the input tensor is on the same device across
-        all the ranks.
-        NOTE: `dst` is the local rank of the destination rank.
-        """
-        world_size = self.world_size
-        # Bypass the function if we are using only 1 GPU.
-        if world_size == 1:
-            return input_
-        assert -input_.dim() <= dim < input_.dim(), (
-            f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
-        if dim < 0:
-            # Convert negative dim to positive.
-            dim += input_.dim()
-        # Allocate output tensor.
-        if self.rank_in_group == dst:
-            gather_list = [torch.empty_like(input_) for _ in range(world_size)]
-        else:
-            gather_list = None
-        # Gather.
-        torch.distributed.gather(input_,
-                                 gather_list,
-                                 dst=self.ranks[dst],
-                                 group=self.device_group)
-        if self.rank_in_group == dst:
-            output_tensor = torch.cat(gather_list, dim=dim)
-        else:
-            output_tensor = None
-        return output_tensor
-
-    def broadcast(self, input_: torch.Tensor, src: int = 0):
-        """Broadcast the input tensor.
-        NOTE: `src` is the local rank of the source rank.
-        """
-        assert src < self.world_size, f"Invalid src rank ({src})"
-
-        # Bypass the function if we are using only 1 GPU.
-        if self.world_size == 1:
-            return input_
-        # Broadcast.
-        torch.distributed.broadcast(input_,
-                                    src=self.ranks[src],
-                                    group=self.device_group)
-        return input_
-
-    def broadcast_object(self, obj: Optional[Any] = None, src: int = 0):
-        """Broadcast the input object.
-        NOTE: `src` is the local rank of the source rank.
-        """
-        assert src < self.world_size, f"Invalid src rank ({src})"
-
-        # Bypass the function if we are using only 1 GPU.
-        if self.world_size == 1:
-            return obj
-        if self.mq_broadcaster is not None:
-            assert src == 0, "Message queue broadcaster only supports src=0"
-            return self.mq_broadcaster.broadcast_object(obj)
-        if self.rank_in_group == src:
-            torch.distributed.broadcast_object_list([obj],
-                                                    src=self.ranks[src],
-                                                    group=self.cpu_group)
-            return obj
-        else:
-            recv = [None]
-            torch.distributed.broadcast_object_list(recv,
-                                                    src=self.ranks[src],
-                                                    group=self.cpu_group)
-            return recv[0]
-
-    def broadcast_object_list(self,
-                              obj_list: List[Any],
-                              src: int = 0,
-                              group: Optional[ProcessGroup] = None):
-        """Broadcast the input object list.
-        NOTE: `src` is the local rank of the source rank.
-        """
-        assert src < self.world_size, f"Invalid src rank ({src})"
-
-        # Bypass the function if we are using only 1 GPU.
-        if self.world_size == 1:
-            return obj_list
-        # Broadcast.
-        torch.distributed.broadcast_object_list(obj_list,
-                                                src=self.ranks[src],
-                                                group=self.device_group)
-        return obj_list
-
-    def send_object(self, obj: Any, dst: int) -> None:
-        """Send the input object list to the destination rank."""
-        """NOTE: `dst` is the local rank of the destination rank."""
-
-        assert dst < self.world_size, f"Invalid dst rank ({dst})"
-
-        assert dst != self.rank_in_group, (
-            "Invalid destination rank. Destination rank is the same "
-            "as the current rank.")
-
-        # Serialize object to tensor and get the size as well
-        object_tensor = torch.frombuffer(pickle.dumps(obj), dtype=torch.uint8)
-
-        size_tensor = torch.tensor([object_tensor.numel()],
-                                   dtype=torch.long,
-                                   device="cpu")
-
-        # Send object size
-
-        torch.distributed.send(size_tensor,
-                               dst=self.ranks[dst],
-                               group=self.cpu_group)
-
-        # Send object
-        torch.distributed.send(object_tensor,
-                               dst=self.ranks[dst],
-                               group=self.cpu_group)
-
-        return None
-
-    def recv_object(self, src: int) -> Any:
-        """Receive the input object list from the source rank."""
-        """NOTE: `src` is the local rank of the source rank."""
-
-        assert src < self.world_size, f"Invalid src rank ({src})"
-
-        assert src != self.rank_in_group, (
-            "Invalid source rank. Source rank is the same as the current rank."
-        )
-
-        size_tensor = torch.empty(1, dtype=torch.long, device="cpu")
-
-        # Receive object size
-        rank_size = torch.distributed.recv(size_tensor,
-                                           src=self.ranks[src],
-                                           group=self.cpu_group)
-
-        # Tensor to receive serialized objects into.
-        object_tensor = torch.empty(  # type: ignore[call-overload]
-            size_tensor.item(),  # type: ignore[arg-type]
-            dtype=torch.uint8,
-            device="cpu")
-
-        rank_object = torch.distributed.recv(object_tensor,
-                                             src=self.ranks[src],
-                                             group=self.cpu_group)
-
-        assert rank_object == rank_size, (
-            "Received object sender rank does not match the size sender rank.")
-
-        obj = pickle.loads(object_tensor.numpy().tobytes())
-
-        return obj
-
-    def broadcast_tensor_dict(
-        self,
-        tensor_dict: Optional[Dict[str, Union[torch.Tensor, Any]]] = None,
-        src: int = 0,
-        group: Optional[ProcessGroup] = None,
-        metadata_group: Optional[ProcessGroup] = None
-    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
-        """Broadcast the input tensor dictionary.
-        NOTE: `src` is the local rank of the source rank.
-        """
-        # Bypass the function if we are using only 1 GPU.
-        if (not torch.distributed.is_initialized() or self.world_size == 1):
-            return tensor_dict
-
-        group = self.device_group
-        metadata_group = self.cpu_group
-        assert src < self.world_size, f"Invalid src rank ({src})"
-
-        rank_in_group = self.rank_in_group
-        if rank_in_group == src:
-            metadata_list: List[Tuple[Any, Any]] = []
-            assert isinstance(
-                tensor_dict,
-                dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
-            metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
-            # `metadata_list` lives in CPU memory.
-            # `broadcast_object_list` has serialization & deserialization,
-            # all happening on CPU. Therefore, we can use the CPU group.
-            self.broadcast_object(metadata_list, src=src)
-            async_handles = []
-            for tensor in tensor_list:
-                if tensor.numel() == 0:
-                    # Skip broadcasting empty tensors.
-                    continue
-                if tensor.is_cpu:
-                    # use metadata_group for CPU tensors
-                    handle = torch.distributed.broadcast(tensor,
-                                                         src=self.ranks[src],
-                                                         group=metadata_group,
-                                                         async_op=True)
-                else:
-                    # use group for GPU tensors
-                    handle = torch.distributed.broadcast(tensor,
-                                                         src=self.ranks[src],
-                                                         group=group,
-                                                         async_op=True)
-                async_handles.append(handle)
-            for async_handle in async_handles:
-                async_handle.wait()
-
-        else:
-            metadata_list = self.broadcast_object(None, src=src)
-            tensor_dict = {}
-            async_handles = []
-            for key, value in metadata_list:
-                if isinstance(value, TensorMetadata):
-                    tensor = torch.empty(value.size,
-                                         dtype=value.dtype,
-                                         device=value.device)
-                    if tensor.numel() == 0:
-                        # Skip broadcasting empty tensors.
-                        _update_nested_dict(tensor_dict, key, tensor)
-                        continue
-                    if tensor.is_cpu:
-                        # use metadata_group for CPU tensors
-                        handle = torch.distributed.broadcast(
-                            tensor,
-                            src=self.ranks[src],
-                            group=metadata_group,
-                            async_op=True)
-                    else:
-                        # use group for GPU tensors
-                        handle = torch.distributed.broadcast(
-                            tensor,
-                            src=self.ranks[src],
-                            group=group,
-                            async_op=True)
-                    async_handles.append(handle)
-                    _update_nested_dict(tensor_dict, key, tensor)
-                else:
-                    _update_nested_dict(tensor_dict, key, value)
-            for async_handle in async_handles:
-                async_handle.wait()
-        return tensor_dict
-
-    def send_tensor_dict(
-        self,
-        tensor_dict: Dict[str, Union[torch.Tensor, Any]],
-        dst: Optional[int] = None
-    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
-        """Send the input tensor dictionary.
-        NOTE: `dst` is the local rank of the source rank.
-        """
-        # Bypass the function if we are using only 1 GPU.
-        if not torch.distributed.is_initialized() or self.world_size == 1:
-            return tensor_dict
-
-        group = self.device_group
-        metadata_group = self.cpu_group
-
-        if dst is None:
-            dst = (self.rank_in_group + 1) % self.world_size
-        assert dst < self.world_size, f"Invalid dst rank ({dst})"
-
-        metadata_list: List[Tuple[Any, Any]] = []
-        assert isinstance(
-            tensor_dict,
-            dict), f"Expecting a dictionary, got {type(tensor_dict)}"
-        metadata_list, tensor_list = _split_tensor_dict(tensor_dict)
-        # `metadata_list` lives in CPU memory.
-        # `send_object_list` has serialization & deserialization,
-        # all happening on CPU. Therefore, we can use the CPU group.
-        self.send_object(metadata_list, dst=dst)
-        for tensor in tensor_list:
-            if tensor.numel() == 0:
-                # Skip sending empty tensors.
-                continue
-            if tensor.is_cpu:
-                # use metadata_group for CPU tensors
-                torch.distributed.send(tensor,
-                                       dst=self.ranks[dst],
-                                       group=metadata_group)
-            else:
-                # use group for GPU tensors
-                torch.distributed.send(tensor,
-                                       dst=self.ranks[dst],
-                                       group=group)
-        return None
-
-    def recv_tensor_dict(
-        self,
-        src: Optional[int] = None
-    ) -> Optional[Dict[str, Union[torch.Tensor, Any]]]:
-        """Recv the input tensor dictionary.
-        NOTE: `src` is the local rank of the source rank.
-        """
-        # Bypass the function if we are using only 1 GPU.
-        if not torch.distributed.is_initialized() or self.world_size == 1:
-            return None
-
-        group = self.device_group
-        metadata_group = self.cpu_group
-
-        if src is None:
-            src = (self.rank_in_group - 1) % self.world_size
-        assert src < self.world_size, f"Invalid src rank ({src})"
-
-        recv_metadata_list = self.recv_object(src=src)
-        tensor_dict: Dict[str, Any] = {}
-        for key, value in recv_metadata_list:
-            if isinstance(value, TensorMetadata):
-                tensor = torch.empty(value.size,
-                                     dtype=value.dtype,
-                                     device=value.device)
-                if tensor.numel() == 0:
-                    # Skip broadcasting empty tensors.
-                    _update_nested_dict(tensor_dict, key, tensor)
-                    continue
-                if tensor.is_cpu:
-                    # use metadata_group for CPU tensors
-                    torch.distributed.recv(tensor,
-                                           src=self.ranks[src],
-                                           group=metadata_group)
-                else:
-                    # use group for GPU tensors
-                    torch.distributed.recv(tensor,
-                                           src=self.ranks[src],
-                                           group=group)
-                _update_nested_dict(tensor_dict, key, tensor)
-            else:
-                _update_nested_dict(tensor_dict, key, value)
-        return tensor_dict
-
-    def barrier(self):
-        """Barrier synchronization among the group.
-        NOTE: don't use `device_group` here! `barrier` in NCCL is
-        terrible because it is internally a broadcast operation with
-        secretly created GPU tensors. It is easy to mess up the current
-        device. Use the CPU group instead.
-        """
-        torch.distributed.barrier(group=self.cpu_group)
-
-    def send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
-        """Sends a tensor to the destination rank in a non-blocking way"""
-        """NOTE: `dst` is the local rank of the destination rank."""
-
-
-        if dst is None:
-            dst = (self.rank_in_group + 1) % self.world_size
-
-        pynccl_comm = self.pynccl_comm
-        if pynccl_comm is not None and not pynccl_comm.disabled:
-            pynccl_comm.send(tensor, dst)
-        else:
-            torch.distributed.send(tensor, self.ranks[dst], self.device_group)
-
-
-    def recv(self,
-             size: torch.Size,
-             dtype: torch.dtype,
-             src: Optional[int] = None) -> torch.Tensor:
-        """Receives a tensor from the src rank."""
-        """NOTE: `src` is the local rank of the destination rank."""
-        if src is None:
-            src = (self.rank_in_group - 1) % self.world_size
-        
-
-        tensor = torch.empty(size, dtype=dtype, device=self.device)
-        pynccl_comm = self.pynccl_comm
-        if pynccl_comm is not None and not pynccl_comm.disabled:
-            pynccl_comm.recv(tensor, src)
-        else:
-            torch.distributed.recv(tensor, self.ranks[src], self.device_group)
-
-        return tensor
-
-        
-    def debug_send(self, tensor: torch.Tensor, dst: Optional[int] = None) -> None:
-        """Sends a tensor to the destination rank in a non-blocking way"""
-        """Will send several metadata. Useful for debugging."""
-        """NOTE: `dst` is the local rank of the destination rank."""
-
-
-        self.send_tensor_dict(
-            {
-            "tensor": tensor,
-            "mean": tensor.float().mean(),
-            "shape": tensor.shape
-            },
-            dst
-        )
-
-    def debug_recv(self,
-             size: torch.Size,
-             dtype: torch.dtype,
-             src: Optional[int] = None) -> torch.Tensor:
-        """Receives a tensor from the src rank."""
-        """NOTE: `src` is the local rank of the destination rank."""
-        
-        result = self.recv_tensor_dict(src)
-        tensor = result["tensor"]
-        assert torch.allclose(result["mean"], tensor.float().mean())
-        assert result["shape"] == tensor.shape
-        assert result["shape"] == size, f"The shape sent by sender is {result['shape']} but trying to receive {size}"
-        return tensor
-
-        
-    
-    
-    
-    def kv_cache_send(self,
-                    input_hash: int,
-                    tensor: torch.Tensor,
-                    dst: Optional[int] = None,
-                    enable_verification: bool = True) -> None:
-        """Push the KV cache send request into the send buffer"""
-        """NOTE: `dst` is the local rank of the destination rank."""
-
-        if enable_verification:
-            send_func = self.debug_send
-        else:
-            send_func = self.send
-
-        self.input_hash_to_kv_sending_requests[input_hash].append([
-            send_func,
-            # tensor needs to be cloned, if not the tensor may be freed
-            tensor.clone(),
-            dst
-        ])
-
-    
-    def kv_cache_recv(self,
-                    size: torch.Size,
-                    dtype: torch.dtype,
-                    src: Optional[int] = None,
-                    enable_verification: bool = True) -> torch.Tensor:
-        """Receives a tensor from the src rank (blocking)."""
-        """This API should be used together with `push`"""
-        """NOTE: `src` is the local rank of the destination rank."""
-
-        if enable_verification:
-            recv_func = self.debug_recv
-        else:
-            recv_func = self.recv
-        
-        tensor = recv_func(size, dtype, src)
-
-        return tensor
-
-            
-    def recv_input_hash_and_send_kv(self):
-
-        try:
-
-            # receive the input hash that the decode instance requires
-            logger.debug('Waiting for input hash ...')
-            # FIXME(Kuntai): debug_recv guarantees correctness but hurts perf
-            input_hash_tensor = self.debug_recv(torch.Size([1]), torch.long)
-            input_hash = input_hash_tensor.item()
-            logger.debug('Receiving input hash %d', input_hash)
-            assert input_hash in self.input_hash_to_kv_sending_requests, \
-                f"The KV cache of {input_hash} does not exist."   
-            logger.debug('Input hash %d exists, start sending', input_hash)
-            
-            # execute corresponding kv cache sending jobs in request queue
-            for idx, request in enumerate(
-                self.input_hash_to_kv_sending_requests[input_hash]):
-                request[0](*request[1:])
-            logger.debug('Finish input hash %d, free memory...' % input_hash)
-            # free GPU memory occupied by sending
-            del self.input_hash_to_kv_sending_requests[input_hash]
-
-        except Exception:
-            import sys
-            import traceback
-
-
-    def kv_cache_send_finish(self):
-
-        if self.kv_sending_thread is None:
-            self.kv_sending_thread = ThreadPoolExecutor(max_workers=1)
-        
-        job = self.kv_sending_thread.submit(self.recv_input_hash_and_send_kv)
-        logger.debug(f'Submit job {job} into kv cache sending thread')
-
-            
-    def kv_cache_recv_start(self, input_hash: int):
-
-        logger.debug('Requesting KV cache transfer for input hash %d', input_hash)
-        
-        input_hash_tensor = torch.tensor([input_hash]).long().to(self.device)
-        # notify the kv cache sender with the input hash id
-        # FIXME(Kuntai): debug_send guarantees correctness but hurts perf.
-        self.debug_send(input_hash_tensor)
-    
-    
-
-    def destroy(self):
-        if self.device_group is not None:
-            torch.distributed.destroy_process_group(self.device_group)
-            self.device_group = None
-        if self.cpu_group is not None:
-            torch.distributed.destroy_process_group(self.cpu_group)
-            self.cpu_group = None
-        if self.pynccl_comm is not None:
-            self.pynccl_comm = None
-        if self.ca_comm is not None:
-            self.ca_comm = None
-        if self.mq_broadcaster is not None:
-            self.mq_broadcaster = None
-            
-            
 _WORLD: Optional[GroupCoordinator] = None
 
 
@@ -933,16 +104,15 @@ def get_pp_group() -> GroupCoordinator:
 # kept for backward compatibility
 get_pipeline_model_parallel_group = get_pp_group
 
+_DISAGG: Optional[dist_kv.DistributedKVCoordinator] = None
 
-_DISAGG: Optional[GroupCoordinator] = None
 
-def get_disagg_group() -> GroupCoordinator:
+def get_disagg_group() -> dist_kv.DistributedKVCoordinator:
     assert _DISAGG is not None, (
         "disaggregated prefill parallel group is not initialized")
     return _DISAGG
 
 
-
 @contextmanager
 def graph_capture():
     """
@@ -971,7 +141,7 @@ def graph_capture():
 def set_custom_all_reduce(enable: bool):
     global _ENABLE_CUSTOM_ALL_REDUCE
     _ENABLE_CUSTOM_ALL_REDUCE = enable
-    
+
 
 def include_decoding_groups_if_disagg_enabled(
     groups: List[List[int]],
@@ -989,9 +159,7 @@ def include_decoding_groups_if_disagg_enabled(
             world_size: the vLLM world size, which is half of torch.distributed.get_world_size()
     """
 
-    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-        assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
-            "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
+    if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
         new_groups = []
         for group in groups:
             new_groups.append([rank for rank in group])
@@ -1020,21 +188,21 @@ def init_distributed_environment(
         # this backend is used for WORLD
         maybe_disagg_world_size = world_size
         maybe_disagg_rank = rank
-        if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+        if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
             maybe_disagg_world_size = world_size * 2
-            logger.debug(
-                "Disaggregated prefill enabled.")
-            assert envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"], (
-            "VLLM_DISAGG_PREFILL_ROLE should be either prefill or decode")
-            if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
+            logger.debug("Disaggregated prefill enabled.")
+            if dist_kv.IS_KV_PREFILL_INSTANCE:
                 # for prefill, the ranks are [0, world_size)
                 maybe_disagg_rank = rank
             else:
+                # this is decode instance.
                 # offset global rank by tp * pp (which is world_size)
                 maybe_disagg_rank = rank + world_size
 
-        logger.debug(f"Before: world size {maybe_disagg_world_size}, rank {maybe_disagg_rank}")
-            
+        logger.debug(
+            f"Before: world size {maybe_disagg_world_size}, rank {maybe_disagg_rank}"
+        )
+
         torch.distributed.init_process_group(
             backend=backend,
             init_method=distributed_init_method,
@@ -1051,25 +219,23 @@ def init_distributed_environment(
             local_rank = envs.LOCAL_RANK
         else:
             local_rank = rank
-        
-            
+
     global _WORLD
     if _WORLD is None:
         ranks = [[i for i in range(world_size)]]
         # offset the distributed group
-        if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
-            ranks = include_decoding_groups_if_disagg_enabled(ranks, world_size)
-            
+        if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
+            ranks = include_decoding_groups_if_disagg_enabled(
+                ranks, world_size)
+
         _WORLD = init_world_group(ranks, local_rank, backend)
-        logger.debug("_WORLD initialized for rank %d", torch.distributed.get_rank())
+        logger.debug("_WORLD initialized for rank %d",
+                     torch.distributed.get_rank())
         time.sleep(5)
     else:
         assert _WORLD.world_size == torch.distributed.get_world_size(), (
             "world group already initialized with a different world size")
 
-            
-        
-
 
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
@@ -1114,20 +280,20 @@ def initialize_model_parallel(
                 - [ [0, tp * pp], [1, tp * pp + 1], .. ]
         - Local rank: unchanged
     """
-    
+
     # Get world size and rank. Ensure some consistencies.
     assert torch.distributed.is_initialized()
     world_size: int = torch.distributed.get_world_size()
     backend = backend or torch.distributed.get_backend(
-        get_world_group().device_group)        
-    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+        get_world_group().device_group)
+    if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
         # Disaggregated prefill enabled
         # The world_size for this vLLM instance is tp * pp, but torch.distributed contains 2 vLLM instances, its world size is 2 * tp * pp
         # Adjust the world_size to match.
         world_size = world_size // 2
 
-    if (world_size !=
-            tensor_model_parallel_size * pipeline_model_parallel_size):
+    if (world_size
+            != tensor_model_parallel_size * pipeline_model_parallel_size):
         raise RuntimeError(
             f"world_size ({world_size}) is not equal to "
             f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
@@ -1144,7 +310,8 @@ def initialize_model_parallel(
             range(i * tensor_model_parallel_size,
                   (i + 1) * tensor_model_parallel_size))
         group_ranks.append(ranks)
-    group_ranks = include_decoding_groups_if_disagg_enabled(group_ranks, world_size)
+    group_ranks = include_decoding_groups_if_disagg_enabled(
+        group_ranks, world_size)
     # message queue broadcaster is only used in tensor model parallel group
     _TP = init_model_parallel_group(group_ranks,
                                     get_world_group().local_rank,
@@ -1153,7 +320,7 @@ def initialize_model_parallel(
     logger.debug("_TP initialized for rank %d", torch.distributed.get_rank())
 
     # Build the pipeline model-parallel groups.
-    num_pipeline_model_parallel_groups: int = (world_size // 
+    num_pipeline_model_parallel_groups: int = (world_size //
                                                pipeline_model_parallel_size)
     global _PP
     assert _PP is None, (
@@ -1162,15 +329,16 @@ def initialize_model_parallel(
     for i in range(num_pipeline_model_parallel_groups):
         ranks = list(range(i, world_size, num_pipeline_model_parallel_groups))
         group_ranks.append(ranks)
-    group_ranks = include_decoding_groups_if_disagg_enabled(group_ranks, world_size)
+    group_ranks = include_decoding_groups_if_disagg_enabled(
+        group_ranks, world_size)
     # pipeline parallel does not need custom allreduce
     _PP = init_model_parallel_group(group_ranks,
                                     get_world_group().local_rank,
                                     backend,
                                     use_custom_allreduce=False)
     logger.debug("_PP initialized for rank %d", torch.distributed.get_rank())
-    
-    if envs.VLLM_DISAGG_PREFILL_ROLE is not None:
+
+    if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
         global _DISAGG
         logger.debug("Disaggregated prefill enabled, create _DISAGG group")
         group_ranks = []
@@ -1179,19 +347,21 @@ def initialize_model_parallel(
             # decode global rank: i + world_size
             group_ranks.append([i, i + world_size])
         logger.debug("Distributed group is %s", str(group_ranks))
-        _DISAGG = init_model_parallel_group(group_ranks,
-                                            get_world_group().local_rank,
-                                            backend,
-                                            use_custom_allreduce=False)
+        _DISAGG = dist_kv.DistributedKVCoordinator(
+            group_ranks=group_ranks,
+            local_rank=get_world_group().local_rank,
+            torch_distributed_backend=backend,
+        )
         # follow by a warmup, to warmup nccl
         # necessary, as NCCL may not be warmed up when tp and pp are both 1.
         temp_tensor = torch.tensor([1.]).to(_DISAGG.device)
-        if envs.VLLM_DISAGG_PREFILL_ROLE == "prefill":
+        if dist_kv.IS_KV_PREFILL_INSTANCE:
             _DISAGG.send(temp_tensor)
         else:
             recv_tensor = _DISAGG.recv(temp_tensor.shape, temp_tensor.dtype)
             assert torch.allclose(temp_tensor, recv_tensor)
-        logger.debug("_DISAGG initialized for rank %d", torch.distributed.get_rank())
+        logger.debug("_DISAGG initialized for rank %d",
+                     torch.distributed.get_rank())
 
 
 def ensure_model_parallel_initialized(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index ed552d6104ec..3e097438449c 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -12,6 +12,8 @@
 import torch.distributed
 import torch.nn as nn
 
+import vllm.distributed.distributed_kv as dist_kv
+
 try:
     from flashinfer import BatchDecodeWithPagedKVCacheWrapper
     from flashinfer.decode import CUDAGraphBatchDecodeWithPagedKVCacheWrapper
@@ -1355,14 +1357,17 @@ def execute_model(
             "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
         } if self.has_seqlen_agnostic else {}
 
-        
-        # call `model_executable`
-        # and handle KV cache transfer for disaggregated prefilling
+        # check if the current run is profiling
+        is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
+        # check if the current run is prefill
+        is_prefill_run = prefill_meta is not None
+
+        # check if we can skip prefilling
+        # We can only skip during prefill phase in disaggregated decode instance
         if any([
-            prefill_meta is None,
-            envs.VLLM_DISAGG_PREFILL_ROLE != "decode", 
-            kv_caches is None,
-            kv_caches[0] is None]):
+            not is_prefill_run,
+            not dist_kv.IS_KV_DECODE_INSTANCE,
+            is_profile_run]):
 
             # model forwarding
             # during forwarding the KV cache will be sent in prefill instance
@@ -1378,151 +1383,27 @@ def execute_model(
 
             
             if all([
-                prefill_meta is not None,
-                envs.VLLM_DISAGG_PREFILL_ROLE == "prefill",
-                kv_caches is not None,
-                kv_caches[0] is not None,]):
-                # send hidden state if disaggregated prefilling enabled
-                
-                _input_tokens_list = model_input.input_tokens.tolist()
-                seq_lens = model_input.seq_lens
-                query_lens = model_input.query_lens
-                seq_lens = get_tp_group().broadcast_object(seq_lens)
-                query_lens = get_tp_group().broadcast_object(query_lens)
-                slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
-                
-                # failed = False
-                # reason = ""
-
-                # if sum(query_lens) != sum(seq_lens):
-                #     logger.error("Query len sum is %d but seq len sum is %d", sum(query_lens), sum(seq_lens))
-                #     failed=True
-                # if sum(query_lens) != len(_input_tokens_list):
-                #     logger.error("Input tokens len is %d, doesn't match with query lens sum %d",
-                #                  sum(query_lens),
-                #                  len(_input_tokens_list))
-                #     failed=True
-                # if slot_mapping.shape[0] != len(_input_tokens_list):
-                #     logger.error("Slot mapping shape is %s, mismatch with input shape %s",
-                #                  slot_mapping.shape,
-                #                  len(_input_tokens_list))
-                #     failed=True
-                # if failed:
-                #     import subprocess 
-                #     subprocess.run("ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9", shell=True)
-                    
+                is_prefill_run,
+                dist_kv.IS_KV_PREFILL_INSTANCE,
+                not is_profile_run]):
                 
-                # query_lens contains new KV caches that are added to vLLM.
-                # so we will send them to decode instance
-                # FIXME(Kuntai): This assume that all requests are prefill. 
-                for idx, qlen in enumerate(query_lens):
-
-
-                    start_pos = sum(query_lens[:idx])
-                    end_pos = start_pos + qlen
-                    input_hash = hash(tuple(_input_tokens_list[start_pos:end_pos]))
-                    
-                    for i in range(model_executable.model.start_layer,
-                            model_executable.model.end_layer):
-                        kv_cache = kv_caches[i - model_executable.model.start_layer]
-                        
-                        _, _, num_heads, head_size = kv_cache[0].shape
-                        
-                        key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
-                        value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
-
-                        current_slot_mapping = slot_mapping[start_pos:end_pos]
-
-                        get_disagg_group().kv_cache_send(
-                            input_hash,
-                            key_cache[current_slot_mapping])
-                        get_disagg_group().kv_cache_send(
-                            input_hash,
-                            value_cache[current_slot_mapping])
-
-
-                    get_disagg_group().kv_cache_send(
-                        input_hash, 
-                        hidden_or_intermediate_states[start_pos:end_pos])
-                    get_disagg_group().kv_cache_send_finish()
-
-                logger.error("\033[92mKV send DONE for rank %d\033[0m", torch.distributed.get_rank())
+                # transfer KV cache and hidden state
+                dist_kv.buffer_kv_caches_send_and_listen_for_input_hash(
+                    model_executable,
+                    model_input,
+                    kv_caches,
+                    hidden_or_intermediate_states,
+                )
                 
         else:
             
-            # This is disagg decode instance, during prefill state
-            # Need to receive KV from the prefill instance
-            # FIXME(Kuntai): This impl assumes that all requests are prefill. 
-
-            _input_tokens_list = model_input.input_tokens.tolist()
-            seq_lens = model_input.seq_lens
-            query_lens = model_input.query_lens
-            seq_lens = get_tp_group().broadcast_object(seq_lens)
-            query_lens = get_tp_group().broadcast_object(query_lens)
-            slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
-            
-            hidden_or_intermediate_states_for_one_req = []
-                
-            # enumerate different requests
-            logger.debug("My query lens is %s, seq len is %s, rank is %s", 
-                         str(query_lens),
-                         str(seq_lens),
-                         torch.distributed.get_rank())
-            for idx, qlen in enumerate(query_lens):
-
-                start_pos = sum(query_lens[:idx])
-                end_pos = start_pos + qlen
-                input_hash = hash(tuple(_input_tokens_list[start_pos:end_pos]))
-                num_tokens = qlen
-                
-                # notify the prefill instance to start sending KVs associated with input_hash 
-                get_disagg_group().kv_cache_recv_start(input_hash)
-
-                # receive KV cache from disaggregated prefill instance
-                for i in range(model_executable.model.start_layer,
-                            model_executable.model.end_layer):
-                    
-                    # get kv cache
-                    kv_cache = kv_caches[i - model_executable.model.start_layer]
-                    # get corresponding layer
-                    layer = model_executable.model.layers[i]
-                    
-                    # get kv cache shape (after sliced by tp)
-                    _, _, num_heads, head_size = kv_cache[0].shape
-                    key = get_disagg_group().kv_cache_recv(
-                        torch.Size([num_tokens, num_heads, head_size]),
-                        kv_cache[0].dtype
-                    )
-                    value = get_disagg_group().kv_cache_recv(
-                        torch.Size([num_tokens, num_heads, head_size]),
-                        kv_cache[0].dtype
-                    )
-                    
-                    key_cache, value_cache = kv_cache[0], kv_cache[1]
-                    ops.reshape_and_cache_flash(
-                        key,
-                        value,
-                        key_cache,
-                        value_cache,
-                        slot_mapping[start_pos:end_pos],
-                        layer.self_attn.attn.kv_cache_dtype,
-                        layer.self_attn.attn._k_scale,
-                        layer.self_attn.attn._v_scale,
-                    )
-
-
-                hidden_or_intermediate_states_for_one_req.append(
-                    get_disagg_group().kv_cache_recv(
-                        torch.Size([num_tokens, model_executable.config.hidden_size]),
-                        kv_cache[0].dtype
-                    )
+            # skip prefill, receive KV cache and hidden state
+            hidden_or_intermediate_states = \
+                dist_kv.send_input_hash_and_do_kv_caches_recv(
+                    model_executable,
+                    model_input,
+                    kv_caches,
                 )
-
-            # concatenate hidden states from different requests
-            hidden_or_intermediate_states = torch.cat(hidden_or_intermediate_states_for_one_req, dim=0)
-
-            logger.error("\033[92mKV receive DONE for rank %d\033[0m", torch.distributed.get_rank())
-
                 
 
         # Compute the logits in the last pipeline stage.

From 06a526a08084e4c85b8dab40f1bd38c43daee8e9 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Mon, 5 Aug 2024 05:03:54 +0000
Subject: [PATCH 137/278] Fix several bugs: tensor device placement, misc
 performance optimizations, handle the case where 2 request has identical
 input, refactor the code

---
 .../disagg_benchmarks/disagg_benchmark.sh     |  38 ++-
 .../device_communicators/shm_broadcast.py     |   4 +-
 vllm/distributed/distributed_kv.py            | 221 ++++++++++--------
 vllm/distributed/group_coordinator.py         |  12 +-
 vllm/envs.py                                  |   2 +-
 vllm/executor/gpu_executor.py                 |   3 +-
 vllm/executor/multiproc_gpu_executor.py       |   3 +-
 vllm/executor/ray_gpu_executor.py             |   6 +-
 vllm/utils.py                                 |  12 +-
 9 files changed, 174 insertions(+), 127 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
index 96e3cf35d49a..fe5afd9fa513 100644
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
@@ -53,10 +53,10 @@ benchmark() {
   model="meta-llama/Meta-Llama-3.1-70B-Instruct"
   dataset_name="sonnet"
   dataset_path="../sonnet_4x.txt"
-  num_prompts=100
+  num_prompts=50
   qps=$1
   prefix_len=50
-  input_len="100"
+  input_len=2048
   output_len=$2
 
 
@@ -143,19 +143,19 @@ benchmark() {
 
 
 # large model
-VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model $model \
     --port 8100 \
     -tp 4 \
-    --max-model-len 10000 \
+    --max-model-len 30000 \
     --gpu-memory-utilization 0.8 &
-VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
   -m vllm.entrypoints.openai.api_server \
   --model $model \
   --port 8200 \
   -tp 4 \
-  --max-model-len 10000 \
+  --max-model-len 30000 \
   --gpu-memory-utilization 0.8 &
 
 # # Small Model
@@ -181,8 +181,8 @@ VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
   wait_for_server 8200
 
   # launch a proxy server that listen from port 8000
-  python3 disagg_prefill_proxy_server.py &
-  sleep 1
+  # python3 disagg_prefill_proxy_server.py &
+  # sleep 1
 
   python3 ../benchmark_serving.py \
           --backend vllm \
@@ -193,7 +193,23 @@ VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
           --sonnet-output-len $output_len \
           --sonnet-prefix-len $prefix_len \
           --num-prompts $num_prompts \
-          --port 8000 \
+          --port 8100 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate $qps
+
+
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len $output_len \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8200 \
           --save-result \
           --result-dir $results_folder \
           --result-filename disagg_prefill_2xtp4.json \
@@ -230,8 +246,8 @@ main() {
   rm -rf results
   mkdir results
 
-  default_qps=10
-  default_output_len=150
+  default_qps=1
+  default_output_len=1
 
   # for target_qps in 2 4 8 16
   # do
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index 878996f6904d..d4847542688c 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -195,7 +195,7 @@ def __init__(
             # message. otherwise, we will only receive the first subscription
             # see http://api.zeromq.org/3-3:zmq-setsockopt for more details
             self.local_socket.setsockopt(XPUB_VERBOSE, True)
-            local_subscribe_port = get_open_port(is_for_dist_init = False)
+            local_subscribe_port = get_open_port()
             self.local_socket.bind(f"tcp://*:{local_subscribe_port}")
 
             self.current_idx = 0
@@ -211,7 +211,7 @@ def __init__(
             # create a publish-subscribe socket to communicate large data
             self.remote_socket = context.socket(XPUB)
             self.remote_socket.setsockopt(XPUB_VERBOSE, True)
-            remote_subscribe_port = get_open_port(is_for_dist_init = False)
+            remote_subscribe_port = get_open_port()
             self.remote_socket.bind(f"tcp://*:{remote_subscribe_port}")
 
         else:
diff --git a/vllm/distributed/distributed_kv.py b/vllm/distributed/distributed_kv.py
index 4d8c1b7d3756..6c73fc2dd581 100644
--- a/vllm/distributed/distributed_kv.py
+++ b/vllm/distributed/distributed_kv.py
@@ -2,7 +2,7 @@
 These APIs are used in `vllm/worker/model_runner.py`.
 """
 from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING
-from collections import defaultdict
+from collections import defaultdict, deque
 from concurrent.futures import ThreadPoolExecutor
 
 import torch
@@ -43,7 +43,7 @@ def __init__(
         use_custom_allreduce: bool = False,
         use_tpu_communicator: bool = True,
         use_message_queue_broadcaster: bool = False,
-        use_cpu_comm_for_sanity_check: bool = True,
+        use_cpu_comm_for_sanity_check: bool = False,
     ):
 
         super().__init__(
@@ -61,12 +61,14 @@ def __init__(
         self.use_cpu_comm_for_sanity_check = use_cpu_comm_for_sanity_check
 
         # use a threadpool to buffer send request in disaggregated prefill
-        self.input_hash_to_kv_sending_requests = defaultdict(list)
+        self.input_hash_to_kv_sending_requests = defaultdict(deque)
         self.kv_sending_thread = None
-        
-        
+        self.target_rank_for_send = self.ranks[(self.rank_in_group + 1) %
+                                               self.world_size]
+        self.target_rank_for_recv = self.ranks[(self.rank_in_group - 1) %
+                                               self.world_size]
+
         torch.set_default_device(self.device)
-        
 
     def debug_send(self,
                    tensor: torch.Tensor,
@@ -133,63 +135,87 @@ def kv_cache_recv(self,
 
         return tensor
 
+    def send_input_hash(self, input_hash: int) -> None:
+
+        # KV cache send go through CPU, and the original `send` only use GPU.
+        # So create a new group for sending input hash.
+        input_hash_tensor = torch.tensor([input_hash], device="cpu").long()
+        torch.distributed.isend(input_hash_tensor, self.target_rank_for_send,
+                                self.cpu_group)
+
+    def recv_input_hash(self) -> int:
+        input_hash_tensor = torch.tensor([0], device="cpu").long()
+        torch.distributed.irecv(input_hash_tensor, self.target_rank_for_recv,
+                                self.cpu_group).wait()
+        return input_hash_tensor.item()
+
     def recv_input_hash_and_send_kv(self):
 
         try:
 
             # receive the input hash that the decode instance requires
-            logger.debug('Rank %d: Waiting for input hash from rank %d, my hashes are %s', 
-                         torch.distributed.get_rank(), 
-                         self.ranks[(self.rank_in_group - 1) % self.world_size],
-                         list(self.input_hash_to_kv_sending_requests.keys()))
-            # FIXME(Kuntai): debug_recv guarantees correctness but hurts perf
-            input_hash_tensor = self.debug_recv(torch.Size([1]), torch.long)
-            input_hash = input_hash_tensor.item()
-            logger.debug('Successfully received input hash %d', input_hash)
+            logger.debug(
+                '[rank%d]: Waiting for input hash from rank %d',
+                torch.distributed.get_rank(),
+                self.target_rank_for_recv,
+            )
+            input_hash = self.recv_input_hash()
+            logger.debug(
+                'Successfully received input hash %d',
+                input_hash)
             assert input_hash in self.input_hash_to_kv_sending_requests, \
-                f"The KV cache of {input_hash} does not exist."
+                f"The KV cache of {input_hash} does not exist. "\
+                f"Existing input hash: {list(self.input_hash_to_kv_sending_requests.keys())}"
             logger.debug('Input hash %d exists, start sending', input_hash)
 
             # execute corresponding kv cache sending jobs in request queue
-            for idx, request in enumerate(
-                    self.input_hash_to_kv_sending_requests[input_hash]):
+            while True:
+                request = self.input_hash_to_kv_sending_requests[
+                    input_hash].popleft()
+                # An empty request: the KV cahe of one request are all sent
+                if request == []:
+                    break
                 request[0](*request[1:])
-            logger.debug('Finish input hash %d, free memory...' % input_hash)
-            # free GPU memory occupied by sending
-            del self.input_hash_to_kv_sending_requests[input_hash]
+            if len(self.input_hash_to_kv_sending_requests[input_hash]) == 0:
+                logger.debug('Finish input hash %d, free GPU memory...',
+                             input_hash)
+                del self.input_hash_to_kv_sending_requests[input_hash]
+            else:
+                logger.debug(
+                    'The buffer for input hash %d is not empty, meaning that '\
+                    'there are two jobs with identical input. Free GPU '\
+                    'memory for one of the request.',
+                    input_hash)
 
         except Exception as e:
-            import sys
+            # This function is executed in ThreadPoolExecutor
+            # and it will block all exceptions by default
+            # so log the potential error message here.
             import traceback
-            exc_info = traceback.format_exc()
             import time
+            exc_info = traceback.format_exc()
+            # avoid the output of different rank overlaps
             time.sleep(torch.distributed.get_rank())
             logger.error("An error occured: %s, stack trace: %s", e, exc_info)
-                         
 
-    def kv_cache_send_finish(self):
+    def kv_cache_send_finish(self, input_hash: int):
 
         if self.kv_sending_thread is None:
             self.kv_sending_thread = ThreadPoolExecutor(max_workers=1)
 
+        # append an empty job to signal that this is the end of a request
+        self.input_hash_to_kv_sending_requests[input_hash].append([])
         job = self.kv_sending_thread.submit(self.recv_input_hash_and_send_kv)
         logger.debug(f'Submit job {job} into kv cache sending thread')
 
     def kv_cache_recv_start(self, input_hash: int):
 
-        logger.debug('Rank %d: Sending input hash %d to rank %d',
-                     torch.distributed.get_rank(),
-                     input_hash, self.ranks[(self.rank_in_group + 1) % self.world_size])
+        logger.debug('[rank%d]: Sending input hash %d to rank %d',
+                     torch.distributed.get_rank(), input_hash,
+                     self.ranks[(self.rank_in_group + 1) % self.world_size])
 
-        input_hash_tensor = torch.tensor([input_hash]).long().to(self.device)
-        logger.error("Rank %d: input hash tensor on device %s",
-                     torch.distributed.get_rank(),
-                     input_hash_tensor.device)
         # notify the kv cache sender with the input hash id
-        # FIXME(Kuntai): debug_send guarantees correctness but hurts perf.
-        self.debug_send(input_hash_tensor)
-
-
+        self.send_input_hash(input_hash)
 
 
 def buffer_kv_caches_send_and_listen_for_input_hash(
@@ -198,17 +224,20 @@ def buffer_kv_caches_send_and_listen_for_input_hash(
     kv_caches: List[torch.Tensor],
     hidden_or_intermediate_states: torch.Tensor,
 ) -> None:
-    
-    _input_tokens_list = model_input.input_tokens.tolist()
-    seq_lens = model_input.seq_lens
-    query_lens = model_input.query_lens
-    seq_lens = ps.get_tp_group().broadcast_object(seq_lens)
-    query_lens = ps.get_tp_group().broadcast_object(query_lens)
+
+    input_tokens_tuple = tuple(model_input.input_tokens.tolist())
+    seq_query_obj = {
+        "seq_lens": model_input.seq_lens,
+        "query_lens": model_input.query_lens,
+    }
+    seq_query_obj = ps.get_tp_group().broadcast_object(seq_query_obj)
+    seq_lens = seq_query_obj["seq_lens"]
+    query_lens = seq_query_obj["query_lens"]
     slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
 
-    
-    logger.info("KV cache shape is %s", kv_caches[0].shape)
-    
+    logger.debug("My query lens is %s, seq len is %s, rank is %s",
+                 str(query_lens), str(seq_lens), torch.distributed.get_rank())
+
     # failed = False
     # reason = ""
 
@@ -226,102 +255,93 @@ def buffer_kv_caches_send_and_listen_for_input_hash(
     #                  len(_input_tokens_list))
     #     failed=True
     # if failed:
-    #     import subprocess 
+    #     import subprocess
     #     subprocess.run("ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9", shell=True)
-        
-    
+
     # query_lens contains new KV caches that are added to vLLM.
     # so we will send them to decode instance
-    # FIXME(Kuntai): This assume that all requests are prefill. 
+    # FIXME(Kuntai): This assume that all requests are prefill.
     for idx, qlen in enumerate(query_lens):
 
-
         start_pos = sum(query_lens[:idx])
         end_pos = start_pos + qlen
-        input_hash = hash(tuple(_input_tokens_list[start_pos:end_pos]))
-        
+        input_hash = hash(input_tokens_tuple[start_pos:end_pos])
+
         for i in range(model_executable.model.start_layer,
-                model_executable.model.end_layer):
+                       model_executable.model.end_layer):
             kv_cache = kv_caches[i - model_executable.model.start_layer]
-            
+
             _, _, num_heads, head_size = kv_cache[0].shape
-            
+
             key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
             value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
 
             current_slot_mapping = slot_mapping[start_pos:end_pos]
 
             ps.get_disagg_group().kv_cache_send(
-                input_hash,
-                key_cache[current_slot_mapping])
+                input_hash, key_cache[current_slot_mapping])
             ps.get_disagg_group().kv_cache_send(
-                input_hash,
-                value_cache[current_slot_mapping])
-
+                input_hash, value_cache[current_slot_mapping])
 
         ps.get_disagg_group().kv_cache_send(
-            input_hash, 
-            hidden_or_intermediate_states[start_pos:end_pos])
-        ps.get_disagg_group().kv_cache_send_finish()
+            input_hash, hidden_or_intermediate_states[start_pos:end_pos])
+        ps.get_disagg_group().kv_cache_send_finish(input_hash)
+
+    logger.error("\033[92mKV send DONE for rank %d\033[0m",
+                 torch.distributed.get_rank())
 
-    logger.error("\033[92mKV send DONE for rank %d\033[0m", torch.distributed.get_rank())
-    
 
-    
 def send_input_hash_and_do_kv_caches_recv(
-    model_executable: torch.nn.Module,
-    model_input: "ModelInputForGPUWithSamplingMetadata",
-    kv_caches: List[torch.Tensor]
-) -> torch.Tensor:
-    
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]) -> torch.Tensor:
+
     # This is disagg decode instance, during prefill state
     # Need to receive KV from the prefill instance
-    # FIXME(Kuntai): This impl assumes that all requests are prefill. 
-
-    _input_tokens_list = model_input.input_tokens.tolist()
-    seq_lens = model_input.seq_lens
-    query_lens = model_input.query_lens
-    seq_lens = ps.get_tp_group().broadcast_object(seq_lens)
-    query_lens = ps.get_tp_group().broadcast_object(query_lens)
+    # FIXME(Kuntai): This impl assumes that all requests are prefill.
+    input_tokens_tuple = tuple(model_input.input_tokens.tolist())
+    seq_query_obj = {
+        "seq_lens": model_input.seq_lens,
+        "query_lens": model_input.query_lens,
+    }
+    seq_query_obj = ps.get_tp_group().broadcast_object(seq_query_obj)
+    seq_lens = seq_query_obj["seq_lens"]
+    query_lens = seq_query_obj["query_lens"]
     slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
-    
+
     hidden_or_intermediate_states_for_one_req = []
-        
+
     # enumerate different requests
-    logger.debug("My query lens is %s, seq len is %s, rank is %s", 
-                    str(query_lens),
-                    str(seq_lens),
-                    torch.distributed.get_rank())
+    logger.debug("My query lens is %s, seq len is %s, rank is %s",
+                 str(query_lens), str(seq_lens), torch.distributed.get_rank())
     for idx, qlen in enumerate(query_lens):
 
         start_pos = sum(query_lens[:idx])
         end_pos = start_pos + qlen
-        input_hash = hash(tuple(_input_tokens_list[start_pos:end_pos]))
+        input_hash = hash(input_tokens_tuple[start_pos:end_pos])
         num_tokens = qlen
-        
-        # notify the prefill instance to start sending KVs associated with input_hash 
+
+        # notify the prefill instance to start sending KVs associated with input_hash
         ps.get_disagg_group().kv_cache_recv_start(input_hash)
 
         # receive KV cache from disaggregated prefill instance
         for i in range(model_executable.model.start_layer,
-                    model_executable.model.end_layer):
-            
+                       model_executable.model.end_layer):
+
             # get kv cache
             kv_cache = kv_caches[i - model_executable.model.start_layer]
             # get corresponding layer
             layer = model_executable.model.layers[i]
-            
+
             # get kv cache shape (after sliced by tp)
             _, _, num_heads, head_size = kv_cache[0].shape
             key = ps.get_disagg_group().kv_cache_recv(
                 torch.Size([num_tokens, num_heads, head_size]),
-                kv_cache[0].dtype
-            )
+                kv_cache[0].dtype)
             value = ps.get_disagg_group().kv_cache_recv(
                 torch.Size([num_tokens, num_heads, head_size]),
-                kv_cache[0].dtype
-            )
-            
+                kv_cache[0].dtype)
+
             key_cache, value_cache = kv_cache[0], kv_cache[1]
             ops.reshape_and_cache_flash(
                 key,
@@ -334,16 +354,15 @@ def send_input_hash_and_do_kv_caches_recv(
                 layer.self_attn.attn._v_scale,
             )
 
-
         hidden_or_intermediate_states_for_one_req.append(
             ps.get_disagg_group().kv_cache_recv(
                 torch.Size([num_tokens, model_executable.config.hidden_size]),
-                kv_cache[0].dtype
-            )
-        )
+                kv_cache[0].dtype))
 
     # concatenate hidden states from different requests
-    hidden_or_intermediate_states = torch.cat(hidden_or_intermediate_states_for_one_req, dim=0)
+    hidden_or_intermediate_states = torch.cat(
+        hidden_or_intermediate_states_for_one_req, dim=0)
 
-    logger.error("\033[92mKV receive DONE for rank %d\033[0m", torch.distributed.get_rank())
-    return hidden_or_intermediate_states
\ No newline at end of file
+    logger.error("\033[92mKV receive DONE for rank %d\033[0m",
+                 torch.distributed.get_rank())
+    return hidden_or_intermediate_states
diff --git a/vllm/distributed/group_coordinator.py b/vllm/distributed/group_coordinator.py
index 03b6a9e3b056..bfa3c7f3c17c 100644
--- a/vllm/distributed/group_coordinator.py
+++ b/vllm/distributed/group_coordinator.py
@@ -631,9 +631,12 @@ def recv_tensor_dict(
         tensor_dict: Dict[str, Any] = {}
         for key, value in recv_metadata_list:
             if isinstance(value, TensorMetadata):
+                target_device = value.device
+                if 'cuda' in target_device:
+                    target_device = self.device
                 tensor = torch.empty(value.size,
                                      dtype=value.dtype,
-                                     device=value.device)
+                                     device=target_device)
                 if tensor.numel() == 0:
                     # Skip broadcasting empty tensors.
                     tensor_dict[key] = tensor
@@ -653,11 +656,12 @@ def recv_tensor_dict(
                     torch.distributed.recv(tensor,
                                            src=self.ranks[src],
                                            group=metadata_group)
+
                 else:
                     # use group for GPU tensors
                     torch.distributed.recv(tensor,
-                                           src=self.ranks[src],
-                                           group=group)
+                                            src=self.ranks[src],
+                                            group=group)
                 if use_all_gather:
                     # do the allgather
                     tensor = all_gather_group.all_gather(  # type: ignore
@@ -719,4 +723,4 @@ def destroy(self):
         if self.ca_comm is not None:
             self.ca_comm = None
         if self.mq_broadcaster is not None:
-            self.mq_broadcaster = None
\ No newline at end of file
+            self.mq_broadcaster = None
diff --git a/vllm/envs.py b/vllm/envs.py
index 4c9e43cfe140..07a7b647f6bc 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -145,7 +145,7 @@ def get_default_config_root():
     # used when the frontend api server is running in multi-processing mode,
     # to communicate with the backend engine process over ZMQ.
     'VLLM_RPC_PORT':
-    lambda: int(os.getenv('VLLM_PORT', '5570')),
+    lambda: int(os.getenv('VLLM_RPC_PORT', '5570')),
 
     # If true, will load models from ModelScope instead of Hugging Face Hub.
     # note that the value is true or false, not numbers
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 3e77af0e2032..300e9a33eba5 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -1,5 +1,6 @@
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
+import vllm.distributed.distributed_kv as dist_kv
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
@@ -43,7 +44,7 @@ def _get_worker_kwargs(
         """Return worker init args for a given rank."""
         if distributed_init_method is None:
             distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port())
+                get_ip(), get_open_port(force=dist_kv.IS_DISTRIBUTED_KV_INSTANCE))
         return dict(
             model_config=self.model_config,
             parallel_config=self.parallel_config,
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 08a35a074b37..ba222f8b5e40 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -8,6 +8,7 @@
 
 import torch
 
+import vllm.distributed.distributed_kv as dist_kv
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.gpu_executor import create_worker
@@ -82,7 +83,7 @@ def _init_executor(self) -> None:
         # Since it only works for single node, we can use the loopback address
         # 127.0.0.1 for communication.
         distributed_init_method = get_distributed_init_method(
-            "127.0.0.1", get_open_port())
+            "127.0.0.1", get_open_port(force=dist_kv.IS_DISTRIBUTED_KV_INSTANCE))
 
         self.workers: List[ProcessWorkerWrapper] = []
         # This is the list of workers that are rank 0 of each TP group EXCEPT
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 4a6825c01fcf..17f4d3633886 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 import vllm.envs as envs
+import vllm.distributed.distributed_kv as dist_kv
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
@@ -226,8 +227,11 @@ def sort_by_driver_then_worker_ip(worker):
             # solves this issue, as it always works for communication inside
             # the node.
             driver_ip = "127.0.0.1"
+        # force vLLM to use the port specified by envs.VLLM_PORT
+        # this port will be binded by prefill instance
+        # but the decode instance must use that port to init torch.distributed
         distributed_init_method = get_distributed_init_method(
-            driver_ip, get_open_port())
+            driver_ip, get_open_port(force=dist_kv.IS_DISTRIBUTED_KV_INSTANCE))
 
         # Initialize the actual workers inside worker wrapper.
         init_worker_all_kwargs = [
diff --git a/vllm/utils.py b/vllm/utils.py
index 0d4c22ab761b..fa5452335264 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -388,18 +388,20 @@ def get_distributed_init_method(ip: str, port: int) -> str:
     return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}"
 
 
-def get_open_port(port: Optional[int] = None, is_for_dist_init: bool = True) -> int:
+def get_open_port(port: Optional[int] = None, force: bool = False) -> int:
     if port is None:
         # Default behavior here is to return a port for multi-gpu communication
         port = envs.VLLM_PORT
     if port is not None:
-        if envs.VLLM_DISAGG_PREFILL_ROLE is not None and is_for_dist_init:
-            # When initializing distributed environment for disagg prefill
-            # The prefill and decode instance may share the same port
-            # Skip the binding check as the port may be binded by prefill
+        if force and port is not None:
+            # force vLLM to use envs.VLLM_PORT for torch.distributed init
+            # This is because this port will binded by prefill instance
+            # But both prefill and decode instance need to use this port to 
+            # initialize torch.distributed
             return port
         while True:
             try:
+                logger.error('Trying port %d', port)
                 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                     s.bind(("", port))
                     return port

From 34e6bb324587a02000c4ea38a21b48ffce08ebc5 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Mon, 5 Aug 2024 05:06:00 +0000
Subject: [PATCH 138/278] remove useless comments

---
 vllm/distributed/distributed_kv.py | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/vllm/distributed/distributed_kv.py b/vllm/distributed/distributed_kv.py
index 6c73fc2dd581..77c153d775c6 100644
--- a/vllm/distributed/distributed_kv.py
+++ b/vllm/distributed/distributed_kv.py
@@ -237,27 +237,7 @@ def buffer_kv_caches_send_and_listen_for_input_hash(
 
     logger.debug("My query lens is %s, seq len is %s, rank is %s",
                  str(query_lens), str(seq_lens), torch.distributed.get_rank())
-
-    # failed = False
-    # reason = ""
-
-    # if sum(query_lens) != sum(seq_lens):
-    #     logger.error("Query len sum is %d but seq len sum is %d", sum(query_lens), sum(seq_lens))
-    #     failed=True
-    # if sum(query_lens) != len(_input_tokens_list):
-    #     logger.error("Input tokens len is %d, doesn't match with query lens sum %d",
-    #                  sum(query_lens),
-    #                  len(_input_tokens_list))
-    #     failed=True
-    # if slot_mapping.shape[0] != len(_input_tokens_list):
-    #     logger.error("Slot mapping shape is %s, mismatch with input shape %s",
-    #                  slot_mapping.shape,
-    #                  len(_input_tokens_list))
-    #     failed=True
-    # if failed:
-    #     import subprocess
-    #     subprocess.run("ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9", shell=True)
-
+                 
     # query_lens contains new KV caches that are added to vLLM.
     # so we will send them to decode instance
     # FIXME(Kuntai): This assume that all requests are prefill.

From 55bf3bfb29a89c3870df492ad2ed3542506bac58 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Mon, 5 Aug 2024 05:11:02 +0000
Subject: [PATCH 139/278] update disaggregated prefill example

---
 .../disagg_prefill/disagg_prefill_example.sh  | 61 +++++++------------
 1 file changed, 23 insertions(+), 38 deletions(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index 8cfe528ffb58..a2a6dc0932fd 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -3,10 +3,8 @@
 # We will launch 2 vllm instances (1 for prefill and 1 for decode),
 # and then transfer the KV cache between them.
 
-export VLLM_LOGGING_LEVEL=DEBUG
 export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
-# export NCCL_DEBUG=INFO
-export NCCL_BUFFSIZE=67108864
+export VLLM_PORT=12345
 
 # a function that waits vLLM server to start
 wait_for_server() {
@@ -18,7 +16,7 @@ wait_for_server() {
 }
 
 # prefilling instance
-VLLM_LOGGING_LEVEL=DEBUG VLLM_HOST_IP=$(hostname -I | awk '{print $1}') VLLM_PORT=2345 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 python3 \
+VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8100 \
@@ -28,7 +26,7 @@ VLLM_LOGGING_LEVEL=DEBUG VLLM_HOST_IP=$(hostname -I | awk '{print $1}') VLLM_POR
     --max-model-len 10000 &
 
 # decoding instance
-VLLM_LOGGING_LEVEL=DEBUG VLLM_HOST_IP=$(hostname -I | awk '{print $1}') VLLM_PORT=2345 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=1 python3 \
+VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8200 \
@@ -46,39 +44,26 @@ wait_for_server 8200
 #   1. send the request to prefill instance, with max_tokens set to 1
 #   2. send the request again to decode instance, no modification
 
+# send to prefill instance, let it only do prefill by setting max_token=1
+curl -m 5 http://localhost:8100/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"prompt": "'$i' San Francisco is a",
+"max_tokens": 1,
+"temperature": 0
+}'
 
-for i in {0..0}
-do
-  # send to prefill instance
-  curl -m 5 http://localhost:8100/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-  "prompt": "'$i' San Francisco is a",
-  "max_tokens": 1,
-  "temperature": 0
-  }'
+# send to decode instance
+curl -m 5 http://localhost:8100/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"prompt": "'$i' San Francisco is a",
+"max_tokens": 50,
+"temperature": 0
+}'
 
-  curl -m 5 http://localhost:8100/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-  "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-  "prompt": "'$i' San Francisco is a",
-  "max_tokens": 1,
-  "temperature": 0
-  }'
 
-  # # send to decode instance
-  # curl -m 60 http://localhost:8200/v1/completions \
-  # -H "Content-Type: application/json" \
-  # -d '{
-  # "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-  # "prompt": "'$i' San Francisco is a",
-  # "max_tokens": 5,
-  # "temperature": 0
-  # }'
-
-done
-
-# kill command:
-# ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
\ No newline at end of file
+# clean up
+ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
\ No newline at end of file

From b525510419103f967c9433bd4553fa80f81f4708 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 6 Aug 2024 19:01:22 +0000
Subject: [PATCH 140/278] add disaggregated prefill overhead benchmark

---
 .../disagg_overhead_benchmark.sh              | 148 ++++++++++++++++++
 1 file changed, 148 insertions(+)
 create mode 100644 benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh

diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
new file mode 100644
index 000000000000..12f5150cadda
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -0,0 +1,148 @@
+#!/bin/bash
+
+# Requirement: 8x H100 GPUs.
+
+
+# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
+# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
+# Resource: 8x H100
+# Approaches:
+# 1. Chunked prefill: 1 vllm instance with tp=8
+# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
+# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
+# Prefilling instance: max_output_token=1
+# Decoding instance: force the input tokens be the same across requests to bypass prefilling
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill pt_main_thread
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+benchmark() {
+
+  export VLLM_LOGGING_LEVEL=DEBUG
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+  export VLLM_PORT=12345
+
+  # compare chunked prefill with disaggregated prefill
+
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-70B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=50
+  qps=$1
+  prefix_len=50
+  input_len=2048
+  output_len=$2
+
+  # large model
+  VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+      -m vllm.entrypoints.openai.api_server \
+      --model $model \
+      --port 8100 \
+      -tp 4 \
+      --max-model-len 30000 \
+      --gpu-memory-utilization 0.8 &
+  VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    -tp 4 \
+    --max-model-len 30000 \
+    --gpu-memory-utilization 0.8 &
+
+  wait_for_server 8100
+  wait_for_server 8200
+
+  # let the prefill instance finish prefill
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len $output_len \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8100 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate $qps
+
+
+  # send the request to decode.
+  # The TTFT of this command will be the overhead of disagg prefill impl.
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len $output_len \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8200 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate $qps
+  kill_gpu_processes
+
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  pip install quart httpx
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_qps=1
+  default_output_len=1
+  benchmark $default_qps $default_output_len
+
+}
+
+
+main "$@"

From ee6a6ec5c12d5a1014850f8068b2857ed7540b8b Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Wed, 7 Aug 2024 02:56:58 +0000
Subject: [PATCH 141/278] change disagg prefill proxy server to support
 non-streaming case

---
 .../disagg_prefill_proxy_server.py              | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index 9028d9be86ec..eb2f2a7149a3 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -1,7 +1,6 @@
 from quart import Quart, request, Response, jsonify, make_response
 import aiohttp
 import sys
-import httpx
 import traceback
 import os
 
@@ -14,15 +13,17 @@ async def forward_request(url, data):
         headers = {
             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
         }
-        async with session.post(url=url, json=data,
-                                headers=headers) as response:
+        async with session.post(url=url, json=data, headers=headers) as response:
             if response.status == 200:
-                async for chunk_bytes in response.content:
-                    yield chunk_bytes
-    
+                if response.headers.get('Transfer-Encoding') == 'chunked':
+                    async for chunk_bytes in response.content.iter_chunked(1024):
+                        yield chunk_bytes
+                else:
+                    content = await response.read()
+                    yield content
+
 @app.route('/v1/completions', methods=['POST'])
 async def handle_request():
-    
     try:
         original_request_data = await request.get_json()
 
@@ -30,7 +31,7 @@ async def handle_request():
         prefill_request['max_tokens'] = 1
 
         # finish prefill
-        async for data in forward_request('http://localhost:8100/v1/completions', prefill_request):
+        async for _ in forward_request('http://localhost:8100/v1/completions', prefill_request):
             continue
 
         print(f"Request {prefill_request} prefill done. proceeding to decode.")

From f3cc91ddb496978cbd625f68ffa1a099780f0400 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Wed, 7 Aug 2024 03:01:59 +0000
Subject: [PATCH 142/278] avoid detokenizing the first token in prefill
 instance -- for shorter latency

---
 benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index eb2f2a7149a3..bf5c136b9a2d 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -28,7 +28,10 @@ async def handle_request():
         original_request_data = await request.get_json()
 
         prefill_request = original_request_data.copy()
+        # change max_tokens = 1 to let it only do prefill
         prefill_request['max_tokens'] = 1
+        # avoid sampling overhead by setting detokenize = False
+        prefill_request['detokenize'] = False
 
         # finish prefill
         async for _ in forward_request('http://localhost:8100/v1/completions', prefill_request):

From 058226540bdf16e53c8cb4d79528b3e64ea2f785 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Wed, 7 Aug 2024 23:06:18 +0000
Subject: [PATCH 143/278] add failure test cases --- try switching to another
 machine

---
 .../disagg_benchmarks/disagg_benchmark.sh     | 266 ------------------
 .../disagg_performance_benchmark.sh           | 172 +++++++++++
 .../disagg_prefill_proxy_server.py            |   7 +-
 3 files changed, 175 insertions(+), 270 deletions(-)
 delete mode 100644 benchmarks/disagg_benchmarks/disagg_benchmark.sh
 create mode 100644 benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh

diff --git a/benchmarks/disagg_benchmarks/disagg_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_benchmark.sh
deleted file mode 100644
index fe5afd9fa513..000000000000
--- a/benchmarks/disagg_benchmarks/disagg_benchmark.sh
+++ /dev/null
@@ -1,266 +0,0 @@
-#!/bin/bash
-
-# Requirement: 8x H100 GPUs.
-
-
-# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
-# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
-# Resource: 8x H100
-# Approaches:
-# 1. Chunked prefill: 1 vllm instance with tp=8
-# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
-# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
-# Prefilling instance: max_output_token=1
-# Decoding instance: force the input tokens be the same across requests to bypass prefilling
-
-set -ex
-
-kill_gpu_processes() {
-  # kill all processes on GPU.
-  pkill pt_main_thread
-  sleep 10
-
-  # remove vllm config file
-  rm -rf ~/.config/vllm
-
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
-}
-
-wait_for_server() {
-  # wait for vllm server to start
-  # return 1 if vllm server crashes
-  local port=$1
-  timeout 1200 bash -c "
-    until curl -s localhost:${port}/v1/completions > /dev/null; do
-      sleep 1
-    done" && return 0 || return 1
-}
-
-
-benchmark() {
-
-  export VLLM_LOGGING_LEVEL=DEBUG
-  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
-  export VLLM_PORT=12345
-
-  # compare chunked prefill with disaggregated prefill
-
-  results_folder="./results"
-  model="meta-llama/Meta-Llama-3.1-70B-Instruct"
-  dataset_name="sonnet"
-  dataset_path="../sonnet_4x.txt"
-  num_prompts=50
-  qps=$1
-  prefix_len=50
-  input_len=2048
-  output_len=$2
-
-
-  # # chunked prefill with tp=4
-  # CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
-  #   -m vllm.entrypoints.openai.api_server \
-  #   --model $model \
-  #   --port 8000 \
-  #   -tp 4 \
-  #   --disable-log-stats \
-  #   --disable-log-requests \
-  #   --enable-chunked-prefill &
-  # wait_for_server 8000
-
-  # python3 ../benchmark_serving.py \
-  #         --backend vllm \
-  #         --model $model \
-  #         --dataset-name $dataset_name \
-  #         --dataset-path $dataset_path \
-  #         --sonnet-input-len $input_len \
-  #         --sonnet-output-len $output_len \
-  #         --sonnet-prefix-len $prefix_len \
-  #         --num-prompts $((num_prompts / 2)) \
-  #         --port 8000 \
-  #         --save-result \
-  #         --result-dir $results_folder \
-  #         --result-filename chunked_prefill_tp4.json \
-  #         --request-rate $((qps / 2))
-  # kill_gpu_processes
-
-
-  # # disaggregated prefill
-  # # prefill with tp=4
-  # python3 -m vllm.entrypoints.openai.api_server \
-  #         --model $model \
-  #         --port 8000 \
-  #         -tp 4 \
-  #         --disable-log-stats \
-  #         --disable-log-requests &
-  # wait_for_server 8000
-  # # set output-len to 1 so that it only do prefilling
-  # python3 ../benchmark_serving.py \
-  #         --backend vllm \
-  #         --model $model \
-  #         --dataset-name $dataset_name \
-  #         --dataset-path $dataset_path \
-  #         --sonnet-input-len $input_len \
-  #         --sonnet-output-len 1 \
-  #         --sonnet-prefix-len $prefix_len \
-  #         --num-prompts $num_prompts \
-  #         --port 8000 \
-  #         --save-result \
-  #         --result-dir $results_folder \
-  #         --result-filename disagg_prefill_tp4.json \
-  #         --request-rate $qps
-  # kill_gpu_processes
-
-  # # decode with tp=4, enable APC
-  # python3 -m vllm.entrypoints.openai.api_server \
-  #         --model $model \
-  #         --port 8000 \
-  #         -tp 4 \
-  #         --enable-prefix-caching \
-  #         --disable-log-stats \
-  #         --disable-log-requests &
-  # wait_for_server 8000
-  # # skip prefilling 
-  # # by enabling APC and force the input tokens be the same
-  # python3 ../benchmark_serving.py \
-  #         --backend vllm \
-  #         --model $model \
-  #         --dataset-name $dataset_name \
-  #         --dataset-path $dataset_path \
-  #         --sonnet-input-len $input_len \
-  #         --sonnet-output-len $output_len \
-  #         --sonnet-prefix-len $input_len  \
-  #         --num-prompts $num_prompts \
-  #         --port 8000 \
-  #         --save-result \
-  #         --result-dir $results_folder \
-  #         --result-filename disagg_decode_tp4.json \
-  #         --request-rate $qps
-  # kill_gpu_processes
-
-
-# large model
-VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
-    -m vllm.entrypoints.openai.api_server \
-    --model $model \
-    --port 8100 \
-    -tp 4 \
-    --max-model-len 30000 \
-    --gpu-memory-utilization 0.8 &
-VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
-  -m vllm.entrypoints.openai.api_server \
-  --model $model \
-  --port 8200 \
-  -tp 4 \
-  --max-model-len 30000 \
-  --gpu-memory-utilization 0.8 &
-
-# # Small Model
-# # prefilling instance
-# VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 python3 \
-#     -m vllm.entrypoints.openai.api_server \
-#     --model $model \
-#     --port 8100 \
-#     -tp 1 \
-#     --gpu-memory-utilization 0.8 \
-#     --max-model-len 10000 &
-
-# # decoding instance
-# VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=1 python3 \
-#     -m vllm.entrypoints.openai.api_server \
-#     --model $model \
-#     --port 8200 \
-#     -tp 1 \
-#     --gpu-memory-utilization 0.8 \
-#     --max-model-len 10000 &
-
-  wait_for_server 8100
-  wait_for_server 8200
-
-  # launch a proxy server that listen from port 8000
-  # python3 disagg_prefill_proxy_server.py &
-  # sleep 1
-
-  python3 ../benchmark_serving.py \
-          --backend vllm \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
-          --sonnet-output-len $output_len \
-          --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
-          --port 8100 \
-          --save-result \
-          --result-dir $results_folder \
-          --result-filename disagg_prefill_2xtp4.json \
-          --request-rate $qps
-
-
-  python3 ../benchmark_serving.py \
-          --backend vllm \
-          --model $model \
-          --dataset-name $dataset_name \
-          --dataset-path $dataset_path \
-          --sonnet-input-len $input_len \
-          --sonnet-output-len $output_len \
-          --sonnet-prefix-len $prefix_len \
-          --num-prompts $num_prompts \
-          --port 8200 \
-          --save-result \
-          --result-dir $results_folder \
-          --result-filename disagg_prefill_2xtp4.json \
-          --request-rate $qps
-  kill_gpu_processes
-
-  # python3 analyze_benchmark_results.py \
-  #         --results-folder $results_folder \
-  #         --output-len $output_len \
-  #         --qps $qps
-
-}
-
-
-main() {
-
-  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-  (which jq) || (apt-get -y install jq)
-  (which socat) || (apt-get -y install socat)
-
-  pip install quart httpx
-
-  cd "$(dirname "$0")"
-
-  cd ..
-  # create sonnet-4x.txt
-  echo "" > sonnet_4x.txt
-  for _ in {1..4}
-  do
-    cat sonnet.txt >> sonnet_4x.txt
-  done
-  cd disagg_benchmarks
-
-  rm -rf results
-  mkdir results
-
-  default_qps=1
-  default_output_len=1
-
-  # for target_qps in 2 4 8 16
-  # do
-  #   benchmark $target_qps $default_output_len
-  # done
-  benchmark $default_qps $default_output_len
-
-  # for target_output_len in 5 10 20 40 80
-  # do
-  #   benchmark $default_qps $target_output_len
-  # done
-
-}
-
-
-main "$@"
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
new file mode 100644
index 000000000000..38c0fcdaefc8
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+
+# Requirement: 8x H100 GPUs.
+
+
+# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
+# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
+# Resource: 8x H100
+# Approaches:
+# 1. Chunked prefill: 1 vllm instance with tp=8
+# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
+# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
+# Prefilling instance: max_output_token=1
+# Decoding instance: force the input tokens be the same across requests to bypass prefilling
+
+set -ex
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill pt_main_thread
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  local port=$1
+  timeout 1200 bash -c "
+    until curl -s localhost:${port}/v1/completions > /dev/null; do
+      sleep 1
+    done" && return 0 || return 1
+}
+
+
+benchmark() {
+
+  export VLLM_LOGGING_LEVEL=DEBUG
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+  export VLLM_PORT=12345
+  # export VLLM_TRACE_FUNCTION=1
+
+  # compare chunked prefill with disaggregated prefill
+
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-70B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=10
+  qps=$1
+  prefix_len=50
+  input_len=2048
+  output_len=$2
+
+
+  # baseline: chunked prefill
+  # CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+  #   -m vllm.entrypoints.openai.api_server \
+  #   --model $model \
+  #   --port 8000 \
+  #   -tp 4 \
+  #   --disable-log-stats \
+  #   --disable-log-requests \
+  #   --enable-chunked-prefill &
+  # wait_for_server 8000
+
+  # python3 ../benchmark_serving.py \
+  #         --backend vllm \
+  #         --model $model \
+  #         --dataset-name $dataset_name \
+  #         --dataset-path $dataset_path \
+  #         --sonnet-input-len $input_len \
+  #         --sonnet-output-len $output_len \
+  #         --sonnet-prefix-len $prefix_len \
+  #         --num-prompts $((num_prompts / 2)) \
+  #         --port 8000 \
+  #         --save-result \
+  #         --result-dir $results_folder \
+  #         --result-filename chunked_prefill_tp4.json \
+  #         --request-rate $((qps / 2))
+  # kill_gpu_processes
+
+
+
+  # disagg prefill
+  VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+      -m vllm.entrypoints.openai.api_server \
+      --model $model \
+      --port 8100 \
+      -tp 4 \
+      --max-model-len 30000 \
+      --disable-log-stats \
+      --disable-log-requests \
+      --gpu-memory-utilization 0.8 &
+  VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    -tp 4 \
+    --max-model-len 30000 \
+    --disable-log-stats \
+    --disable-log-requests \
+    --gpu-memory-utilization 0.8 &
+
+  wait_for_server 8100
+  wait_for_server 8200
+
+  # launch a proxy server that listen from port 8000
+  python3 disagg_prefill_proxy_server.py &
+  sleep 1
+
+  python3 ../benchmark_serving.py \
+          --backend vllm \
+          --model $model \
+          --dataset-name $dataset_name \
+          --dataset-path $dataset_path \
+          --sonnet-input-len $input_len \
+          --sonnet-output-len $output_len \
+          --sonnet-prefix-len $prefix_len \
+          --num-prompts $num_prompts \
+          --port 8000 \
+          --save-result \
+          --result-dir $results_folder \
+          --result-filename disagg_prefill_2xtp4.json \
+          --request-rate $qps
+
+
+  kill_gpu_processes
+
+
+}
+
+
+main() {
+
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get -y install jq)
+  (which socat) || (apt-get -y install socat)
+
+  pip install quart httpx
+
+  cd "$(dirname "$0")"
+
+  cd ..
+  # create sonnet-4x.txt so that we can sample 2048 tokens for input
+  echo "" > sonnet_4x.txt
+  for _ in {1..4}
+  do
+    cat sonnet.txt >> sonnet_4x.txt
+  done
+  cd disagg_benchmarks
+
+  rm -rf results
+  mkdir results
+
+  default_qps=10
+  default_output_len=10
+
+  benchmark $default_qps $default_output_len
+
+}
+
+
+main "$@"
diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index bf5c136b9a2d..8d9f699d4532 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -15,7 +15,8 @@ async def forward_request(url, data):
         }
         async with session.post(url=url, json=data, headers=headers) as response:
             if response.status == 200:
-                if response.headers.get('Transfer-Encoding') == 'chunked':
+                # if response.headers.get('Transfer-Encoding') == 'chunked':
+                if True:
                     async for chunk_bytes in response.content.iter_chunked(1024):
                         yield chunk_bytes
                 else:
@@ -30,9 +31,7 @@ async def handle_request():
         prefill_request = original_request_data.copy()
         # change max_tokens = 1 to let it only do prefill
         prefill_request['max_tokens'] = 1
-        # avoid sampling overhead by setting detokenize = False
-        prefill_request['detokenize'] = False
-
+        
         # finish prefill
         async for _ in forward_request('http://localhost:8100/v1/completions', prefill_request):
             continue

From 89d4ca42168a11b73b69ae3a0969ad4a8fbef310 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Wed, 7 Aug 2024 23:08:49 +0000
Subject: [PATCH 144/278] update

---
 vllm/worker/model_runner.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 120b6406692e..5c285fd0b933 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1370,6 +1370,8 @@ def execute_model(
         is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
+        
+        logger.debug('Into disagg prefill')
 
         # check if we can skip prefilling
         # We can only skip during prefill phase in disaggregated decode instance
@@ -1414,6 +1416,8 @@ def execute_model(
                     model_input,
                     kv_caches,
                 )
+
+        logger.debug("Out from disagg prefill.")
                 
 
         # Compute the logits in the last pipeline stage.

From 9f4dba236da05c3af67337c32a21c1bbc531c659 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 8 Aug 2024 06:12:25 +0000
Subject: [PATCH 145/278] remove debugging information

---
 vllm/worker/model_runner.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 5c285fd0b933..cc18f4b3cd62 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1371,8 +1371,6 @@ def execute_model(
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
         
-        logger.debug('Into disagg prefill')
-
         # check if we can skip prefilling
         # We can only skip during prefill phase in disaggregated decode instance
         if any([
@@ -1417,8 +1415,6 @@ def execute_model(
                     kv_caches,
                 )
 
-        logger.debug("Out from disagg prefill.")
-                
 
         # Compute the logits in the last pipeline stage.
         if not get_pp_group().is_last_rank:

From aa55883e755153e7a98c32fa4ad48679743d2829 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 9 Aug 2024 03:30:39 +0000
Subject: [PATCH 146/278] avoid broadcast by finding seqlen inside the attn
 metadata

---
 vllm/distributed/distributed_kv.py | 61 ++++++++++++++----------------
 1 file changed, 28 insertions(+), 33 deletions(-)

diff --git a/vllm/distributed/distributed_kv.py b/vllm/distributed/distributed_kv.py
index 77c153d775c6..3d6dfc93acfe 100644
--- a/vllm/distributed/distributed_kv.py
+++ b/vllm/distributed/distributed_kv.py
@@ -21,6 +21,9 @@
 IS_KV_PREFILL_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "prefill")
 IS_KV_DECODE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "decode")
 
+# a magic number
+DISTRIBUTED_KV_GLOO_TAG = 24857323
+
 logger = init_logger(__name__)
 
 
@@ -140,13 +143,17 @@ def send_input_hash(self, input_hash: int) -> None:
         # KV cache send go through CPU, and the original `send` only use GPU.
         # So create a new group for sending input hash.
         input_hash_tensor = torch.tensor([input_hash], device="cpu").long()
-        torch.distributed.isend(input_hash_tensor, self.target_rank_for_send,
-                                self.cpu_group)
+        torch.distributed.isend(input_hash_tensor, 
+                                self.target_rank_for_send,
+                                self.cpu_group,
+                                tag=DISTRIBUTED_KV_GLOO_TAG)
 
     def recv_input_hash(self) -> int:
         input_hash_tensor = torch.tensor([0], device="cpu").long()
-        torch.distributed.irecv(input_hash_tensor, self.target_rank_for_recv,
-                                self.cpu_group).wait()
+        torch.distributed.irecv(input_hash_tensor, 
+                                self.target_rank_for_recv,
+                                self.cpu_group, 
+                                tag=DISTRIBUTED_KV_GLOO_TAG).wait()
         return input_hash_tensor.item()
 
     def recv_input_hash_and_send_kv(self):
@@ -155,9 +162,10 @@ def recv_input_hash_and_send_kv(self):
 
             # receive the input hash that the decode instance requires
             logger.debug(
-                '[rank%d]: Waiting for input hash from rank %d',
+                '[rank%d]: Waiting for input hash from rank %d, my keys are %s',
                 torch.distributed.get_rank(),
                 self.target_rank_for_recv,
+                list(self.input_hash_to_kv_sending_requests.keys()),
             )
             input_hash = self.recv_input_hash()
             logger.debug(
@@ -226,25 +234,18 @@ def buffer_kv_caches_send_and_listen_for_input_hash(
 ) -> None:
 
     input_tokens_tuple = tuple(model_input.input_tokens.tolist())
-    seq_query_obj = {
-        "seq_lens": model_input.seq_lens,
-        "query_lens": model_input.query_lens,
-    }
-    seq_query_obj = ps.get_tp_group().broadcast_object(seq_query_obj)
-    seq_lens = seq_query_obj["seq_lens"]
-    query_lens = seq_query_obj["query_lens"]
+    seq_lens = model_input.attn_metadata.seq_lens
     slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
 
-    logger.debug("My query lens is %s, seq len is %s, rank is %s",
-                 str(query_lens), str(seq_lens), torch.distributed.get_rank())
+    logger.debug("My seq len is %s, rank is %s", str(seq_lens), torch.distributed.get_rank())
                  
     # query_lens contains new KV caches that are added to vLLM.
     # so we will send them to decode instance
     # FIXME(Kuntai): This assume that all requests are prefill.
-    for idx, qlen in enumerate(query_lens):
+    for idx, slen in enumerate(seq_lens):
 
-        start_pos = sum(query_lens[:idx])
-        end_pos = start_pos + qlen
+        start_pos = sum(seq_lens[:idx])
+        end_pos = start_pos + slen
         input_hash = hash(input_tokens_tuple[start_pos:end_pos])
 
         for i in range(model_executable.model.start_layer,
@@ -267,7 +268,7 @@ def buffer_kv_caches_send_and_listen_for_input_hash(
             input_hash, hidden_or_intermediate_states[start_pos:end_pos])
         ps.get_disagg_group().kv_cache_send_finish(input_hash)
 
-    logger.error("\033[92mKV send DONE for rank %d\033[0m",
+    logger.debug("[rank%d]: KV send DONE.",
                  torch.distributed.get_rank())
 
 
@@ -278,28 +279,22 @@ def send_input_hash_and_do_kv_caches_recv(
 
     # This is disagg decode instance, during prefill state
     # Need to receive KV from the prefill instance
-    # FIXME(Kuntai): This impl assumes that all requests are prefill.
     input_tokens_tuple = tuple(model_input.input_tokens.tolist())
-    seq_query_obj = {
-        "seq_lens": model_input.seq_lens,
-        "query_lens": model_input.query_lens,
-    }
-    seq_query_obj = ps.get_tp_group().broadcast_object(seq_query_obj)
-    seq_lens = seq_query_obj["seq_lens"]
-    query_lens = seq_query_obj["query_lens"]
+    seq_lens = model_input.attn_metadata.seq_lens
     slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
 
+    logger.debug("My seq len is %s, rank is %s", str(seq_lens), torch.distributed.get_rank())
+
     hidden_or_intermediate_states_for_one_req = []
 
     # enumerate different requests
-    logger.debug("My query lens is %s, seq len is %s, rank is %s",
-                 str(query_lens), str(seq_lens), torch.distributed.get_rank())
-    for idx, qlen in enumerate(query_lens):
+    # FIXME(Kuntai): This impl assumes that all requests are prefill.
+    for idx, slen in enumerate(seq_lens):
 
-        start_pos = sum(query_lens[:idx])
-        end_pos = start_pos + qlen
+        start_pos = sum(seq_lens[:idx])
+        end_pos = start_pos + slen
         input_hash = hash(input_tokens_tuple[start_pos:end_pos])
-        num_tokens = qlen
+        num_tokens = slen
 
         # notify the prefill instance to start sending KVs associated with input_hash
         ps.get_disagg_group().kv_cache_recv_start(input_hash)
@@ -343,6 +338,6 @@ def send_input_hash_and_do_kv_caches_recv(
     hidden_or_intermediate_states = torch.cat(
         hidden_or_intermediate_states_for_one_req, dim=0)
 
-    logger.error("\033[92mKV receive DONE for rank %d\033[0m",
+    logger.error("[rank%d]: KV recv DONE.",
                  torch.distributed.get_rank())
     return hidden_or_intermediate_states

From 95df02349644a8807b4c646b75d0164fb4b93c99 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 9 Aug 2024 06:33:58 +0000
Subject: [PATCH 147/278] update examples

---
 .../disagg_prefill/disagg_prefill_example.sh  | 38 ++++++-------------
 1 file changed, 11 insertions(+), 27 deletions(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index a2a6dc0932fd..f57f5fd86d89 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -20,50 +20,34 @@ VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 pytho
     -m vllm.entrypoints.openai.api_server \
     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8100 \
-    -tp 1 \
-    --enable-prefix-caching \
-    --gpu-memory-utilization 0.8 \
-    --max-model-len 10000 &
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.8 &
 
 # decoding instance
 VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8200 \
-    -tp 1 \
-    --enable-prefix-caching \
-    --gpu-memory-utilization 0.8 \
-    --max-model-len 10000 &
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.8 &
 
 # wait until prefill and decode instances are ready
 wait_for_server 8100
 wait_for_server 8200
 
-# sending an example request
-# in disaggregated prefilling, there are two steps of sending a request:
-#   1. send the request to prefill instance, with max_tokens set to 1
-#   2. send the request again to decode instance, no modification
+# launch a proxy server that opens the service at port 8000
+python3 ../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
+sleep 1
 
-# send to prefill instance, let it only do prefill by setting max_token=1
-curl -m 5 http://localhost:8100/v1/completions \
+# serve an example request
+curl http://localhost:8000/v1/completions \
 -H "Content-Type: application/json" \
 -d '{
 "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-"prompt": "'$i' San Francisco is a",
-"max_tokens": 1,
+"prompt": "San Francisco is a",
+"max_tokens": 10,
 "temperature": 0
 }'
 
-# send to decode instance
-curl -m 5 http://localhost:8100/v1/completions \
--H "Content-Type: application/json" \
--d '{
-"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-"prompt": "'$i' San Francisco is a",
-"max_tokens": 50,
-"temperature": 0
-}'
-
-
 # clean up
 ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
\ No newline at end of file

From d92223ad176dae9264555fbacd3fc63056ebedd8 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 9 Aug 2024 06:34:25 +0000
Subject: [PATCH 148/278] support pipeline parallel

---
 vllm/distributed/distributed_kv.py | 132 ++++++++++++++++++++---------
 1 file changed, 90 insertions(+), 42 deletions(-)

diff --git a/vllm/distributed/distributed_kv.py b/vllm/distributed/distributed_kv.py
index 3d6dfc93acfe..5404ea91e2bc 100644
--- a/vllm/distributed/distributed_kv.py
+++ b/vllm/distributed/distributed_kv.py
@@ -4,6 +4,8 @@
 from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING
 from collections import defaultdict, deque
 from concurrent.futures import ThreadPoolExecutor
+from threading import Lock
+from copy import deepcopy
 
 import torch
 from torch.distributed import Backend, ProcessGroup
@@ -13,15 +15,16 @@
 from vllm.logger import init_logger
 import vllm.distributed.parallel_state as ps
 from vllm import _custom_ops as ops
+from vllm.sequence import IntermediateTensors
 
 assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode"], \
     "VLLM_DISAGG_PREFILL_ROLE can only be prefill or decode."
 
-IS_DISTRIBUTED_KV_INSTANCE = (envs.VLLM_DISAGG_PREFILL_ROLE is not None)
+IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE is not None)
 IS_KV_PREFILL_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "prefill")
 IS_KV_DECODE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "decode")
 
-# a magic number
+# add a tag when sending/recving input hash
 DISTRIBUTED_KV_GLOO_TAG = 24857323
 
 logger = init_logger(__name__)
@@ -66,6 +69,7 @@ def __init__(
         # use a threadpool to buffer send request in disaggregated prefill
         self.input_hash_to_kv_sending_requests = defaultdict(deque)
         self.kv_sending_thread = None
+        self.input_hash_to_kv_sending_requests_lock = Lock()
         self.target_rank_for_send = self.ranks[(self.rank_in_group + 1) %
                                                self.world_size]
         self.target_rank_for_recv = self.ranks[(self.rank_in_group - 1) %
@@ -104,7 +108,8 @@ def debug_recv(self,
 
     def kv_cache_send(self,
                       input_hash: int,
-                      tensor: torch.Tensor,
+                      tensor: Union[torch.Tensor, IntermediateTensors],
+                      is_hidden: bool = False,
                       dst: Optional[int] = None) -> None:
         """Push the KV cache send request into the send buffer"""
         """NOTE: `dst` is the local rank of the destination rank."""
@@ -114,17 +119,35 @@ def kv_cache_send(self,
         else:
             send_func = self.send
 
-        self.input_hash_to_kv_sending_requests[input_hash].append([
-            send_func,
-            # tensor needs to be cloned, if not the tensor may be freed
-            tensor.clone(),
-            dst
-        ])
-
-    def kv_cache_recv(self,
-                      size: torch.Size,
-                      dtype: torch.dtype,
-                      src: Optional[int] = None) -> torch.Tensor:
+        if is_hidden and not ps.get_pp_group().is_last_rank:
+
+            assert isinstance(tensor, IntermediateTensors)
+
+            output = deepcopy(tensor.tensors)
+            for key in output:
+                output[key] = output[key].contiguous()
+
+            self.input_hash_to_kv_sending_requests[input_hash].append(
+                [self.send_tensor_dict, output, dst])
+
+        else:
+
+            assert isinstance(tensor, torch.Tensor)
+
+            self.input_hash_to_kv_sending_requests[input_hash].append([
+                send_func,
+                # tensor needs to be cloned, if not the tensor may be freed
+                tensor.clone(),
+                dst
+            ])
+
+    def kv_cache_recv(
+            self,
+            size: torch.Size,
+            dtype: torch.dtype,
+            is_hidden: bool = False,
+            src: Optional[int] = None
+    ) -> Union[torch.Tensor, IntermediateTensors]:
         """Receives a tensor from the src rank (blocking)."""
         """This API should be used together with `push`"""
         """NOTE: `src` is the local rank of the destination rank."""
@@ -134,7 +157,10 @@ def kv_cache_recv(self,
         else:
             recv_func = self.recv
 
-        tensor = recv_func(size, dtype, src)
+        if is_hidden and not ps.get_pp_group().is_last_rank:
+            tensor = IntermediateTensors(self.recv_tensor_dict(src))
+        else:
+            tensor = recv_func(size, dtype, src)
 
         return tensor
 
@@ -143,34 +169,33 @@ def send_input_hash(self, input_hash: int) -> None:
         # KV cache send go through CPU, and the original `send` only use GPU.
         # So create a new group for sending input hash.
         input_hash_tensor = torch.tensor([input_hash], device="cpu").long()
-        torch.distributed.isend(input_hash_tensor, 
-                                self.target_rank_for_send,
-                                self.cpu_group,
-                                tag=DISTRIBUTED_KV_GLOO_TAG)
+        torch.distributed.send(input_hash_tensor,
+                               self.target_rank_for_send,
+                               self.cpu_group,
+                               tag=DISTRIBUTED_KV_GLOO_TAG)
 
     def recv_input_hash(self) -> int:
         input_hash_tensor = torch.tensor([0], device="cpu").long()
-        torch.distributed.irecv(input_hash_tensor, 
-                                self.target_rank_for_recv,
-                                self.cpu_group, 
-                                tag=DISTRIBUTED_KV_GLOO_TAG).wait()
+        torch.distributed.recv(input_hash_tensor,
+                               self.target_rank_for_recv,
+                               self.cpu_group,
+                               tag=DISTRIBUTED_KV_GLOO_TAG)
         return input_hash_tensor.item()
 
     def recv_input_hash_and_send_kv(self):
 
         try:
-
-            # receive the input hash that the decode instance requires
             logger.debug(
                 '[rank%d]: Waiting for input hash from rank %d, my keys are %s',
                 torch.distributed.get_rank(),
                 self.target_rank_for_recv,
                 list(self.input_hash_to_kv_sending_requests.keys()),
             )
+            # block the ThreadPoolExecutor, until a new input hash is received
             input_hash = self.recv_input_hash()
-            logger.debug(
-                'Successfully received input hash %d',
-                input_hash)
+
+            self.input_hash_to_kv_sending_requests_lock.acquire()
+            logger.debug('Successfully received input hash %d', input_hash)
             assert input_hash in self.input_hash_to_kv_sending_requests, \
                 f"The KV cache of {input_hash} does not exist. "\
                 f"Existing input hash: {list(self.input_hash_to_kv_sending_requests.keys())}"
@@ -184,6 +209,7 @@ def recv_input_hash_and_send_kv(self):
                 if request == []:
                     break
                 request[0](*request[1:])
+
             if len(self.input_hash_to_kv_sending_requests[input_hash]) == 0:
                 logger.debug('Finish input hash %d, free GPU memory...',
                              input_hash)
@@ -195,6 +221,8 @@ def recv_input_hash_and_send_kv(self):
                     'memory for one of the request.',
                     input_hash)
 
+            self.input_hash_to_kv_sending_requests_lock.release()
+
         except Exception as e:
             # This function is executed in ThreadPoolExecutor
             # and it will block all exceptions by default
@@ -237,8 +265,11 @@ def buffer_kv_caches_send_and_listen_for_input_hash(
     seq_lens = model_input.attn_metadata.seq_lens
     slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
 
-    logger.debug("My seq len is %s, rank is %s", str(seq_lens), torch.distributed.get_rank())
-                 
+    logger.debug("My seq len is %s, rank is %s", str(seq_lens),
+                 torch.distributed.get_rank())
+
+    ps.get_disagg_group().input_hash_to_kv_sending_requests_lock.acquire()
+
     # query_lens contains new KV caches that are added to vLLM.
     # so we will send them to decode instance
     # FIXME(Kuntai): This assume that all requests are prefill.
@@ -265,11 +296,14 @@ def buffer_kv_caches_send_and_listen_for_input_hash(
                 input_hash, value_cache[current_slot_mapping])
 
         ps.get_disagg_group().kv_cache_send(
-            input_hash, hidden_or_intermediate_states[start_pos:end_pos])
+            input_hash,
+            hidden_or_intermediate_states[start_pos:end_pos],
+            is_hidden=True)
         ps.get_disagg_group().kv_cache_send_finish(input_hash)
 
-    logger.debug("[rank%d]: KV send DONE.",
-                 torch.distributed.get_rank())
+    ps.get_disagg_group().input_hash_to_kv_sending_requests_lock.release()
+
+    logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
 
 
 def send_input_hash_and_do_kv_caches_recv(
@@ -283,7 +317,8 @@ def send_input_hash_and_do_kv_caches_recv(
     seq_lens = model_input.attn_metadata.seq_lens
     slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
 
-    logger.debug("My seq len is %s, rank is %s", str(seq_lens), torch.distributed.get_rank())
+    logger.debug("My seq len is %s, rank is %s", str(seq_lens),
+                 torch.distributed.get_rank())
 
     hidden_or_intermediate_states_for_one_req = []
 
@@ -330,14 +365,27 @@ def send_input_hash_and_do_kv_caches_recv(
             )
 
         hidden_or_intermediate_states_for_one_req.append(
-            ps.get_disagg_group().kv_cache_recv(
-                torch.Size([num_tokens, model_executable.config.hidden_size]),
-                kv_cache[0].dtype))
+            ps.get_disagg_group().kv_cache_recv(torch.Size(
+                [num_tokens, model_executable.config.hidden_size]),
+                                                kv_cache[0].dtype,
+                                                is_hidden=True))
 
     # concatenate hidden states from different requests
-    hidden_or_intermediate_states = torch.cat(
-        hidden_or_intermediate_states_for_one_req, dim=0)
-
-    logger.error("[rank%d]: KV recv DONE.",
-                 torch.distributed.get_rank())
+    if isinstance(hidden_or_intermediate_states_for_one_req[0], torch.Tensor):
+        hidden_or_intermediate_states = torch.cat(
+            hidden_or_intermediate_states_for_one_req, dim=0)
+    else:
+        # concat the IntermediateTensors
+        keys = list(hidden_or_intermediate_states_for_one_req[0].tensors.keys())
+        result_its = {}
+        
+        for key in keys:
+            result_its[key] = []
+            for its in hidden_or_intermediate_states_for_one_req:
+                result_its[key].append(its[key])
+            result_its[key] = torch.cat(result_its[key], dim=0)
+            
+        hidden_or_intermediate_states = IntermediateTensors(result_its)
+
+    logger.error("[rank%d]: KV recv DONE.", torch.distributed.get_rank())
     return hidden_or_intermediate_states

From a8c202c130f622c2ba37cf10869eb287f96e2ac1 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sat, 10 Aug 2024 09:19:49 +0000
Subject: [PATCH 149/278] update benchmark --- compare chunked prefill w.r.t.
 disagg prefill

---
 .../disagg_performance_benchmark.sh           | 136 +++++++++---------
 1 file changed, 68 insertions(+), 68 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index 38c0fcdaefc8..dde9a80b59b3 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -17,17 +17,12 @@ set -ex
 
 kill_gpu_processes() {
   # kill all processes on GPU.
-  pkill pt_main_thread
-  sleep 10
-
-  # remove vllm config file
-  rm -rf ~/.config/vllm
-
-  # Print the GPU memory usage
-  # so that we know if all GPU processes are killed.
-  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
-  # The memory usage should be 0 MB.
-  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+  pkill -f pt_main_thread
+  pkill -f python3
+  pkill -f round_robin_proxy.sh
+  ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
+  for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
+  sleep 1
 }
 
 wait_for_server() {
@@ -41,57 +36,40 @@ wait_for_server() {
 }
 
 
-benchmark() {
-
-  export VLLM_LOGGING_LEVEL=DEBUG
-  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
-  export VLLM_PORT=12345
-  # export VLLM_TRACE_FUNCTION=1
-
-  # compare chunked prefill with disaggregated prefill
-
-  results_folder="./results"
+launch_chunked_prefill() {
   model="meta-llama/Meta-Llama-3.1-70B-Instruct"
-  dataset_name="sonnet"
-  dataset_path="../sonnet_4x.txt"
-  num_prompts=10
-  qps=$1
-  prefix_len=50
-  input_len=2048
-  output_len=$2
-
-
-  # baseline: chunked prefill
-  # CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
-  #   -m vllm.entrypoints.openai.api_server \
-  #   --model $model \
-  #   --port 8000 \
-  #   -tp 4 \
-  #   --disable-log-stats \
-  #   --disable-log-requests \
-  #   --enable-chunked-prefill &
-  # wait_for_server 8000
-
-  # python3 ../benchmark_serving.py \
-  #         --backend vllm \
-  #         --model $model \
-  #         --dataset-name $dataset_name \
-  #         --dataset-path $dataset_path \
-  #         --sonnet-input-len $input_len \
-  #         --sonnet-output-len $output_len \
-  #         --sonnet-prefix-len $prefix_len \
-  #         --num-prompts $((num_prompts / 2)) \
-  #         --port 8000 \
-  #         --save-result \
-  #         --result-dir $results_folder \
-  #         --result-filename chunked_prefill_tp4.json \
-  #         --request-rate $((qps / 2))
-  # kill_gpu_processes
-
+  # disagg prefill
+  VLLM_RPC_PORT=5570 CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+      -m vllm.entrypoints.openai.api_server \
+      --model $model \
+      --port 8100 \
+      -tp 4 \
+      --max-model-len 30000 \
+      --disable-log-stats \
+      --disable-log-requests \
+      --enable-chunked-prefill \
+      --gpu-memory-utilization 0.8 &
+  VLLM_RPC_PORT=5580 CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model $model \
+    --port 8200 \
+    -tp 4 \
+    --max-model-len 30000 \
+    --disable-log-stats \
+    --disable-log-requests \
+    --enable-chunked-prefill \
+    --gpu-memory-utilization 0.8 &
+  wait_for_server 8100
+  wait_for_server 8200
+  bash round_robin_proxy.sh &
+  sleep 1
+}
 
 
+launch_disagg_prefill() {
+  model="meta-llama/Meta-Llama-3.1-70B-Instruct" 
   # disagg prefill
-  VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+  VLLM_PORT=12345 VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
       -m vllm.entrypoints.openai.api_server \
       --model $model \
       --port 8100 \
@@ -100,7 +78,7 @@ benchmark() {
       --disable-log-stats \
       --disable-log-requests \
       --gpu-memory-utilization 0.8 &
-  VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+  VLLM_PORT=12345 VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model $model \
     --port 8200 \
@@ -109,13 +87,24 @@ benchmark() {
     --disable-log-stats \
     --disable-log-requests \
     --gpu-memory-utilization 0.8 &
-
   wait_for_server 8100
   wait_for_server 8200
-
-  # launch a proxy server that listen from port 8000
   python3 disagg_prefill_proxy_server.py &
   sleep 1
+}
+
+
+benchmark() {
+  results_folder="./results"
+  model="meta-llama/Meta-Llama-3.1-70B-Instruct"
+  dataset_name="sonnet"
+  dataset_path="../sonnet_4x.txt"
+  num_prompts=400
+  qps=$1
+  prefix_len=50
+  input_len=2048
+  output_len=$2
+  tag=$3
 
   python3 ../benchmark_serving.py \
           --backend vllm \
@@ -129,12 +118,10 @@ benchmark() {
           --port 8000 \
           --save-result \
           --result-dir $results_folder \
-          --result-filename disagg_prefill_2xtp4.json \
+          --result-filename $tag-qps-$qps.json \
           --request-rate $qps
 
-
-  kill_gpu_processes
-
+  sleep 2
 
 }
 
@@ -162,9 +149,22 @@ main() {
   mkdir results
 
   default_qps=10
-  default_output_len=10
+  default_output_len=150
 
-  benchmark $default_qps $default_output_len
+  export VLLM_LOGGING_LEVEL=DEBUG
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+
+  launch_chunked_prefill
+  for qps in 2 4 6 8; do
+  benchmark $qps $default_output_len chunked_prefill
+  done
+  kill_gpu_processes
+
+  launch_disagg_prefill
+  for qps in 2 4 6 8; do
+  benchmark $qps $default_output_len disagg_prefill
+  done
+  kill_gpu_processes
 
 }
 

From 310f3a3214a86f8411a60d9d59c07d4686244c94 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sat, 10 Aug 2024 09:20:16 +0000
Subject: [PATCH 150/278] mute round_robin_proxy -- too loud

---
 benchmarks/disagg_benchmarks/round_robin_proxy.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.sh b/benchmarks/disagg_benchmarks/round_robin_proxy.sh
index e996756bc89d..375bf9e42237 100644
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.sh
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.sh
@@ -15,6 +15,5 @@ get_next_port() {
 # Start the proxy
 while true; do
   NEXT_PORT=$(get_next_port)
-  echo "Forwarding to port $NEXT_PORT"
-  socat TCP4-LISTEN:8000,reuseaddr,fork TCP4:localhost:$NEXT_PORT
+  socat TCP4-LISTEN:8000,reuseaddr,fork TCP4:localhost:$NEXT_PORT 2>/dev/null
 done
\ No newline at end of file

From 118aab18bda3f07df1af7643bc6208657825287b Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sat, 10 Aug 2024 09:21:54 +0000
Subject: [PATCH 151/278] bug fix: racing conditions, and rare cases where
 input hash is not cached

---
 .../disagg_prefill_proxy_server.py            |   9 +-
 vllm/distributed/distributed_kv.py            | 200 +++++++++++++-----
 vllm/worker/model_runner.py                   |  67 +++---
 3 files changed, 180 insertions(+), 96 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index 8d9f699d4532..5750df7735ad 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -36,7 +36,7 @@ async def handle_request():
         async for _ in forward_request('http://localhost:8100/v1/completions', prefill_request):
             continue
 
-        print(f"Request {prefill_request} prefill done. proceeding to decode.")
+        print(f"Prefill done. proceeding to decode.")
         
         # return decode
         generator = forward_request('http://localhost:8200/v1/completions', original_request_data)
@@ -46,9 +46,10 @@ async def handle_request():
         return response
     
     except Exception as e:
-        exc_info = sys.exc_info()
-        print(e)
-        print("".join(traceback.format_exception(*exc_info)))
+        pass
+        # exc_info = sys.exc_info()
+        # print(e)
+        # print("".join(traceback.format_exception(*exc_info)))
 
 if __name__ == '__main__':
     app.run(port=8000)
diff --git a/vllm/distributed/distributed_kv.py b/vllm/distributed/distributed_kv.py
index 5404ea91e2bc..fd28a82c294a 100644
--- a/vllm/distributed/distributed_kv.py
+++ b/vllm/distributed/distributed_kv.py
@@ -1,11 +1,22 @@
 """vLLM distributed KV cache transfer API.
 These APIs are used in `vllm/worker/model_runner.py`.
+
+Currently supporting TP and PP.
+
+Workflow:
+- In prefill instance, KV cache sender *buffers* the KV cache send requests
+- In decode instance
+    - KV cache receiver sends the hash of input tokens to sender
+    - KV cache sender executes send request
+    - KV cache receiver receives the KV cache
 """
 from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING
 from collections import defaultdict, deque
 from concurrent.futures import ThreadPoolExecutor
 from threading import Lock
 from copy import deepcopy
+import time
+import threading
 
 import torch
 from torch.distributed import Backend, ProcessGroup
@@ -29,6 +40,24 @@
 
 logger = init_logger(__name__)
 
+import logging
+
+
+class RankFilter(logging.Filter):
+
+    def filter(self, record):
+        # Only log if rank is 4
+        rank = 1
+        try:
+            rank = torch.distributed.get_rank()
+        except Exception:
+            pass
+        return rank % 4 == 0
+
+
+for handler in logger.handlers:
+    handler.addFilter(RankFilter())
+
 
 class DistributedKVCoordinator(GroupCoordinator):
     """
@@ -45,7 +74,11 @@ def __init__(
         group_ranks: List[List[int]],
         local_rank: int,
         torch_distributed_backend: Union[str, Backend],
-        use_pynccl: bool = True,
+        # DO NOT use pynccl here
+        # Pynccl send is non-blocking
+        # and it's possible that the memory is freed before the data being sent
+        # which may happen at high qps
+        use_pynccl: bool = False,
         use_custom_allreduce: bool = False,
         use_tpu_communicator: bool = True,
         use_message_queue_broadcaster: bool = False,
@@ -136,7 +169,7 @@ def kv_cache_send(self,
 
             self.input_hash_to_kv_sending_requests[input_hash].append([
                 send_func,
-                # tensor needs to be cloned, if not the tensor may be freed
+                # use clone to make sure the tensor is contiguous
                 tensor.clone(),
                 dst
             ])
@@ -164,7 +197,11 @@ def kv_cache_recv(
 
         return tensor
 
-    def send_input_hash(self, input_hash: int) -> None:
+    def send_input_hash(self, input_hash: int) -> int:
+
+        logger.debug('[rank%d]: Sending input hash %d to rank %d',
+                     torch.distributed.get_rank(), input_hash,
+                     self.target_rank_for_send)
 
         # KV cache send go through CPU, and the original `send` only use GPU.
         # So create a new group for sending input hash.
@@ -173,33 +210,64 @@ def send_input_hash(self, input_hash: int) -> None:
                                self.target_rank_for_send,
                                self.cpu_group,
                                tag=DISTRIBUTED_KV_GLOO_TAG)
+        return_tensor = torch.tensor([0], device="cpu").long()
+        torch.distributed.recv(return_tensor,
+                               self.target_rank_for_recv,
+                               self.cpu_group,
+                               tag=DISTRIBUTED_KV_GLOO_TAG)
+        return return_tensor.item()
 
-    def recv_input_hash(self) -> int:
+    def recv_input_hash(self) -> Optional[int]:
+        '''
+            Receive an input hash, and check if it is already cached
+        '''
         input_hash_tensor = torch.tensor([0], device="cpu").long()
         torch.distributed.recv(input_hash_tensor,
                                self.target_rank_for_recv,
                                self.cpu_group,
                                tag=DISTRIBUTED_KV_GLOO_TAG)
-        return input_hash_tensor.item()
-
-    def recv_input_hash_and_send_kv(self):
-
-        try:
+        input_hash = input_hash_tensor.item()
+        # a new input hash comes in, see if it is already cached
+        self.input_hash_to_kv_sending_requests_lock.acquire()
+        logger.debug('Successfully received input hash %d', input_hash)
+        if input_hash not in self.input_hash_to_kv_sending_requests:
+            logger.warning(
+            f"The KV cache of {input_hash} does not exist. "\
+            f"Existing input hash: {list(self.input_hash_to_kv_sending_requests.keys())}")
+            
+            # 0 for fail
+            x = torch.tensor([0], device="cpu").long()
+            torch.distributed.send(x,
+                                    self.target_rank_for_send,
+                                    self.cpu_group,
+                                    tag=DISTRIBUTED_KV_GLOO_TAG)
+            return None
+        else:
+            logger.debug('Input hash %d exists, start sending', input_hash)
+            
+            # 1 for success
+            x = torch.tensor([1], device="cpu").long()
+            torch.distributed.send(x,
+                                   self.target_rank_for_send,
+                                   self.cpu_group,
+                                   tag=DISTRIBUTED_KV_GLOO_TAG)
+            return input_hash
+
+    def kv_cache_send_loop(self):
+        
+        while True:
             logger.debug(
                 '[rank%d]: Waiting for input hash from rank %d, my keys are %s',
                 torch.distributed.get_rank(),
                 self.target_rank_for_recv,
                 list(self.input_hash_to_kv_sending_requests.keys()),
             )
-            # block the ThreadPoolExecutor, until a new input hash is received
+            # wait for a new input hash
+            # this function will acquire the lock
             input_hash = self.recv_input_hash()
-
-            self.input_hash_to_kv_sending_requests_lock.acquire()
-            logger.debug('Successfully received input hash %d', input_hash)
-            assert input_hash in self.input_hash_to_kv_sending_requests, \
-                f"The KV cache of {input_hash} does not exist. "\
-                f"Existing input hash: {list(self.input_hash_to_kv_sending_requests.keys())}"
-            logger.debug('Input hash %d exists, start sending', input_hash)
+            if input_hash is None:
+                self.input_hash_to_kv_sending_requests_lock.release()
+                continue
 
             # execute corresponding kv cache sending jobs in request queue
             while True:
@@ -208,6 +276,7 @@ def recv_input_hash_and_send_kv(self):
                 # An empty request: the KV cahe of one request are all sent
                 if request == []:
                     break
+
                 request[0](*request[1:])
 
             if len(self.input_hash_to_kv_sending_requests[input_hash]) == 0:
@@ -217,56 +286,57 @@ def recv_input_hash_and_send_kv(self):
             else:
                 logger.debug(
                     'The buffer for input hash %d is not empty, meaning that '\
-                    'there are two jobs with identical input. Free GPU '\
-                    'memory for one of the request.',
+                    'there are two jobs with identical input.',
                     input_hash)
 
             self.input_hash_to_kv_sending_requests_lock.release()
 
-        except Exception as e:
-            # This function is executed in ThreadPoolExecutor
-            # and it will block all exceptions by default
-            # so log the potential error message here.
-            import traceback
-            import time
-            exc_info = traceback.format_exc()
-            # avoid the output of different rank overlaps
-            time.sleep(torch.distributed.get_rank())
-            logger.error("An error occured: %s, stack trace: %s", e, exc_info)
 
-    def kv_cache_send_finish(self, input_hash: int):
+    def kv_cache_send_ready(self, input_hash: int):
 
         if self.kv_sending_thread is None:
-            self.kv_sending_thread = ThreadPoolExecutor(max_workers=1)
-
-        # append an empty job to signal that this is the end of a request
+            self.kv_sending_thread = threading.Thread(
+                target=self.kv_cache_send_loop)
+            self.kv_sending_thread.start()
+        
+        # append an empty list to separate requests
+        # as there might be identical requests, that has the same input hash
         self.input_hash_to_kv_sending_requests[input_hash].append([])
-        job = self.kv_sending_thread.submit(self.recv_input_hash_and_send_kv)
-        logger.debug(f'Submit job {job} into kv cache sending thread')
+        logger.debug(f'Buffered input hash {input_hash}')
 
     def kv_cache_recv_start(self, input_hash: int):
-
-        logger.debug('[rank%d]: Sending input hash %d to rank %d',
-                     torch.distributed.get_rank(), input_hash,
-                     self.ranks[(self.rank_in_group + 1) % self.world_size])
-
         # notify the kv cache sender with the input hash id
-        self.send_input_hash(input_hash)
+        return self.send_input_hash(input_hash)
+
+    def block_if_buffer_full(self):
+        
+        # block vLLM if the KV cache sending buffer is full
+        # TODO: allow using other policies to handle buffer full
+        while True:
+            self.input_hash_to_kv_sending_requests_lock.acquire()
+            if len(self.input_hash_to_kv_sending_requests.keys()) > 55:
+                self.input_hash_to_kv_sending_requests_lock.release()
+                time.sleep(0.1)
+            else:
+                self.input_hash_to_kv_sending_requests_lock.release()
+                break
 
 
-def buffer_kv_caches_send_and_listen_for_input_hash(
+def send_kv_caches_and_hidden_states(
     model_executable: torch.nn.Module,
     model_input: "ModelInputForGPUWithSamplingMetadata",
     kv_caches: List[torch.Tensor],
-    hidden_or_intermediate_states: torch.Tensor,
+    hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors],
 ) -> None:
 
     input_tokens_tuple = tuple(model_input.input_tokens.tolist())
     seq_lens = model_input.attn_metadata.seq_lens
     slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
 
-    logger.debug("My seq len is %s, rank is %s", str(seq_lens),
-                 torch.distributed.get_rank())
+    # Assumption: current batch is all-prefill requests
+    assert torch.allclose(model_input.attn_metadata.query_start_loc,
+                          model_input.attn_metadata.seq_start_loc)
+    assert torch.all(model_input.attn_metadata.context_lens_tensor == 0)
 
     ps.get_disagg_group().input_hash_to_kv_sending_requests_lock.acquire()
 
@@ -299,17 +369,22 @@ def buffer_kv_caches_send_and_listen_for_input_hash(
             input_hash,
             hidden_or_intermediate_states[start_pos:end_pos],
             is_hidden=True)
-        ps.get_disagg_group().kv_cache_send_finish(input_hash)
+        ps.get_disagg_group().kv_cache_send_ready(input_hash)
 
     ps.get_disagg_group().input_hash_to_kv_sending_requests_lock.release()
 
+    ps.get_disagg_group().block_if_buffer_full()
+
     logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
 
 
-def send_input_hash_and_do_kv_caches_recv(
-        model_executable: torch.nn.Module,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
-        kv_caches: List[torch.Tensor]) -> torch.Tensor:
+def recv_kv_caches_and_hidden_states(
+    model_executable: torch.nn.Module,
+    model_input: "ModelInputForGPUWithSamplingMetadata",
+    kv_caches: List[torch.Tensor]
+) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool]:
+
+    bypass_model_exec = True
 
     # This is disagg decode instance, during prefill state
     # Need to receive KV from the prefill instance
@@ -317,8 +392,10 @@ def send_input_hash_and_do_kv_caches_recv(
     seq_lens = model_input.attn_metadata.seq_lens
     slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
 
-    logger.debug("My seq len is %s, rank is %s", str(seq_lens),
-                 torch.distributed.get_rank())
+    # Assumption: current batch is all-prefill requests
+    assert torch.allclose(model_input.attn_metadata.query_start_loc,
+                          model_input.attn_metadata.seq_start_loc)
+    assert torch.all(model_input.attn_metadata.context_lens_tensor == 0)
 
     hidden_or_intermediate_states_for_one_req = []
 
@@ -332,7 +409,13 @@ def send_input_hash_and_do_kv_caches_recv(
         num_tokens = slen
 
         # notify the prefill instance to start sending KVs associated with input_hash
-        ps.get_disagg_group().kv_cache_recv_start(input_hash)
+        contain = ps.get_disagg_group().kv_cache_recv_start(input_hash)
+        
+        # fail to find input_hash in prefill instance
+        # this can occur but idk why...
+        if contain == 0:
+            bypass_model_exec = False
+            continue
 
         # receive KV cache from disaggregated prefill instance
         for i in range(model_executable.model.start_layer,
@@ -376,16 +459,17 @@ def send_input_hash_and_do_kv_caches_recv(
             hidden_or_intermediate_states_for_one_req, dim=0)
     else:
         # concat the IntermediateTensors
-        keys = list(hidden_or_intermediate_states_for_one_req[0].tensors.keys())
+        keys = list(
+            hidden_or_intermediate_states_for_one_req[0].tensors.keys())
         result_its = {}
-        
+
         for key in keys:
             result_its[key] = []
             for its in hidden_or_intermediate_states_for_one_req:
                 result_its[key].append(its[key])
             result_its[key] = torch.cat(result_its[key], dim=0)
-            
+
         hidden_or_intermediate_states = IntermediateTensors(result_its)
 
-    logger.error("[rank%d]: KV recv DONE.", torch.distributed.get_rank())
-    return hidden_or_intermediate_states
+    logger.debug("[rank%d]: KV recv DONE.", torch.distributed.get_rank())
+    return hidden_or_intermediate_states, bypass_model_exec
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index cc18f4b3cd62..4d8105bde2c4 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1371,16 +1371,26 @@ def execute_model(
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
         
-        # check if we can skip prefilling
-        # We can only skip during prefill phase in disaggregated decode instance
-        if any([
-            not is_prefill_run,
-            not dist_kv.IS_KV_DECODE_INSTANCE,
-            is_profile_run]):
-
-            # model forwarding
-            # during forwarding the KV cache will be sent in prefill instance
-            # see vllm/attention/backends/flash_attn.py for sending impl
+        # for disaggregated prefilling: allow bypassing model execution
+        bypass_model_exec = False
+        
+        # Recv kv cache for disaggregated prefill
+        # Skip model execution if all required KV cache are received
+        if all([
+            is_prefill_run,
+            dist_kv.IS_KV_DECODE_INSTANCE,
+            not is_profile_run]):
+            
+            hidden_or_intermediate_states, bypass = \
+                dist_kv.recv_kv_caches_and_hidden_states(
+                    model_executable,
+                    model_input,
+                    kv_caches,
+                )
+            if bypass:
+                bypass_model_exec = True
+
+        if not bypass_model_exec:
             hidden_or_intermediate_states = model_executable(
             input_ids=model_input.input_tokens,
             positions=model_input.input_positions,
@@ -1388,32 +1398,21 @@ def execute_model(
             attn_metadata=model_input.attn_metadata,
             intermediate_tensors=intermediate_tensors,
             **MultiModalInputs.as_kwargs(multi_modal_kwargs,
-                                         device=self.device),
+                                            device=self.device),
             **seqlen_agnostic_kwargs)
+        
+        # Send KV cache for disaggregated prefill
+        if all([
+            is_prefill_run,
+            dist_kv.IS_KV_PREFILL_INSTANCE,
+            not is_profile_run]):
             
-            
-            if all([
-                is_prefill_run,
-                dist_kv.IS_KV_PREFILL_INSTANCE,
-                not is_profile_run]):
-                
-                # transfer KV cache and hidden state
-                dist_kv.buffer_kv_caches_send_and_listen_for_input_hash(
-                    model_executable,
-                    model_input,
-                    kv_caches,
-                    hidden_or_intermediate_states,
-                )
-                
-        else:
-            
-            # skip prefill, receive KV cache and hidden state
-            hidden_or_intermediate_states = \
-                dist_kv.send_input_hash_and_do_kv_caches_recv(
-                    model_executable,
-                    model_input,
-                    kv_caches,
-                )
+            dist_kv.send_kv_caches_and_hidden_states(
+                model_executable,
+                model_input,
+                kv_caches,
+                hidden_or_intermediate_states,
+            )
 
 
         # Compute the logits in the last pipeline stage.

From 96d38b49b1f3811e06728491bb2d33ac71298c12 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 11 Aug 2024 05:29:31 +0000
Subject: [PATCH 152/278] add visualization script

---
 .../visualize_benchmark_results.py            | 90 +++++++------------
 1 file changed, 32 insertions(+), 58 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
index 8686fb2abf9b..192f26a1e3cd 100644
--- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -2,72 +2,46 @@
 import matplotlib.pyplot as plt
 import yaml
 import pandas as pd
-from tabulate import tabulate
+import json
 
 
-def stringify(x):
-    return [str(i) for i in x]
-
 
 if __name__ == "__main__":
-    
-    with open("results/chunk_vs_disagg.yaml", "r") as f:
-        data = yaml.load(f, Loader=yaml.FullLoader)
-    df = pd.DataFrame.from_dict(data)
-
-    print_df = df.copy()
-    print_df.drop(columns=[
-        "ttft_ratio",
-        "itl_ratio",
-        "prefill_decode_ratio",
-    ], inplace=True)
-    print_df.to_csv('results/chunk_vs_disagg.csv', index=False)
 
-    df["chunk_e2e"] = df["chunk_ttft"] + df["chunk_itl"] * df["output_len"]
-    df["disagg_e2e"] = df["disagg_ttft"] + df["disagg_itl"] * df["output_len"]
-    df["e2e_ratio"] = df["chunk_e2e"] / df["disagg_e2e"]
+    data = []
+    for name in ['disagg_prefill', 'chunked_prefill']:
+        for qps in [2,4,6,8]:
+            with open(f"results/{name}-qps-{qps}.json", "r") as f:
+                x = json.load(f)
+                x['name'] = name
+                x['qps'] = qps
+                data.append(x)
+                
+    df = pd.DataFrame.from_dict(data)
+    dis_df = df[df['name'] == 'disagg_prefill']
+    chu_df = df[df['name'] == 'chunked_prefill']
     
+    plt.style.use('bmh')
     plt.rcParams['font.size'] = 20
-    
-    
-    # qps vs performance
-    qps_df = df[df["output_len"] == 150].copy()
-    qps_df.drop(columns=[
-        "chunk_itl",
-        "chunk_ttft",
-        "disagg_itl",
-        "disagg_ttft",
-        "output_len",
-        "prefill_decode_ratio", 
-    ], inplace=True)
-    fig, ax = plt.subplots(figsize=(10, 7))
-    qps_df.plot(
-        ax=ax,
-        kind="bar",
-        x="qps",
-        y=["ttft_ratio", "itl_ratio", "e2e_ratio"],
-        ylabel="$T_{chunked}~/~T_{disagg}$",
-        rot=0,
-    )
-    ax.hlines(1, -1, 5, color='black')
-    fig.savefig('results/qps.png')
-    plt.close(fig)
 
     
-    # prefill decode ratio vs performance
-    tokens_df = df[df["output_len"] != 12]
-    fig, ax = plt.subplots(figsize=(10, 7))
-    tokens_df.plot(
-        ax=ax,
-        kind="bar",
-        x="output_len",
-        xlabel="# of output tokens",
-        y=["ttft_ratio", "itl_ratio", "e2e_ratio", "prefill_decode_ratio"],
-        ylabel="$T_{chunked}~/~T_{disagg}$",
-        rot=0,
-    )
-    ax.hlines(1, -1, 5, color='black')
-    fig.savefig('results/tokens.png')
-    plt.close(fig)
+    for key in ['mean_ttft_ms', 
+                'median_ttft_ms',
+                'p99_ttft_ms',
+                'mean_itl_ms',
+                'median_itl_ms',
+                'p99_itl_ms']:
+        
+        fig, ax = plt.subplots(figsize=(11, 7))
+        plt.plot(dis_df['qps'], dis_df[key], label='disagg_prefill', marker='o', linewidth=4)
+        plt.plot(chu_df['qps'], chu_df[key], label='chunked_prefill', marker='o', linewidth=4)
+        ax.legend()
+
+        ax.set_xlabel('QPS')
+        ax.set_ylabel(key)
+        ax.set_ylim(bottom=0)
+        fig.savefig(f'results/{key}.png')
+        plt.close(fig)
+
     
     
\ No newline at end of file

From 3fc0c5cbca1d53c23a39bab8b26bfe56fe95b684 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 11 Aug 2024 05:30:04 +0000
Subject: [PATCH 153/278] fix bug: when KV transfer fails, do not return hidden
 state

---
 vllm/distributed/distributed_kv.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/distributed_kv.py b/vllm/distributed/distributed_kv.py
index fd28a82c294a..9005a325d6bf 100644
--- a/vllm/distributed/distributed_kv.py
+++ b/vllm/distributed/distributed_kv.py
@@ -314,7 +314,7 @@ def block_if_buffer_full(self):
         # TODO: allow using other policies to handle buffer full
         while True:
             self.input_hash_to_kv_sending_requests_lock.acquire()
-            if len(self.input_hash_to_kv_sending_requests.keys()) > 55:
+            if len(self.input_hash_to_kv_sending_requests.keys()) > 40:
                 self.input_hash_to_kv_sending_requests_lock.release()
                 time.sleep(0.1)
             else:
@@ -452,6 +452,11 @@ def recv_kv_caches_and_hidden_states(
                 [num_tokens, model_executable.config.hidden_size]),
                                                 kv_cache[0].dtype,
                                                 is_hidden=True))
+    
+    if not bypass_model_exec:
+        # Some of the KV cache is not retrieved
+        # so we need to recompute the hidden state
+        return [], bypass_model_exec
 
     # concatenate hidden states from different requests
     if isinstance(hidden_or_intermediate_states_for_one_req[0], torch.Tensor):

From f9aadd87de1bb284d42692b8968b76dfaa775606 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 26 Aug 2024 10:55:10 -0700
Subject: [PATCH 154/278] add new abstractions

---
 vllm/distributed/kv_transfer/__init__.py      |   0
 .../kv_transfer/kv_database/__init__.py       |   0
 .../kv_transfer/kv_database/base.py           |  16 +
 .../kv_transfer/kv_pipe/__init__.py           |   0
 vllm/distributed/kv_transfer/kv_pipe/base.py  |  13 +
 .../kv_pipe/torch_distributed_pipe.py         |  88 +++++
 .../kv_transfer/kv_serde/__init__.py          |   0
 vllm/distributed/kv_transfer/kv_serde/base.py |  13 +
 vllm/distributed/kv_transfer/vllm_adapter.py  | 341 ++++++++++++++++++
 9 files changed, 471 insertions(+)
 create mode 100644 vllm/distributed/kv_transfer/__init__.py
 create mode 100644 vllm/distributed/kv_transfer/kv_database/__init__.py
 create mode 100644 vllm/distributed/kv_transfer/kv_database/base.py
 create mode 100644 vllm/distributed/kv_transfer/kv_pipe/__init__.py
 create mode 100644 vllm/distributed/kv_transfer/kv_pipe/base.py
 create mode 100644 vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
 create mode 100644 vllm/distributed/kv_transfer/kv_serde/__init__.py
 create mode 100644 vllm/distributed/kv_transfer/kv_serde/base.py
 create mode 100644 vllm/distributed/kv_transfer/vllm_adapter.py

diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/distributed/kv_transfer/kv_database/__init__.py b/vllm/distributed/kv_transfer/kv_database/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/distributed/kv_transfer/kv_database/base.py b/vllm/distributed/kv_transfer/kv_database/base.py
new file mode 100644
index 000000000000..ae17650754bf
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_database/base.py
@@ -0,0 +1,16 @@
+
+from abc import ABC, abstractmethod
+from typing import Optional
+import torch
+
+
+class KV_Database(ABC):
+    
+    @abstractmethod
+    def insert(self, input_tokens, kv, roi):
+        raise NotImplementedError
+    
+    @abstractmethod
+    def drop_select(self, input_tokens, roi) -> Optional[torch.Tensor]:
+        raise NotImplementedError
+    
\ No newline at end of file
diff --git a/vllm/distributed/kv_transfer/kv_pipe/__init__.py b/vllm/distributed/kv_transfer/kv_pipe/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
new file mode 100644
index 000000000000..625656adc266
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -0,0 +1,13 @@
+
+from abc import ABC, abstractmethod
+
+
+class KVPipeBase(ABC):
+    
+    @abstractmethod 
+    def send_tensor(self, tensor):
+        raise NotImplementedError
+    
+    @abstractmethod 
+    def recv_tensor(self):
+        raise NotImplementedError
\ No newline at end of file
diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
new file mode 100644
index 000000000000..97fab4817198
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -0,0 +1,88 @@
+
+from vllm.distributed.group_coordinator import GroupCoordinator
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from torch.distributed import Backend, ProcessGroup
+import torch
+from typing import List, Union, Optional
+
+class TorchDistributedPipe(KVPipeBase, GroupCoordinator):
+    
+class DistributedKVCoordinator(GroupCoordinator):
+    """
+    A class designated for distributed KV transfer
+    
+    Target use cases:
+        1. Disaggregated prefill
+        2. Remote KV cache storage
+        
+    """
+
+    def __init__(
+        self,
+        group_ranks: List[List[int]],
+        local_rank: int,
+        torch_distributed_backend: Union[str, Backend],
+        # DO NOT use pynccl here
+        # Pynccl send is non-blocking
+        # and it's possible that the memory is freed before the data being sent
+        # which may happen at high qps
+        use_pynccl: bool = False,
+        use_custom_allreduce: bool = False,
+        use_tpu_communicator: bool = True,
+        use_message_queue_broadcaster: bool = False,
+        blocking_send_recv: bool = False,
+    ):
+
+        super().__init__(
+            group_ranks,
+            local_rank,
+            torch_distributed_backend,
+            use_pynccl,
+            use_custom_allreduce,
+            use_tpu_communicator,
+            use_message_queue_broadcaster,
+        )
+
+        # if turned on, will use CPU-based communication to perform a series of sanity check.
+        # but it adds ~5ms delay, so please turn it off in performance-demanding usecases (e.g. disaggregated prefill)
+        self.blocking_send_recv = blocking_send_recv
+        self.target_rank_for_send = self.ranks[(self.rank_in_group + 1) %
+                                               self.world_size]
+        self.target_rank_for_recv = self.ranks[(self.rank_in_group - 1) %
+                                               self.world_size]
+        torch.set_default_device(self.device)
+
+    def send_tensor(self,
+                   tensor: torch.Tensor) -> None:
+        """
+        Sends a tensor to the destination rank in a non-blocking way.
+        Flow: send tensor dim -- send tensor shape -- send tensor data
+        """
+        
+        dim_tensor = torch.tensor([len(tensor.shape)], dtype=torch.int).to(self.device, non_blocking=True)
+        shape_tensor = torch.tensor(tensor.shape, dtype=torch.int).to(self.device, non_blocking=True)
+        
+        torch.distributed.isend(dim_tensor, self.target_rank_for_send, self.device_group)
+        torch.distributed.isend(shape_tensor, self.target_rank_for_send, self.device_group)
+        torch.distributed.isend(tensor, self.target_rank_for_send, self.device_group)
+
+    def recv_tensor(self) -> torch.Tensor:
+        """Receives a tensor from the src rank. Blocking."""
+        
+        # FIXME(Kuntai): this incurs frequent data moving between CPU and GPU
+        # can be optimized by pre-allocating tensors on GPU.
+        dim_tensor = torch.tensor([0], dtype=torch.int).to(self.device)
+        torch.distributed.irecv(dim_tensor, self.target_rank_for_recv, self.device_group)
+        dim = dim_tensor.item()
+        shape_tensor = torch.zeros(dim, dtype=torch.int).to(self.device)
+        torch.distributed.irecv(shape_tensor, self.target_rank_for_recv, self.device_group)
+        return_tensor = torch.zeros(shape_tensor, dtype=torch.float32).to(self.device)
+        torch.distributed.irecv(return_tensor, self.target_rank_for_recv, self.device_group)
+
+        result = self.recv_tensor_dict(src)
+        tensor = result["tensor"]
+        assert torch.allclose(result["mean"], tensor.float().mean())
+        assert result["shape"] == tensor.shape
+        assert result[
+            "shape"] == size, f"The shape sent by sender is {result['shape']} but trying to receive {size}"
+        return tensor
diff --git a/vllm/distributed/kv_transfer/kv_serde/__init__.py b/vllm/distributed/kv_transfer/kv_serde/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/vllm/distributed/kv_transfer/kv_serde/base.py b/vllm/distributed/kv_transfer/kv_serde/base.py
new file mode 100644
index 000000000000..64168553ff15
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_serde/base.py
@@ -0,0 +1,13 @@
+
+import torch
+from abc import ABC, abstractmethod
+
+class KV_serde(ABC):
+    
+    @abstractmethod
+    def serialize(self, tensor: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+    
+    @abstractmethod
+    def deserialize(self, data: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
\ No newline at end of file
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
new file mode 100644
index 000000000000..683185df1ab5
--- /dev/null
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -0,0 +1,341 @@
+"""vLLM distributed KV cache transfer API.
+These APIs are used in `vllm/worker/model_runner.py`.
+
+Currently supporting TP and PP.
+
+Workflow:
+- In prefill instance, KV cache sender *buffers* the KV cache send requests
+- In decode instance
+    - KV cache receiver sends the hash of input tokens to sender
+    - KV cache sender executes send request
+    - KV cache receiver receives the KV cache
+"""
+from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING
+from collections import defaultdict, deque
+from concurrent.futures import ThreadPoolExecutor
+from threading import Lock
+from copy import deepcopy
+import time
+import threading
+
+import torch
+from torch.distributed import Backend, ProcessGroup
+
+import vllm.envs as envs
+from vllm.distributed.group_coordinator import GroupCoordinator
+from vllm.logger import init_logger
+import vllm.distributed.parallel_state as ps
+from vllm import _custom_ops as ops
+from vllm.sequence import IntermediateTensors
+
+assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode"], \
+    "VLLM_DISAGG_PREFILL_ROLE can only be prefill or decode."
+
+IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE is not None)
+IS_KV_PREFILL_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "prefill")
+IS_KV_DECODE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "decode")
+
+# add a tag when sending/recving input hash
+DISTRIBUTED_KV_GLOO_TAG = 24857323
+
+logger = init_logger(__name__)
+
+import logging
+
+
+class RankFilter(logging.Filter):
+
+    def filter(self, record):
+        # Only log if rank is 4
+        rank = 1
+        try:
+            rank = torch.distributed.get_rank()
+        except Exception:
+            pass
+        return rank % 4 == 0
+
+
+for handler in logger.handlers:
+    handler.addFilter(RankFilter())
+
+
+class DistributedKVCoordinator(GroupCoordinator):
+    """
+    A class designated for distributed KV transfer
+    
+    Target use cases:
+        1. Disaggregated prefill
+        2. Remote KV cache storage
+        
+    """
+
+    def __init__(
+        self,
+        group_ranks: List[List[int]],
+        local_rank: int,
+        torch_distributed_backend: Union[str, Backend],
+        # DO NOT use pynccl here
+        # Pynccl send is non-blocking
+        # and it's possible that the memory is freed before the data being sent
+        # which may happen at high qps
+        use_pynccl: bool = False,
+        use_custom_allreduce: bool = False,
+        use_tpu_communicator: bool = True,
+        use_message_queue_broadcaster: bool = False,
+        use_cpu_comm_for_sanity_check: bool = False,
+    ):
+
+        super().__init__(
+            group_ranks,
+            local_rank,
+            torch_distributed_backend,
+            use_pynccl,
+            use_custom_allreduce,
+            use_tpu_communicator,
+            use_message_queue_broadcaster,
+        )
+
+        # if turned on, will use CPU-based communication to perform a series of sanity check.
+        # but it adds ~5ms delay, so please turn it off in performance-demanding usecases (e.g. disaggregated prefill)
+        self.use_cpu_comm_for_sanity_check = use_cpu_comm_for_sanity_check
+
+        # use a threadpool to buffer send request in disaggregated prefill
+        self.input_hash_to_kv_sending_requests = defaultdict(deque)
+        self.kv_sending_thread = None
+        self.input_hash_to_kv_sending_requests_lock = Lock()
+        self.target_rank_for_send = self.ranks[(self.rank_in_group + 1) %
+                                               self.world_size]
+        self.target_rank_for_recv = self.ranks[(self.rank_in_group - 1) %
+                                               self.world_size]
+
+        torch.set_default_device(self.device)
+
+    def debug_send(self,
+                   tensor: torch.Tensor,
+                   dst: Optional[int] = None) -> None:
+        """Sends a tensor to the destination rank in a non-blocking way"""
+        """Will send several metadata. Useful for debugging."""
+        """NOTE: `dst` is the local rank of the destination rank."""
+
+        self.send_tensor_dict(
+            {
+                "tensor": tensor,
+                "mean": tensor.float().mean(),
+                "shape": tensor.shape
+            }, dst)
+
+    def debug_recv(self,
+                   size: torch.Size,
+                   dtype: torch.dtype,
+                   src: Optional[int] = None) -> torch.Tensor:
+        """Receives a tensor from the src rank."""
+        """NOTE: `src` is the local rank of the destination rank."""
+
+        result = self.recv_tensor_dict(src)
+        tensor = result["tensor"]
+        assert torch.allclose(result["mean"], tensor.float().mean())
+        assert result["shape"] == tensor.shape
+        assert result[
+            "shape"] == size, f"The shape sent by sender is {result['shape']} but trying to receive {size}"
+        return tensor
+
+    def kv_cache_send(self,
+                      input_hash: int,
+                      tensor: Union[torch.Tensor, IntermediateTensors],
+                      is_hidden: bool = False,
+                      dst: Optional[int] = None) -> None:
+        """Push the KV cache send request into the send buffer"""
+        """NOTE: `dst` is the local rank of the destination rank."""
+
+        if self.use_cpu_comm_for_sanity_check:
+            send_func = self.debug_send
+        else:
+            send_func = self.send
+
+        if is_hidden and not ps.get_pp_group().is_last_rank:
+
+            assert isinstance(tensor, IntermediateTensors)
+
+            output = deepcopy(tensor.tensors)
+            for key in output:
+                output[key] = output[key].contiguous()
+
+            self.input_hash_to_kv_sending_requests[input_hash].append(
+                [self.send_tensor_dict, output, dst])
+
+        else:
+
+            assert isinstance(tensor, torch.Tensor)
+
+            self.input_hash_to_kv_sending_requests[input_hash].append([
+                send_func,
+                # use clone to make sure the tensor is contiguous
+                tensor.clone(),
+                dst
+            ])
+
+    def kv_cache_recv(
+            self,
+            size: torch.Size,
+            dtype: torch.dtype,
+            is_hidden: bool = False,
+            src: Optional[int] = None
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        """Receives a tensor from the src rank (blocking)."""
+        """This API should be used together with `push`"""
+        """NOTE: `src` is the local rank of the destination rank."""
+
+        if self.use_cpu_comm_for_sanity_check:
+            recv_func = self.debug_recv
+        else:
+            recv_func = self.recv
+
+        if is_hidden and not ps.get_pp_group().is_last_rank:
+            tensor = IntermediateTensors(self.recv_tensor_dict(src))
+        else:
+            tensor = recv_func(size, dtype, src)
+
+        return tensor
+
+    def send_input_hash(self, input_hash: int) -> int:
+
+        logger.debug('[rank%d]: Sending input hash %d to rank %d',
+                     torch.distributed.get_rank(), input_hash,
+                     self.target_rank_for_send)
+
+        # KV cache send go through CPU, and the original `send` only use GPU.
+        # So create a new group for sending input hash.
+        input_hash_tensor = torch.tensor([input_hash], device="cpu").long()
+        torch.distributed.send(input_hash_tensor,
+                               self.target_rank_for_send,
+                               self.cpu_group,
+                               tag=DISTRIBUTED_KV_GLOO_TAG)
+        return_tensor = torch.tensor([0], device="cpu").long()
+        torch.distributed.recv(return_tensor,
+                               self.target_rank_for_recv,
+                               self.cpu_group,
+                               tag=DISTRIBUTED_KV_GLOO_TAG)
+        return return_tensor.item()
+
+    def recv_input_hash(self) -> Optional[int]:
+        '''
+            Receive an input hash, and check if it is already cached
+        '''
+        input_hash_tensor = torch.tensor([0], device="cpu").long()
+        torch.distributed.recv(input_hash_tensor,
+                               self.target_rank_for_recv,
+                               self.cpu_group,
+                               tag=DISTRIBUTED_KV_GLOO_TAG)
+        input_hash = input_hash_tensor.item()
+        # a new input hash comes in, see if it is already cached
+        self.input_hash_to_kv_sending_requests_lock.acquire()
+        logger.debug('Successfully received input hash %d', input_hash)
+        if input_hash not in self.input_hash_to_kv_sending_requests:
+            logger.warning(
+            f"The KV cache of {input_hash} does not exist. "\
+            f"Existing input hash: {list(self.input_hash_to_kv_sending_requests.keys())}")
+            
+            # 0 for fail
+            x = torch.tensor([0], device="cpu").long()
+            torch.distributed.send(x,
+                                    self.target_rank_for_send,
+                                    self.cpu_group,
+                                    tag=DISTRIBUTED_KV_GLOO_TAG)
+            return None
+        else:
+            logger.debug('Input hash %d exists, start sending', input_hash)
+            
+            # 1 for success
+            x = torch.tensor([1], device="cpu").long()
+            torch.distributed.send(x,
+                                   self.target_rank_for_send,
+                                   self.cpu_group,
+                                   tag=DISTRIBUTED_KV_GLOO_TAG)
+            return input_hash
+
+    def kv_cache_send_loop(self):
+        
+        while True:
+            logger.debug(
+                '[rank%d]: Waiting for input hash from rank %d, my keys are %s',
+                torch.distributed.get_rank(),
+                self.target_rank_for_recv,
+                list(self.input_hash_to_kv_sending_requests.keys()),
+            )
+            # wait for a new input hash
+            # this function will acquire the lock
+            input_hash = self.recv_input_hash()
+            if input_hash is None:
+                self.input_hash_to_kv_sending_requests_lock.release()
+                continue
+
+            # execute corresponding kv cache sending jobs in request queue
+            while True:
+                request = self.input_hash_to_kv_sending_requests[
+                    input_hash].popleft()
+                # An empty request: the KV cahe of one request are all sent
+                if request == []:
+                    break
+
+                request[0](*request[1:])
+
+            if len(self.input_hash_to_kv_sending_requests[input_hash]) == 0:
+                logger.debug('Finish input hash %d, free GPU memory...',
+                             input_hash)
+                del self.input_hash_to_kv_sending_requests[input_hash]
+            else:
+                logger.debug(
+                    'The buffer for input hash %d is not empty, meaning that '\
+                    'there are two jobs with identical input.',
+                    input_hash)
+
+            self.input_hash_to_kv_sending_requests_lock.release()
+
+
+    def kv_cache_send_ready(self, input_hash: int):
+
+        if self.kv_sending_thread is None:
+            self.kv_sending_thread = threading.Thread(
+                target=self.kv_cache_send_loop)
+            self.kv_sending_thread.start()
+        
+        # append an empty list to separate requests
+        # as there might be identical requests, that has the same input hash
+        self.input_hash_to_kv_sending_requests[input_hash].append([])
+        logger.debug(f'Buffered input hash {input_hash}')
+
+    def kv_cache_recv_start(self, input_hash: int):
+        # notify the kv cache sender with the input hash id
+        return self.send_input_hash(input_hash)
+
+    def block_if_buffer_full(self):
+        
+        # block vLLM if the KV cache sending buffer is full
+        # TODO: allow using other policies to handle buffer full
+        while True:
+            self.input_hash_to_kv_sending_requests_lock.acquire()
+            if len(self.input_hash_to_kv_sending_requests.keys()) > 40:
+                self.input_hash_to_kv_sending_requests_lock.release()
+                time.sleep(0.1)
+            else:
+                self.input_hash_to_kv_sending_requests_lock.release()
+                break
+
+
+def buffer_kv(
+    model_executable: torch.nn.Module,
+    model_input: "ModelInputForGPUWithSamplingMetadata",
+    kv_caches: List[torch.Tensor],
+    hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors],
+) -> None:
+    
+    pass
+
+
+def recv_kv(
+    model_executable: torch.nn.Module,
+    model_input: "ModelInputForGPUWithSamplingMetadata",
+    kv_caches: List[torch.Tensor]
+) -> Tuple[List[torch.Tensor], Union[torch.Tensor, IntermediateTensors]]:
+    
+    pass
\ No newline at end of file

From db66a1e383bff0c2c0d7656289f6d7ac46b1f6ea Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 27 Aug 2024 22:15:53 -0700
Subject: [PATCH 155/278] major revision: add 3-layer abstractions. Transport,
 lookup buffer, and adapter.

---
 .../__init__.py                               |   0
 .../{kv_database => kv_lookup_buffer}/base.py |   6 +-
 .../simple_kv_lookup_buffer.py                | 122 ++++++
 .../kv_pipe/torch_distributed_pipe.py         |  91 ++--
 .../kv_transfer/kv_serde/__init__.py          |   0
 vllm/distributed/kv_transfer/kv_serde/base.py |  13 -
 vllm/distributed/kv_transfer/vllm_adapter.py  | 396 ++++++------------
 7 files changed, 311 insertions(+), 317 deletions(-)
 rename vllm/distributed/kv_transfer/{kv_database => kv_lookup_buffer}/__init__.py (100%)
 rename vllm/distributed/kv_transfer/{kv_database => kv_lookup_buffer}/base.py (65%)
 create mode 100644 vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_serde/__init__.py
 delete mode 100644 vllm/distributed/kv_transfer/kv_serde/base.py

diff --git a/vllm/distributed/kv_transfer/kv_database/__init__.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
similarity index 100%
rename from vllm/distributed/kv_transfer/kv_database/__init__.py
rename to vllm/distributed/kv_transfer/kv_lookup_buffer/__init__.py
diff --git a/vllm/distributed/kv_transfer/kv_database/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
similarity index 65%
rename from vllm/distributed/kv_transfer/kv_database/base.py
rename to vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index ae17650754bf..5ac8fbb24444 100644
--- a/vllm/distributed/kv_transfer/kv_database/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -4,10 +4,12 @@
 import torch
 
 
-class KV_Database(ABC):
+class KVLookupBufferBase(ABC):
     
     @abstractmethod
-    def insert(self, input_tokens, kv, roi):
+    def insert(self,
+               input_tokens: torch.Tensor,
+               kv: torch.Tensor, roi) -> None:
         raise NotImplementedError
     
     @abstractmethod
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
new file mode 100644
index 000000000000..c43a41575aee
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
@@ -0,0 +1,122 @@
+
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import \
+    KVLookupBufferBase
+from typing import Dict, Tuple, List, Optional
+import threading
+import torch
+from collections import deque
+
+class SimpleKVLookupBuffer(KVLookupBufferBase):
+    
+    def __init__(self, pipe):
+        
+        self.tokens_roi_kv_buffer = deque()
+        
+        self.buffer_size = 0
+        self.buffer_lock = threading.Lock()
+        self.pipe = pipe
+        self.request_handling_thread = None
+
+        
+    def _matches(self, tokens_roi_sender, tokens_roi_recver):
+
+        # tokens_roi_sender: tokens and roi of the producer (in the buffer)
+        # tokens_roi_recver: tokens and roi of the consumer (query)
+        
+        tokens_sender = tokens_roi_sender[0]
+        tokens_recver = tokens_roi_recver[0]
+        roi_sender = tokens_roi_sender[1]
+        roi_recver = tokens_roi_recver[1]
+        
+        if tokens_recver is None:
+            # consumer sends an empty request
+            # semantics: DROP SELECT * LIMIT 1
+            # so any of the data in the buffer can be drop-selected
+            return True
+        
+        
+        if tokens_sender == tokens_recver[:tokens_sender.shape[0]]:
+            # drastically simplified
+            # accept a match as long as 
+            
+            return True
+
+            
+    def _send_tensor_and_dec_size(self, tensor: Optional[torch.Tensor]) -> None:
+
+        assert tensor is not None, "Use self.pipe.send(None) instead"
+        self.buffer_size -= tensor.element_size() * tensor.numel()
+        tensor = tensor.clone()
+        self.pipe.send_tensor(tensor)
+        
+    def _add_to_buffer(self, input_tokens, roi, kv):
+        
+        self.buffer_size += input_tokens.element_size() * input_tokens.numel()
+        self.buffer_size += roi.element_size() * roi.numel()
+        self.buffer_size += kv.element_size() * kv.numel()
+        self.tokens_roi_kv_buffer.append((input_tokens, roi, kv))
+        
+        
+        
+        
+        
+    def drop_select_handler(self):
+        
+        while True:
+            input_tokens = self.pipe.recv_tensor()
+            roi = self.pipe.recv_tensor()
+            tokens_roi = [input_tokens, roi]
+            
+            matched_idx = None
+            
+            # perform input tokens and roi matching
+            with self.buffer_lock:
+                
+                for idx, tokens_roi_kv in enumerate(self.tokens_roi_kv_buffer):
+                    if self._matches(tokens_roi_kv, tokens_roi):
+                        matched_idx = idx
+                        break
+                    
+                if matched_idx is not None:
+                    # need to clone the tensor
+                    # in case the tensor is freed before sending finishes
+                    matched_item = self.tokens_roi_kv_buffer[matched_idx]
+                    self._send_tensor_and_dec_size(matched_item[0].clone())
+                    self._send_tensor_and_dec_size(matched_item[1].clone())
+                    self._send_tensor_and_dec_size(matched_item[2].clone())
+                    del self.tokens_roi_kv_buffer[matched_idx]
+                    
+                else:
+                    # no match, just send None
+                    self.pipe.send_tensor(None)
+                    self.pipe.send_tensor(None)
+                    self.pipe.send_tensor(None)
+
+        
+    def drop_select(self, input_tokens, roi):
+        
+        assert self.request_handling_thread is None, \
+            "drop_select should be called by the receiver"
+        
+        self.pipe.send_tensor(input_tokens.clone())
+        self.pipe.send_tensor(roi.clone())
+        
+        input_tokens = self.pipe.recv_tensor()
+        roi = self.pipe.recv_tensor()
+        kv = self.pipe.recv_tensor()
+        
+        return [input_tokens, roi, kv]
+        
+    
+    def insert(self, input_tokens, roi, kv) -> None:
+        
+        # when calling the insert, the current process is a sender
+        # need to launch the request handler and start listening to request.
+        if self.request_handling_thread is None:
+            self.request_handling_thread = threading.Thread(
+                target=self.drop_select_handler)
+            self.request_handling_thread.start()
+            
+        with self.buffer_lock:
+            self._add_to_buffer(input_tokens, roi, kv)
+            
\ No newline at end of file
diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index 97fab4817198..4a16f5908679 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -4,19 +4,19 @@
 from torch.distributed import Backend, ProcessGroup
 import torch
 from typing import List, Union, Optional
+import threading
+from concurrent.futures import ThreadPoolExecutor
+import time
+import threading
+
+
+class BrokenPipeException(Exception):
+    def __init__(self, message):
+        self.message = message
+        super().__init__(self.message)
 
 class TorchDistributedPipe(KVPipeBase, GroupCoordinator):
     
-class DistributedKVCoordinator(GroupCoordinator):
-    """
-    A class designated for distributed KV transfer
-    
-    Target use cases:
-        1. Disaggregated prefill
-        2. Remote KV cache storage
-        
-    """
-
     def __init__(
         self,
         group_ranks: List[List[int]],
@@ -30,7 +30,6 @@ def __init__(
         use_custom_allreduce: bool = False,
         use_tpu_communicator: bool = True,
         use_message_queue_broadcaster: bool = False,
-        blocking_send_recv: bool = False,
     ):
 
         super().__init__(
@@ -45,44 +44,64 @@ def __init__(
 
         # if turned on, will use CPU-based communication to perform a series of sanity check.
         # but it adds ~5ms delay, so please turn it off in performance-demanding usecases (e.g. disaggregated prefill)
-        self.blocking_send_recv = blocking_send_recv
         self.target_rank_for_send = self.ranks[(self.rank_in_group + 1) %
                                                self.world_size]
         self.target_rank_for_recv = self.ranks[(self.rank_in_group - 1) %
                                                self.world_size]
         torch.set_default_device(self.device)
 
+        self.kv_sending_thread = None
+        self.buffer_size = 0
+        self.buffer_size_lock = threading.lock()
+
+        self.nan_tensor = torch.tensor(['nan'])
+        self.broken = False
+        
+        
+    def send_tensor_wrapper(self, tensor: torch.Tensor) -> None:
+        """Wrapper for send_tensor_dict"""
+        tensor_size = tensor['tensor'].element_size() * tensor['tensor'].numel()
+        self.send_tensor_dict({'tensor': tensor}, self.target_rank_for_send)
+        
+        with self.buffer_size_lock:
+            self.buffer_size = self.buffer_size - tensor_size
+        
+    def block_if_full(self):
+        
+        while self.buffer_size > 1e9:
+            time.sleep(0.05)
+
     def send_tensor(self,
-                   tensor: torch.Tensor) -> None:
+                   tensor: Optional[torch.Tensor]) -> None:
         """
         Sends a tensor to the destination rank in a non-blocking way.
         Flow: send tensor dim -- send tensor shape -- send tensor data
         """
         
-        dim_tensor = torch.tensor([len(tensor.shape)], dtype=torch.int).to(self.device, non_blocking=True)
-        shape_tensor = torch.tensor(tensor.shape, dtype=torch.int).to(self.device, non_blocking=True)
+        if self.kv_sending_thread is None:
+            self.kv_sending_thread = ThreadPoolExecutor(max_workers=1)
+
+        if tensor is None:
+            tensor = self.nan_tensor
+            tensor_size = 0
+        else:
+            tensor_size = tensor.element_size() * tensor.numel()
+
+        self.block_if_full()
+
+        with self.buffer_size_lock:
+            self.buffer_size = self.buffer_size + tensor_size
+            
+        self.kv_sending_thread.submit(self.send_tensor_wrapper, tensor)
+
         
-        torch.distributed.isend(dim_tensor, self.target_rank_for_send, self.device_group)
-        torch.distributed.isend(shape_tensor, self.target_rank_for_send, self.device_group)
-        torch.distributed.isend(tensor, self.target_rank_for_send, self.device_group)
 
-    def recv_tensor(self) -> torch.Tensor:
+    def recv_tensor(self) -> Optional[torch.Tensor]:
         """Receives a tensor from the src rank. Blocking."""
         
-        # FIXME(Kuntai): this incurs frequent data moving between CPU and GPU
-        # can be optimized by pre-allocating tensors on GPU.
-        dim_tensor = torch.tensor([0], dtype=torch.int).to(self.device)
-        torch.distributed.irecv(dim_tensor, self.target_rank_for_recv, self.device_group)
-        dim = dim_tensor.item()
-        shape_tensor = torch.zeros(dim, dtype=torch.int).to(self.device)
-        torch.distributed.irecv(shape_tensor, self.target_rank_for_recv, self.device_group)
-        return_tensor = torch.zeros(shape_tensor, dtype=torch.float32).to(self.device)
-        torch.distributed.irecv(return_tensor, self.target_rank_for_recv, self.device_group)
-
-        result = self.recv_tensor_dict(src)
-        tensor = result["tensor"]
-        assert torch.allclose(result["mean"], tensor.float().mean())
-        assert result["shape"] == tensor.shape
-        assert result[
-            "shape"] == size, f"The shape sent by sender is {result['shape']} but trying to receive {size}"
-        return tensor
+        tensor = self.recv_tensor_dict(self.target_rank_for_recv)['tensor']
+        if tensor.isnan().item():
+            return None
+        else:
+            return tensor
+    
\ No newline at end of file
diff --git a/vllm/distributed/kv_transfer/kv_serde/__init__.py b/vllm/distributed/kv_transfer/kv_serde/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/vllm/distributed/kv_transfer/kv_serde/base.py b/vllm/distributed/kv_transfer/kv_serde/base.py
deleted file mode 100644
index 64168553ff15..000000000000
--- a/vllm/distributed/kv_transfer/kv_serde/base.py
+++ /dev/null
@@ -1,13 +0,0 @@
-
-import torch
-from abc import ABC, abstractmethod
-
-class KV_serde(ABC):
-    
-    @abstractmethod
-    def serialize(self, tensor: torch.Tensor) -> torch.Tensor:
-        raise NotImplementedError
-    
-    @abstractmethod
-    def deserialize(self, data: torch.Tensor) -> torch.Tensor:
-        raise NotImplementedError
\ No newline at end of file
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 683185df1ab5..07358567e783 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -27,14 +27,20 @@
 import vllm.distributed.parallel_state as ps
 from vllm import _custom_ops as ops
 from vllm.sequence import IntermediateTensors
+from vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe import TorchDistributedPipe
+from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_kv_lookup_buffer import SimpleKVLookupBuffer
 
-assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode"], \
+assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode", "lmc"], \
     "VLLM_DISAGG_PREFILL_ROLE can only be prefill or decode."
 
-IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE is not None)
+IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"])
 IS_KV_PREFILL_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "prefill")
 IS_KV_DECODE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "decode")
 
+'''Jiayi starts here'''
+IS_LMC_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "lmc")
+'''Jiayi ends here'''
+
 # add a tag when sending/recving input hash
 DISTRIBUTED_KV_GLOO_TAG = 24857323
 
@@ -43,30 +49,13 @@
 import logging
 
 
-class RankFilter(logging.Filter):
-
-    def filter(self, record):
-        # Only log if rank is 4
-        rank = 1
-        try:
-            rank = torch.distributed.get_rank()
-        except Exception:
-            pass
-        return rank % 4 == 0
-
-
-for handler in logger.handlers:
-    handler.addFilter(RankFilter())
-
-
-class DistributedKVCoordinator(GroupCoordinator):
+class KV_transfer_agent:
     """
     A class designated for distributed KV transfer
     
     Target use cases:
         1. Disaggregated prefill
         2. Remote KV cache storage
-        
     """
 
     def __init__(
@@ -74,18 +63,14 @@ def __init__(
         group_ranks: List[List[int]],
         local_rank: int,
         torch_distributed_backend: Union[str, Backend],
-        # DO NOT use pynccl here
-        # Pynccl send is non-blocking
-        # and it's possible that the memory is freed before the data being sent
-        # which may happen at high qps
         use_pynccl: bool = False,
         use_custom_allreduce: bool = False,
         use_tpu_communicator: bool = True,
-        use_message_queue_broadcaster: bool = False,
-        use_cpu_comm_for_sanity_check: bool = False,
+        use_message_queue_broadcaster: bool = False
     ):
-
-        super().__init__(
+        
+        # init pipe
+        self.pipe = TorchDistributedPipe(
             group_ranks,
             local_rank,
             torch_distributed_backend,
@@ -94,248 +79,127 @@ def __init__(
             use_tpu_communicator,
             use_message_queue_broadcaster,
         )
+        # init lookup buffer
+        self.buffer = SimpleKVLookupBuffer(self.pipe)
 
-        # if turned on, will use CPU-based communication to perform a series of sanity check.
-        # but it adds ~5ms delay, so please turn it off in performance-demanding usecases (e.g. disaggregated prefill)
-        self.use_cpu_comm_for_sanity_check = use_cpu_comm_for_sanity_check
-
-        # use a threadpool to buffer send request in disaggregated prefill
-        self.input_hash_to_kv_sending_requests = defaultdict(deque)
-        self.kv_sending_thread = None
-        self.input_hash_to_kv_sending_requests_lock = Lock()
-        self.target_rank_for_send = self.ranks[(self.rank_in_group + 1) %
-                                               self.world_size]
-        self.target_rank_for_recv = self.ranks[(self.rank_in_group - 1) %
-                                               self.world_size]
-
-        torch.set_default_device(self.device)
-
-    def debug_send(self,
-                   tensor: torch.Tensor,
-                   dst: Optional[int] = None) -> None:
-        """Sends a tensor to the destination rank in a non-blocking way"""
-        """Will send several metadata. Useful for debugging."""
-        """NOTE: `dst` is the local rank of the destination rank."""
-
-        self.send_tensor_dict(
-            {
-                "tensor": tensor,
-                "mean": tensor.float().mean(),
-                "shape": tensor.shape
-            }, dst)
-
-    def debug_recv(self,
-                   size: torch.Size,
-                   dtype: torch.dtype,
-                   src: Optional[int] = None) -> torch.Tensor:
-        """Receives a tensor from the src rank."""
-        """NOTE: `src` is the local rank of the destination rank."""
-
-        result = self.recv_tensor_dict(src)
-        tensor = result["tensor"]
-        assert torch.allclose(result["mean"], tensor.float().mean())
-        assert result["shape"] == tensor.shape
-        assert result[
-            "shape"] == size, f"The shape sent by sender is {result['shape']} but trying to receive {size}"
-        return tensor
-
-    def kv_cache_send(self,
-                      input_hash: int,
-                      tensor: Union[torch.Tensor, IntermediateTensors],
-                      is_hidden: bool = False,
-                      dst: Optional[int] = None) -> None:
-        """Push the KV cache send request into the send buffer"""
-        """NOTE: `dst` is the local rank of the destination rank."""
-
-        if self.use_cpu_comm_for_sanity_check:
-            send_func = self.debug_send
-        else:
-            send_func = self.send
-
-        if is_hidden and not ps.get_pp_group().is_last_rank:
-
-            assert isinstance(tensor, IntermediateTensors)
-
-            output = deepcopy(tensor.tensors)
-            for key in output:
-                output[key] = output[key].contiguous()
-
-            self.input_hash_to_kv_sending_requests[input_hash].append(
-                [self.send_tensor_dict, output, dst])
-
-        else:
-
-            assert isinstance(tensor, torch.Tensor)
-
-            self.input_hash_to_kv_sending_requests[input_hash].append([
-                send_func,
-                # use clone to make sure the tensor is contiguous
-                tensor.clone(),
-                dst
-            ])
-
-    def kv_cache_recv(
-            self,
-            size: torch.Size,
-            dtype: torch.dtype,
-            is_hidden: bool = False,
-            src: Optional[int] = None
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        """Receives a tensor from the src rank (blocking)."""
-        """This API should be used together with `push`"""
-        """NOTE: `src` is the local rank of the destination rank."""
-
-        if self.use_cpu_comm_for_sanity_check:
-            recv_func = self.debug_recv
-        else:
-            recv_func = self.recv
-
-        if is_hidden and not ps.get_pp_group().is_last_rank:
-            tensor = IntermediateTensors(self.recv_tensor_dict(src))
-        else:
-            tensor = recv_func(size, dtype, src)
-
-        return tensor
-
-    def send_input_hash(self, input_hash: int) -> int:
-
-        logger.debug('[rank%d]: Sending input hash %d to rank %d',
-                     torch.distributed.get_rank(), input_hash,
-                     self.target_rank_for_send)
-
-        # KV cache send go through CPU, and the original `send` only use GPU.
-        # So create a new group for sending input hash.
-        input_hash_tensor = torch.tensor([input_hash], device="cpu").long()
-        torch.distributed.send(input_hash_tensor,
-                               self.target_rank_for_send,
-                               self.cpu_group,
-                               tag=DISTRIBUTED_KV_GLOO_TAG)
-        return_tensor = torch.tensor([0], device="cpu").long()
-        torch.distributed.recv(return_tensor,
-                               self.target_rank_for_recv,
-                               self.cpu_group,
-                               tag=DISTRIBUTED_KV_GLOO_TAG)
-        return return_tensor.item()
-
-    def recv_input_hash(self) -> Optional[int]:
-        '''
-            Receive an input hash, and check if it is already cached
-        '''
-        input_hash_tensor = torch.tensor([0], device="cpu").long()
-        torch.distributed.recv(input_hash_tensor,
-                               self.target_rank_for_recv,
-                               self.cpu_group,
-                               tag=DISTRIBUTED_KV_GLOO_TAG)
-        input_hash = input_hash_tensor.item()
-        # a new input hash comes in, see if it is already cached
-        self.input_hash_to_kv_sending_requests_lock.acquire()
-        logger.debug('Successfully received input hash %d', input_hash)
-        if input_hash not in self.input_hash_to_kv_sending_requests:
-            logger.warning(
-            f"The KV cache of {input_hash} does not exist. "\
-            f"Existing input hash: {list(self.input_hash_to_kv_sending_requests.keys())}")
+    def send_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor],
+        hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors],
+    ) -> None:
+
+        #input_tokens_tuple = tuple(model_input.input_tokens.tolist())
+        input_tokens_tensor = model_input.input_tokens
+        seq_lens = model_input.attn_metadata.seq_lens
+        slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
+
+        # query_lens contains new KV caches that are added to vLLM.
+        # so we will send them to decode instance
+        # FIXME(Kuntai): This assume that all requests are prefill.
+        for idx, slen in enumerate(seq_lens):
+            logger.debug(f"sending request {idx}")
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
             
-            # 0 for fail
-            x = torch.tensor([0], device="cpu").long()
-            torch.distributed.send(x,
-                                    self.target_rank_for_send,
-                                    self.cpu_group,
-                                    tag=DISTRIBUTED_KV_GLOO_TAG)
-            return None
-        else:
-            logger.debug('Input hash %d exists, start sending', input_hash)
+            keys, values = [], []
             
-            # 1 for success
-            x = torch.tensor([1], device="cpu").long()
-            torch.distributed.send(x,
-                                   self.target_rank_for_send,
-                                   self.cpu_group,
-                                   tag=DISTRIBUTED_KV_GLOO_TAG)
-            return input_hash
-
-    def kv_cache_send_loop(self):
-        
-        while True:
-            logger.debug(
-                '[rank%d]: Waiting for input hash from rank %d, my keys are %s',
-                torch.distributed.get_rank(),
-                self.target_rank_for_recv,
-                list(self.input_hash_to_kv_sending_requests.keys()),
+            
+            for l in range(model_executable.model.start_layer,
+                        model_executable.model.end_layer):
+                logger.debug(f"sending layer {l}")
+                kv_cache = kv_caches[l - model_executable.model.start_layer]
+
+                _, _, num_heads, head_size = kv_cache[0].shape
+
+                key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
+                value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
+
+                current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
+
+                keys.append(key_cache[current_slot_mapping].unsqeeze(0))
+                values.append(value_cache[current_slot_mapping].unsqueeze(0))
+                
+            keys = torch.cat(keys, dim=0)
+            values = torch.cat(values, dim=0)
+            self.buffer.insert(
+                input_tokens_tensor[start_pos:end_pos], 
+                None,
+                keys, 
+                values, 
+                hidden_or_intermediate_states[start_pos:end_pos]
             )
-            # wait for a new input hash
-            # this function will acquire the lock
-            input_hash = self.recv_input_hash()
-            if input_hash is None:
-                self.input_hash_to_kv_sending_requests_lock.release()
-                continue
-
-            # execute corresponding kv cache sending jobs in request queue
-            while True:
-                request = self.input_hash_to_kv_sending_requests[
-                    input_hash].popleft()
-                # An empty request: the KV cahe of one request are all sent
-                if request == []:
-                    break
-
-                request[0](*request[1:])
-
-            if len(self.input_hash_to_kv_sending_requests[input_hash]) == 0:
-                logger.debug('Finish input hash %d, free GPU memory...',
-                             input_hash)
-                del self.input_hash_to_kv_sending_requests[input_hash]
-            else:
-                logger.debug(
-                    'The buffer for input hash %d is not empty, meaning that '\
-                    'there are two jobs with identical input.',
-                    input_hash)
-
-            self.input_hash_to_kv_sending_requests_lock.release()
-
-
-    def kv_cache_send_ready(self, input_hash: int):
-
-        if self.kv_sending_thread is None:
-            self.kv_sending_thread = threading.Thread(
-                target=self.kv_cache_send_loop)
-            self.kv_sending_thread.start()
-        
-        # append an empty list to separate requests
-        # as there might be identical requests, that has the same input hash
-        self.input_hash_to_kv_sending_requests[input_hash].append([])
-        logger.debug(f'Buffered input hash {input_hash}')
-
-    def kv_cache_recv_start(self, input_hash: int):
-        # notify the kv cache sender with the input hash id
-        return self.send_input_hash(input_hash)
+            
 
-    def block_if_buffer_full(self):
-        
-        # block vLLM if the KV cache sending buffer is full
-        # TODO: allow using other policies to handle buffer full
-        while True:
-            self.input_hash_to_kv_sending_requests_lock.acquire()
-            if len(self.input_hash_to_kv_sending_requests.keys()) > 40:
-                self.input_hash_to_kv_sending_requests_lock.release()
-                time.sleep(0.1)
-            else:
-                self.input_hash_to_kv_sending_requests_lock.release()
-                break
-
-
-def buffer_kv(
-    model_executable: torch.nn.Module,
-    model_input: "ModelInputForGPUWithSamplingMetadata",
-    kv_caches: List[torch.Tensor],
-    hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors],
-) -> None:
-    
-    pass
+        logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
 
 
-def recv_kv(
-    model_executable: torch.nn.Module,
-    model_input: "ModelInputForGPUWithSamplingMetadata",
-    kv_caches: List[torch.Tensor]
-) -> Tuple[List[torch.Tensor], Union[torch.Tensor, IntermediateTensors]]:
-    
-    pass
\ No newline at end of file
+    def recv_kv_caches_and_hidden_states(
+        self,
+        model_executable: torch.nn.Module,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
+        kv_caches: List[torch.Tensor]
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool]:
+
+        bypass_model_exec = True
+
+        # This is disagg decode instance, during prefill state
+        # Need to receive KV from the prefill instance
+        input_tokens_tuple = tuple(model_input.input_tokens.tolist())
+        seq_lens = model_input.attn_metadata.seq_lens
+        slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
+
+        hidden_or_intermediate_states_for_one_req = []
+
+        # enumerate different requests
+        # FIXME(Kuntai): This impl assumes that all requests are prefill.
+        for idx, slen in enumerate(seq_lens):
+
+            start_pos = sum(seq_lens[:idx])
+            end_pos = start_pos + slen
+            current_input_tokens = input_tokens_tuple[start_pos:end_pos]
+            num_tokens = slen
+
+            ret = self.buffer.drop_select(current_input_tokens, None)
+            if ret[0] is None:
+                # didn't find any match.
+                self.bypass_model_exec = False
+                continue
+            
+            _, _, keys, values, hidden = ret
+
+            # receive KV cache from disaggregated prefill instance
+            for i in range(model_executable.model.start_layer,
+                        model_executable.model.end_layer):
+
+                # get kv cache
+                kv_cache = kv_caches[i - model_executable.model.start_layer]
+                # get corresponding layer
+                layer = model_executable.model.layers[i]
+
+                key_cache, value_cache = kv_cache[0], kv_cache[1]
+                ops.reshape_and_cache_flash(
+                    keys[i],
+                    values[i],
+                    key_cache,
+                    value_cache,
+                    slot_mapping[start_pos:end_pos],
+                    layer.self_attn.attn.kv_cache_dtype,
+                    layer.self_attn.attn._k_scale,
+                    layer.self_attn.attn._v_scale,
+                )
+
+            hidden_or_intermediate_states_for_one_req.append(hidden)
+
+        if not bypass_model_exec:
+            # Some of the KV cache is not retrieved
+            # so we need to recompute the hidden state
+            return [], bypass_model_exec
+
+        # concatenate hidden states from different requests
+        hidden_or_intermediate_states = torch.cat(
+            hidden_or_intermediate_states_for_one_req, dim=0)
+
+        logger.debug("[rank%d]: KV recv DONE.", torch.distributed.get_rank())
+        return hidden_or_intermediate_states, bypass_model_exec
\ No newline at end of file

From e04430c574f89590778f3d2aea55518c1e648a50 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 27 Aug 2024 22:20:06 -0700
Subject: [PATCH 156/278] add kv transfer test

---
 tests/random_send_recv.py | 0
 tests/test_send_recv.sh   | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/random_send_recv.py
 create mode 100644 tests/test_send_recv.sh

diff --git a/tests/random_send_recv.py b/tests/random_send_recv.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/test_send_recv.sh b/tests/test_send_recv.sh
new file mode 100644
index 000000000000..e69de29bb2d1

From 30f9bb670d8e4110de5ec906c3695b438fbe390d Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Wed, 28 Aug 2024 07:25:58 +0000
Subject: [PATCH 157/278] add test cases for pipe

---
 tests/kv_transfer/test_send_recv.py | 81 +++++++++++++++++++++++++++++
 tests/kv_transfer/test_send_recv.sh |  3 ++
 2 files changed, 84 insertions(+)
 create mode 100644 tests/kv_transfer/test_send_recv.py
 create mode 100644 tests/kv_transfer/test_send_recv.sh

diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
new file mode 100644
index 000000000000..c8be888ef6aa
--- /dev/null
+++ b/tests/kv_transfer/test_send_recv.py
@@ -0,0 +1,81 @@
+
+import vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe as tdp
+import torch
+import os
+import random
+from tqdm import tqdm
+
+my_rank = int(os.environ['RANK'])
+
+
+torch.distributed.init_process_group(
+            init_method="tcp://127.0.0.1:23456",
+            world_size=2,
+            rank=my_rank)
+
+print("initialized! My rank is %d" % my_rank)
+
+
+pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "nccl")
+
+print('My device is ', pipe.device, ' default: ', torch.cuda.current_device())
+print(pipe.target_rank_for_send, pipe.target_rank_for_recv)
+
+# test run
+
+if my_rank == 0:
+    x = torch.tensor([1]).to(pipe.device)
+    pipe.send(x, 1)
+    
+    
+else:
+    y = torch.tensor([0]).to(pipe.device)
+    y = pipe.recv(y.shape, y.dtype)
+    
+    assert y.item() == 1
+
+# if my_rank == 0:
+#     x = torch.tensor([1]).to(pipe.device)
+#     torch.distributed.send(x, dst=1, group=pipe.device_group)
+# else:
+#     x = torch.tensor([0]).to(pipe.device)
+#     torch.distributed.recv(x, src=0, group=pipe.device_group)
+#     assert x.item() == 1
+
+print(my_rank, 'Test run successed! ')
+
+if my_rank == 0:
+    # send a tensor 1000 times
+    for i in range(3):
+        
+        mean = random.randint(10, 100)
+        std = random.randint(10, 100)
+        size = [random.randint(10, 100), random.randint(10, 100)]
+        x = torch.normal(mean, std, size=size).to(pipe.device)
+        
+        if i % 10 == 0:
+            pipe.send_tensor(None)
+            pipe.send_tensor(None)
+            pipe.send_tensor(None)
+        else:
+            pipe.send_tensor(x)
+            pipe.send_tensor(x.mean())
+            pipe.send_tensor(x.std())
+            
+else:
+    # recv a tensor 1000 times
+    for i in tqdm(range(2)):
+        
+        x = pipe.recv_tensor()
+        mean = pipe.recv_tensor()
+        std = pipe.recv_tensor()
+        
+        if x is None:
+            assert mean is None, std is None
+        else:
+            assert x.mean() == mean
+            assert x.std() == std
+
+        
+
+        
\ No newline at end of file
diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh
new file mode 100644
index 000000000000..2a478871bd0e
--- /dev/null
+++ b/tests/kv_transfer/test_send_recv.sh
@@ -0,0 +1,3 @@
+
+RANK=0 python3 test_send_recv.py &
+RANK=1 python3 test_send_recv.py &

From bbce62ea8cf0f207461f0776839a838623465325 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Wed, 28 Aug 2024 07:26:20 +0000
Subject: [PATCH 158/278] bug fix

---
 .../kv_transfer/kv_pipe/torch_distributed_pipe.py  | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index 4a16f5908679..dd540ead9440 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -10,6 +10,11 @@
 import threading
 
 
+# if the tensor is only one-element and only contains this number
+# this means that the sended object is None.
+NONE_INT = -150886311
+
+
 class BrokenPipeException(Exception):
     def __init__(self, message):
         self.message = message
@@ -52,13 +57,14 @@ def __init__(
 
         self.kv_sending_thread = None
         self.buffer_size = 0
-        self.buffer_size_lock = threading.lock()
+        self.buffer_size_lock = threading.Lock()
 
-        self.nan_tensor = torch.tensor(['nan'])
+        self.none_tensor = torch.tensor([NONE_INT]).to(self.device)
         self.broken = False
         
         
     def send_tensor_wrapper(self, tensor: torch.Tensor) -> None:
+        print('Sending ', tensor)
         """Wrapper for send_tensor_dict"""
         tensor_size = tensor['tensor'].element_size() * tensor['tensor'].numel()
         self.send_tensor_dict({'tensor': tensor}, self.target_rank_for_send)
@@ -82,7 +88,7 @@ def send_tensor(self,
             self.kv_sending_thread = ThreadPoolExecutor(max_workers=1)
 
         if tensor is None:
-            tensor = self.nan_tensor
+            tensor = self.none_tensor
             tensor_size = 0
         else:
             tensor_size = tensor.element_size() * tensor.numel()
@@ -100,7 +106,7 @@ def recv_tensor(self) -> Optional[torch.Tensor]:
         """Receives a tensor from the src rank. Blocking."""
         
         tensor = self.recv_tensor_dict(self.target_rank_for_recv)['tensor']
-        if tensor.isnan().item():
+        if tensor.numel() == 1 and tensor.item() == 150886311:
             return None
         else:
             return tensor

From 927800d9aa0b61f17edbc333b25bafd197ccf4d5 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 29 Aug 2024 07:02:04 +0000
Subject: [PATCH 159/278] finalize send-recv test

---
 tests/kv_transfer/test_send_recv.py           | 154 +++++++++++-------
 .../kv_pipe/torch_distributed_pipe.py         |   5 +-
 2 files changed, 97 insertions(+), 62 deletions(-)

diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index c8be888ef6aa..5d59c38c4f27 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -4,78 +4,114 @@
 import os
 import random
 from tqdm import tqdm
+import time
 
-my_rank = int(os.environ['RANK'])
 
+def test_run(my_rank, pipe):
+    # test run
+    if my_rank == 0:
+        x = torch.tensor([1]).to(pipe.device)
+        pipe.send_tensor(x)
+    else:
+        y = pipe.recv_tensor()    
+        assert y.item() == 1
 
-torch.distributed.init_process_group(
-            init_method="tcp://127.0.0.1:23456",
-            world_size=2,
-            rank=my_rank)
 
-print("initialized! My rank is %d" % my_rank)
-
-
-pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "nccl")
-
-print('My device is ', pipe.device, ' default: ', torch.cuda.current_device())
-print(pipe.target_rank_for_send, pipe.target_rank_for_recv)
-
-# test run
-
-if my_rank == 0:
-    x = torch.tensor([1]).to(pipe.device)
-    pipe.send(x, 1)
+def stress_test(my_rank, pipe):
     
+    torch.distributed.barrier()
     
-else:
-    y = torch.tensor([0]).to(pipe.device)
-    y = pipe.recv(y.shape, y.dtype)
+    tensors = []
     
-    assert y.item() == 1
-
-# if my_rank == 0:
-#     x = torch.tensor([1]).to(pipe.device)
-#     torch.distributed.send(x, dst=1, group=pipe.device_group)
-# else:
-#     x = torch.tensor([0]).to(pipe.device)
-#     torch.distributed.recv(x, src=0, group=pipe.device_group)
-#     assert x.item() == 1
+    if my_rank == 0:    
+        for i in tqdm(range(2000)):
+            mean = random.randint(1, 10)
+            std = random.randint(1, 10)
+            size = [random.randint(900, 1000), random.randint(900, 1000)]
+            x = torch.normal(mean * 1.0, std * 1.0, size=size).to(pipe.device)
+            
+            # 5% probability of sending a None
+            if random.randint(1, 100) < 5:
+                tensors.append(None)
+                tensors.append(None)
+                tensors.append(None)
+            else:
+                tensors.append(x)
+                tensors.append(x.mean())
+                tensors.append(x.std())
+        
+    torch.distributed.barrier()
+    
+    for i in tqdm(range(2000)):
+        if my_rank == 0:
+            pipe.send_tensor(tensors[3*i])
+            pipe.send_tensor(tensors[3*i+1])
+            pipe.send_tensor(tensors[3*i+2])
+        else:
+            x = pipe.recv_tensor()
+            mean = pipe.recv_tensor()
+            std = pipe.recv_tensor()
+            if x is None:
+                assert mean is None
+                assert std is None
+            else:
+                assert x.mean() == mean
+                assert x.std() == std
 
-print(my_rank, 'Test run successed! ')
+    torch.distributed.barrier()
 
-if my_rank == 0:
-    # send a tensor 1000 times
-    for i in range(3):
+    print("Stress test passed.")
+    
+    
+    
+def latency_test(my_rank, pipe, nelement, ntensor):
+    
+    latencies = []
+    
+    torch.distributed.barrier()
+    
+    for i in tqdm(range(1000)):
         
-        mean = random.randint(10, 100)
-        std = random.randint(10, 100)
-        size = [random.randint(10, 100), random.randint(10, 100)]
-        x = torch.normal(mean, std, size=size).to(pipe.device)
+        tensors = []
         
-        if i % 10 == 0:
-            pipe.send_tensor(None)
-            pipe.send_tensor(None)
-            pipe.send_tensor(None)
-        else:
-            pipe.send_tensor(x)
-            pipe.send_tensor(x.mean())
-            pipe.send_tensor(x.std())
-            
-else:
-    # recv a tensor 1000 times
-    for i in tqdm(range(2)):
+        if my_rank == 0:
+            # create tensor
+            tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)]
         
-        x = pipe.recv_tensor()
-        mean = pipe.recv_tensor()
-        std = pipe.recv_tensor()
+        torch.distributed.barrier()
         
-        if x is None:
-            assert mean is None, std is None
+        if my_rank == 0:
+            t = torch.tensor(time.time(), dtype=torch.float64).to(pipe.device)
+            for tensor in tensors:
+                pipe.send_tensor(tensor)
+            pipe.send_tensor(t)
         else:
-            assert x.mean() == mean
-            assert x.std() == std
+            for _ in range(ntensor):
+                pipe.recv_tensor()
+            t = pipe.recv_tensor()
+            latencies.append(time.time() - t.item())
+
+    torch.distributed.barrier()
+            
+    print('Latency test passed.')
+    print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms')
 
-        
 
-        
\ No newline at end of file
+if __name__ == "__main__":
+
+    my_rank = int(os.environ['RANK'])
+
+
+    torch.distributed.init_process_group(
+                init_method="tcp://127.0.0.1:23456",
+                world_size=2,
+                rank=my_rank)
+
+    print("initialized! My rank is %d" % my_rank)
+
+
+    pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "nccl")
+    
+    test_run(my_rank, pipe)
+    stress_test(my_rank, pipe)
+    latency_test(my_rank, pipe, 1024*8*128, 80)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index dd540ead9440..d3663ac7667d 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -64,9 +64,8 @@ def __init__(
         
         
     def send_tensor_wrapper(self, tensor: torch.Tensor) -> None:
-        print('Sending ', tensor)
         """Wrapper for send_tensor_dict"""
-        tensor_size = tensor['tensor'].element_size() * tensor['tensor'].numel()
+        tensor_size = tensor.element_size() * tensor.numel()
         self.send_tensor_dict({'tensor': tensor}, self.target_rank_for_send)
         
         with self.buffer_size_lock:
@@ -106,7 +105,7 @@ def recv_tensor(self) -> Optional[torch.Tensor]:
         """Receives a tensor from the src rank. Blocking."""
         
         tensor = self.recv_tensor_dict(self.target_rank_for_recv)['tensor']
-        if tensor.numel() == 1 and tensor.item() == 150886311:
+        if tensor.numel() == 1 and tensor.item() == NONE_INT:
             return None
         else:
             return tensor

From 6680ea780d57fed1cb162c7efef3e677992d3bd2 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 29 Aug 2024 07:48:27 +0000
Subject: [PATCH 160/278] update test case so that there are both send and recv

---
 tests/kv_transfer/test_send_recv.py | 33 ++++++++++++++---------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 5d59c38c4f27..5a2a72cf177f 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -23,27 +23,26 @@ def stress_test(my_rank, pipe):
     
     tensors = []
     
-    if my_rank == 0:    
-        for i in tqdm(range(2000)):
-            mean = random.randint(1, 10)
-            std = random.randint(1, 10)
-            size = [random.randint(900, 1000), random.randint(900, 1000)]
-            x = torch.normal(mean * 1.0, std * 1.0, size=size).to(pipe.device)
-            
-            # 5% probability of sending a None
-            if random.randint(1, 100) < 5:
-                tensors.append(None)
-                tensors.append(None)
-                tensors.append(None)
-            else:
-                tensors.append(x)
-                tensors.append(x.mean())
-                tensors.append(x.std())
+    for i in tqdm(range(2000)):
+        mean = random.randint(1, 10)
+        std = random.randint(1, 10)
+        size = [random.randint(900, 1000), random.randint(900, 1000)]
+        x = torch.normal(mean * 1.0, std * 1.0, size=size).to(pipe.device)
+        
+        # 5% probability of sending a None
+        if random.randint(1, 100) < 5:
+            tensors.append(None)
+            tensors.append(None)
+            tensors.append(None)
+        else:
+            tensors.append(x)
+            tensors.append(x.mean())
+            tensors.append(x.std())
         
     torch.distributed.barrier()
     
     for i in tqdm(range(2000)):
-        if my_rank == 0:
+        if my_rank == int((i % 10) > 3):
             pipe.send_tensor(tensors[3*i])
             pipe.send_tensor(tensors[3*i+1])
             pipe.send_tensor(tensors[3*i+2])

From dfbfe80ba5a26a02011d651b0b78c8c38f3a3b86 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 29 Aug 2024 08:08:27 +0000
Subject: [PATCH 161/278] update kv lookup buffer --- I am  TOOOOOOO sleepy

---
 .../simple_kv_lookup_buffer.py                | 46 ++++++++++---------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
index c43a41575aee..9864e5f2d42b 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
@@ -10,7 +10,7 @@ class SimpleKVLookupBuffer(KVLookupBufferBase):
     
     def __init__(self, pipe):
         
-        self.tokens_roi_kv_buffer = deque()
+        self.buffer = deque()
         
         self.buffer_size = 0
         self.buffer_lock = threading.Lock()
@@ -35,11 +35,13 @@ def _matches(self, tokens_roi_sender, tokens_roi_recver):
             return True
         
         
-        if tokens_sender == tokens_recver[:tokens_sender.shape[0]]:
+        min_length = min(len(tokens_sender), len(tokens_receiver))
+        if tokens_sender[:min_length] == tokens_recver[:min_length]:
             # drastically simplified
-            # accept a match as long as 
-            
+            # common prefix matching
             return True
+        
+        return None
 
             
     def _send_tensor_and_dec_size(self, tensor: Optional[torch.Tensor]) -> None:
@@ -49,15 +51,14 @@ def _send_tensor_and_dec_size(self, tensor: Optional[torch.Tensor]) -> None:
         tensor = tensor.clone()
         self.pipe.send_tensor(tensor)
         
-    def _add_to_buffer(self, input_tokens, roi, kv):
+    def _add_to_buffer(self, input_tokens, roi, key, value, hidden):
         
         self.buffer_size += input_tokens.element_size() * input_tokens.numel()
         self.buffer_size += roi.element_size() * roi.numel()
-        self.buffer_size += kv.element_size() * kv.numel()
-        self.tokens_roi_kv_buffer.append((input_tokens, roi, kv))
-        
-        
-        
+        self.buffer_size += key.element_size() * key.numel()
+        self.buffer_size += value.element_size() * value.numel()
+        self.buffer_size += hidden.element_size() * hidden.numel()
+        self.buffer.append([input_tokens, roi, kv, hidden])
         
         
     def drop_select_handler(self):
@@ -81,17 +82,15 @@ def drop_select_handler(self):
                     # need to clone the tensor
                     # in case the tensor is freed before sending finishes
                     matched_item = self.tokens_roi_kv_buffer[matched_idx]
-                    self._send_tensor_and_dec_size(matched_item[0].clone())
-                    self._send_tensor_and_dec_size(matched_item[1].clone())
-                    self._send_tensor_and_dec_size(matched_item[2].clone())
+                    for tensor in matched_item:
+                        self._send_tensor_and_dec_size(tensor.clone())
                     del self.tokens_roi_kv_buffer[matched_idx]
                     
                 else:
                     # no match, just send None
-                    self.pipe.send_tensor(None)
-                    self.pipe.send_tensor(None)
-                    self.pipe.send_tensor(None)
-
+                    for _ in range(5):
+                        self.pipe.send_tensor(None)
+                        
         
     def drop_select(self, input_tokens, roi):
         
@@ -103,12 +102,17 @@ def drop_select(self, input_tokens, roi):
         
         input_tokens = self.pipe.recv_tensor()
         roi = self.pipe.recv_tensor()
-        kv = self.pipe.recv_tensor()
+        key = self.pipe.recv_tensor()
+        value = self.pipe.recv_tensor()
+        hidden = self.pipe.recv_tensor()
         
-        return [input_tokens, roi, kv]
+        return [input_tokens, roi, key, value, hidden]
         
     
-    def insert(self, input_tokens, roi, kv) -> None:
+    def insert(self, input_tokens, roi, key, value, hidden) -> None:
+        
+        with self.buffer_lock:
+            self._add_to_buffer(input_tokens, roi, key, value, hidden)
         
         # when calling the insert, the current process is a sender
         # need to launch the request handler and start listening to request.
@@ -117,6 +121,4 @@ def insert(self, input_tokens, roi, kv) -> None:
                 target=self.drop_select_handler)
             self.request_handling_thread.start()
             
-        with self.buffer_lock:
-            self._add_to_buffer(input_tokens, roi, kv)
             
\ No newline at end of file

From b566b18e4101190787077f7a889f10c2229e951a Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Wed, 4 Sep 2024 06:15:39 +0000
Subject: [PATCH 162/278] add lookup buffer test

---
 tests/kv_transfer/test_lookup_buffer.py | 132 ++++++++++++++++++++++++
 tests/kv_transfer/test_lookup_buffer.sh |   3 +
 2 files changed, 135 insertions(+)
 create mode 100644 tests/kv_transfer/test_lookup_buffer.py
 create mode 100644 tests/kv_transfer/test_lookup_buffer.sh

diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
new file mode 100644
index 000000000000..477eafbfafc2
--- /dev/null
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -0,0 +1,132 @@
+
+import vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe as tdp
+import vllm.distributed.kv_transfer.kv_lookup_buffer.simple_kv_lookup_buffer as sklb
+import torch
+import os
+import random
+from tqdm import tqdm
+import time
+
+
+def test_run(my_rank, buffer):
+    # test run
+    tokens = torch.tensor([1,2,3]).to(buffer.pipe.device)
+    
+    if my_rank == 0:
+        key = 2.0 * torch.ones([5, 6]).to(buffer.pipe.device)
+        value = 3.0 * torch.ones([5, 6]).to(buffer.pipe.device)
+
+        placeholder = torch.tensor([1]).to(buffer.pipe.device)
+
+        buffer.insert(tokens, placeholder, key, value, placeholder)
+        
+    else:
+        placeholder = torch.tensor([1]).to(buffer.pipe.device)
+        tok, roi, key, value, hidden = buffer.drop_select(tokens, placeholder)
+        assert torch.allclose(tokens, tok)
+        assert torch.allclose(key, 2.0 * torch.ones([5, 6]))
+        assert torch.allclose(value, 3.0 * torch.ones([5, 6]))
+        
+    torch.distributed.barrier()
+    
+    if my_rank == 0:
+        assert buffer.buffer_size == 0
+        assert len(buffer.buffer) == 0
+
+
+def stress_test(my_rank, pipe):
+    
+    torch.distributed.barrier()
+    
+    tensors = []
+    
+    for i in tqdm(range(2000)):
+        mean = random.randint(1, 10)
+        std = random.randint(1, 10)
+        size = [random.randint(900, 1000), random.randint(900, 1000)]
+        x = torch.normal(mean * 1.0, std * 1.0, size=size).to(pipe.device)
+        
+        # 5% probability of sending a None
+        if random.randint(1, 100) < 5:
+            tensors.append(None)
+            tensors.append(None)
+            tensors.append(None)
+        else:
+            tensors.append(x)
+            tensors.append(x.mean())
+            tensors.append(x.std())
+        
+    torch.distributed.barrier()
+    
+    for i in tqdm(range(2000)):
+        if my_rank == int((i % 10) > 3):
+            pipe.send_tensor(tensors[3*i])
+            pipe.send_tensor(tensors[3*i+1])
+            pipe.send_tensor(tensors[3*i+2])
+        else:
+            x = pipe.recv_tensor()
+            mean = pipe.recv_tensor()
+            std = pipe.recv_tensor()
+            if x is None:
+                assert mean is None
+                assert std is None
+            else:
+                assert x.mean() == mean
+                assert x.std() == std
+
+    torch.distributed.barrier()
+
+    print("Stress test passed.")
+    
+    
+    
+def latency_test(my_rank, pipe, nelement, ntensor):
+    
+    latencies = []
+    
+    torch.distributed.barrier()
+    
+    for i in tqdm(range(1000)):
+        
+        tensors = []
+        
+        if my_rank == 0:
+            # create tensor
+            tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)]
+        
+        torch.distributed.barrier()
+        
+        if my_rank == 0:
+            t = torch.tensor(time.time(), dtype=torch.float64).to(pipe.device)
+            for tensor in tensors:
+                pipe.send_tensor(tensor)
+            pipe.send_tensor(t)
+        else:
+            for _ in range(ntensor):
+                pipe.recv_tensor()
+            t = pipe.recv_tensor()
+            latencies.append(time.time() - t.item())
+
+    torch.distributed.barrier()
+            
+    print('Latency test passed.')
+    print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms')
+
+
+if __name__ == "__main__":
+
+    my_rank = int(os.environ['RANK'])
+
+
+    torch.distributed.init_process_group(
+                init_method="tcp://127.0.0.1:23456",
+                world_size=2,
+                rank=my_rank)
+
+    print("initialized! My rank is %d" % my_rank)
+
+
+    pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "nccl")
+    buffer = sklb.SimpleKVLookupBuffer(pipe)
+
+    test_run(my_rank, buffer)
diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh
new file mode 100644
index 000000000000..336b540e7054
--- /dev/null
+++ b/tests/kv_transfer/test_lookup_buffer.sh
@@ -0,0 +1,3 @@
+
+RANK=0 python3 test_lookup_buffer.py &
+RANK=1 python3 test_lookup_buffer.py &

From fc2c972bdf1beb203c7711f40a803795b2e1502d Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Wed, 4 Sep 2024 06:16:10 +0000
Subject: [PATCH 163/278] update lookup buffer

---
 .../simple_kv_lookup_buffer.py                | 69 ++++++++++++++-----
 1 file changed, 50 insertions(+), 19 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
index 9864e5f2d42b..a5b0ee4c3c72 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
@@ -35,30 +35,51 @@ def _matches(self, tokens_roi_sender, tokens_roi_recver):
             return True
         
         
-        min_length = min(len(tokens_sender), len(tokens_receiver))
-        if tokens_sender[:min_length] == tokens_recver[:min_length]:
+        min_length = min(len(tokens_sender), len(tokens_recver))
+        if torch.allclose(tokens_sender[:min_length], tokens_recver[:min_length]):
             # drastically simplified
             # common prefix matching
-            return True
+            print("min length is ", min_length)
+            return min_length
         
-        return None
+        return 0
 
             
     def _send_tensor_and_dec_size(self, tensor: Optional[torch.Tensor]) -> None:
 
         assert tensor is not None, "Use self.pipe.send(None) instead"
         self.buffer_size -= tensor.element_size() * tensor.numel()
-        tensor = tensor.clone()
         self.pipe.send_tensor(tensor)
+
+    def _get_element_size(self, data):
+        
+        if data == [] or data is None:
+            return 0
+        if isinstance(data, torch.Tensor):
+            return data.element_size() * data.numel()
+
+        assert False, "Unknown data type %s" % type(data)
         
     def _add_to_buffer(self, input_tokens, roi, key, value, hidden):
+
+        if isinstance(input_tokens, torch.Tensor):
+            input_tokens = input_tokens.clone()
+        if isinstance(roi, torch.Tensor):
+            roi = roi.clone()
+        if isinstance(key, torch.Tensor):
+            key = key.clone()
+        if isinstance(value, torch.Tensor):
+            value = value.clone()
+        if isinstance(hidden, torch.Tensor):
+            hidden = hidden.clone()
+
         
-        self.buffer_size += input_tokens.element_size() * input_tokens.numel()
-        self.buffer_size += roi.element_size() * roi.numel()
-        self.buffer_size += key.element_size() * key.numel()
-        self.buffer_size += value.element_size() * value.numel()
-        self.buffer_size += hidden.element_size() * hidden.numel()
-        self.buffer.append([input_tokens, roi, kv, hidden])
+        buffer_item = [input_tokens, roi, key, value, hidden]
+
+        with self.buffer_lock:
+            for data in buffer_item:
+                self.buffer_size += self._get_element_size(data)
+            self.buffer.append(buffer_item)
         
         
     def drop_select_handler(self):
@@ -66,25 +87,29 @@ def drop_select_handler(self):
         while True:
             input_tokens = self.pipe.recv_tensor()
             roi = self.pipe.recv_tensor()
-            tokens_roi = [input_tokens, roi]
+            tokens_roi_recver = [input_tokens, roi]
             
             matched_idx = None
             
             # perform input tokens and roi matching
             with self.buffer_lock:
                 
-                for idx, tokens_roi_kv in enumerate(self.tokens_roi_kv_buffer):
-                    if self._matches(tokens_roi_kv, tokens_roi):
+                for idx, tokens_roi_sender in enumerate(self.buffer):
+                    if self._matches(tokens_roi_sender, tokens_roi_recver) > 0:
                         matched_idx = idx
                         break
+
+                        
+                print("Got a match ", matched_idx)
                     
                 if matched_idx is not None:
                     # need to clone the tensor
                     # in case the tensor is freed before sending finishes
-                    matched_item = self.tokens_roi_kv_buffer[matched_idx]
+                    matched_item = self.buffer[matched_idx]
+                    print(matched_item)
                     for tensor in matched_item:
-                        self._send_tensor_and_dec_size(tensor.clone())
-                    del self.tokens_roi_kv_buffer[matched_idx]
+                        self._send_tensor_and_dec_size(tensor)
+                    del self.buffer[matched_idx]
                     
                 else:
                     # no match, just send None
@@ -96,9 +121,15 @@ def drop_select(self, input_tokens, roi):
         
         assert self.request_handling_thread is None, \
             "drop_select should be called by the receiver"
+
+            
+        if isinstance(input_tokens, torch.Tensor):
+            input_tokens = input_tokens.clone()
+        if isinstance(roi, torch.Tensor):
+            roi = roi.clone()
         
-        self.pipe.send_tensor(input_tokens.clone())
-        self.pipe.send_tensor(roi.clone())
+        self.pipe.send_tensor(input_tokens)
+        self.pipe.send_tensor(roi)
         
         input_tokens = self.pipe.recv_tensor()
         roi = self.pipe.recv_tensor()

From b2c765c9444b3687aa8baeb098e2b6adc6089fc9 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 6 Sep 2024 06:37:09 +0000
Subject: [PATCH 164/278] finish lookup buffer test

---
 tests/kv_transfer/test_lookup_buffer.py       | 142 +++++++++---------
 .../simple_kv_lookup_buffer.py                |  82 ++++++----
 2 files changed, 120 insertions(+), 104 deletions(-)

diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index 477eafbfafc2..aa98a7804ecd 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -9,24 +9,32 @@
 
 
 def test_run(my_rank, buffer):
-    # test run
-    tokens = torch.tensor([1,2,3]).to(buffer.pipe.device)
     
+    # buffer should be empty in the beginning    
+    if my_rank == 0:
+        assert buffer.buffer_size == 0
+        assert len(buffer.buffer) == 0
+
+
+    # insert
+    tokens = torch.tensor([1,2,3]).to(buffer.pipe.device)
+    roi = (tokens > 0)
     if my_rank == 0:
         key = 2.0 * torch.ones([5, 6]).to(buffer.pipe.device)
         value = 3.0 * torch.ones([5, 6]).to(buffer.pipe.device)
 
         placeholder = torch.tensor([1]).to(buffer.pipe.device)
 
-        buffer.insert(tokens, placeholder, key, value, placeholder)
+        buffer.insert(tokens, roi, key, value, placeholder)
+    torch.distributed.barrier()
         
-    else:
-        placeholder = torch.tensor([1]).to(buffer.pipe.device)
-        tok, roi, key, value, hidden = buffer.drop_select(tokens, placeholder)
+    # drop_select
+    if my_rank == 1:
+        tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
         assert torch.allclose(tokens, tok)
+        assert torch.allclose(roi, roi_)
         assert torch.allclose(key, 2.0 * torch.ones([5, 6]))
         assert torch.allclose(value, 3.0 * torch.ones([5, 6]))
-        
     torch.distributed.barrier()
     
     if my_rank == 0:
@@ -34,84 +42,72 @@ def test_run(my_rank, buffer):
         assert len(buffer.buffer) == 0
 
 
-def stress_test(my_rank, pipe):
+def stress_test(my_rank, buf):
     
     torch.distributed.barrier()
+    torch.manual_seed(100)
+
+    device = buf.pipe.device
     
-    tensors = []
+    reqs = [
+        (
+         torch.rand(100).to(device),   # tokens
+         torch.ones(100).bool().to(device),    # roi
+         torch.rand(100).to(device),   # key
+         torch.rand(100).to(device),   # value
+         torch.rand(100).to(device),   # hidden
+         ) for i in range(200)]
+
+    random.seed(my_rank)
+    random.shuffle(reqs)
     
-    for i in tqdm(range(2000)):
-        mean = random.randint(1, 10)
-        std = random.randint(1, 10)
-        size = [random.randint(900, 1000), random.randint(900, 1000)]
-        x = torch.normal(mean * 1.0, std * 1.0, size=size).to(pipe.device)
-        
-        # 5% probability of sending a None
-        if random.randint(1, 100) < 5:
-            tensors.append(None)
-            tensors.append(None)
-            tensors.append(None)
-        else:
-            tensors.append(x)
-            tensors.append(x.mean())
-            tensors.append(x.std())
-        
     torch.distributed.barrier()
     
-    for i in tqdm(range(2000)):
-        if my_rank == int((i % 10) > 3):
-            pipe.send_tensor(tensors[3*i])
-            pipe.send_tensor(tensors[3*i+1])
-            pipe.send_tensor(tensors[3*i+2])
+    n = 0
+    
+    # the buffer size can only store 100 reqs
+    # so the sender will occasionally block.needs to wait for the receiver.
+    for req in tqdm(reqs):
+        if my_rank == 0:
+            buf.insert(*req)
         else:
-            x = pipe.recv_tensor()
-            mean = pipe.recv_tensor()
-            std = pipe.recv_tensor()
-            if x is None:
-                assert mean is None
-                assert std is None
+            tok, roi, k, v, h = req
+            tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi)
+            
+            if tok_ is None:
+                assert roi_ is None
+                assert k_ is None
+                assert v_ is None
+                assert h_ is None
+                n += 1
             else:
-                assert x.mean() == mean
-                assert x.std() == std
-
+                assert torch.allclose(tok, tok_)
+                assert torch.allclose(roi, roi_)
+                assert torch.allclose(k, k_)
+                assert torch.allclose(v, v_)
+                assert torch.allclose(h, h_)
+    print('Rand %d done' % my_rank)
     torch.distributed.barrier()
-
-    print("Stress test passed.")
     
     
+    if my_rank == 0:
+        x = torch.tensor([0])
+        torch.distributed.recv(x, 1)
+        # the # of None received is the kv that are not selected
+        assert x.item() == len(buf.buffer)
+        # and the size of the buffer should be 2000 * buffer len
+        print(buf.buffer_size)
+        assert buf.buffer_size == 1700 * len(buf.buffer)
+    else:
+        torch.distributed.send(torch.tensor([n]), 0)
+
+        
     
-def latency_test(my_rank, pipe, nelement, ntensor):
+            
+            
     
-    latencies = []
     
-    torch.distributed.barrier()
     
-    for i in tqdm(range(1000)):
-        
-        tensors = []
-        
-        if my_rank == 0:
-            # create tensor
-            tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)]
-        
-        torch.distributed.barrier()
-        
-        if my_rank == 0:
-            t = torch.tensor(time.time(), dtype=torch.float64).to(pipe.device)
-            for tensor in tensors:
-                pipe.send_tensor(tensor)
-            pipe.send_tensor(t)
-        else:
-            for _ in range(ntensor):
-                pipe.recv_tensor()
-            t = pipe.recv_tensor()
-            latencies.append(time.time() - t.item())
-
-    torch.distributed.barrier()
-            
-    print('Latency test passed.')
-    print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms')
-
 
 if __name__ == "__main__":
 
@@ -127,6 +123,10 @@ def latency_test(my_rank, pipe, nelement, ntensor):
 
 
     pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "nccl")
-    buffer = sklb.SimpleKVLookupBuffer(pipe)
+    buffer = sklb.SimpleKVLookupBuffer(pipe, 170000)
 
     test_run(my_rank, buffer)
+    
+    stress_test(my_rank, buffer)
+    
+    print('Done')
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
index a5b0ee4c3c72..84566789f696 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
@@ -5,14 +5,16 @@
 import threading
 import torch
 from collections import deque
+import time
 
 class SimpleKVLookupBuffer(KVLookupBufferBase):
     
-    def __init__(self, pipe):
+    def __init__(self, pipe, buffer_size_thresh):
         
         self.buffer = deque()
         
         self.buffer_size = 0
+        self.buffer_size_threshold = buffer_size_thresh
         self.buffer_lock = threading.Lock()
         self.pipe = pipe
         self.request_handling_thread = None
@@ -33,13 +35,17 @@ def _matches(self, tokens_roi_sender, tokens_roi_recver):
             # semantics: DROP SELECT * LIMIT 1
             # so any of the data in the buffer can be drop-selected
             return True
+
+            
+        # I am assuming that roi is a mask on tokens
+        tokens_sender = tokens_sender[roi_sender]
+        tokens_recver = tokens_recver[roi_recver]
         
         
         min_length = min(len(tokens_sender), len(tokens_recver))
         if torch.allclose(tokens_sender[:min_length], tokens_recver[:min_length]):
             # drastically simplified
             # common prefix matching
-            print("min length is ", min_length)
             return min_length
         
         return 0
@@ -75,7 +81,7 @@ def _add_to_buffer(self, input_tokens, roi, key, value, hidden):
 
         
         buffer_item = [input_tokens, roi, key, value, hidden]
-
+        
         with self.buffer_lock:
             for data in buffer_item:
                 self.buffer_size += self._get_element_size(data)
@@ -83,38 +89,42 @@ def _add_to_buffer(self, input_tokens, roi, key, value, hidden):
         
         
     def drop_select_handler(self):
+
+        try:
         
-        while True:
-            input_tokens = self.pipe.recv_tensor()
-            roi = self.pipe.recv_tensor()
-            tokens_roi_recver = [input_tokens, roi]
-            
-            matched_idx = None
-            
-            # perform input tokens and roi matching
-            with self.buffer_lock:
+            while True:
+                input_tokens = self.pipe.recv_tensor()
+                roi = self.pipe.recv_tensor()
+                tokens_roi_recver = [input_tokens, roi]
                 
-                for idx, tokens_roi_sender in enumerate(self.buffer):
-                    if self._matches(tokens_roi_sender, tokens_roi_recver) > 0:
-                        matched_idx = idx
-                        break
+                matched_length = 0
+                
+                # perform input tokens and roi matching
+                with self.buffer_lock:
 
+                    for _ in range(len(self.buffer)):
+                        
+                        temp_length = self._matches(self.buffer[0], tokens_roi_recver)
+                        if temp_length > 0:
+                            matched_length = temp_length
+                            break
+                        # rotate the element we just accessed to the end
+                        self.buffer.rotate(-1)
                         
-                print("Got a match ", matched_idx)
-                    
-                if matched_idx is not None:
-                    # need to clone the tensor
-                    # in case the tensor is freed before sending finishes
-                    matched_item = self.buffer[matched_idx]
-                    print(matched_item)
-                    for tensor in matched_item:
-                        self._send_tensor_and_dec_size(tensor)
-                    del self.buffer[matched_idx]
-                    
-                else:
-                    # no match, just send None
-                    for _ in range(5):
-                        self.pipe.send_tensor(None)
+                    if matched_length > 0:
+                        # need to clone the tensor
+                        # in case the tensor is freed before sending finishes
+                        matched_item = self.buffer.popleft()
+                        for tensor in matched_item:
+                            self._send_tensor_and_dec_size(tensor)
+                        
+                    else:
+                        # no match, just send None
+                        for _ in range(5):
+                            self.pipe.send_tensor(None)
+        except RuntimeError as e:
+            if 'Connection closed by peer' not in str(e):
+                raise e
                         
         
     def drop_select(self, input_tokens, roi):
@@ -138,12 +148,18 @@ def drop_select(self, input_tokens, roi):
         hidden = self.pipe.recv_tensor()
         
         return [input_tokens, roi, key, value, hidden]
+
+        
+    def full_handler(self):
+        time.sleep(0.001)
         
     
     def insert(self, input_tokens, roi, key, value, hidden) -> None:
+
+        while self.buffer_size > self.buffer_size_threshold:
+            self.full_handler()
         
-        with self.buffer_lock:
-            self._add_to_buffer(input_tokens, roi, key, value, hidden)
+        self._add_to_buffer(input_tokens, roi, key, value, hidden)
         
         # when calling the insert, the current process is a sender
         # need to launch the request handler and start listening to request.

From 8aef9dcf1e701b96f2956e9e432ef94f6ace7563 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sat, 7 Sep 2024 22:27:15 +0000
Subject: [PATCH 165/278] update parallel state to use the new class method

---
 vllm/distributed/distributed_kv.py | 480 -----------------------------
 vllm/distributed/parallel_state.py |  12 +-
 2 files changed, 2 insertions(+), 490 deletions(-)
 delete mode 100644 vllm/distributed/distributed_kv.py

diff --git a/vllm/distributed/distributed_kv.py b/vllm/distributed/distributed_kv.py
deleted file mode 100644
index 9005a325d6bf..000000000000
--- a/vllm/distributed/distributed_kv.py
+++ /dev/null
@@ -1,480 +0,0 @@
-"""vLLM distributed KV cache transfer API.
-These APIs are used in `vllm/worker/model_runner.py`.
-
-Currently supporting TP and PP.
-
-Workflow:
-- In prefill instance, KV cache sender *buffers* the KV cache send requests
-- In decode instance
-    - KV cache receiver sends the hash of input tokens to sender
-    - KV cache sender executes send request
-    - KV cache receiver receives the KV cache
-"""
-from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING
-from collections import defaultdict, deque
-from concurrent.futures import ThreadPoolExecutor
-from threading import Lock
-from copy import deepcopy
-import time
-import threading
-
-import torch
-from torch.distributed import Backend, ProcessGroup
-
-import vllm.envs as envs
-from vllm.distributed.group_coordinator import GroupCoordinator
-from vllm.logger import init_logger
-import vllm.distributed.parallel_state as ps
-from vllm import _custom_ops as ops
-from vllm.sequence import IntermediateTensors
-
-assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode"], \
-    "VLLM_DISAGG_PREFILL_ROLE can only be prefill or decode."
-
-IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE is not None)
-IS_KV_PREFILL_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "prefill")
-IS_KV_DECODE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "decode")
-
-# add a tag when sending/recving input hash
-DISTRIBUTED_KV_GLOO_TAG = 24857323
-
-logger = init_logger(__name__)
-
-import logging
-
-
-class RankFilter(logging.Filter):
-
-    def filter(self, record):
-        # Only log if rank is 4
-        rank = 1
-        try:
-            rank = torch.distributed.get_rank()
-        except Exception:
-            pass
-        return rank % 4 == 0
-
-
-for handler in logger.handlers:
-    handler.addFilter(RankFilter())
-
-
-class DistributedKVCoordinator(GroupCoordinator):
-    """
-    A class designated for distributed KV transfer
-    
-    Target use cases:
-        1. Disaggregated prefill
-        2. Remote KV cache storage
-        
-    """
-
-    def __init__(
-        self,
-        group_ranks: List[List[int]],
-        local_rank: int,
-        torch_distributed_backend: Union[str, Backend],
-        # DO NOT use pynccl here
-        # Pynccl send is non-blocking
-        # and it's possible that the memory is freed before the data being sent
-        # which may happen at high qps
-        use_pynccl: bool = False,
-        use_custom_allreduce: bool = False,
-        use_tpu_communicator: bool = True,
-        use_message_queue_broadcaster: bool = False,
-        use_cpu_comm_for_sanity_check: bool = False,
-    ):
-
-        super().__init__(
-            group_ranks,
-            local_rank,
-            torch_distributed_backend,
-            use_pynccl,
-            use_custom_allreduce,
-            use_tpu_communicator,
-            use_message_queue_broadcaster,
-        )
-
-        # if turned on, will use CPU-based communication to perform a series of sanity check.
-        # but it adds ~5ms delay, so please turn it off in performance-demanding usecases (e.g. disaggregated prefill)
-        self.use_cpu_comm_for_sanity_check = use_cpu_comm_for_sanity_check
-
-        # use a threadpool to buffer send request in disaggregated prefill
-        self.input_hash_to_kv_sending_requests = defaultdict(deque)
-        self.kv_sending_thread = None
-        self.input_hash_to_kv_sending_requests_lock = Lock()
-        self.target_rank_for_send = self.ranks[(self.rank_in_group + 1) %
-                                               self.world_size]
-        self.target_rank_for_recv = self.ranks[(self.rank_in_group - 1) %
-                                               self.world_size]
-
-        torch.set_default_device(self.device)
-
-    def debug_send(self,
-                   tensor: torch.Tensor,
-                   dst: Optional[int] = None) -> None:
-        """Sends a tensor to the destination rank in a non-blocking way"""
-        """Will send several metadata. Useful for debugging."""
-        """NOTE: `dst` is the local rank of the destination rank."""
-
-        self.send_tensor_dict(
-            {
-                "tensor": tensor,
-                "mean": tensor.float().mean(),
-                "shape": tensor.shape
-            }, dst)
-
-    def debug_recv(self,
-                   size: torch.Size,
-                   dtype: torch.dtype,
-                   src: Optional[int] = None) -> torch.Tensor:
-        """Receives a tensor from the src rank."""
-        """NOTE: `src` is the local rank of the destination rank."""
-
-        result = self.recv_tensor_dict(src)
-        tensor = result["tensor"]
-        assert torch.allclose(result["mean"], tensor.float().mean())
-        assert result["shape"] == tensor.shape
-        assert result[
-            "shape"] == size, f"The shape sent by sender is {result['shape']} but trying to receive {size}"
-        return tensor
-
-    def kv_cache_send(self,
-                      input_hash: int,
-                      tensor: Union[torch.Tensor, IntermediateTensors],
-                      is_hidden: bool = False,
-                      dst: Optional[int] = None) -> None:
-        """Push the KV cache send request into the send buffer"""
-        """NOTE: `dst` is the local rank of the destination rank."""
-
-        if self.use_cpu_comm_for_sanity_check:
-            send_func = self.debug_send
-        else:
-            send_func = self.send
-
-        if is_hidden and not ps.get_pp_group().is_last_rank:
-
-            assert isinstance(tensor, IntermediateTensors)
-
-            output = deepcopy(tensor.tensors)
-            for key in output:
-                output[key] = output[key].contiguous()
-
-            self.input_hash_to_kv_sending_requests[input_hash].append(
-                [self.send_tensor_dict, output, dst])
-
-        else:
-
-            assert isinstance(tensor, torch.Tensor)
-
-            self.input_hash_to_kv_sending_requests[input_hash].append([
-                send_func,
-                # use clone to make sure the tensor is contiguous
-                tensor.clone(),
-                dst
-            ])
-
-    def kv_cache_recv(
-            self,
-            size: torch.Size,
-            dtype: torch.dtype,
-            is_hidden: bool = False,
-            src: Optional[int] = None
-    ) -> Union[torch.Tensor, IntermediateTensors]:
-        """Receives a tensor from the src rank (blocking)."""
-        """This API should be used together with `push`"""
-        """NOTE: `src` is the local rank of the destination rank."""
-
-        if self.use_cpu_comm_for_sanity_check:
-            recv_func = self.debug_recv
-        else:
-            recv_func = self.recv
-
-        if is_hidden and not ps.get_pp_group().is_last_rank:
-            tensor = IntermediateTensors(self.recv_tensor_dict(src))
-        else:
-            tensor = recv_func(size, dtype, src)
-
-        return tensor
-
-    def send_input_hash(self, input_hash: int) -> int:
-
-        logger.debug('[rank%d]: Sending input hash %d to rank %d',
-                     torch.distributed.get_rank(), input_hash,
-                     self.target_rank_for_send)
-
-        # KV cache send go through CPU, and the original `send` only use GPU.
-        # So create a new group for sending input hash.
-        input_hash_tensor = torch.tensor([input_hash], device="cpu").long()
-        torch.distributed.send(input_hash_tensor,
-                               self.target_rank_for_send,
-                               self.cpu_group,
-                               tag=DISTRIBUTED_KV_GLOO_TAG)
-        return_tensor = torch.tensor([0], device="cpu").long()
-        torch.distributed.recv(return_tensor,
-                               self.target_rank_for_recv,
-                               self.cpu_group,
-                               tag=DISTRIBUTED_KV_GLOO_TAG)
-        return return_tensor.item()
-
-    def recv_input_hash(self) -> Optional[int]:
-        '''
-            Receive an input hash, and check if it is already cached
-        '''
-        input_hash_tensor = torch.tensor([0], device="cpu").long()
-        torch.distributed.recv(input_hash_tensor,
-                               self.target_rank_for_recv,
-                               self.cpu_group,
-                               tag=DISTRIBUTED_KV_GLOO_TAG)
-        input_hash = input_hash_tensor.item()
-        # a new input hash comes in, see if it is already cached
-        self.input_hash_to_kv_sending_requests_lock.acquire()
-        logger.debug('Successfully received input hash %d', input_hash)
-        if input_hash not in self.input_hash_to_kv_sending_requests:
-            logger.warning(
-            f"The KV cache of {input_hash} does not exist. "\
-            f"Existing input hash: {list(self.input_hash_to_kv_sending_requests.keys())}")
-            
-            # 0 for fail
-            x = torch.tensor([0], device="cpu").long()
-            torch.distributed.send(x,
-                                    self.target_rank_for_send,
-                                    self.cpu_group,
-                                    tag=DISTRIBUTED_KV_GLOO_TAG)
-            return None
-        else:
-            logger.debug('Input hash %d exists, start sending', input_hash)
-            
-            # 1 for success
-            x = torch.tensor([1], device="cpu").long()
-            torch.distributed.send(x,
-                                   self.target_rank_for_send,
-                                   self.cpu_group,
-                                   tag=DISTRIBUTED_KV_GLOO_TAG)
-            return input_hash
-
-    def kv_cache_send_loop(self):
-        
-        while True:
-            logger.debug(
-                '[rank%d]: Waiting for input hash from rank %d, my keys are %s',
-                torch.distributed.get_rank(),
-                self.target_rank_for_recv,
-                list(self.input_hash_to_kv_sending_requests.keys()),
-            )
-            # wait for a new input hash
-            # this function will acquire the lock
-            input_hash = self.recv_input_hash()
-            if input_hash is None:
-                self.input_hash_to_kv_sending_requests_lock.release()
-                continue
-
-            # execute corresponding kv cache sending jobs in request queue
-            while True:
-                request = self.input_hash_to_kv_sending_requests[
-                    input_hash].popleft()
-                # An empty request: the KV cahe of one request are all sent
-                if request == []:
-                    break
-
-                request[0](*request[1:])
-
-            if len(self.input_hash_to_kv_sending_requests[input_hash]) == 0:
-                logger.debug('Finish input hash %d, free GPU memory...',
-                             input_hash)
-                del self.input_hash_to_kv_sending_requests[input_hash]
-            else:
-                logger.debug(
-                    'The buffer for input hash %d is not empty, meaning that '\
-                    'there are two jobs with identical input.',
-                    input_hash)
-
-            self.input_hash_to_kv_sending_requests_lock.release()
-
-
-    def kv_cache_send_ready(self, input_hash: int):
-
-        if self.kv_sending_thread is None:
-            self.kv_sending_thread = threading.Thread(
-                target=self.kv_cache_send_loop)
-            self.kv_sending_thread.start()
-        
-        # append an empty list to separate requests
-        # as there might be identical requests, that has the same input hash
-        self.input_hash_to_kv_sending_requests[input_hash].append([])
-        logger.debug(f'Buffered input hash {input_hash}')
-
-    def kv_cache_recv_start(self, input_hash: int):
-        # notify the kv cache sender with the input hash id
-        return self.send_input_hash(input_hash)
-
-    def block_if_buffer_full(self):
-        
-        # block vLLM if the KV cache sending buffer is full
-        # TODO: allow using other policies to handle buffer full
-        while True:
-            self.input_hash_to_kv_sending_requests_lock.acquire()
-            if len(self.input_hash_to_kv_sending_requests.keys()) > 40:
-                self.input_hash_to_kv_sending_requests_lock.release()
-                time.sleep(0.1)
-            else:
-                self.input_hash_to_kv_sending_requests_lock.release()
-                break
-
-
-def send_kv_caches_and_hidden_states(
-    model_executable: torch.nn.Module,
-    model_input: "ModelInputForGPUWithSamplingMetadata",
-    kv_caches: List[torch.Tensor],
-    hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors],
-) -> None:
-
-    input_tokens_tuple = tuple(model_input.input_tokens.tolist())
-    seq_lens = model_input.attn_metadata.seq_lens
-    slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
-
-    # Assumption: current batch is all-prefill requests
-    assert torch.allclose(model_input.attn_metadata.query_start_loc,
-                          model_input.attn_metadata.seq_start_loc)
-    assert torch.all(model_input.attn_metadata.context_lens_tensor == 0)
-
-    ps.get_disagg_group().input_hash_to_kv_sending_requests_lock.acquire()
-
-    # query_lens contains new KV caches that are added to vLLM.
-    # so we will send them to decode instance
-    # FIXME(Kuntai): This assume that all requests are prefill.
-    for idx, slen in enumerate(seq_lens):
-
-        start_pos = sum(seq_lens[:idx])
-        end_pos = start_pos + slen
-        input_hash = hash(input_tokens_tuple[start_pos:end_pos])
-
-        for i in range(model_executable.model.start_layer,
-                       model_executable.model.end_layer):
-            kv_cache = kv_caches[i - model_executable.model.start_layer]
-
-            _, _, num_heads, head_size = kv_cache[0].shape
-
-            key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
-            value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
-
-            current_slot_mapping = slot_mapping[start_pos:end_pos]
-
-            ps.get_disagg_group().kv_cache_send(
-                input_hash, key_cache[current_slot_mapping])
-            ps.get_disagg_group().kv_cache_send(
-                input_hash, value_cache[current_slot_mapping])
-
-        ps.get_disagg_group().kv_cache_send(
-            input_hash,
-            hidden_or_intermediate_states[start_pos:end_pos],
-            is_hidden=True)
-        ps.get_disagg_group().kv_cache_send_ready(input_hash)
-
-    ps.get_disagg_group().input_hash_to_kv_sending_requests_lock.release()
-
-    ps.get_disagg_group().block_if_buffer_full()
-
-    logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
-
-
-def recv_kv_caches_and_hidden_states(
-    model_executable: torch.nn.Module,
-    model_input: "ModelInputForGPUWithSamplingMetadata",
-    kv_caches: List[torch.Tensor]
-) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool]:
-
-    bypass_model_exec = True
-
-    # This is disagg decode instance, during prefill state
-    # Need to receive KV from the prefill instance
-    input_tokens_tuple = tuple(model_input.input_tokens.tolist())
-    seq_lens = model_input.attn_metadata.seq_lens
-    slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
-
-    # Assumption: current batch is all-prefill requests
-    assert torch.allclose(model_input.attn_metadata.query_start_loc,
-                          model_input.attn_metadata.seq_start_loc)
-    assert torch.all(model_input.attn_metadata.context_lens_tensor == 0)
-
-    hidden_or_intermediate_states_for_one_req = []
-
-    # enumerate different requests
-    # FIXME(Kuntai): This impl assumes that all requests are prefill.
-    for idx, slen in enumerate(seq_lens):
-
-        start_pos = sum(seq_lens[:idx])
-        end_pos = start_pos + slen
-        input_hash = hash(input_tokens_tuple[start_pos:end_pos])
-        num_tokens = slen
-
-        # notify the prefill instance to start sending KVs associated with input_hash
-        contain = ps.get_disagg_group().kv_cache_recv_start(input_hash)
-        
-        # fail to find input_hash in prefill instance
-        # this can occur but idk why...
-        if contain == 0:
-            bypass_model_exec = False
-            continue
-
-        # receive KV cache from disaggregated prefill instance
-        for i in range(model_executable.model.start_layer,
-                       model_executable.model.end_layer):
-
-            # get kv cache
-            kv_cache = kv_caches[i - model_executable.model.start_layer]
-            # get corresponding layer
-            layer = model_executable.model.layers[i]
-
-            # get kv cache shape (after sliced by tp)
-            _, _, num_heads, head_size = kv_cache[0].shape
-            key = ps.get_disagg_group().kv_cache_recv(
-                torch.Size([num_tokens, num_heads, head_size]),
-                kv_cache[0].dtype)
-            value = ps.get_disagg_group().kv_cache_recv(
-                torch.Size([num_tokens, num_heads, head_size]),
-                kv_cache[0].dtype)
-
-            key_cache, value_cache = kv_cache[0], kv_cache[1]
-            ops.reshape_and_cache_flash(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                slot_mapping[start_pos:end_pos],
-                layer.self_attn.attn.kv_cache_dtype,
-                layer.self_attn.attn._k_scale,
-                layer.self_attn.attn._v_scale,
-            )
-
-        hidden_or_intermediate_states_for_one_req.append(
-            ps.get_disagg_group().kv_cache_recv(torch.Size(
-                [num_tokens, model_executable.config.hidden_size]),
-                                                kv_cache[0].dtype,
-                                                is_hidden=True))
-    
-    if not bypass_model_exec:
-        # Some of the KV cache is not retrieved
-        # so we need to recompute the hidden state
-        return [], bypass_model_exec
-
-    # concatenate hidden states from different requests
-    if isinstance(hidden_or_intermediate_states_for_one_req[0], torch.Tensor):
-        hidden_or_intermediate_states = torch.cat(
-            hidden_or_intermediate_states_for_one_req, dim=0)
-    else:
-        # concat the IntermediateTensors
-        keys = list(
-            hidden_or_intermediate_states_for_one_req[0].tensors.keys())
-        result_its = {}
-
-        for key in keys:
-            result_its[key] = []
-            for its in hidden_or_intermediate_states_for_one_req:
-                result_its[key].append(its[key])
-            result_its[key] = torch.cat(result_its[key], dim=0)
-
-        hidden_or_intermediate_states = IntermediateTensors(result_its)
-
-    logger.debug("[rank%d]: KV recv DONE.", torch.distributed.get_rank())
-    return hidden_or_intermediate_states, bypass_model_exec
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 86e26b46c9e9..22bc109833c1 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -39,7 +39,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.distributed.group_coordinator import GroupCoordinator
-import vllm.distributed.distributed_kv as dist_kv
+import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
 
 
 
@@ -350,19 +350,11 @@ def initialize_model_parallel(
             # decode global rank: i + world_size
             group_ranks.append([i, i + world_size])
         logger.debug("Distributed group is %s", str(group_ranks))
-        _DISAGG = dist_kv.DistributedKVCoordinator(
+        _DISAGG = dist_kv.KV_transfer_agent(
             group_ranks=group_ranks,
             local_rank=get_world_group().local_rank,
             torch_distributed_backend=backend,
         )
-        # follow by a warmup, to warmup nccl
-        # necessary, as NCCL may not be warmed up when tp and pp are both 1.
-        temp_tensor = torch.tensor([1.]).to(_DISAGG.device)
-        if dist_kv.IS_KV_PREFILL_INSTANCE:
-            _DISAGG.send(temp_tensor)
-        else:
-            recv_tensor = _DISAGG.recv(temp_tensor.shape, temp_tensor.dtype)
-            assert torch.allclose(temp_tensor, recv_tensor)
         logger.debug("_DISAGG initialized for rank %d",
                      torch.distributed.get_rank())
 

From 1b6125d0af553379ed6bd5f9f64a86cb6c425a55 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 8 Sep 2024 01:13:29 +0000
Subject: [PATCH 166/278] move the implementatio to worker_base.py

---
 vllm/distributed/group_coordinator.py        |  5 --
 vllm/distributed/kv_transfer/vllm_adapter.py | 38 ++++-----
 vllm/distributed/parallel_state.py           |  4 +-
 vllm/executor/gpu_executor.py                |  2 +-
 vllm/executor/multiproc_gpu_executor.py      |  2 +-
 vllm/executor/ray_gpu_executor.py            |  2 +-
 vllm/worker/model_runner.py                  | 72 +++++-----------
 vllm/worker/worker_base.py                   | 89 ++++++++++++++++++--
 8 files changed, 129 insertions(+), 85 deletions(-)

diff --git a/vllm/distributed/group_coordinator.py b/vllm/distributed/group_coordinator.py
index bfa3c7f3c17c..e043beeb9969 100644
--- a/vllm/distributed/group_coordinator.py
+++ b/vllm/distributed/group_coordinator.py
@@ -387,7 +387,6 @@ def broadcast_object_list(self,
         """Broadcast the input object list.
         NOTE: `src` is the local rank of the source rank.
         """
-        assert src < self.world_size, f"Invalid src rank ({src})"
 
         # Bypass the function if we are using only 1 GPU.
         if self.world_size == 1:
@@ -402,7 +401,6 @@ def send_object(self, obj: Any, dst: int) -> None:
         """Send the input object list to the destination rank."""
         """NOTE: `dst` is the local rank of the destination rank."""
 
-        assert dst < self.world_size, f"Invalid dst rank ({dst})"
 
         assert dst != self.rank_in_group, (
             "Invalid destination rank. Destination rank is the same "
@@ -432,7 +430,6 @@ def recv_object(self, src: int) -> Any:
         """Receive the input object list from the source rank."""
         """NOTE: `src` is the local rank of the source rank."""
 
-        assert src < self.world_size, f"Invalid src rank ({src})"
 
         assert src != self.rank_in_group, (
             "Invalid source rank. Source rank is the same as the current rank."
@@ -570,7 +567,6 @@ def send_tensor_dict(
 
         if dst is None:
             dst = (self.rank_in_group + 1) % self.world_size
-        assert dst < self.world_size, f"Invalid dst rank ({dst})"
 
         metadata_list: List[Tuple[Any, Any]] = []
         assert isinstance(
@@ -625,7 +621,6 @@ def recv_tensor_dict(
 
         if src is None:
             src = (self.rank_in_group - 1) % self.world_size
-        assert src < self.world_size, f"Invalid src rank ({src})"
 
         recv_metadata_list = self.recv_object(src=src)
         tensor_dict: Dict[str, Any] = {}
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 07358567e783..c5747330d1ac 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -30,19 +30,19 @@
 from vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe import TorchDistributedPipe
 from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_kv_lookup_buffer import SimpleKVLookupBuffer
 
-assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode", "lmc"], \
-    "VLLM_DISAGG_PREFILL_ROLE can only be prefill or decode."
+assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode", "lmcache"], \
+    "VLLM_DISAGG_PREFILL_ROLE can only be prefill, decode or lmcache."
 
+
+# currently the connections are hard-coded.
+# we only handle 2 cases:
+# - prefill vLLM --> decode vLLM
+# - vLLM --> LMCache
 IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"])
 IS_KV_PREFILL_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "prefill")
 IS_KV_DECODE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "decode")
+IS_LMCACHE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "lmcache")
 
-'''Jiayi starts here'''
-IS_LMC_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "lmc")
-'''Jiayi ends here'''
-
-# add a tag when sending/recving input hash
-DISTRIBUTED_KV_GLOO_TAG = 24857323
 
 logger = init_logger(__name__)
 
@@ -80,7 +80,7 @@ def __init__(
             use_message_queue_broadcaster,
         )
         # init lookup buffer
-        self.buffer = SimpleKVLookupBuffer(self.pipe)
+        self.buffer = SimpleKVLookupBuffer(self.pipe, 1000**3)
 
     def send_kv_caches_and_hidden_states(
         self,
@@ -90,7 +90,6 @@ def send_kv_caches_and_hidden_states(
         hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors],
     ) -> None:
 
-        #input_tokens_tuple = tuple(model_input.input_tokens.tolist())
         input_tokens_tensor = model_input.input_tokens
         seq_lens = model_input.attn_metadata.seq_lens
         slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
@@ -99,16 +98,15 @@ def send_kv_caches_and_hidden_states(
         # so we will send them to decode instance
         # FIXME(Kuntai): This assume that all requests are prefill.
         for idx, slen in enumerate(seq_lens):
-            logger.debug(f"sending request {idx}")
             start_pos = sum(seq_lens[:idx])
             end_pos = start_pos + slen
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
             
             keys, values = [], []
             
             
             for l in range(model_executable.model.start_layer,
                         model_executable.model.end_layer):
-                logger.debug(f"sending layer {l}")
                 kv_cache = kv_caches[l - model_executable.model.start_layer]
 
                 _, _, num_heads, head_size = kv_cache[0].shape
@@ -118,14 +116,14 @@ def send_kv_caches_and_hidden_states(
 
                 current_slot_mapping = slot_mapping_flat[start_pos:end_pos]
 
-                keys.append(key_cache[current_slot_mapping].unsqeeze(0))
+                keys.append(key_cache[current_slot_mapping].unsqueeze(0))
                 values.append(value_cache[current_slot_mapping].unsqueeze(0))
                 
             keys = torch.cat(keys, dim=0)
             values = torch.cat(values, dim=0)
             self.buffer.insert(
-                input_tokens_tensor[start_pos:end_pos], 
-                None,
+                current_tokens, 
+                torch.ones_like(current_tokens, dtype=bool),
                 keys, 
                 values, 
                 hidden_or_intermediate_states[start_pos:end_pos]
@@ -146,7 +144,7 @@ def recv_kv_caches_and_hidden_states(
 
         # This is disagg decode instance, during prefill state
         # Need to receive KV from the prefill instance
-        input_tokens_tuple = tuple(model_input.input_tokens.tolist())
+        input_tokens_tensor = model_input.input_tokens
         seq_lens = model_input.attn_metadata.seq_lens
         slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
 
@@ -158,10 +156,12 @@ def recv_kv_caches_and_hidden_states(
 
             start_pos = sum(seq_lens[:idx])
             end_pos = start_pos + slen
-            current_input_tokens = input_tokens_tuple[start_pos:end_pos]
+            current_tokens = input_tokens_tensor[start_pos:end_pos]
             num_tokens = slen
 
-            ret = self.buffer.drop_select(current_input_tokens, None)
+            ret = self.buffer.drop_select(
+                current_tokens, 
+                torch.ones_like(current_tokens, dtype=bool))
             if ret[0] is None:
                 # didn't find any match.
                 self.bypass_model_exec = False
@@ -202,4 +202,4 @@ def recv_kv_caches_and_hidden_states(
             hidden_or_intermediate_states_for_one_req, dim=0)
 
         logger.debug("[rank%d]: KV recv DONE.", torch.distributed.get_rank())
-        return hidden_or_intermediate_states, bypass_model_exec
\ No newline at end of file
+        return hidden_or_intermediate_states, bypass_model_exec, model_input
\ No newline at end of file
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 22bc109833c1..c48b113de970 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -107,10 +107,10 @@ def get_pp_group() -> GroupCoordinator:
 # kept for backward compatibility
 get_pipeline_model_parallel_group = get_pp_group
 
-_DISAGG: Optional[dist_kv.DistributedKVCoordinator] = None
+_DISAGG: Optional[dist_kv.KV_transfer_agent] = None
 
 
-def get_disagg_group() -> dist_kv.DistributedKVCoordinator:
+def get_disagg_group() -> dist_kv.KV_transfer_agent:
     assert _DISAGG is not None, (
         "disaggregated prefill parallel group is not initialized")
     return _DISAGG
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 300e9a33eba5..c52c21600d03 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -1,6 +1,6 @@
 from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
-import vllm.distributed.distributed_kv as dist_kv
+import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index ba222f8b5e40..7857eab2e551 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -8,7 +8,7 @@
 
 import torch
 
-import vllm.distributed.distributed_kv as dist_kv
+import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.gpu_executor import create_worker
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 17f4d3633886..0d14d65d26ca 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -5,7 +5,7 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 import vllm.envs as envs
-import vllm.distributed.distributed_kv as dist_kv
+import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 4d8105bde2c4..6c2982f6f781 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -12,7 +12,6 @@
 import torch.distributed
 import torch.nn as nn
 
-import vllm.distributed.distributed_kv as dist_kv
 
 try:
     from flashinfer import BatchDecodeWithPagedKVCacheWrapper
@@ -1366,56 +1365,25 @@ def execute_model(
             "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
         } if self.has_seqlen_agnostic else {}
 
-        # check if the current run is profiling
-        is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
-        # check if the current run is prefill
-        is_prefill_run = prefill_meta is not None
+        hidden_or_intermediate_states = model_executable(
+        input_ids=model_input.input_tokens,
+        positions=model_input.input_positions,
+        kv_caches=kv_caches,
+        attn_metadata=model_input.attn_metadata,
+        intermediate_tensors=intermediate_tensors,
+        **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                                        device=self.device),
+        **seqlen_agnostic_kwargs)
         
-        # for disaggregated prefilling: allow bypassing model execution
-        bypass_model_exec = False
-        
-        # Recv kv cache for disaggregated prefill
-        # Skip model execution if all required KV cache are received
-        if all([
-            is_prefill_run,
-            dist_kv.IS_KV_DECODE_INSTANCE,
-            not is_profile_run]):
-            
-            hidden_or_intermediate_states, bypass = \
-                dist_kv.recv_kv_caches_and_hidden_states(
-                    model_executable,
-                    model_input,
-                    kv_caches,
-                )
-            if bypass:
-                bypass_model_exec = True
-
-        if not bypass_model_exec:
-            hidden_or_intermediate_states = model_executable(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=model_input.attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            **MultiModalInputs.as_kwargs(multi_modal_kwargs,
-                                            device=self.device),
-            **seqlen_agnostic_kwargs)
-        
-        # Send KV cache for disaggregated prefill
-        if all([
-            is_prefill_run,
-            dist_kv.IS_KV_PREFILL_INSTANCE,
-            not is_profile_run]):
-            
-            dist_kv.send_kv_caches_and_hidden_states(
-                model_executable,
-                model_input,
-                kv_caches,
-                hidden_or_intermediate_states,
-            )
-
+        return hidden_or_intermediate_states
 
-        # Compute the logits in the last pipeline stage.
+    @torch.inference_mode()
+    def postprocess_model(
+        self,
+        model_input,
+        hidden_or_intermediate_states,
+        
+    ):
         if not get_pp_group().is_last_rank:
             return hidden_or_intermediate_states
         
@@ -1431,7 +1399,7 @@ def execute_model(
             sampling_metadata=model_input.sampling_metadata,
         )
 
-
+        decode_meta = model_input.attn_metadata.decode_metadata
         if self.return_hidden_states:
             # we only need to pass hidden states of most recent token
             assert model_input.sampling_metadata is not None
@@ -1447,7 +1415,9 @@ def execute_model(
             output.hidden_states = hidden_states
 
         return [output]
-
+    
+    
+    
 
 class CUDAGraphRunner:
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index e56440693b89..6fd94312483e 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -16,6 +16,9 @@
                         update_environment_variables)
 from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
 
+import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
+import vllm.distributed.parallel_state as ps
+
 logger = init_logger(__name__)
 
 
@@ -212,6 +215,7 @@ def execute_worker(self, worker_input: WorkerInput) -> None:
         Process an execution request.
         """
         raise NotImplementedError
+    
 
     def execute_model(
         self,
@@ -269,11 +273,52 @@ def execute_model(
             intermediate_tensors = IntermediateTensors(
                 get_pp_group().recv_tensor_dict(
                     all_gather_group=get_tp_group()))
-
-        output = self.model_runner.execute_model(
-            model_input, self.kv_cache[worker_input.virtual_engine]
-            if self.kv_cache is not None else None, intermediate_tensors,
-            num_steps)
+        
+        
+        # for disaggregated prefilling: allow bypassing model execution
+        bypass_model_exec = False
+        
+        
+        # receive KV cache. 
+        # NOTE(kuntai): 
+        # If only a part of KV cache is received, we will adjust model_input
+        # to avoid prefill on the part of KV caches that are already received.
+        # This will not happen for disaggregated prefill, but will happen
+        # when connecting to a KV cache database (like LMCache).
+        if self.need_recv_kv(model_input, worker_input):
+            hidden_or_intermediate_states, bypass_model_exec, model_input = \
+                ps.get_disagg_group().recv_kv_caches_and_hidden_states(
+                    # model is used to know which layer the current worker
+                    # is working on, so that we can receive KV for only those
+                    # layers.
+                    self.model_runner.model,
+                    model_input,
+                    self.kv_cache[worker_input.virtual_engine],
+                )
+        
+        if not bypass_model_exec: 
+            hidden_or_intermediate_states = self.model_runner.execute_model(
+                model_input, self.kv_cache[worker_input.virtual_engine]
+                if self.kv_cache is not None else None, intermediate_tensors,
+                num_steps)
+            
+        # sending out KV cache
+        if self.need_send_kv(model_input, worker_input):
+            ps.get_disagg_group().send_kv_caches_and_hidden_states(
+                # model is used to know which layer the current worker
+                # is working on, so that we can send KV for only those
+                # layers.
+                self.model_runner.model,
+                model_input,
+                self.kv_cache[worker_input.virtual_engine],
+                hidden_or_intermediate_states,
+            )
+            
+        # Get model output based on hidden state.
+        output = self.model_runner.postprocess_model(
+            model_input,
+            hidden_or_intermediate_states,
+        )
 
         if not get_pp_group().is_last_rank:
             # output is IntermediateTensors
@@ -284,6 +329,40 @@ def execute_model(
         # output is List[SamplerOutput]
         return output
 
+    def need_recv_kv(self, model_input, worker_input) -> bool:
+        
+        kv_caches = self.kv_cache[worker_input.virtual_engine]
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+        
+        # check if the current run is profiling
+        is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
+        # check if the current run is prefill
+        is_prefill_run = prefill_meta is not None
+        # for disaggregated prefilling: allow bypassing model execution
+        
+        return all([
+            is_prefill_run,
+            dist_kv.IS_KV_DECODE_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE,
+            not is_profile_run])
+
+            
+    def need_send_kv(self, model_input, worker_input) -> bool:
+        
+        kv_caches = self.kv_cache[worker_input.virtual_engine]
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+        model_executable = self.model_runner.model
+        
+        # check if the current run is profiling
+        is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
+        # check if the current run is prefill
+        is_prefill_run = prefill_meta is not None
+        
+        return all([
+            is_prefill_run,
+            dist_kv.IS_KV_PREFILL_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE,
+            not is_profile_run])
+        
+
     def _execute_model_spmd(
         self,
         execute_model_req: ExecuteModelRequest,

From c4102ef057210f4a5a8ac99cb125cf6c95043167 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 8 Sep 2024 08:14:15 +0000
Subject: [PATCH 167/278] update test

---
 tests/kv_transfer/test_send_recv.py | 36 ++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 5a2a72cf177f..95a7528f0f7a 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -9,12 +9,15 @@
 
 def test_run(my_rank, pipe):
     # test run
+    x = torch.tensor([1]).to(pipe.device)
+    y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
     if my_rank == 0:
-        x = torch.tensor([1]).to(pipe.device)
+        
         pipe.send_tensor(x)
+        pipe.send_tensor(y)
     else:
-        y = pipe.recv_tensor()    
-        assert y.item() == 1
+        assert torch.allclose(x, pipe.recv_tensor())
+        assert torch.allclose(y, pipe.recv_tensor())
 
 
 def stress_test(my_rank, pipe):
@@ -23,21 +26,24 @@ def stress_test(my_rank, pipe):
     
     tensors = []
     
+    
     for i in tqdm(range(2000)):
-        mean = random.randint(1, 10)
-        std = random.randint(1, 10)
-        size = [random.randint(900, 1000), random.randint(900, 1000)]
-        x = torch.normal(mean * 1.0, std * 1.0, size=size).to(pipe.device)
+        mean = torch.rand(1).item()
+        std = torch.rand(1).item()
+        size = torch.randint(900, 1000, (2,))
+        x = torch.normal(mean * 1.0, std * 1.0, size=size.tolist()).to(pipe.device)
         
         # 5% probability of sending a None
-        if random.randint(1, 100) < 5:
+        if torch.rand(1).item() < 0.05:
             tensors.append(None)
             tensors.append(None)
             tensors.append(None)
         else:
             tensors.append(x)
-            tensors.append(x.mean())
-            tensors.append(x.std())
+            tensors.append(x.mean().unsqueeze(0))
+            tensors.append(x.std().unsqueeze(0))
+
+        
         
     torch.distributed.barrier()
     
@@ -54,8 +60,9 @@ def stress_test(my_rank, pipe):
                 assert mean is None
                 assert std is None
             else:
-                assert x.mean() == mean
-                assert x.std() == std
+                assert torch.allclose(x, tensors[3*i])
+                assert x.mean() == mean[0]
+                assert x.std() == std[0]
 
     torch.distributed.barrier()
 
@@ -80,7 +87,7 @@ def latency_test(my_rank, pipe, nelement, ntensor):
         torch.distributed.barrier()
         
         if my_rank == 0:
-            t = torch.tensor(time.time(), dtype=torch.float64).to(pipe.device)
+            t = torch.tensor([time.time()], dtype=torch.float64).to(pipe.device)
             for tensor in tensors:
                 pipe.send_tensor(tensor)
             pipe.send_tensor(t)
@@ -110,7 +117,8 @@ def latency_test(my_rank, pipe, nelement, ntensor):
 
 
     pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "nccl")
-    
+
+    torch.manual_seed(0) 
     test_run(my_rank, pipe)
     stress_test(my_rank, pipe)
     latency_test(my_rank, pipe, 1024*8*128, 80)

From a576532438e54bc545b86dc19d4292a32539492e Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 8 Sep 2024 08:14:39 +0000
Subject: [PATCH 168/278] update a new implementation for distributed pipe.
 Much less CPU communication

---
 .../simple_kv_lookup_buffer.py                |   5 +
 .../kv_pipe/torch_distributed_pipe.py         | 251 +++++++++++++++---
 vllm/distributed/kv_transfer/vllm_adapter.py  |  10 +-
 3 files changed, 225 insertions(+), 41 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
index 84566789f696..407ac7c9bcfc 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
@@ -7,6 +7,10 @@
 from collections import deque
 import time
 
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
 class SimpleKVLookupBuffer(KVLookupBufferBase):
     
     def __init__(self, pipe, buffer_size_thresh):
@@ -157,6 +161,7 @@ def full_handler(self):
     def insert(self, input_tokens, roi, key, value, hidden) -> None:
 
         while self.buffer_size > self.buffer_size_threshold:
+            logger.debug("KV transfer buffer is full. Handling...")
             self.full_handler()
         
         self._add_to_buffer(input_tokens, roi, key, value, hidden)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index d3663ac7667d..1bf54badac25 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -3,16 +3,76 @@
 from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 from torch.distributed import Backend, ProcessGroup
 import torch
-from typing import List, Union, Optional
+from typing import Any, Dict, List, Optional, Tuple, Union
 import threading
 from concurrent.futures import ThreadPoolExecutor
 import time
 import threading
+from collections import namedtuple
+from typing import Dict, Any, Tuple, List
+import pickle
+
+from vllm.logger import init_logger
+
+
+logger = init_logger(__name__)
+
+
+# auxilary function to send tensordict
+TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
+
+def _split_tensor_dict(
+    tensor_dict: Dict[str, Union[torch.Tensor, Any]]
+) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
+    """Split the tensor dictionary into two parts:
+    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
+         by its metadata.
+    2. A list of tensors.
+    """
+    metadata_list: List[Tuple[str, Any]] = []
+    tensor_list: List[torch.Tensor] = []
+    for key, value in tensor_dict.items():
+        if isinstance(value, torch.Tensor):
+            # Note: we cannot use `value.device` here,
+            # because it contains not only the device type but also the device
+            # index (e.g. "cuda:0"). We only need the device type.
+            # receiving side will set the device index.
+            device = value.device.type
+            metadata_list.append(
+                (key, TensorMetadata(device, value.dtype, value.size())))
+            tensor_list.append(value)
+        else:
+            metadata_list.append((key, value))
+    return metadata_list, tensor_list
 
 
 # if the tensor is only one-element and only contains this number
 # this means that the sended object is None.
 NONE_INT = -150886311
+FLOAT16_INT = -543205003776624
+INT64_INT = -375623078607432
+BOOL_INT = -28035262008646
+BFLOAT16_INT = -452084912267662
+FLOAT32_INT = -1049557997456592
+FLOAT64_INT = -452201007054137
+
+DTYPE2INT = {
+    torch.float16: FLOAT16_INT,
+    torch.int64: INT64_INT,
+    torch.bool: BOOL_INT,
+    torch.bfloat16: BFLOAT16_INT,
+    torch.float32: FLOAT32_INT,
+    torch.float64: FLOAT64_INT,
+}
+
+INT2DTYPE = {
+    FLOAT16_INT: torch.float16,
+    INT64_INT: torch.int64,
+    BOOL_INT: torch.bool,
+    BFLOAT16_INT: torch.bfloat16,
+    FLOAT32_INT: torch.float32,
+    FLOAT64_INT: torch.float64,
+}
 
 
 class BrokenPipeException(Exception):
@@ -20,32 +80,41 @@ def __init__(self, message):
         self.message = message
         super().__init__(self.message)
 
-class TorchDistributedPipe(KVPipeBase, GroupCoordinator):
+class TorchDistributedPipe(KVPipeBase):
     
     def __init__(
         self,
         group_ranks: List[List[int]],
         local_rank: int,
-        torch_distributed_backend: Union[str, Backend],
-        # DO NOT use pynccl here
-        # Pynccl send is non-blocking
-        # and it's possible that the memory is freed before the data being sent
-        # which may happen at high qps
-        use_pynccl: bool = False,
-        use_custom_allreduce: bool = False,
-        use_tpu_communicator: bool = True,
-        use_message_queue_broadcaster: bool = False,
+        torch_distributed_backend: Union[str, Backend]
     ):
 
-        super().__init__(
-            group_ranks,
-            local_rank,
-            torch_distributed_backend,
-            use_pynccl,
-            use_custom_allreduce,
-            use_tpu_communicator,
-            use_message_queue_broadcaster,
-        )
+        self.rank = torch.distributed.get_rank()
+        self.local_rank = local_rank
+        self.device_group = None
+        self.cpu_group = None
+
+        for ranks in group_ranks:
+            device_group = torch.distributed.new_group(
+                ranks, backend=torch_distributed_backend)
+            # a group with `gloo` backend, to allow direct coordination between
+            # processes through the CPU.
+            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            if self.rank in ranks:
+                self.ranks = ranks
+                self.world_size = len(ranks)
+                self.rank_in_group = ranks.index(self.rank)
+                self.device_group = device_group
+                self.cpu_group = cpu_group
+
+        assert self.cpu_group is not None
+        assert self.device_group is not None
+        assert self.rank_in_group <= 1
+
+        if torch.cuda.is_available():
+            self.device = torch.device(f"cuda:{local_rank}")
+        else:
+            self.device = torch.device("cpu")
 
         # if turned on, will use CPU-based communication to perform a series of sanity check.
         # but it adds ~5ms delay, so please turn it off in performance-demanding usecases (e.g. disaggregated prefill)
@@ -61,19 +130,131 @@ def __init__(
 
         self.none_tensor = torch.tensor([NONE_INT]).to(self.device)
         self.broken = False
+
+        # create a dummy tensor
+        # this tensor is used 
+        self.dummy_cpu_tensor_for_send = torch.tensor([1],device='cpu')
+        self.dummy_cpu_tensor_for_recv = torch.tensor([1],device='cpu')
+
+        self.dtype_tensor_for_recv = torch.tensor([0]).to(self.device)
+        self.numdim_tensor_for_recv = torch.tensor([-1]).to(self.device)
+        self.dims_tensor_for_recv = torch.ones([100], dtype=int).to(self.device)
+
         
+    def quick_send(self, tensor, prep):
+
+        group = self.device_group
+
+        # NCCL is NOT fully duplex
+        # need to explicitly sync using CPU
+        # to guarantee that there is only 1-directional data happening now
+        torch.distributed.send(
+            self.dummy_cpu_tensor_for_send,
+            dst=self.target_rank_for_send,
+            group=self.cpu_group
+        )
+
+        torch.distributed.send(
+            prep['dtype'],
+            dst=self.target_rank_for_send,
+            group=group
+        )
+        torch.distributed.send(
+            prep['numdim'],
+            dst=self.target_rank_for_send,
+            group=group
+        )
+        torch.distributed.send(
+            prep['dims'],
+            dst=self.target_rank_for_send,
+            group=group
+        )
+        torch.distributed.send(
+            tensor,
+            dst=self.target_rank_for_send,
+            group=group
+        )
+
+
+    def quick_recv(self):
+
+        # receive is sequential, so we can reuse the GPU buffer
+        group = self.device_group
+
+        # NCCL is NOT fully duplex
+        # need to explicitly sync using CPU
+        # to guarantee that there is only 1-directional data happening now
+        torch.distributed.recv(
+            self.dummy_cpu_tensor_for_recv,
+            src=self.target_rank_for_recv,
+            group=self.cpu_group
+        )
+        
+        torch.distributed.recv(
+            self.dtype_tensor_for_recv,
+            src=self.target_rank_for_recv,
+            group=group
+        )
+        torch.distributed.recv(
+            self.numdim_tensor_for_recv,
+            src=self.target_rank_for_recv,
+            group=group
+        )
+
+        numdim = self.numdim_tensor_for_recv.item()
+        torch.distributed.recv(
+            self.dims_tensor_for_recv[:numdim],
+            src=self.target_rank_for_recv,
+            group=group
+        )
+
+        dtype = INT2DTYPE[self.dtype_tensor_for_recv.item()]
+        shape = self.dims_tensor_for_recv[:numdim].tolist()
+
+        buffer = torch.zeros(shape, dtype=dtype).to(self.device)
         
-    def send_tensor_wrapper(self, tensor: torch.Tensor) -> None:
-        """Wrapper for send_tensor_dict"""
-        tensor_size = tensor.element_size() * tensor.numel()
-        self.send_tensor_dict({'tensor': tensor}, self.target_rank_for_send)
+        torch.distributed.recv(
+            buffer,
+            src=self.target_rank_for_recv,
+            group=group
+        )
+
+        return buffer
         
-        with self.buffer_size_lock:
-            self.buffer_size = self.buffer_size - tensor_size
+        
+        
+    def prep_send(self, tensor):
+        
+        # prepare a series of tensor before send
+        dtype_tensor = torch.tensor([DTYPE2INT[tensor.dtype]]).to(self.device, non_blocking=True)
+        numdim_tensor = torch.tensor(len(tensor.shape)).to(self.device, non_blocking=True)
+        dims_tensor = torch.tensor(tensor.shape).to(self.device, non_blocking=True)
+
+        return {
+            'dtype': dtype_tensor,
+            'numdim': numdim_tensor,
+            'dims': dims_tensor
+        }
+
+        
+    def send_tensor_wrapper(self, tensor, prep) -> None:
+
+        try:
+            """Wrapper for send_tensor_dict"""
+            tensor_size = tensor.element_size() * tensor.numel()
+            # self.send_tensor_dict({'tensor': tensor})
+            self.quick_send(tensor, prep)
+            
+            with self.buffer_size_lock:
+                self.buffer_size = self.buffer_size - tensor_size
+        except Exception as e:
+            logger.error("Encountering exception in KV sending thread")
+            logger.error("%s", e)
         
     def block_if_full(self):
         
         while self.buffer_size > 1e9:
+            logger.debug("KV cache transfer pipe is full. Waiting...")
             time.sleep(0.05)
 
     def send_tensor(self,
@@ -92,21 +273,27 @@ def send_tensor(self,
         else:
             tensor_size = tensor.element_size() * tensor.numel()
 
+        assert 0 < len(tensor.shape) <  100, "Send tensor does not support tensor with 0 dim or >=100 dim. Got %d" % len(tensor.shape)
+
         self.block_if_full()
 
         with self.buffer_size_lock:
             self.buffer_size = self.buffer_size + tensor_size
             
-        self.kv_sending_thread.submit(self.send_tensor_wrapper, tensor)
-
-        
-
+        # self.kv_sending_thread.submit(self.send_tensor_wrapper, tensor)
+        prep = self.prep_send(tensor)
+        self.kv_sending_thread.submit(
+            self.send_tensor_wrapper, 
+            tensor, prep)
+    
     def recv_tensor(self) -> Optional[torch.Tensor]:
         """Receives a tensor from the src rank. Blocking."""
         
-        tensor = self.recv_tensor_dict(self.target_rank_for_recv)['tensor']
+        tensor = self.quick_recv()
         if tensor.numel() == 1 and tensor.item() == NONE_INT:
             return None
         else:
             return tensor
-    
\ No newline at end of file
+    
+
+    
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index c5747330d1ac..d13d132f5dfe 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -63,10 +63,6 @@ def __init__(
         group_ranks: List[List[int]],
         local_rank: int,
         torch_distributed_backend: Union[str, Backend],
-        use_pynccl: bool = False,
-        use_custom_allreduce: bool = False,
-        use_tpu_communicator: bool = True,
-        use_message_queue_broadcaster: bool = False
     ):
         
         # init pipe
@@ -74,13 +70,9 @@ def __init__(
             group_ranks,
             local_rank,
             torch_distributed_backend,
-            use_pynccl,
-            use_custom_allreduce,
-            use_tpu_communicator,
-            use_message_queue_broadcaster,
         )
         # init lookup buffer
-        self.buffer = SimpleKVLookupBuffer(self.pipe, 1000**3)
+        self.buffer = SimpleKVLookupBuffer(self.pipe, 1000**3 * 10)
 
     def send_kv_caches_and_hidden_states(
         self,

From 24a231eae92dab83565d17275d49e37557ecd2dc Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 8 Sep 2024 08:31:26 +0000
Subject: [PATCH 169/278] update tensor sending and receiving. Use CPU to
 transfer metadata instead.

---
 .../kv_pipe/torch_distributed_pipe.py         | 130 +++---------------
 1 file changed, 20 insertions(+), 110 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index 1bf54badac25..760a9662d515 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -18,34 +18,6 @@
 logger = init_logger(__name__)
 
 
-# auxilary function to send tensordict
-TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
-
-def _split_tensor_dict(
-    tensor_dict: Dict[str, Union[torch.Tensor, Any]]
-) -> Tuple[List[Tuple[str, Any]], List[torch.Tensor]]:
-    """Split the tensor dictionary into two parts:
-    1. A list of (key, value) pairs. If the value is a tensor, it is replaced
-         by its metadata.
-    2. A list of tensors.
-    """
-    metadata_list: List[Tuple[str, Any]] = []
-    tensor_list: List[torch.Tensor] = []
-    for key, value in tensor_dict.items():
-        if isinstance(value, torch.Tensor):
-            # Note: we cannot use `value.device` here,
-            # because it contains not only the device type but also the device
-            # index (e.g. "cuda:0"). We only need the device type.
-            # receiving side will set the device index.
-            device = value.device.type
-            metadata_list.append(
-                (key, TensorMetadata(device, value.dtype, value.size())))
-            tensor_list.append(value)
-        else:
-            metadata_list.append((key, value))
-    return metadata_list, tensor_list
-
-
 # if the tensor is only one-element and only contains this number
 # this means that the sended object is None.
 NONE_INT = -150886311
@@ -131,119 +103,58 @@ def __init__(
         self.none_tensor = torch.tensor([NONE_INT]).to(self.device)
         self.broken = False
 
-        # create a dummy tensor
-        # this tensor is used 
-        self.dummy_cpu_tensor_for_send = torch.tensor([1],device='cpu')
-        self.dummy_cpu_tensor_for_recv = torch.tensor([1],device='cpu')
-
-        self.dtype_tensor_for_recv = torch.tensor([0]).to(self.device)
-        self.numdim_tensor_for_recv = torch.tensor([-1]).to(self.device)
-        self.dims_tensor_for_recv = torch.ones([100], dtype=int).to(self.device)
-
         
-    def quick_send(self, tensor, prep):
+    def quick_send(self, tensor):
 
         group = self.device_group
 
         # NCCL is NOT fully duplex
-        # need to explicitly sync using CPU
-        # to guarantee that there is only 1-directional data happening now
-        torch.distributed.send(
-            self.dummy_cpu_tensor_for_send,
+        # so CPU communication is ALWAYS necessary
+        torch.distributed.send_object_list(
+            [tensor.dtype, tensor.shape, str(tensor.device)],
             dst=self.target_rank_for_send,
             group=self.cpu_group
         )
 
-        torch.distributed.send(
-            prep['dtype'],
-            dst=self.target_rank_for_send,
-            group=group
-        )
-        torch.distributed.send(
-            prep['numdim'],
-            dst=self.target_rank_for_send,
-            group=group
-        )
-        torch.distributed.send(
-            prep['dims'],
-            dst=self.target_rank_for_send,
-            group=group
-        )
         torch.distributed.send(
             tensor,
             dst=self.target_rank_for_send,
-            group=group
+            group=self.device_group
         )
 
 
     def quick_recv(self):
 
-        # receive is sequential, so we can reuse the GPU buffer
-        group = self.device_group
-
         # NCCL is NOT fully duplex
-        # need to explicitly sync using CPU
-        # to guarantee that there is only 1-directional data happening now
-        torch.distributed.recv(
-            self.dummy_cpu_tensor_for_recv,
+        # so CPU communication is necessary
+        metadata = [None, None, None]
+        torch.distributed.recv_object_list(
+            metadata,
             src=self.target_rank_for_recv,
             group=self.cpu_group
         )
         
-        torch.distributed.recv(
-            self.dtype_tensor_for_recv,
-            src=self.target_rank_for_recv,
-            group=group
-        )
-        torch.distributed.recv(
-            self.numdim_tensor_for_recv,
-            src=self.target_rank_for_recv,
-            group=group
-        )
-
-        numdim = self.numdim_tensor_for_recv.item()
-        torch.distributed.recv(
-            self.dims_tensor_for_recv[:numdim],
-            src=self.target_rank_for_recv,
-            group=group
-        )
-
-        dtype = INT2DTYPE[self.dtype_tensor_for_recv.item()]
-        shape = self.dims_tensor_for_recv[:numdim].tolist()
-
-        buffer = torch.zeros(shape, dtype=dtype).to(self.device)
+        dtype, shape, device = metadata
+        if 'cuda' in device:
+            device = self.device
+        else:
+            device = 'cpu'
+        buffer = torch.zeros(shape, dtype=dtype).to(device, non_blocking=True)
         
         torch.distributed.recv(
             buffer,
             src=self.target_rank_for_recv,
-            group=group
+            group=self.device_group
         )
-
         return buffer
         
-        
-        
-    def prep_send(self, tensor):
-        
-        # prepare a series of tensor before send
-        dtype_tensor = torch.tensor([DTYPE2INT[tensor.dtype]]).to(self.device, non_blocking=True)
-        numdim_tensor = torch.tensor(len(tensor.shape)).to(self.device, non_blocking=True)
-        dims_tensor = torch.tensor(tensor.shape).to(self.device, non_blocking=True)
-
-        return {
-            'dtype': dtype_tensor,
-            'numdim': numdim_tensor,
-            'dims': dims_tensor
-        }
 
         
-    def send_tensor_wrapper(self, tensor, prep) -> None:
+    def send_tensor_wrapper(self, tensor) -> None:
 
         try:
-            """Wrapper for send_tensor_dict"""
             tensor_size = tensor.element_size() * tensor.numel()
-            # self.send_tensor_dict({'tensor': tensor})
-            self.quick_send(tensor, prep)
+            self.quick_send(tensor)
             
             with self.buffer_size_lock:
                 self.buffer_size = self.buffer_size - tensor_size
@@ -280,11 +191,10 @@ def send_tensor(self,
         with self.buffer_size_lock:
             self.buffer_size = self.buffer_size + tensor_size
             
-        # self.kv_sending_thread.submit(self.send_tensor_wrapper, tensor)
-        prep = self.prep_send(tensor)
+        # prepare the metadata before sending the tensor.
         self.kv_sending_thread.submit(
             self.send_tensor_wrapper, 
-            tensor, prep)
+            tensor)
     
     def recv_tensor(self) -> Optional[torch.Tensor]:
         """Receives a tensor from the src rank. Blocking."""

From dca877ab7def76ada27324d4729eb8a3832a24e8 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 8 Sep 2024 08:52:07 +0000
Subject: [PATCH 170/278] update benchmark: use small model for quick iteration

---
 .../disagg_overhead_benchmark.sh              | 43 +++++++++++++------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 12f5150cadda..d264f1815643 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -50,29 +50,44 @@ benchmark() {
   # compare chunked prefill with disaggregated prefill
 
   results_folder="./results"
-  model="meta-llama/Meta-Llama-3.1-70B-Instruct"
+  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
   dataset_name="sonnet"
   dataset_path="../sonnet_4x.txt"
-  num_prompts=50
+  num_prompts=20
   qps=$1
   prefix_len=50
   input_len=2048
   output_len=$2
 
   # large model
-  VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
-      -m vllm.entrypoints.openai.api_server \
-      --model $model \
-      --port 8100 \
-      -tp 4 \
-      --max-model-len 30000 \
-      --gpu-memory-utilization 0.8 &
-  VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+  # VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+  #     -m vllm.entrypoints.openai.api_server \
+  #     --model $model \
+  #     --port 8100 \
+  #     -tp 4 \
+  #     --max-model-len 30000 \
+  #     --gpu-memory-utilization 0.8 &
+  # VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+  #   -m vllm.entrypoints.openai.api_server \
+  #   --model $model \
+  #   --port 8200 \
+  #   -tp 4 \
+  #   --max-model-len 30000 \
+  #   --gpu-memory-utilization 0.8 &
+
+  VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 python3 \
     -m vllm.entrypoints.openai.api_server \
-    --model $model \
+    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
+    --port 8100 \
+    --max-model-len 10000 \
+    --gpu-memory-utilization 0.8 &
+
+# decoding instance
+VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=1 python3 \
+    -m vllm.entrypoints.openai.api_server \
+    --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8200 \
-    -tp 4 \
-    --max-model-len 30000 \
+    --max-model-len 10000 \
     --gpu-memory-utilization 0.8 &
 
   wait_for_server 8100
@@ -92,7 +107,7 @@ benchmark() {
           --save-result \
           --result-dir $results_folder \
           --result-filename disagg_prefill_2xtp4.json \
-          --request-rate $qps
+          --request-rate "inf"
 
 
   # send the request to decode.

From 9f81f41f813333bf077f792ee509f0ba7fe6eac8 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 8 Sep 2024 08:57:40 +0000
Subject: [PATCH 171/278] update implementation

---
 benchmarks/sonnet_4x.txt                      | 2070 +++++++++++++++++
 .../kv_pipe/torch_distributed_pipe.py         |    4 +-
 vllm/worker/worker_base.py                    |    1 +
 3 files changed, 2072 insertions(+), 3 deletions(-)
 create mode 100644 benchmarks/sonnet_4x.txt

diff --git a/benchmarks/sonnet_4x.txt b/benchmarks/sonnet_4x.txt
new file mode 100644
index 000000000000..02f39a9fb14f
--- /dev/null
+++ b/benchmarks/sonnet_4x.txt
@@ -0,0 +1,2070 @@
+
+FROM fairest creatures we desire increase,
+That thereby beauty's rose might never die,
+But as the riper should by time decease,
+His tender heir might bear his memory:
+But thou, contracted to thine own bright eyes,
+Feed'st thy light'st flame with self-substantial fuel,
+Making a famine where abundance lies,
+Thyself thy foe, to thy sweet self too cruel.
+Thou that art now the world's fresh ornament
+And only herald to the gaudy spring,
+Within thine own bud buriest thy content
+And, tender churl, makest waste in niggarding.
+Pity the world, or else this glutton be,
+To eat the world's due, by the grave and thee.
+When forty winters shall beseige thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery, so gazed on now,
+Will be a tatter'd weed, of small worth held:
+Then being ask'd where all thy beauty lies,
+Where all the treasure of thy lusty days,
+To say, within thine own deep-sunken eyes,
+Were an all-eating shame and thriftless praise.
+How much more praise deserved thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.
+Look in thy glass, and tell the face thou viewest
+Now is the time that face should form another;
+Whose fresh repair if now thou not renewest,
+Thou dost beguile the world, unbless some mother.
+For where is she so fair whose unear'd womb
+Disdains the tillage of thy husbandry?
+Or who is he so fond will be the tomb
+Of his self-love, to stop posterity?
+Thou art thy mother's glass, and she in thee
+Calls back the lovely April of her prime:
+So thou through windows of thine age shall see
+Despite of wrinkles this thy golden time.
+But if thou live, remember'd not to be,
+Die single, and thine image dies with thee.
+Unthrifty loveliness, why dost thou spend
+Upon thyself thy beauty's legacy?
+Nature's bequest gives nothing but doth lend,
+And being frank she lends to those are free.
+Then, beauteous niggard, why dost thou abuse
+The bounteous largess given thee to give?
+Profitless usurer, why dost thou use
+So great a sum of sums, yet canst not live?
+For having traffic with thyself alone,
+Thou of thyself thy sweet self dost deceive.
+Then how, when nature calls thee to be gone,
+What acceptable audit canst thou leave?
+Thy unused beauty must be tomb'd with thee,
+Which, used, lives th' executor to be.
+Those hours, that with gentle work did frame
+The lovely gaze where every eye doth dwell,
+Will play the tyrants to the very same
+And that unfair which fairly doth excel:
+For never-resting time leads summer on
+To hideous winter and confounds him there;
+Sap cheque'd with frost and lusty leaves quite gone,
+Beauty o'ersnow'd and bareness every where:
+Then, were not summer's distillation left,
+A liquid prisoner pent in walls of glass,
+Beauty's effect with beauty were bereft,
+Nor it nor no remembrance what it was:
+But flowers distill'd though they with winter meet,
+Leese but their show; their substance still lives sweet.
+Then let not winter's ragged hand deface
+In thee thy summer, ere thou be distill'd:
+Make sweet some vial; treasure thou some place
+With beauty's treasure, ere it be self-kill'd.
+That use is not forbidden usury,
+Which happies those that pay the willing loan;
+That's for thyself to breed another thee,
+Or ten times happier, be it ten for one;
+Ten times thyself were happier than thou art,
+If ten of thine ten times refigured thee:
+Then what could death do, if thou shouldst depart,
+Leaving thee living in posterity?
+Be not self-will'd, for thou art much too fair
+To be death's conquest and make worms thine heir.
+Lo! in the orient when the gracious light
+Lifts up his burning head, each under eye
+Doth homage to his new-appearing sight,
+Serving with looks his sacred majesty;
+And having climb'd the steep-up heavenly hill,
+Resembling strong youth in his middle age,
+yet mortal looks adore his beauty still,
+Attending on his golden pilgrimage;
+But when from highmost pitch, with weary car,
+Like feeble age, he reeleth from the day,
+The eyes, 'fore duteous, now converted are
+From his low tract and look another way:
+So thou, thyself out-going in thy noon,
+Unlook'd on diest, unless thou get a son.
+Music to hear, why hear'st thou music sadly?
+Sweets with sweets war not, joy delights in joy.
+Why lovest thou that which thou receivest not gladly,
+Or else receivest with pleasure thine annoy?
+If the true concord of well-tuned sounds,
+By unions married, do offend thine ear,
+They do but sweetly chide thee, who confounds
+In singleness the parts that thou shouldst bear.
+Mark how one string, sweet husband to another,
+Strikes each in each by mutual ordering,
+Resembling sire and child and happy mother
+Who all in one, one pleasing note do sing:
+Whose speechless song, being many, seeming one,
+Sings this to thee: 'thou single wilt prove none.'
+Is it for fear to wet a widow's eye
+That thou consumest thyself in single life?
+Ah! if thou issueless shalt hap to die.
+The world will wail thee, like a makeless wife;
+The world will be thy widow and still weep
+That thou no form of thee hast left behind,
+When every private widow well may keep
+By children's eyes her husband's shape in mind.
+Look, what an unthrift in the world doth spend
+Shifts but his place, for still the world enjoys it;
+But beauty's waste hath in the world an end,
+And kept unused, the user so destroys it.
+No love toward others in that bosom sits
+That on himself such murderous shame commits.
+For shame! deny that thou bear'st love to any,
+Who for thyself art so unprovident.
+Grant, if thou wilt, thou art beloved of many,
+But that thou none lovest is most evident;
+For thou art so possess'd with murderous hate
+That 'gainst thyself thou stick'st not to conspire.
+Seeking that beauteous roof to ruinate
+Which to repair should be thy chief desire.
+O, change thy thought, that I may change my mind!
+Shall hate be fairer lodged than gentle love?
+Be, as thy presence is, gracious and kind,
+Or to thyself at least kind-hearted prove:
+Make thee another self, for love of me,
+That beauty still may live in thine or thee.
+As fast as thou shalt wane, so fast thou growest
+In one of thine, from that which thou departest;
+And that fresh blood which youngly thou bestowest
+Thou mayst call thine when thou from youth convertest.
+Herein lives wisdom, beauty and increase:
+Without this, folly, age and cold decay:
+If all were minded so, the times should cease
+And threescore year would make the world away.
+Let those whom Nature hath not made for store,
+Harsh featureless and rude, barrenly perish:
+Look, whom she best endow'd she gave the more;
+Which bounteous gift thou shouldst in bounty cherish:
+She carved thee for her seal, and meant thereby
+Thou shouldst print more, not let that copy die.
+When I do count the clock that tells the time,
+And see the brave day sunk in hideous night;
+When I behold the violet past prime,
+And sable curls all silver'd o'er with white;
+When lofty trees I see barren of leaves
+Which erst from heat did canopy the herd,
+And summer's green all girded up in sheaves
+Borne on the bier with white and bristly beard,
+Then of thy beauty do I question make,
+That thou among the wastes of time must go,
+Since sweets and beauties do themselves forsake
+And die as fast as they see others grow;
+And nothing 'gainst Time's scythe can make defence
+Save breed, to brave him when he takes thee hence.
+O, that you were yourself! but, love, you are
+No longer yours than you yourself here live:
+Against this coming end you should prepare,
+And your sweet semblance to some other give.
+So should that beauty which you hold in lease
+Find no determination: then you were
+Yourself again after yourself's decease,
+When your sweet issue your sweet form should bear.
+Who lets so fair a house fall to decay,
+Which husbandry in honour might uphold
+Against the stormy gusts of winter's day
+And barren rage of death's eternal cold?
+O, none but unthrifts! Dear my love, you know
+You had a father: let your son say so.
+Not from the stars do I my judgment pluck;
+And yet methinks I have astronomy,
+But not to tell of good or evil luck,
+Of plagues, of dearths, or seasons' quality;
+Nor can I fortune to brief minutes tell,
+Pointing to each his thunder, rain and wind,
+Or say with princes if it shall go well,
+By oft predict that I in heaven find:
+But from thine eyes my knowledge I derive,
+And, constant stars, in them I read such art
+As truth and beauty shall together thrive,
+If from thyself to store thou wouldst convert;
+Or else of thee this I prognosticate:
+Thy end is truth's and beauty's doom and date.
+When I consider every thing that grows
+Holds in perfection but a little moment,
+That this huge stage presenteth nought but shows
+Whereon the stars in secret influence comment;
+When I perceive that men as plants increase,
+Cheered and cheque'd even by the self-same sky,
+Vaunt in their youthful sap, at height decrease,
+And wear their brave state out of memory;
+Then the conceit of this inconstant stay
+Sets you most rich in youth before my sight,
+Where wasteful Time debateth with Decay,
+To change your day of youth to sullied night;
+And all in war with Time for love of you,
+As he takes from you, I engraft you new.
+But wherefore do not you a mightier way
+Make war upon this bloody tyrant, Time?
+And fortify yourself in your decay
+With means more blessed than my barren rhyme?
+Now stand you on the top of happy hours,
+And many maiden gardens yet unset
+With virtuous wish would bear your living flowers,
+Much liker than your painted counterfeit:
+So should the lines of life that life repair,
+Which this, Time's pencil, or my pupil pen,
+Neither in inward worth nor outward fair,
+Can make you live yourself in eyes of men.
+To give away yourself keeps yourself still,
+And you must live, drawn by your own sweet skill.
+Who will believe my verse in time to come,
+If it were fill'd with your most high deserts?
+Though yet, heaven knows, it is but as a tomb
+Which hides your life and shows not half your parts.
+If I could write the beauty of your eyes
+And in fresh numbers number all your graces,
+The age to come would say 'This poet lies:
+Such heavenly touches ne'er touch'd earthly faces.'
+So should my papers yellow'd with their age
+Be scorn'd like old men of less truth than tongue,
+And your true rights be term'd a poet's rage
+And stretched metre of an antique song:
+But were some child of yours alive that time,
+You should live twice; in it and in my rhyme.
+Shall I compare thee to a summer's day?
+Thou art more lovely and more temperate:
+Rough winds do shake the darling buds of May,
+And summer's lease hath all too short a date:
+Sometime too hot the eye of heaven shines,
+And often is his gold complexion dimm'd;
+And every fair from fair sometime declines,
+By chance or nature's changing course untrimm'd;
+But thy eternal summer shall not fade
+Nor lose possession of that fair thou owest;
+Nor shall Death brag thou wander'st in his shade,
+When in eternal lines to time thou growest:
+So long as men can breathe or eyes can see,
+So long lives this and this gives life to thee.
+Devouring Time, blunt thou the lion's paws,
+And make the earth devour her own sweet brood;
+Pluck the keen teeth from the fierce tiger's jaws,
+And burn the long-lived phoenix in her blood;
+Make glad and sorry seasons as thou fleets,
+And do whate'er thou wilt, swift-footed Time,
+To the wide world and all her fading sweets;
+But I forbid thee one most heinous crime:
+O, carve not with thy hours my love's fair brow,
+Nor draw no lines there with thine antique pen;
+Him in thy course untainted do allow
+For beauty's pattern to succeeding men.
+Yet, do thy worst, old Time: despite thy wrong,
+My love shall in my verse ever live young.
+A woman's face with Nature's own hand painted
+Hast thou, the master-mistress of my passion;
+A woman's gentle heart, but not acquainted
+With shifting change, as is false women's fashion;
+An eye more bright than theirs, less false in rolling,
+Gilding the object whereupon it gazeth;
+A man in hue, all 'hues' in his controlling,
+Much steals men's eyes and women's souls amazeth.
+And for a woman wert thou first created;
+Till Nature, as she wrought thee, fell a-doting,
+And by addition me of thee defeated,
+By adding one thing to my purpose nothing.
+But since she prick'd thee out for women's pleasure,
+Mine be thy love and thy love's use their treasure.
+So is it not with me as with that Muse
+Stirr'd by a painted beauty to his verse,
+Who heaven itself for ornament doth use
+And every fair with his fair doth rehearse
+Making a couplement of proud compare,
+With sun and moon, with earth and sea's rich gems,
+With April's first-born flowers, and all things rare
+That heaven's air in this huge rondure hems.
+O' let me, true in love, but truly write,
+And then believe me, my love is as fair
+As any mother's child, though not so bright
+As those gold candles fix'd in heaven's air:
+Let them say more than like of hearsay well;
+I will not praise that purpose not to sell.
+My glass shall not persuade me I am old,
+So long as youth and thou are of one date;
+But when in thee time's furrows I behold,
+Then look I death my days should expiate.
+For all that beauty that doth cover thee
+Is but the seemly raiment of my heart,
+Which in thy breast doth live, as thine in me:
+How can I then be elder than thou art?
+O, therefore, love, be of thyself so wary
+As I, not for myself, but for thee will;
+Bearing thy heart, which I will keep so chary
+As tender nurse her babe from faring ill.
+Presume not on thy heart when mine is slain;
+Thou gavest me thine, not to give back again.
+As an unperfect actor on the stage
+Who with his fear is put besides his part,
+Or some fierce thing replete with too much rage,
+Whose strength's abundance weakens his own heart.
+So I, for fear of trust, forget to say
+The perfect ceremony of love's rite,
+And in mine own love's strength seem to decay,
+O'ercharged with burden of mine own love's might.
+O, let my books be then the eloquence
+And dumb presagers of my speaking breast,
+Who plead for love and look for recompense
+More than that tongue that more hath more express'd.
+O, learn to read what silent love hath writ:
+To hear with eyes belongs to love's fine wit.
+Mine eye hath play'd the painter and hath stell'd
+Thy beauty's form in table of my heart;
+My body is the frame wherein 'tis held,
+And perspective it is the painter's art.
+For through the painter must you see his skill,
+To find where your true image pictured lies;
+Which in my bosom's shop is hanging still,
+That hath his windows glazed with thine eyes.
+Now see what good turns eyes for eyes have done:
+Mine eyes have drawn thy shape, and thine for me
+Are windows to my breast, where-through the sun
+Delights to peep, to gaze therein on thee;
+Yet eyes this cunning want to grace their art;
+They draw but what they see, know not the heart.
+Let those who are in favour with their stars
+Of public honour and proud titles boast,
+Whilst I, whom fortune of such triumph bars,
+Unlook'd for joy in that I honour most.
+Great princes' favourites their fair leaves spread
+But as the marigold at the sun's eye,
+And in themselves their pride lies buried,
+For at a frown they in their glory die.
+The painful warrior famoused for fight,
+After a thousand victories once foil'd,
+Is from the book of honour razed quite,
+And all the rest forgot for which he toil'd:
+Then happy I, that love and am beloved
+Where I may not remove nor be removed.
+Lord of my love, to whom in vassalage
+Thy merit hath my duty strongly knit,
+To thee I send this written embassage,
+To witness duty, not to show my wit:
+Duty so great, which wit so poor as mine
+May make seem bare, in wanting words to show it,
+But that I hope some good conceit of thine
+In thy soul's thought, all naked, will bestow it;
+Till whatsoever star that guides my moving
+Points on me graciously with fair aspect
+And puts apparel on my tatter'd loving,
+To show me worthy of thy sweet respect:
+Then may I dare to boast how I do love thee;
+Till then not show my head where thou mayst prove me.
+Weary with toil, I haste me to my bed,
+The dear repose for limbs with travel tired;
+But then begins a journey in my head,
+To work my mind, when body's work's expired:
+For then my thoughts, from far where I abide,
+Intend a zealous pilgrimage to thee,
+And keep my drooping eyelids open wide,
+Looking on darkness which the blind do see
+Save that my soul's imaginary sight
+Presents thy shadow to my sightless view,
+Which, like a jewel hung in ghastly night,
+Makes black night beauteous and her old face new.
+Lo! thus, by day my limbs, by night my mind,
+For thee and for myself no quiet find.
+How can I then return in happy plight,
+That am debarr'd the benefit of rest?
+When day's oppression is not eased by night,
+But day by night, and night by day, oppress'd?
+And each, though enemies to either's reign,
+Do in consent shake hands to torture me;
+The one by toil, the other to complain
+How far I toil, still farther off from thee.
+I tell the day, to please them thou art bright
+And dost him grace when clouds do blot the heaven:
+So flatter I the swart-complexion'd night,
+When sparkling stars twire not thou gild'st the even.
+But day doth daily draw my sorrows longer
+And night doth nightly make grief's strength seem stronger.
+When, in disgrace with fortune and men's eyes,
+I all alone beweep my outcast state
+And trouble deal heaven with my bootless cries
+And look upon myself and curse my fate,
+Wishing me like to one more rich in hope,
+Featured like him, like him with friends possess'd,
+Desiring this man's art and that man's scope,
+With what I most enjoy contented least;
+Yet in these thoughts myself almost despising,
+Haply I think on thee, and then my state,
+Like to the lark at break of day arising
+From sullen earth, sings hymns at heaven's gate;
+For thy sweet love remember'd such wealth brings
+That then I scorn to change my state with kings.
+When to the sessions of sweet silent thought
+I summon up remembrance of things past,
+I sigh the lack of many a thing I sought,
+And with old woes new wail my dear time's waste:
+Then can I drown an eye, unused to flow,
+For precious friends hid in death's dateless night,
+And weep afresh love's long since cancell'd woe,
+And moan the expense of many a vanish'd sight:
+Then can I grieve at grievances foregone,
+And heavily from woe to woe tell o'er
+The sad account of fore-bemoaned moan,
+Which I new pay as if not paid before.
+But if the while I think on thee, dear friend,
+All losses are restored and sorrows end.
+Thy bosom is endeared with all hearts,
+Which I by lacking have supposed dead,
+And there reigns love and all love's loving parts,
+And all those friends which I thought buried.
+How many a holy and obsequious tear
+Hath dear religious love stol'n from mine eye
+As interest of the dead, which now appear
+But things removed that hidden in thee lie!
+Thou art the grave where buried love doth live,
+Hung with the trophies of my lovers gone,
+Who all their parts of me to thee did give;
+That due of many now is thine alone:
+Their images I loved I view in thee,
+And thou, all they, hast all the all of me.
+If thou survive my well-contented day,
+When that churl Death my bones with dust shall cover,
+And shalt by fortune once more re-survey
+These poor rude lines of thy deceased lover,
+Compare them with the bettering of the time,
+And though they be outstripp'd by every pen,
+Reserve them for my love, not for their rhyme,
+Exceeded by the height of happier men.
+O, then vouchsafe me but this loving thought:
+'Had my friend's Muse grown with this growing age,
+A dearer birth than this his love had brought,
+To march in ranks of better equipage:
+But since he died and poets better prove,
+Theirs for their style I'll read, his for his love.'
+Full many a glorious morning have I seen
+Flatter the mountain-tops with sovereign eye,
+Kissing with golden face the meadows green,
+Gilding pale streams with heavenly alchemy;
+Anon permit the basest clouds to ride
+With ugly rack on his celestial face,
+And from the forlorn world his visage hide,
+Stealing unseen to west with this disgrace:
+Even so my sun one early morn did shine
+With all triumphant splendor on my brow;
+But out, alack! he was but one hour mine;
+The region cloud hath mask'd him from me now.
+Yet him for this my love no whit disdaineth;
+Suns of the world may stain when heaven's sun staineth.
+Why didst thou promise such a beauteous day,
+And make me travel forth without my cloak,
+To let base clouds o'ertake me in my way,
+Hiding thy bravery in their rotten smoke?
+'Tis not enough that through the cloud thou break,
+To dry the rain on my storm-beaten face,
+For no man well of such a salve can speak
+That heals the wound and cures not the disgrace:
+Nor can thy shame give physic to my grief;
+Though thou repent, yet I have still the loss:
+The offender's sorrow lends but weak relief
+To him that bears the strong offence's cross.
+Ah! but those tears are pearl which thy love sheds,
+And they are rich and ransom all ill deeds.
+No more be grieved at that which thou hast done:
+Roses have thorns, and silver fountains mud;
+Clouds and eclipses stain both moon and sun,
+And loathsome canker lives in sweetest bud.
+All men make faults, and even I in this,
+Authorizing thy trespass with compare,
+Myself corrupting, salving thy amiss,
+Excusing thy sins more than thy sins are;
+For to thy sensual fault I bring in sense--
+Thy adverse party is thy advocate--
+And 'gainst myself a lawful plea commence:
+Such civil war is in my love and hate
+That I an accessary needs must be
+To that sweet thief which sourly robs from me.
+Let me confess that we two must be twain,
+Although our undivided loves are one:
+So shall those blots that do with me remain
+Without thy help by me be borne alone.
+In our two loves there is but one respect,
+Though in our lives a separable spite,
+Which though it alter not love's sole effect,
+Yet doth it steal sweet hours from love's delight.
+I may not evermore acknowledge thee,
+Lest my bewailed guilt should do thee shame,
+Nor thou with public kindness honour me,
+Unless thou take that honour from thy name:
+But do not so; I love thee in such sort
+As, thou being mine, mine is thy good report.
+As a decrepit father takes delight
+To see his active child do deeds of youth,
+So I, made lame by fortune's dearest spite,
+Take all my comfort of thy worth and truth.
+For whether beauty, birth, or wealth, or wit,
+Or any of these all, or all, or more,
+Entitled in thy parts do crowned sit,
+I make my love engrafted to this store:
+So then I am not lame, poor, nor despised,
+Whilst that this shadow doth such substance give
+That I in thy abundance am sufficed
+And by a part of all thy glory live.
+Look, what is best, that best I wish in thee:
+This wish I have; then ten times happy me!FROM fairest creatures we desire increase,
+That thereby beauty's rose might never die,
+But as the riper should by time decease,
+His tender heir might bear his memory:
+But thou, contracted to thine own bright eyes,
+Feed'st thy light'st flame with self-substantial fuel,
+Making a famine where abundance lies,
+Thyself thy foe, to thy sweet self too cruel.
+Thou that art now the world's fresh ornament
+And only herald to the gaudy spring,
+Within thine own bud buriest thy content
+And, tender churl, makest waste in niggarding.
+Pity the world, or else this glutton be,
+To eat the world's due, by the grave and thee.
+When forty winters shall beseige thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery, so gazed on now,
+Will be a tatter'd weed, of small worth held:
+Then being ask'd where all thy beauty lies,
+Where all the treasure of thy lusty days,
+To say, within thine own deep-sunken eyes,
+Were an all-eating shame and thriftless praise.
+How much more praise deserved thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.
+Look in thy glass, and tell the face thou viewest
+Now is the time that face should form another;
+Whose fresh repair if now thou not renewest,
+Thou dost beguile the world, unbless some mother.
+For where is she so fair whose unear'd womb
+Disdains the tillage of thy husbandry?
+Or who is he so fond will be the tomb
+Of his self-love, to stop posterity?
+Thou art thy mother's glass, and she in thee
+Calls back the lovely April of her prime:
+So thou through windows of thine age shall see
+Despite of wrinkles this thy golden time.
+But if thou live, remember'd not to be,
+Die single, and thine image dies with thee.
+Unthrifty loveliness, why dost thou spend
+Upon thyself thy beauty's legacy?
+Nature's bequest gives nothing but doth lend,
+And being frank she lends to those are free.
+Then, beauteous niggard, why dost thou abuse
+The bounteous largess given thee to give?
+Profitless usurer, why dost thou use
+So great a sum of sums, yet canst not live?
+For having traffic with thyself alone,
+Thou of thyself thy sweet self dost deceive.
+Then how, when nature calls thee to be gone,
+What acceptable audit canst thou leave?
+Thy unused beauty must be tomb'd with thee,
+Which, used, lives th' executor to be.
+Those hours, that with gentle work did frame
+The lovely gaze where every eye doth dwell,
+Will play the tyrants to the very same
+And that unfair which fairly doth excel:
+For never-resting time leads summer on
+To hideous winter and confounds him there;
+Sap cheque'd with frost and lusty leaves quite gone,
+Beauty o'ersnow'd and bareness every where:
+Then, were not summer's distillation left,
+A liquid prisoner pent in walls of glass,
+Beauty's effect with beauty were bereft,
+Nor it nor no remembrance what it was:
+But flowers distill'd though they with winter meet,
+Leese but their show; their substance still lives sweet.
+Then let not winter's ragged hand deface
+In thee thy summer, ere thou be distill'd:
+Make sweet some vial; treasure thou some place
+With beauty's treasure, ere it be self-kill'd.
+That use is not forbidden usury,
+Which happies those that pay the willing loan;
+That's for thyself to breed another thee,
+Or ten times happier, be it ten for one;
+Ten times thyself were happier than thou art,
+If ten of thine ten times refigured thee:
+Then what could death do, if thou shouldst depart,
+Leaving thee living in posterity?
+Be not self-will'd, for thou art much too fair
+To be death's conquest and make worms thine heir.
+Lo! in the orient when the gracious light
+Lifts up his burning head, each under eye
+Doth homage to his new-appearing sight,
+Serving with looks his sacred majesty;
+And having climb'd the steep-up heavenly hill,
+Resembling strong youth in his middle age,
+yet mortal looks adore his beauty still,
+Attending on his golden pilgrimage;
+But when from highmost pitch, with weary car,
+Like feeble age, he reeleth from the day,
+The eyes, 'fore duteous, now converted are
+From his low tract and look another way:
+So thou, thyself out-going in thy noon,
+Unlook'd on diest, unless thou get a son.
+Music to hear, why hear'st thou music sadly?
+Sweets with sweets war not, joy delights in joy.
+Why lovest thou that which thou receivest not gladly,
+Or else receivest with pleasure thine annoy?
+If the true concord of well-tuned sounds,
+By unions married, do offend thine ear,
+They do but sweetly chide thee, who confounds
+In singleness the parts that thou shouldst bear.
+Mark how one string, sweet husband to another,
+Strikes each in each by mutual ordering,
+Resembling sire and child and happy mother
+Who all in one, one pleasing note do sing:
+Whose speechless song, being many, seeming one,
+Sings this to thee: 'thou single wilt prove none.'
+Is it for fear to wet a widow's eye
+That thou consumest thyself in single life?
+Ah! if thou issueless shalt hap to die.
+The world will wail thee, like a makeless wife;
+The world will be thy widow and still weep
+That thou no form of thee hast left behind,
+When every private widow well may keep
+By children's eyes her husband's shape in mind.
+Look, what an unthrift in the world doth spend
+Shifts but his place, for still the world enjoys it;
+But beauty's waste hath in the world an end,
+And kept unused, the user so destroys it.
+No love toward others in that bosom sits
+That on himself such murderous shame commits.
+For shame! deny that thou bear'st love to any,
+Who for thyself art so unprovident.
+Grant, if thou wilt, thou art beloved of many,
+But that thou none lovest is most evident;
+For thou art so possess'd with murderous hate
+That 'gainst thyself thou stick'st not to conspire.
+Seeking that beauteous roof to ruinate
+Which to repair should be thy chief desire.
+O, change thy thought, that I may change my mind!
+Shall hate be fairer lodged than gentle love?
+Be, as thy presence is, gracious and kind,
+Or to thyself at least kind-hearted prove:
+Make thee another self, for love of me,
+That beauty still may live in thine or thee.
+As fast as thou shalt wane, so fast thou growest
+In one of thine, from that which thou departest;
+And that fresh blood which youngly thou bestowest
+Thou mayst call thine when thou from youth convertest.
+Herein lives wisdom, beauty and increase:
+Without this, folly, age and cold decay:
+If all were minded so, the times should cease
+And threescore year would make the world away.
+Let those whom Nature hath not made for store,
+Harsh featureless and rude, barrenly perish:
+Look, whom she best endow'd she gave the more;
+Which bounteous gift thou shouldst in bounty cherish:
+She carved thee for her seal, and meant thereby
+Thou shouldst print more, not let that copy die.
+When I do count the clock that tells the time,
+And see the brave day sunk in hideous night;
+When I behold the violet past prime,
+And sable curls all silver'd o'er with white;
+When lofty trees I see barren of leaves
+Which erst from heat did canopy the herd,
+And summer's green all girded up in sheaves
+Borne on the bier with white and bristly beard,
+Then of thy beauty do I question make,
+That thou among the wastes of time must go,
+Since sweets and beauties do themselves forsake
+And die as fast as they see others grow;
+And nothing 'gainst Time's scythe can make defence
+Save breed, to brave him when he takes thee hence.
+O, that you were yourself! but, love, you are
+No longer yours than you yourself here live:
+Against this coming end you should prepare,
+And your sweet semblance to some other give.
+So should that beauty which you hold in lease
+Find no determination: then you were
+Yourself again after yourself's decease,
+When your sweet issue your sweet form should bear.
+Who lets so fair a house fall to decay,
+Which husbandry in honour might uphold
+Against the stormy gusts of winter's day
+And barren rage of death's eternal cold?
+O, none but unthrifts! Dear my love, you know
+You had a father: let your son say so.
+Not from the stars do I my judgment pluck;
+And yet methinks I have astronomy,
+But not to tell of good or evil luck,
+Of plagues, of dearths, or seasons' quality;
+Nor can I fortune to brief minutes tell,
+Pointing to each his thunder, rain and wind,
+Or say with princes if it shall go well,
+By oft predict that I in heaven find:
+But from thine eyes my knowledge I derive,
+And, constant stars, in them I read such art
+As truth and beauty shall together thrive,
+If from thyself to store thou wouldst convert;
+Or else of thee this I prognosticate:
+Thy end is truth's and beauty's doom and date.
+When I consider every thing that grows
+Holds in perfection but a little moment,
+That this huge stage presenteth nought but shows
+Whereon the stars in secret influence comment;
+When I perceive that men as plants increase,
+Cheered and cheque'd even by the self-same sky,
+Vaunt in their youthful sap, at height decrease,
+And wear their brave state out of memory;
+Then the conceit of this inconstant stay
+Sets you most rich in youth before my sight,
+Where wasteful Time debateth with Decay,
+To change your day of youth to sullied night;
+And all in war with Time for love of you,
+As he takes from you, I engraft you new.
+But wherefore do not you a mightier way
+Make war upon this bloody tyrant, Time?
+And fortify yourself in your decay
+With means more blessed than my barren rhyme?
+Now stand you on the top of happy hours,
+And many maiden gardens yet unset
+With virtuous wish would bear your living flowers,
+Much liker than your painted counterfeit:
+So should the lines of life that life repair,
+Which this, Time's pencil, or my pupil pen,
+Neither in inward worth nor outward fair,
+Can make you live yourself in eyes of men.
+To give away yourself keeps yourself still,
+And you must live, drawn by your own sweet skill.
+Who will believe my verse in time to come,
+If it were fill'd with your most high deserts?
+Though yet, heaven knows, it is but as a tomb
+Which hides your life and shows not half your parts.
+If I could write the beauty of your eyes
+And in fresh numbers number all your graces,
+The age to come would say 'This poet lies:
+Such heavenly touches ne'er touch'd earthly faces.'
+So should my papers yellow'd with their age
+Be scorn'd like old men of less truth than tongue,
+And your true rights be term'd a poet's rage
+And stretched metre of an antique song:
+But were some child of yours alive that time,
+You should live twice; in it and in my rhyme.
+Shall I compare thee to a summer's day?
+Thou art more lovely and more temperate:
+Rough winds do shake the darling buds of May,
+And summer's lease hath all too short a date:
+Sometime too hot the eye of heaven shines,
+And often is his gold complexion dimm'd;
+And every fair from fair sometime declines,
+By chance or nature's changing course untrimm'd;
+But thy eternal summer shall not fade
+Nor lose possession of that fair thou owest;
+Nor shall Death brag thou wander'st in his shade,
+When in eternal lines to time thou growest:
+So long as men can breathe or eyes can see,
+So long lives this and this gives life to thee.
+Devouring Time, blunt thou the lion's paws,
+And make the earth devour her own sweet brood;
+Pluck the keen teeth from the fierce tiger's jaws,
+And burn the long-lived phoenix in her blood;
+Make glad and sorry seasons as thou fleets,
+And do whate'er thou wilt, swift-footed Time,
+To the wide world and all her fading sweets;
+But I forbid thee one most heinous crime:
+O, carve not with thy hours my love's fair brow,
+Nor draw no lines there with thine antique pen;
+Him in thy course untainted do allow
+For beauty's pattern to succeeding men.
+Yet, do thy worst, old Time: despite thy wrong,
+My love shall in my verse ever live young.
+A woman's face with Nature's own hand painted
+Hast thou, the master-mistress of my passion;
+A woman's gentle heart, but not acquainted
+With shifting change, as is false women's fashion;
+An eye more bright than theirs, less false in rolling,
+Gilding the object whereupon it gazeth;
+A man in hue, all 'hues' in his controlling,
+Much steals men's eyes and women's souls amazeth.
+And for a woman wert thou first created;
+Till Nature, as she wrought thee, fell a-doting,
+And by addition me of thee defeated,
+By adding one thing to my purpose nothing.
+But since she prick'd thee out for women's pleasure,
+Mine be thy love and thy love's use their treasure.
+So is it not with me as with that Muse
+Stirr'd by a painted beauty to his verse,
+Who heaven itself for ornament doth use
+And every fair with his fair doth rehearse
+Making a couplement of proud compare,
+With sun and moon, with earth and sea's rich gems,
+With April's first-born flowers, and all things rare
+That heaven's air in this huge rondure hems.
+O' let me, true in love, but truly write,
+And then believe me, my love is as fair
+As any mother's child, though not so bright
+As those gold candles fix'd in heaven's air:
+Let them say more than like of hearsay well;
+I will not praise that purpose not to sell.
+My glass shall not persuade me I am old,
+So long as youth and thou are of one date;
+But when in thee time's furrows I behold,
+Then look I death my days should expiate.
+For all that beauty that doth cover thee
+Is but the seemly raiment of my heart,
+Which in thy breast doth live, as thine in me:
+How can I then be elder than thou art?
+O, therefore, love, be of thyself so wary
+As I, not for myself, but for thee will;
+Bearing thy heart, which I will keep so chary
+As tender nurse her babe from faring ill.
+Presume not on thy heart when mine is slain;
+Thou gavest me thine, not to give back again.
+As an unperfect actor on the stage
+Who with his fear is put besides his part,
+Or some fierce thing replete with too much rage,
+Whose strength's abundance weakens his own heart.
+So I, for fear of trust, forget to say
+The perfect ceremony of love's rite,
+And in mine own love's strength seem to decay,
+O'ercharged with burden of mine own love's might.
+O, let my books be then the eloquence
+And dumb presagers of my speaking breast,
+Who plead for love and look for recompense
+More than that tongue that more hath more express'd.
+O, learn to read what silent love hath writ:
+To hear with eyes belongs to love's fine wit.
+Mine eye hath play'd the painter and hath stell'd
+Thy beauty's form in table of my heart;
+My body is the frame wherein 'tis held,
+And perspective it is the painter's art.
+For through the painter must you see his skill,
+To find where your true image pictured lies;
+Which in my bosom's shop is hanging still,
+That hath his windows glazed with thine eyes.
+Now see what good turns eyes for eyes have done:
+Mine eyes have drawn thy shape, and thine for me
+Are windows to my breast, where-through the sun
+Delights to peep, to gaze therein on thee;
+Yet eyes this cunning want to grace their art;
+They draw but what they see, know not the heart.
+Let those who are in favour with their stars
+Of public honour and proud titles boast,
+Whilst I, whom fortune of such triumph bars,
+Unlook'd for joy in that I honour most.
+Great princes' favourites their fair leaves spread
+But as the marigold at the sun's eye,
+And in themselves their pride lies buried,
+For at a frown they in their glory die.
+The painful warrior famoused for fight,
+After a thousand victories once foil'd,
+Is from the book of honour razed quite,
+And all the rest forgot for which he toil'd:
+Then happy I, that love and am beloved
+Where I may not remove nor be removed.
+Lord of my love, to whom in vassalage
+Thy merit hath my duty strongly knit,
+To thee I send this written embassage,
+To witness duty, not to show my wit:
+Duty so great, which wit so poor as mine
+May make seem bare, in wanting words to show it,
+But that I hope some good conceit of thine
+In thy soul's thought, all naked, will bestow it;
+Till whatsoever star that guides my moving
+Points on me graciously with fair aspect
+And puts apparel on my tatter'd loving,
+To show me worthy of thy sweet respect:
+Then may I dare to boast how I do love thee;
+Till then not show my head where thou mayst prove me.
+Weary with toil, I haste me to my bed,
+The dear repose for limbs with travel tired;
+But then begins a journey in my head,
+To work my mind, when body's work's expired:
+For then my thoughts, from far where I abide,
+Intend a zealous pilgrimage to thee,
+And keep my drooping eyelids open wide,
+Looking on darkness which the blind do see
+Save that my soul's imaginary sight
+Presents thy shadow to my sightless view,
+Which, like a jewel hung in ghastly night,
+Makes black night beauteous and her old face new.
+Lo! thus, by day my limbs, by night my mind,
+For thee and for myself no quiet find.
+How can I then return in happy plight,
+That am debarr'd the benefit of rest?
+When day's oppression is not eased by night,
+But day by night, and night by day, oppress'd?
+And each, though enemies to either's reign,
+Do in consent shake hands to torture me;
+The one by toil, the other to complain
+How far I toil, still farther off from thee.
+I tell the day, to please them thou art bright
+And dost him grace when clouds do blot the heaven:
+So flatter I the swart-complexion'd night,
+When sparkling stars twire not thou gild'st the even.
+But day doth daily draw my sorrows longer
+And night doth nightly make grief's strength seem stronger.
+When, in disgrace with fortune and men's eyes,
+I all alone beweep my outcast state
+And trouble deal heaven with my bootless cries
+And look upon myself and curse my fate,
+Wishing me like to one more rich in hope,
+Featured like him, like him with friends possess'd,
+Desiring this man's art and that man's scope,
+With what I most enjoy contented least;
+Yet in these thoughts myself almost despising,
+Haply I think on thee, and then my state,
+Like to the lark at break of day arising
+From sullen earth, sings hymns at heaven's gate;
+For thy sweet love remember'd such wealth brings
+That then I scorn to change my state with kings.
+When to the sessions of sweet silent thought
+I summon up remembrance of things past,
+I sigh the lack of many a thing I sought,
+And with old woes new wail my dear time's waste:
+Then can I drown an eye, unused to flow,
+For precious friends hid in death's dateless night,
+And weep afresh love's long since cancell'd woe,
+And moan the expense of many a vanish'd sight:
+Then can I grieve at grievances foregone,
+And heavily from woe to woe tell o'er
+The sad account of fore-bemoaned moan,
+Which I new pay as if not paid before.
+But if the while I think on thee, dear friend,
+All losses are restored and sorrows end.
+Thy bosom is endeared with all hearts,
+Which I by lacking have supposed dead,
+And there reigns love and all love's loving parts,
+And all those friends which I thought buried.
+How many a holy and obsequious tear
+Hath dear religious love stol'n from mine eye
+As interest of the dead, which now appear
+But things removed that hidden in thee lie!
+Thou art the grave where buried love doth live,
+Hung with the trophies of my lovers gone,
+Who all their parts of me to thee did give;
+That due of many now is thine alone:
+Their images I loved I view in thee,
+And thou, all they, hast all the all of me.
+If thou survive my well-contented day,
+When that churl Death my bones with dust shall cover,
+And shalt by fortune once more re-survey
+These poor rude lines of thy deceased lover,
+Compare them with the bettering of the time,
+And though they be outstripp'd by every pen,
+Reserve them for my love, not for their rhyme,
+Exceeded by the height of happier men.
+O, then vouchsafe me but this loving thought:
+'Had my friend's Muse grown with this growing age,
+A dearer birth than this his love had brought,
+To march in ranks of better equipage:
+But since he died and poets better prove,
+Theirs for their style I'll read, his for his love.'
+Full many a glorious morning have I seen
+Flatter the mountain-tops with sovereign eye,
+Kissing with golden face the meadows green,
+Gilding pale streams with heavenly alchemy;
+Anon permit the basest clouds to ride
+With ugly rack on his celestial face,
+And from the forlorn world his visage hide,
+Stealing unseen to west with this disgrace:
+Even so my sun one early morn did shine
+With all triumphant splendor on my brow;
+But out, alack! he was but one hour mine;
+The region cloud hath mask'd him from me now.
+Yet him for this my love no whit disdaineth;
+Suns of the world may stain when heaven's sun staineth.
+Why didst thou promise such a beauteous day,
+And make me travel forth without my cloak,
+To let base clouds o'ertake me in my way,
+Hiding thy bravery in their rotten smoke?
+'Tis not enough that through the cloud thou break,
+To dry the rain on my storm-beaten face,
+For no man well of such a salve can speak
+That heals the wound and cures not the disgrace:
+Nor can thy shame give physic to my grief;
+Though thou repent, yet I have still the loss:
+The offender's sorrow lends but weak relief
+To him that bears the strong offence's cross.
+Ah! but those tears are pearl which thy love sheds,
+And they are rich and ransom all ill deeds.
+No more be grieved at that which thou hast done:
+Roses have thorns, and silver fountains mud;
+Clouds and eclipses stain both moon and sun,
+And loathsome canker lives in sweetest bud.
+All men make faults, and even I in this,
+Authorizing thy trespass with compare,
+Myself corrupting, salving thy amiss,
+Excusing thy sins more than thy sins are;
+For to thy sensual fault I bring in sense--
+Thy adverse party is thy advocate--
+And 'gainst myself a lawful plea commence:
+Such civil war is in my love and hate
+That I an accessary needs must be
+To that sweet thief which sourly robs from me.
+Let me confess that we two must be twain,
+Although our undivided loves are one:
+So shall those blots that do with me remain
+Without thy help by me be borne alone.
+In our two loves there is but one respect,
+Though in our lives a separable spite,
+Which though it alter not love's sole effect,
+Yet doth it steal sweet hours from love's delight.
+I may not evermore acknowledge thee,
+Lest my bewailed guilt should do thee shame,
+Nor thou with public kindness honour me,
+Unless thou take that honour from thy name:
+But do not so; I love thee in such sort
+As, thou being mine, mine is thy good report.
+As a decrepit father takes delight
+To see his active child do deeds of youth,
+So I, made lame by fortune's dearest spite,
+Take all my comfort of thy worth and truth.
+For whether beauty, birth, or wealth, or wit,
+Or any of these all, or all, or more,
+Entitled in thy parts do crowned sit,
+I make my love engrafted to this store:
+So then I am not lame, poor, nor despised,
+Whilst that this shadow doth such substance give
+That I in thy abundance am sufficed
+And by a part of all thy glory live.
+Look, what is best, that best I wish in thee:
+This wish I have; then ten times happy me!FROM fairest creatures we desire increase,
+That thereby beauty's rose might never die,
+But as the riper should by time decease,
+His tender heir might bear his memory:
+But thou, contracted to thine own bright eyes,
+Feed'st thy light'st flame with self-substantial fuel,
+Making a famine where abundance lies,
+Thyself thy foe, to thy sweet self too cruel.
+Thou that art now the world's fresh ornament
+And only herald to the gaudy spring,
+Within thine own bud buriest thy content
+And, tender churl, makest waste in niggarding.
+Pity the world, or else this glutton be,
+To eat the world's due, by the grave and thee.
+When forty winters shall beseige thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery, so gazed on now,
+Will be a tatter'd weed, of small worth held:
+Then being ask'd where all thy beauty lies,
+Where all the treasure of thy lusty days,
+To say, within thine own deep-sunken eyes,
+Were an all-eating shame and thriftless praise.
+How much more praise deserved thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.
+Look in thy glass, and tell the face thou viewest
+Now is the time that face should form another;
+Whose fresh repair if now thou not renewest,
+Thou dost beguile the world, unbless some mother.
+For where is she so fair whose unear'd womb
+Disdains the tillage of thy husbandry?
+Or who is he so fond will be the tomb
+Of his self-love, to stop posterity?
+Thou art thy mother's glass, and she in thee
+Calls back the lovely April of her prime:
+So thou through windows of thine age shall see
+Despite of wrinkles this thy golden time.
+But if thou live, remember'd not to be,
+Die single, and thine image dies with thee.
+Unthrifty loveliness, why dost thou spend
+Upon thyself thy beauty's legacy?
+Nature's bequest gives nothing but doth lend,
+And being frank she lends to those are free.
+Then, beauteous niggard, why dost thou abuse
+The bounteous largess given thee to give?
+Profitless usurer, why dost thou use
+So great a sum of sums, yet canst not live?
+For having traffic with thyself alone,
+Thou of thyself thy sweet self dost deceive.
+Then how, when nature calls thee to be gone,
+What acceptable audit canst thou leave?
+Thy unused beauty must be tomb'd with thee,
+Which, used, lives th' executor to be.
+Those hours, that with gentle work did frame
+The lovely gaze where every eye doth dwell,
+Will play the tyrants to the very same
+And that unfair which fairly doth excel:
+For never-resting time leads summer on
+To hideous winter and confounds him there;
+Sap cheque'd with frost and lusty leaves quite gone,
+Beauty o'ersnow'd and bareness every where:
+Then, were not summer's distillation left,
+A liquid prisoner pent in walls of glass,
+Beauty's effect with beauty were bereft,
+Nor it nor no remembrance what it was:
+But flowers distill'd though they with winter meet,
+Leese but their show; their substance still lives sweet.
+Then let not winter's ragged hand deface
+In thee thy summer, ere thou be distill'd:
+Make sweet some vial; treasure thou some place
+With beauty's treasure, ere it be self-kill'd.
+That use is not forbidden usury,
+Which happies those that pay the willing loan;
+That's for thyself to breed another thee,
+Or ten times happier, be it ten for one;
+Ten times thyself were happier than thou art,
+If ten of thine ten times refigured thee:
+Then what could death do, if thou shouldst depart,
+Leaving thee living in posterity?
+Be not self-will'd, for thou art much too fair
+To be death's conquest and make worms thine heir.
+Lo! in the orient when the gracious light
+Lifts up his burning head, each under eye
+Doth homage to his new-appearing sight,
+Serving with looks his sacred majesty;
+And having climb'd the steep-up heavenly hill,
+Resembling strong youth in his middle age,
+yet mortal looks adore his beauty still,
+Attending on his golden pilgrimage;
+But when from highmost pitch, with weary car,
+Like feeble age, he reeleth from the day,
+The eyes, 'fore duteous, now converted are
+From his low tract and look another way:
+So thou, thyself out-going in thy noon,
+Unlook'd on diest, unless thou get a son.
+Music to hear, why hear'st thou music sadly?
+Sweets with sweets war not, joy delights in joy.
+Why lovest thou that which thou receivest not gladly,
+Or else receivest with pleasure thine annoy?
+If the true concord of well-tuned sounds,
+By unions married, do offend thine ear,
+They do but sweetly chide thee, who confounds
+In singleness the parts that thou shouldst bear.
+Mark how one string, sweet husband to another,
+Strikes each in each by mutual ordering,
+Resembling sire and child and happy mother
+Who all in one, one pleasing note do sing:
+Whose speechless song, being many, seeming one,
+Sings this to thee: 'thou single wilt prove none.'
+Is it for fear to wet a widow's eye
+That thou consumest thyself in single life?
+Ah! if thou issueless shalt hap to die.
+The world will wail thee, like a makeless wife;
+The world will be thy widow and still weep
+That thou no form of thee hast left behind,
+When every private widow well may keep
+By children's eyes her husband's shape in mind.
+Look, what an unthrift in the world doth spend
+Shifts but his place, for still the world enjoys it;
+But beauty's waste hath in the world an end,
+And kept unused, the user so destroys it.
+No love toward others in that bosom sits
+That on himself such murderous shame commits.
+For shame! deny that thou bear'st love to any,
+Who for thyself art so unprovident.
+Grant, if thou wilt, thou art beloved of many,
+But that thou none lovest is most evident;
+For thou art so possess'd with murderous hate
+That 'gainst thyself thou stick'st not to conspire.
+Seeking that beauteous roof to ruinate
+Which to repair should be thy chief desire.
+O, change thy thought, that I may change my mind!
+Shall hate be fairer lodged than gentle love?
+Be, as thy presence is, gracious and kind,
+Or to thyself at least kind-hearted prove:
+Make thee another self, for love of me,
+That beauty still may live in thine or thee.
+As fast as thou shalt wane, so fast thou growest
+In one of thine, from that which thou departest;
+And that fresh blood which youngly thou bestowest
+Thou mayst call thine when thou from youth convertest.
+Herein lives wisdom, beauty and increase:
+Without this, folly, age and cold decay:
+If all were minded so, the times should cease
+And threescore year would make the world away.
+Let those whom Nature hath not made for store,
+Harsh featureless and rude, barrenly perish:
+Look, whom she best endow'd she gave the more;
+Which bounteous gift thou shouldst in bounty cherish:
+She carved thee for her seal, and meant thereby
+Thou shouldst print more, not let that copy die.
+When I do count the clock that tells the time,
+And see the brave day sunk in hideous night;
+When I behold the violet past prime,
+And sable curls all silver'd o'er with white;
+When lofty trees I see barren of leaves
+Which erst from heat did canopy the herd,
+And summer's green all girded up in sheaves
+Borne on the bier with white and bristly beard,
+Then of thy beauty do I question make,
+That thou among the wastes of time must go,
+Since sweets and beauties do themselves forsake
+And die as fast as they see others grow;
+And nothing 'gainst Time's scythe can make defence
+Save breed, to brave him when he takes thee hence.
+O, that you were yourself! but, love, you are
+No longer yours than you yourself here live:
+Against this coming end you should prepare,
+And your sweet semblance to some other give.
+So should that beauty which you hold in lease
+Find no determination: then you were
+Yourself again after yourself's decease,
+When your sweet issue your sweet form should bear.
+Who lets so fair a house fall to decay,
+Which husbandry in honour might uphold
+Against the stormy gusts of winter's day
+And barren rage of death's eternal cold?
+O, none but unthrifts! Dear my love, you know
+You had a father: let your son say so.
+Not from the stars do I my judgment pluck;
+And yet methinks I have astronomy,
+But not to tell of good or evil luck,
+Of plagues, of dearths, or seasons' quality;
+Nor can I fortune to brief minutes tell,
+Pointing to each his thunder, rain and wind,
+Or say with princes if it shall go well,
+By oft predict that I in heaven find:
+But from thine eyes my knowledge I derive,
+And, constant stars, in them I read such art
+As truth and beauty shall together thrive,
+If from thyself to store thou wouldst convert;
+Or else of thee this I prognosticate:
+Thy end is truth's and beauty's doom and date.
+When I consider every thing that grows
+Holds in perfection but a little moment,
+That this huge stage presenteth nought but shows
+Whereon the stars in secret influence comment;
+When I perceive that men as plants increase,
+Cheered and cheque'd even by the self-same sky,
+Vaunt in their youthful sap, at height decrease,
+And wear their brave state out of memory;
+Then the conceit of this inconstant stay
+Sets you most rich in youth before my sight,
+Where wasteful Time debateth with Decay,
+To change your day of youth to sullied night;
+And all in war with Time for love of you,
+As he takes from you, I engraft you new.
+But wherefore do not you a mightier way
+Make war upon this bloody tyrant, Time?
+And fortify yourself in your decay
+With means more blessed than my barren rhyme?
+Now stand you on the top of happy hours,
+And many maiden gardens yet unset
+With virtuous wish would bear your living flowers,
+Much liker than your painted counterfeit:
+So should the lines of life that life repair,
+Which this, Time's pencil, or my pupil pen,
+Neither in inward worth nor outward fair,
+Can make you live yourself in eyes of men.
+To give away yourself keeps yourself still,
+And you must live, drawn by your own sweet skill.
+Who will believe my verse in time to come,
+If it were fill'd with your most high deserts?
+Though yet, heaven knows, it is but as a tomb
+Which hides your life and shows not half your parts.
+If I could write the beauty of your eyes
+And in fresh numbers number all your graces,
+The age to come would say 'This poet lies:
+Such heavenly touches ne'er touch'd earthly faces.'
+So should my papers yellow'd with their age
+Be scorn'd like old men of less truth than tongue,
+And your true rights be term'd a poet's rage
+And stretched metre of an antique song:
+But were some child of yours alive that time,
+You should live twice; in it and in my rhyme.
+Shall I compare thee to a summer's day?
+Thou art more lovely and more temperate:
+Rough winds do shake the darling buds of May,
+And summer's lease hath all too short a date:
+Sometime too hot the eye of heaven shines,
+And often is his gold complexion dimm'd;
+And every fair from fair sometime declines,
+By chance or nature's changing course untrimm'd;
+But thy eternal summer shall not fade
+Nor lose possession of that fair thou owest;
+Nor shall Death brag thou wander'st in his shade,
+When in eternal lines to time thou growest:
+So long as men can breathe or eyes can see,
+So long lives this and this gives life to thee.
+Devouring Time, blunt thou the lion's paws,
+And make the earth devour her own sweet brood;
+Pluck the keen teeth from the fierce tiger's jaws,
+And burn the long-lived phoenix in her blood;
+Make glad and sorry seasons as thou fleets,
+And do whate'er thou wilt, swift-footed Time,
+To the wide world and all her fading sweets;
+But I forbid thee one most heinous crime:
+O, carve not with thy hours my love's fair brow,
+Nor draw no lines there with thine antique pen;
+Him in thy course untainted do allow
+For beauty's pattern to succeeding men.
+Yet, do thy worst, old Time: despite thy wrong,
+My love shall in my verse ever live young.
+A woman's face with Nature's own hand painted
+Hast thou, the master-mistress of my passion;
+A woman's gentle heart, but not acquainted
+With shifting change, as is false women's fashion;
+An eye more bright than theirs, less false in rolling,
+Gilding the object whereupon it gazeth;
+A man in hue, all 'hues' in his controlling,
+Much steals men's eyes and women's souls amazeth.
+And for a woman wert thou first created;
+Till Nature, as she wrought thee, fell a-doting,
+And by addition me of thee defeated,
+By adding one thing to my purpose nothing.
+But since she prick'd thee out for women's pleasure,
+Mine be thy love and thy love's use their treasure.
+So is it not with me as with that Muse
+Stirr'd by a painted beauty to his verse,
+Who heaven itself for ornament doth use
+And every fair with his fair doth rehearse
+Making a couplement of proud compare,
+With sun and moon, with earth and sea's rich gems,
+With April's first-born flowers, and all things rare
+That heaven's air in this huge rondure hems.
+O' let me, true in love, but truly write,
+And then believe me, my love is as fair
+As any mother's child, though not so bright
+As those gold candles fix'd in heaven's air:
+Let them say more than like of hearsay well;
+I will not praise that purpose not to sell.
+My glass shall not persuade me I am old,
+So long as youth and thou are of one date;
+But when in thee time's furrows I behold,
+Then look I death my days should expiate.
+For all that beauty that doth cover thee
+Is but the seemly raiment of my heart,
+Which in thy breast doth live, as thine in me:
+How can I then be elder than thou art?
+O, therefore, love, be of thyself so wary
+As I, not for myself, but for thee will;
+Bearing thy heart, which I will keep so chary
+As tender nurse her babe from faring ill.
+Presume not on thy heart when mine is slain;
+Thou gavest me thine, not to give back again.
+As an unperfect actor on the stage
+Who with his fear is put besides his part,
+Or some fierce thing replete with too much rage,
+Whose strength's abundance weakens his own heart.
+So I, for fear of trust, forget to say
+The perfect ceremony of love's rite,
+And in mine own love's strength seem to decay,
+O'ercharged with burden of mine own love's might.
+O, let my books be then the eloquence
+And dumb presagers of my speaking breast,
+Who plead for love and look for recompense
+More than that tongue that more hath more express'd.
+O, learn to read what silent love hath writ:
+To hear with eyes belongs to love's fine wit.
+Mine eye hath play'd the painter and hath stell'd
+Thy beauty's form in table of my heart;
+My body is the frame wherein 'tis held,
+And perspective it is the painter's art.
+For through the painter must you see his skill,
+To find where your true image pictured lies;
+Which in my bosom's shop is hanging still,
+That hath his windows glazed with thine eyes.
+Now see what good turns eyes for eyes have done:
+Mine eyes have drawn thy shape, and thine for me
+Are windows to my breast, where-through the sun
+Delights to peep, to gaze therein on thee;
+Yet eyes this cunning want to grace their art;
+They draw but what they see, know not the heart.
+Let those who are in favour with their stars
+Of public honour and proud titles boast,
+Whilst I, whom fortune of such triumph bars,
+Unlook'd for joy in that I honour most.
+Great princes' favourites their fair leaves spread
+But as the marigold at the sun's eye,
+And in themselves their pride lies buried,
+For at a frown they in their glory die.
+The painful warrior famoused for fight,
+After a thousand victories once foil'd,
+Is from the book of honour razed quite,
+And all the rest forgot for which he toil'd:
+Then happy I, that love and am beloved
+Where I may not remove nor be removed.
+Lord of my love, to whom in vassalage
+Thy merit hath my duty strongly knit,
+To thee I send this written embassage,
+To witness duty, not to show my wit:
+Duty so great, which wit so poor as mine
+May make seem bare, in wanting words to show it,
+But that I hope some good conceit of thine
+In thy soul's thought, all naked, will bestow it;
+Till whatsoever star that guides my moving
+Points on me graciously with fair aspect
+And puts apparel on my tatter'd loving,
+To show me worthy of thy sweet respect:
+Then may I dare to boast how I do love thee;
+Till then not show my head where thou mayst prove me.
+Weary with toil, I haste me to my bed,
+The dear repose for limbs with travel tired;
+But then begins a journey in my head,
+To work my mind, when body's work's expired:
+For then my thoughts, from far where I abide,
+Intend a zealous pilgrimage to thee,
+And keep my drooping eyelids open wide,
+Looking on darkness which the blind do see
+Save that my soul's imaginary sight
+Presents thy shadow to my sightless view,
+Which, like a jewel hung in ghastly night,
+Makes black night beauteous and her old face new.
+Lo! thus, by day my limbs, by night my mind,
+For thee and for myself no quiet find.
+How can I then return in happy plight,
+That am debarr'd the benefit of rest?
+When day's oppression is not eased by night,
+But day by night, and night by day, oppress'd?
+And each, though enemies to either's reign,
+Do in consent shake hands to torture me;
+The one by toil, the other to complain
+How far I toil, still farther off from thee.
+I tell the day, to please them thou art bright
+And dost him grace when clouds do blot the heaven:
+So flatter I the swart-complexion'd night,
+When sparkling stars twire not thou gild'st the even.
+But day doth daily draw my sorrows longer
+And night doth nightly make grief's strength seem stronger.
+When, in disgrace with fortune and men's eyes,
+I all alone beweep my outcast state
+And trouble deal heaven with my bootless cries
+And look upon myself and curse my fate,
+Wishing me like to one more rich in hope,
+Featured like him, like him with friends possess'd,
+Desiring this man's art and that man's scope,
+With what I most enjoy contented least;
+Yet in these thoughts myself almost despising,
+Haply I think on thee, and then my state,
+Like to the lark at break of day arising
+From sullen earth, sings hymns at heaven's gate;
+For thy sweet love remember'd such wealth brings
+That then I scorn to change my state with kings.
+When to the sessions of sweet silent thought
+I summon up remembrance of things past,
+I sigh the lack of many a thing I sought,
+And with old woes new wail my dear time's waste:
+Then can I drown an eye, unused to flow,
+For precious friends hid in death's dateless night,
+And weep afresh love's long since cancell'd woe,
+And moan the expense of many a vanish'd sight:
+Then can I grieve at grievances foregone,
+And heavily from woe to woe tell o'er
+The sad account of fore-bemoaned moan,
+Which I new pay as if not paid before.
+But if the while I think on thee, dear friend,
+All losses are restored and sorrows end.
+Thy bosom is endeared with all hearts,
+Which I by lacking have supposed dead,
+And there reigns love and all love's loving parts,
+And all those friends which I thought buried.
+How many a holy and obsequious tear
+Hath dear religious love stol'n from mine eye
+As interest of the dead, which now appear
+But things removed that hidden in thee lie!
+Thou art the grave where buried love doth live,
+Hung with the trophies of my lovers gone,
+Who all their parts of me to thee did give;
+That due of many now is thine alone:
+Their images I loved I view in thee,
+And thou, all they, hast all the all of me.
+If thou survive my well-contented day,
+When that churl Death my bones with dust shall cover,
+And shalt by fortune once more re-survey
+These poor rude lines of thy deceased lover,
+Compare them with the bettering of the time,
+And though they be outstripp'd by every pen,
+Reserve them for my love, not for their rhyme,
+Exceeded by the height of happier men.
+O, then vouchsafe me but this loving thought:
+'Had my friend's Muse grown with this growing age,
+A dearer birth than this his love had brought,
+To march in ranks of better equipage:
+But since he died and poets better prove,
+Theirs for their style I'll read, his for his love.'
+Full many a glorious morning have I seen
+Flatter the mountain-tops with sovereign eye,
+Kissing with golden face the meadows green,
+Gilding pale streams with heavenly alchemy;
+Anon permit the basest clouds to ride
+With ugly rack on his celestial face,
+And from the forlorn world his visage hide,
+Stealing unseen to west with this disgrace:
+Even so my sun one early morn did shine
+With all triumphant splendor on my brow;
+But out, alack! he was but one hour mine;
+The region cloud hath mask'd him from me now.
+Yet him for this my love no whit disdaineth;
+Suns of the world may stain when heaven's sun staineth.
+Why didst thou promise such a beauteous day,
+And make me travel forth without my cloak,
+To let base clouds o'ertake me in my way,
+Hiding thy bravery in their rotten smoke?
+'Tis not enough that through the cloud thou break,
+To dry the rain on my storm-beaten face,
+For no man well of such a salve can speak
+That heals the wound and cures not the disgrace:
+Nor can thy shame give physic to my grief;
+Though thou repent, yet I have still the loss:
+The offender's sorrow lends but weak relief
+To him that bears the strong offence's cross.
+Ah! but those tears are pearl which thy love sheds,
+And they are rich and ransom all ill deeds.
+No more be grieved at that which thou hast done:
+Roses have thorns, and silver fountains mud;
+Clouds and eclipses stain both moon and sun,
+And loathsome canker lives in sweetest bud.
+All men make faults, and even I in this,
+Authorizing thy trespass with compare,
+Myself corrupting, salving thy amiss,
+Excusing thy sins more than thy sins are;
+For to thy sensual fault I bring in sense--
+Thy adverse party is thy advocate--
+And 'gainst myself a lawful plea commence:
+Such civil war is in my love and hate
+That I an accessary needs must be
+To that sweet thief which sourly robs from me.
+Let me confess that we two must be twain,
+Although our undivided loves are one:
+So shall those blots that do with me remain
+Without thy help by me be borne alone.
+In our two loves there is but one respect,
+Though in our lives a separable spite,
+Which though it alter not love's sole effect,
+Yet doth it steal sweet hours from love's delight.
+I may not evermore acknowledge thee,
+Lest my bewailed guilt should do thee shame,
+Nor thou with public kindness honour me,
+Unless thou take that honour from thy name:
+But do not so; I love thee in such sort
+As, thou being mine, mine is thy good report.
+As a decrepit father takes delight
+To see his active child do deeds of youth,
+So I, made lame by fortune's dearest spite,
+Take all my comfort of thy worth and truth.
+For whether beauty, birth, or wealth, or wit,
+Or any of these all, or all, or more,
+Entitled in thy parts do crowned sit,
+I make my love engrafted to this store:
+So then I am not lame, poor, nor despised,
+Whilst that this shadow doth such substance give
+That I in thy abundance am sufficed
+And by a part of all thy glory live.
+Look, what is best, that best I wish in thee:
+This wish I have; then ten times happy me!FROM fairest creatures we desire increase,
+That thereby beauty's rose might never die,
+But as the riper should by time decease,
+His tender heir might bear his memory:
+But thou, contracted to thine own bright eyes,
+Feed'st thy light'st flame with self-substantial fuel,
+Making a famine where abundance lies,
+Thyself thy foe, to thy sweet self too cruel.
+Thou that art now the world's fresh ornament
+And only herald to the gaudy spring,
+Within thine own bud buriest thy content
+And, tender churl, makest waste in niggarding.
+Pity the world, or else this glutton be,
+To eat the world's due, by the grave and thee.
+When forty winters shall beseige thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery, so gazed on now,
+Will be a tatter'd weed, of small worth held:
+Then being ask'd where all thy beauty lies,
+Where all the treasure of thy lusty days,
+To say, within thine own deep-sunken eyes,
+Were an all-eating shame and thriftless praise.
+How much more praise deserved thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.
+Look in thy glass, and tell the face thou viewest
+Now is the time that face should form another;
+Whose fresh repair if now thou not renewest,
+Thou dost beguile the world, unbless some mother.
+For where is she so fair whose unear'd womb
+Disdains the tillage of thy husbandry?
+Or who is he so fond will be the tomb
+Of his self-love, to stop posterity?
+Thou art thy mother's glass, and she in thee
+Calls back the lovely April of her prime:
+So thou through windows of thine age shall see
+Despite of wrinkles this thy golden time.
+But if thou live, remember'd not to be,
+Die single, and thine image dies with thee.
+Unthrifty loveliness, why dost thou spend
+Upon thyself thy beauty's legacy?
+Nature's bequest gives nothing but doth lend,
+And being frank she lends to those are free.
+Then, beauteous niggard, why dost thou abuse
+The bounteous largess given thee to give?
+Profitless usurer, why dost thou use
+So great a sum of sums, yet canst not live?
+For having traffic with thyself alone,
+Thou of thyself thy sweet self dost deceive.
+Then how, when nature calls thee to be gone,
+What acceptable audit canst thou leave?
+Thy unused beauty must be tomb'd with thee,
+Which, used, lives th' executor to be.
+Those hours, that with gentle work did frame
+The lovely gaze where every eye doth dwell,
+Will play the tyrants to the very same
+And that unfair which fairly doth excel:
+For never-resting time leads summer on
+To hideous winter and confounds him there;
+Sap cheque'd with frost and lusty leaves quite gone,
+Beauty o'ersnow'd and bareness every where:
+Then, were not summer's distillation left,
+A liquid prisoner pent in walls of glass,
+Beauty's effect with beauty were bereft,
+Nor it nor no remembrance what it was:
+But flowers distill'd though they with winter meet,
+Leese but their show; their substance still lives sweet.
+Then let not winter's ragged hand deface
+In thee thy summer, ere thou be distill'd:
+Make sweet some vial; treasure thou some place
+With beauty's treasure, ere it be self-kill'd.
+That use is not forbidden usury,
+Which happies those that pay the willing loan;
+That's for thyself to breed another thee,
+Or ten times happier, be it ten for one;
+Ten times thyself were happier than thou art,
+If ten of thine ten times refigured thee:
+Then what could death do, if thou shouldst depart,
+Leaving thee living in posterity?
+Be not self-will'd, for thou art much too fair
+To be death's conquest and make worms thine heir.
+Lo! in the orient when the gracious light
+Lifts up his burning head, each under eye
+Doth homage to his new-appearing sight,
+Serving with looks his sacred majesty;
+And having climb'd the steep-up heavenly hill,
+Resembling strong youth in his middle age,
+yet mortal looks adore his beauty still,
+Attending on his golden pilgrimage;
+But when from highmost pitch, with weary car,
+Like feeble age, he reeleth from the day,
+The eyes, 'fore duteous, now converted are
+From his low tract and look another way:
+So thou, thyself out-going in thy noon,
+Unlook'd on diest, unless thou get a son.
+Music to hear, why hear'st thou music sadly?
+Sweets with sweets war not, joy delights in joy.
+Why lovest thou that which thou receivest not gladly,
+Or else receivest with pleasure thine annoy?
+If the true concord of well-tuned sounds,
+By unions married, do offend thine ear,
+They do but sweetly chide thee, who confounds
+In singleness the parts that thou shouldst bear.
+Mark how one string, sweet husband to another,
+Strikes each in each by mutual ordering,
+Resembling sire and child and happy mother
+Who all in one, one pleasing note do sing:
+Whose speechless song, being many, seeming one,
+Sings this to thee: 'thou single wilt prove none.'
+Is it for fear to wet a widow's eye
+That thou consumest thyself in single life?
+Ah! if thou issueless shalt hap to die.
+The world will wail thee, like a makeless wife;
+The world will be thy widow and still weep
+That thou no form of thee hast left behind,
+When every private widow well may keep
+By children's eyes her husband's shape in mind.
+Look, what an unthrift in the world doth spend
+Shifts but his place, for still the world enjoys it;
+But beauty's waste hath in the world an end,
+And kept unused, the user so destroys it.
+No love toward others in that bosom sits
+That on himself such murderous shame commits.
+For shame! deny that thou bear'st love to any,
+Who for thyself art so unprovident.
+Grant, if thou wilt, thou art beloved of many,
+But that thou none lovest is most evident;
+For thou art so possess'd with murderous hate
+That 'gainst thyself thou stick'st not to conspire.
+Seeking that beauteous roof to ruinate
+Which to repair should be thy chief desire.
+O, change thy thought, that I may change my mind!
+Shall hate be fairer lodged than gentle love?
+Be, as thy presence is, gracious and kind,
+Or to thyself at least kind-hearted prove:
+Make thee another self, for love of me,
+That beauty still may live in thine or thee.
+As fast as thou shalt wane, so fast thou growest
+In one of thine, from that which thou departest;
+And that fresh blood which youngly thou bestowest
+Thou mayst call thine when thou from youth convertest.
+Herein lives wisdom, beauty and increase:
+Without this, folly, age and cold decay:
+If all were minded so, the times should cease
+And threescore year would make the world away.
+Let those whom Nature hath not made for store,
+Harsh featureless and rude, barrenly perish:
+Look, whom she best endow'd she gave the more;
+Which bounteous gift thou shouldst in bounty cherish:
+She carved thee for her seal, and meant thereby
+Thou shouldst print more, not let that copy die.
+When I do count the clock that tells the time,
+And see the brave day sunk in hideous night;
+When I behold the violet past prime,
+And sable curls all silver'd o'er with white;
+When lofty trees I see barren of leaves
+Which erst from heat did canopy the herd,
+And summer's green all girded up in sheaves
+Borne on the bier with white and bristly beard,
+Then of thy beauty do I question make,
+That thou among the wastes of time must go,
+Since sweets and beauties do themselves forsake
+And die as fast as they see others grow;
+And nothing 'gainst Time's scythe can make defence
+Save breed, to brave him when he takes thee hence.
+O, that you were yourself! but, love, you are
+No longer yours than you yourself here live:
+Against this coming end you should prepare,
+And your sweet semblance to some other give.
+So should that beauty which you hold in lease
+Find no determination: then you were
+Yourself again after yourself's decease,
+When your sweet issue your sweet form should bear.
+Who lets so fair a house fall to decay,
+Which husbandry in honour might uphold
+Against the stormy gusts of winter's day
+And barren rage of death's eternal cold?
+O, none but unthrifts! Dear my love, you know
+You had a father: let your son say so.
+Not from the stars do I my judgment pluck;
+And yet methinks I have astronomy,
+But not to tell of good or evil luck,
+Of plagues, of dearths, or seasons' quality;
+Nor can I fortune to brief minutes tell,
+Pointing to each his thunder, rain and wind,
+Or say with princes if it shall go well,
+By oft predict that I in heaven find:
+But from thine eyes my knowledge I derive,
+And, constant stars, in them I read such art
+As truth and beauty shall together thrive,
+If from thyself to store thou wouldst convert;
+Or else of thee this I prognosticate:
+Thy end is truth's and beauty's doom and date.
+When I consider every thing that grows
+Holds in perfection but a little moment,
+That this huge stage presenteth nought but shows
+Whereon the stars in secret influence comment;
+When I perceive that men as plants increase,
+Cheered and cheque'd even by the self-same sky,
+Vaunt in their youthful sap, at height decrease,
+And wear their brave state out of memory;
+Then the conceit of this inconstant stay
+Sets you most rich in youth before my sight,
+Where wasteful Time debateth with Decay,
+To change your day of youth to sullied night;
+And all in war with Time for love of you,
+As he takes from you, I engraft you new.
+But wherefore do not you a mightier way
+Make war upon this bloody tyrant, Time?
+And fortify yourself in your decay
+With means more blessed than my barren rhyme?
+Now stand you on the top of happy hours,
+And many maiden gardens yet unset
+With virtuous wish would bear your living flowers,
+Much liker than your painted counterfeit:
+So should the lines of life that life repair,
+Which this, Time's pencil, or my pupil pen,
+Neither in inward worth nor outward fair,
+Can make you live yourself in eyes of men.
+To give away yourself keeps yourself still,
+And you must live, drawn by your own sweet skill.
+Who will believe my verse in time to come,
+If it were fill'd with your most high deserts?
+Though yet, heaven knows, it is but as a tomb
+Which hides your life and shows not half your parts.
+If I could write the beauty of your eyes
+And in fresh numbers number all your graces,
+The age to come would say 'This poet lies:
+Such heavenly touches ne'er touch'd earthly faces.'
+So should my papers yellow'd with their age
+Be scorn'd like old men of less truth than tongue,
+And your true rights be term'd a poet's rage
+And stretched metre of an antique song:
+But were some child of yours alive that time,
+You should live twice; in it and in my rhyme.
+Shall I compare thee to a summer's day?
+Thou art more lovely and more temperate:
+Rough winds do shake the darling buds of May,
+And summer's lease hath all too short a date:
+Sometime too hot the eye of heaven shines,
+And often is his gold complexion dimm'd;
+And every fair from fair sometime declines,
+By chance or nature's changing course untrimm'd;
+But thy eternal summer shall not fade
+Nor lose possession of that fair thou owest;
+Nor shall Death brag thou wander'st in his shade,
+When in eternal lines to time thou growest:
+So long as men can breathe or eyes can see,
+So long lives this and this gives life to thee.
+Devouring Time, blunt thou the lion's paws,
+And make the earth devour her own sweet brood;
+Pluck the keen teeth from the fierce tiger's jaws,
+And burn the long-lived phoenix in her blood;
+Make glad and sorry seasons as thou fleets,
+And do whate'er thou wilt, swift-footed Time,
+To the wide world and all her fading sweets;
+But I forbid thee one most heinous crime:
+O, carve not with thy hours my love's fair brow,
+Nor draw no lines there with thine antique pen;
+Him in thy course untainted do allow
+For beauty's pattern to succeeding men.
+Yet, do thy worst, old Time: despite thy wrong,
+My love shall in my verse ever live young.
+A woman's face with Nature's own hand painted
+Hast thou, the master-mistress of my passion;
+A woman's gentle heart, but not acquainted
+With shifting change, as is false women's fashion;
+An eye more bright than theirs, less false in rolling,
+Gilding the object whereupon it gazeth;
+A man in hue, all 'hues' in his controlling,
+Much steals men's eyes and women's souls amazeth.
+And for a woman wert thou first created;
+Till Nature, as she wrought thee, fell a-doting,
+And by addition me of thee defeated,
+By adding one thing to my purpose nothing.
+But since she prick'd thee out for women's pleasure,
+Mine be thy love and thy love's use their treasure.
+So is it not with me as with that Muse
+Stirr'd by a painted beauty to his verse,
+Who heaven itself for ornament doth use
+And every fair with his fair doth rehearse
+Making a couplement of proud compare,
+With sun and moon, with earth and sea's rich gems,
+With April's first-born flowers, and all things rare
+That heaven's air in this huge rondure hems.
+O' let me, true in love, but truly write,
+And then believe me, my love is as fair
+As any mother's child, though not so bright
+As those gold candles fix'd in heaven's air:
+Let them say more than like of hearsay well;
+I will not praise that purpose not to sell.
+My glass shall not persuade me I am old,
+So long as youth and thou are of one date;
+But when in thee time's furrows I behold,
+Then look I death my days should expiate.
+For all that beauty that doth cover thee
+Is but the seemly raiment of my heart,
+Which in thy breast doth live, as thine in me:
+How can I then be elder than thou art?
+O, therefore, love, be of thyself so wary
+As I, not for myself, but for thee will;
+Bearing thy heart, which I will keep so chary
+As tender nurse her babe from faring ill.
+Presume not on thy heart when mine is slain;
+Thou gavest me thine, not to give back again.
+As an unperfect actor on the stage
+Who with his fear is put besides his part,
+Or some fierce thing replete with too much rage,
+Whose strength's abundance weakens his own heart.
+So I, for fear of trust, forget to say
+The perfect ceremony of love's rite,
+And in mine own love's strength seem to decay,
+O'ercharged with burden of mine own love's might.
+O, let my books be then the eloquence
+And dumb presagers of my speaking breast,
+Who plead for love and look for recompense
+More than that tongue that more hath more express'd.
+O, learn to read what silent love hath writ:
+To hear with eyes belongs to love's fine wit.
+Mine eye hath play'd the painter and hath stell'd
+Thy beauty's form in table of my heart;
+My body is the frame wherein 'tis held,
+And perspective it is the painter's art.
+For through the painter must you see his skill,
+To find where your true image pictured lies;
+Which in my bosom's shop is hanging still,
+That hath his windows glazed with thine eyes.
+Now see what good turns eyes for eyes have done:
+Mine eyes have drawn thy shape, and thine for me
+Are windows to my breast, where-through the sun
+Delights to peep, to gaze therein on thee;
+Yet eyes this cunning want to grace their art;
+They draw but what they see, know not the heart.
+Let those who are in favour with their stars
+Of public honour and proud titles boast,
+Whilst I, whom fortune of such triumph bars,
+Unlook'd for joy in that I honour most.
+Great princes' favourites their fair leaves spread
+But as the marigold at the sun's eye,
+And in themselves their pride lies buried,
+For at a frown they in their glory die.
+The painful warrior famoused for fight,
+After a thousand victories once foil'd,
+Is from the book of honour razed quite,
+And all the rest forgot for which he toil'd:
+Then happy I, that love and am beloved
+Where I may not remove nor be removed.
+Lord of my love, to whom in vassalage
+Thy merit hath my duty strongly knit,
+To thee I send this written embassage,
+To witness duty, not to show my wit:
+Duty so great, which wit so poor as mine
+May make seem bare, in wanting words to show it,
+But that I hope some good conceit of thine
+In thy soul's thought, all naked, will bestow it;
+Till whatsoever star that guides my moving
+Points on me graciously with fair aspect
+And puts apparel on my tatter'd loving,
+To show me worthy of thy sweet respect:
+Then may I dare to boast how I do love thee;
+Till then not show my head where thou mayst prove me.
+Weary with toil, I haste me to my bed,
+The dear repose for limbs with travel tired;
+But then begins a journey in my head,
+To work my mind, when body's work's expired:
+For then my thoughts, from far where I abide,
+Intend a zealous pilgrimage to thee,
+And keep my drooping eyelids open wide,
+Looking on darkness which the blind do see
+Save that my soul's imaginary sight
+Presents thy shadow to my sightless view,
+Which, like a jewel hung in ghastly night,
+Makes black night beauteous and her old face new.
+Lo! thus, by day my limbs, by night my mind,
+For thee and for myself no quiet find.
+How can I then return in happy plight,
+That am debarr'd the benefit of rest?
+When day's oppression is not eased by night,
+But day by night, and night by day, oppress'd?
+And each, though enemies to either's reign,
+Do in consent shake hands to torture me;
+The one by toil, the other to complain
+How far I toil, still farther off from thee.
+I tell the day, to please them thou art bright
+And dost him grace when clouds do blot the heaven:
+So flatter I the swart-complexion'd night,
+When sparkling stars twire not thou gild'st the even.
+But day doth daily draw my sorrows longer
+And night doth nightly make grief's strength seem stronger.
+When, in disgrace with fortune and men's eyes,
+I all alone beweep my outcast state
+And trouble deal heaven with my bootless cries
+And look upon myself and curse my fate,
+Wishing me like to one more rich in hope,
+Featured like him, like him with friends possess'd,
+Desiring this man's art and that man's scope,
+With what I most enjoy contented least;
+Yet in these thoughts myself almost despising,
+Haply I think on thee, and then my state,
+Like to the lark at break of day arising
+From sullen earth, sings hymns at heaven's gate;
+For thy sweet love remember'd such wealth brings
+That then I scorn to change my state with kings.
+When to the sessions of sweet silent thought
+I summon up remembrance of things past,
+I sigh the lack of many a thing I sought,
+And with old woes new wail my dear time's waste:
+Then can I drown an eye, unused to flow,
+For precious friends hid in death's dateless night,
+And weep afresh love's long since cancell'd woe,
+And moan the expense of many a vanish'd sight:
+Then can I grieve at grievances foregone,
+And heavily from woe to woe tell o'er
+The sad account of fore-bemoaned moan,
+Which I new pay as if not paid before.
+But if the while I think on thee, dear friend,
+All losses are restored and sorrows end.
+Thy bosom is endeared with all hearts,
+Which I by lacking have supposed dead,
+And there reigns love and all love's loving parts,
+And all those friends which I thought buried.
+How many a holy and obsequious tear
+Hath dear religious love stol'n from mine eye
+As interest of the dead, which now appear
+But things removed that hidden in thee lie!
+Thou art the grave where buried love doth live,
+Hung with the trophies of my lovers gone,
+Who all their parts of me to thee did give;
+That due of many now is thine alone:
+Their images I loved I view in thee,
+And thou, all they, hast all the all of me.
+If thou survive my well-contented day,
+When that churl Death my bones with dust shall cover,
+And shalt by fortune once more re-survey
+These poor rude lines of thy deceased lover,
+Compare them with the bettering of the time,
+And though they be outstripp'd by every pen,
+Reserve them for my love, not for their rhyme,
+Exceeded by the height of happier men.
+O, then vouchsafe me but this loving thought:
+'Had my friend's Muse grown with this growing age,
+A dearer birth than this his love had brought,
+To march in ranks of better equipage:
+But since he died and poets better prove,
+Theirs for their style I'll read, his for his love.'
+Full many a glorious morning have I seen
+Flatter the mountain-tops with sovereign eye,
+Kissing with golden face the meadows green,
+Gilding pale streams with heavenly alchemy;
+Anon permit the basest clouds to ride
+With ugly rack on his celestial face,
+And from the forlorn world his visage hide,
+Stealing unseen to west with this disgrace:
+Even so my sun one early morn did shine
+With all triumphant splendor on my brow;
+But out, alack! he was but one hour mine;
+The region cloud hath mask'd him from me now.
+Yet him for this my love no whit disdaineth;
+Suns of the world may stain when heaven's sun staineth.
+Why didst thou promise such a beauteous day,
+And make me travel forth without my cloak,
+To let base clouds o'ertake me in my way,
+Hiding thy bravery in their rotten smoke?
+'Tis not enough that through the cloud thou break,
+To dry the rain on my storm-beaten face,
+For no man well of such a salve can speak
+That heals the wound and cures not the disgrace:
+Nor can thy shame give physic to my grief;
+Though thou repent, yet I have still the loss:
+The offender's sorrow lends but weak relief
+To him that bears the strong offence's cross.
+Ah! but those tears are pearl which thy love sheds,
+And they are rich and ransom all ill deeds.
+No more be grieved at that which thou hast done:
+Roses have thorns, and silver fountains mud;
+Clouds and eclipses stain both moon and sun,
+And loathsome canker lives in sweetest bud.
+All men make faults, and even I in this,
+Authorizing thy trespass with compare,
+Myself corrupting, salving thy amiss,
+Excusing thy sins more than thy sins are;
+For to thy sensual fault I bring in sense--
+Thy adverse party is thy advocate--
+And 'gainst myself a lawful plea commence:
+Such civil war is in my love and hate
+That I an accessary needs must be
+To that sweet thief which sourly robs from me.
+Let me confess that we two must be twain,
+Although our undivided loves are one:
+So shall those blots that do with me remain
+Without thy help by me be borne alone.
+In our two loves there is but one respect,
+Though in our lives a separable spite,
+Which though it alter not love's sole effect,
+Yet doth it steal sweet hours from love's delight.
+I may not evermore acknowledge thee,
+Lest my bewailed guilt should do thee shame,
+Nor thou with public kindness honour me,
+Unless thou take that honour from thy name:
+But do not so; I love thee in such sort
+As, thou being mine, mine is thy good report.
+As a decrepit father takes delight
+To see his active child do deeds of youth,
+So I, made lame by fortune's dearest spite,
+Take all my comfort of thy worth and truth.
+For whether beauty, birth, or wealth, or wit,
+Or any of these all, or all, or more,
+Entitled in thy parts do crowned sit,
+I make my love engrafted to this store:
+So then I am not lame, poor, nor despised,
+Whilst that this shadow doth such substance give
+That I in thy abundance am sufficed
+And by a part of all thy glory live.
+Look, what is best, that best I wish in thee:
+This wish I have; then ten times happy me!
\ No newline at end of file
diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index 760a9662d515..53d97fe29529 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -139,7 +139,7 @@ def quick_recv(self):
             device = self.device
         else:
             device = 'cpu'
-        buffer = torch.zeros(shape, dtype=dtype).to(device, non_blocking=True)
+        buffer = torch.zeros(shape, dtype=dtype).to(device)
         
         torch.distributed.recv(
             buffer,
@@ -184,8 +184,6 @@ def send_tensor(self,
         else:
             tensor_size = tensor.element_size() * tensor.numel()
 
-        assert 0 < len(tensor.shape) <  100, "Send tensor does not support tensor with 0 dim or >=100 dim. Got %d" % len(tensor.shape)
-
         self.block_if_full()
 
         with self.buffer_size_lock:
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 6fd94312483e..1b60b444709e 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -295,6 +295,7 @@ def execute_model(
                     model_input,
                     self.kv_cache[worker_input.virtual_engine],
                 )
+            assert bypass_model_exec
         
         if not bypass_model_exec: 
             hidden_or_intermediate_states = self.model_runner.execute_model(

From bb865884e086a7f3617e36e8b4428565e80a2ecb Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Tue, 10 Sep 2024 18:32:53 -0500
Subject: [PATCH 172/278] [Add] optimized implementation for KV transfer pipe

---
 .../kv_pipe/torch_distributed_pipe.py         | 247 ++++++++++++------
 1 file changed, 166 insertions(+), 81 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index 53d97fe29529..994afc82ffa4 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -1,20 +1,12 @@
-
-from vllm.distributed.group_coordinator import GroupCoordinator
-from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from torch.distributed import Backend, ProcessGroup
+from torch.distributed import Backend
 import torch
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Union
 import threading
 from concurrent.futures import ThreadPoolExecutor
 import time
-import threading
-from collections import namedtuple
-from typing import Dict, Any, Tuple, List
-import pickle
 
 from vllm.logger import init_logger
 
-
 logger = init_logger(__name__)
 
 
@@ -52,34 +44,32 @@ def __init__(self, message):
         self.message = message
         super().__init__(self.message)
 
-class TorchDistributedPipe(KVPipeBase):
-    
+
+class TorchDistributedPipe:
+    METADATA_LENGTH = 16
+    MAX_TENSOR_DIMENSIONS = 14
+    METADATA_DTYPE = torch.int64
+
     def __init__(
         self,
         group_ranks: List[List[int]],
         local_rank: int,
-        torch_distributed_backend: Union[str, Backend]
+        torch_distributed_backend: Union[str, Backend],
     ):
-
         self.rank = torch.distributed.get_rank()
         self.local_rank = local_rank
         self.device_group = None
-        self.cpu_group = None
 
         for ranks in group_ranks:
             device_group = torch.distributed.new_group(
-                ranks, backend=torch_distributed_backend)
-            # a group with `gloo` backend, to allow direct coordination between
-            # processes through the CPU.
-            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+                ranks, backend=torch_distributed_backend
+            )
             if self.rank in ranks:
                 self.ranks = ranks
                 self.world_size = len(ranks)
                 self.rank_in_group = ranks.index(self.rank)
                 self.device_group = device_group
-                self.cpu_group = cpu_group
 
-        assert self.cpu_group is not None
         assert self.device_group is not None
         assert self.rank_in_group <= 1
 
@@ -88,95 +78,167 @@ def __init__(
         else:
             self.device = torch.device("cpu")
 
-        # if turned on, will use CPU-based communication to perform a series of sanity check.
-        # but it adds ~5ms delay, so please turn it off in performance-demanding usecases (e.g. disaggregated prefill)
-        self.target_rank_for_send = self.ranks[(self.rank_in_group + 1) %
-                                               self.world_size]
-        self.target_rank_for_recv = self.ranks[(self.rank_in_group - 1) %
-                                               self.world_size]
+        self.target_rank_for_send = self.ranks[
+            (self.rank_in_group + 1) % self.world_size
+        ]
+        self.target_rank_for_recv = self.ranks[
+            (self.rank_in_group - 1) % self.world_size
+        ]
+
+        # FIXME: why we need this?
         torch.set_default_device(self.device)
 
-        self.kv_sending_thread = None
+        self.transport_thread = None
         self.buffer_size = 0
         self.buffer_size_lock = threading.Lock()
 
-        self.none_tensor = torch.tensor([NONE_INT]).to(self.device)
-        self.broken = False
+        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
+
+        # On-device tensors to be reused for recv
+        self.rcv_metadata_buffer = torch.zeros(
+            self.METADATA_LENGTH, dtype=self.METADATA_DTYPE, device=self.device
+        )
+
+    def _make_metadata(self, tensor: torch.Tensor) -> torch.Tensor:
+        """
+        Create the metadata on based on the input tensor, and move it to GPU.
+        The metadata's length is `TorchDistributedPipe.METADATA_LENGTH`.
 
-        
-    def quick_send(self, tensor):
+        Currently, the metadata is a int64 tensor and it includes dtype, number
+        of dimensions, and the shape information of the input tensor.
 
-        group = self.device_group
 
-        # NCCL is NOT fully duplex
-        # so CPU communication is ALWAYS necessary
-        torch.distributed.send_object_list(
-            [tensor.dtype, tensor.shape, str(tensor.device)],
-            dst=self.target_rank_for_send,
-            group=self.cpu_group
+        The information follows the layout below:
+        - metadata[0] -- dtype
+        - metadata[1] -- number of dimensions
+        - metadata[2 : 2+ndims] -- the shape of the input tensor
+
+        Parameters:
+            - tensor: the input tensor
+
+        Returns:
+            - metadata: the metadata tensor, on self.device
+        """
+        buffer = torch.empty(self.METADATA_LENGTH, dtype=self.METADATA_DTYPE)
+        buffer[0] = DTYPE2INT[tensor.dtype]
+        ndims = len(tensor.shape)
+        buffer[1] = len(tensor.shape)
+        buffer[2 : 2 + ndims] = torch.tensor(
+            tensor.shape, dtype=self.METADATA_DTYPE
         )
+        return buffer.to(self.device)
+
+    def _prepare_recv_buffer(
+        self, d_metadata_buffer: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Create a buffer to receive the tensor based on the metadata.
+
+        Parameters:
+            - d_metadata_buffer: the metadata tensor on self.device
+
+        Returns:
+            - buffer: the buffer tensor to receive the tensor, on self.device
+        """
+        h_buffer = d_metadata_buffer.cpu().numpy()
+        dtype = INT2DTYPE[h_buffer[0]]
+        ndims = h_buffer[1]
+        shape = tuple(h_buffer[2 : 2 + ndims])
+        return torch.empty(shape, dtype=dtype, device=self.device)
 
+    def _send_metadata(self, d_metadata_buffer: torch.Tensor):
+        """
+        Send the metadata buffer to the target rank.
+        """
         torch.distributed.send(
-            tensor,
+            d_metadata_buffer,
             dst=self.target_rank_for_send,
-            group=self.device_group
+            group=self.device_group,
         )
 
+    def _recv_metadata(self) -> torch.Tensor:
+        """
+        Receive the metadata buffer from the target rank.
 
-    def quick_recv(self):
+        Returns:
+            - metadata_buffer: the metadata buffer tensor, on self.device
 
-        # NCCL is NOT fully duplex
-        # so CPU communication is necessary
-        metadata = [None, None, None]
-        torch.distributed.recv_object_list(
-            metadata,
+        Note:
+            The current implementation uses the assumption that there is no
+            race conditions during sending/receiving. Therefore, the metadata
+            buffer can be reused
+        """
+        torch.distributed.recv(
+            self.rcv_metadata_buffer,
             src=self.target_rank_for_recv,
-            group=self.cpu_group
+            group=self.device_group,
         )
-        
-        dtype, shape, device = metadata
-        if 'cuda' in device:
-            device = self.device
-        else:
-            device = 'cpu'
-        buffer = torch.zeros(shape, dtype=dtype).to(device)
-        
+        return self.rcv_metadata_buffer
+
+    def _send_impl(self, tensor):
+        """
+        The actual implementation of sending the tensor to the target rank.
+        This function will first send the metadata, and then send the tensor.
+
+        Parameters:
+            - tensor: the input tensor to be sent
+        """
+
+        metadata = self._make_metadata(tensor)
+        self._send_metadata(metadata)
+
+        torch.distributed.send(
+            tensor, dst=self.target_rank_for_send, group=self.device_group
+        )
+
+    def _recv_impl(self) -> torch.Tensor:
+        """
+        The actual implementation of receiving the tensor from the target rank.
+        This function will first receive the metadata, then receive the tensor.
+
+        This function will block if there is no tensor to receive.
+
+        Returns:
+            - buffer: the received tensor, on self.device
+        """
+        d_metadata = self._recv_metadata()
+        buffer = self._prepare_recv_buffer(d_metadata)
+
         torch.distributed.recv(
-            buffer,
-            src=self.target_rank_for_recv,
-            group=self.device_group
+            buffer, src=self.target_rank_for_recv, group=self.device_group
         )
-        return buffer
-        
 
-        
-    def send_tensor_wrapper(self, tensor) -> None:
+        return buffer
 
+    def send_tensor_wrapper(self, tensor):
         try:
+            """Wrapper for send_tensor_dict"""
             tensor_size = tensor.element_size() * tensor.numel()
-            self.quick_send(tensor)
-            
+            self._send_impl(tensor)
+
             with self.buffer_size_lock:
                 self.buffer_size = self.buffer_size - tensor_size
         except Exception as e:
             logger.error("Encountering exception in KV sending thread")
             logger.error("%s", e)
-        
+
     def block_if_full(self):
-        
+        """
+        Block the current thread if the buffer size is larger than 1e9.
+        """
+        # TODO: replace this 1e9 with a configurable parameter or a constant
         while self.buffer_size > 1e9:
             logger.debug("KV cache transfer pipe is full. Waiting...")
             time.sleep(0.05)
 
-    def send_tensor(self,
-                   tensor: Optional[torch.Tensor]) -> None:
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         """
         Sends a tensor to the destination rank in a non-blocking way.
         Flow: send tensor dim -- send tensor shape -- send tensor data
         """
-        
-        if self.kv_sending_thread is None:
-            self.kv_sending_thread = ThreadPoolExecutor(max_workers=1)
+
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
 
         if tensor is None:
             tensor = self.none_tensor
@@ -184,24 +246,47 @@ def send_tensor(self,
         else:
             tensor_size = tensor.element_size() * tensor.numel()
 
+        assert (
+            0 < len(tensor.shape) < self.MAX_TENSOR_DIMENSIONS
+        ), f"Only support dimensions within 1-{self.MAX_TENSOR_DIMENSIONS}"
+
         self.block_if_full()
 
         with self.buffer_size_lock:
+            # print("Remaining size:", self.buffer_size)
             self.buffer_size = self.buffer_size + tensor_size
-            
+
         # prepare the metadata before sending the tensor.
-        self.kv_sending_thread.submit(
-            self.send_tensor_wrapper, 
-            tensor)
-    
+        self.transport_thread.submit(
+            self.send_tensor_wrapper,
+            tensor,
+        )
+
     def recv_tensor(self) -> Optional[torch.Tensor]:
         """Receives a tensor from the src rank. Blocking."""
-        
-        tensor = self.quick_recv()
+
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+
+        future = self.transport_thread.submit(self._recv_impl)
+
+        try:
+            tensor = future.result()
+        except Exception as e:
+            logger.error("Encountering exception in KV receiving thread")
+            logger.error("%s", e)
+
         if tensor.numel() == 1 and tensor.item() == NONE_INT:
             return None
         else:
             return tensor
-    
 
-    
+    def close(self):
+        """
+        Close the pipe and release the resources.
+        """
+        if (
+            hasattr(self, "transport_thread")
+            and self.transport_thread is not None
+        ):
+            self.transport_thread.shutdown()

From ffb792bd7bfde2b0d85d0b963b146e9827a1400f Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Fri, 13 Sep 2024 01:37:27 +0000
Subject: [PATCH 173/278] [Fix] the implementation of KV lookup buffer

---
 tests/kv_transfer/test_lookup_buffer.py       |  46 +--
 tests/kv_transfer/test_send_recv.py           |  22 +-
 .../kv_transfer/kv_lookup_buffer/base.py      |   8 +-
 .../simple_kv_lookup_buffer.py                |  59 +++-
 vllm/distributed/kv_transfer/kv_pipe/base.py  |   6 +-
 .../kv_pipe/torch_distributed_pipe.py         |  22 +-
 .../kv_pipe/torch_distributed_pipe.py.bkup    | 300 ++++++++++++++++++
 7 files changed, 415 insertions(+), 48 deletions(-)
 create mode 100644 vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py.bkup

diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index aa98a7804ecd..5041bf026483 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -7,8 +7,9 @@
 from tqdm import tqdm
 import time
 
+# TODO: the test depends on a lot of fields in the current implementation. We should have standard interface instead direct field access
 
-def test_run(my_rank, buffer):
+def test_run(my_rank, buffer, device):
     
     # buffer should be empty in the beginning    
     if my_rank == 0:
@@ -17,15 +18,19 @@ def test_run(my_rank, buffer):
 
 
     # insert
-    tokens = torch.tensor([1,2,3]).to(buffer.pipe.device)
+    tokens = torch.tensor([1,2,3]).to(device)
     roi = (tokens > 0)
     if my_rank == 0:
-        key = 2.0 * torch.ones([5, 6]).to(buffer.pipe.device)
-        value = 3.0 * torch.ones([5, 6]).to(buffer.pipe.device)
+        key = 2.0 * torch.ones([5, 6]).to(device)
+        value = 3.0 * torch.ones([5, 6]).to(device)
 
-        placeholder = torch.tensor([1]).to(buffer.pipe.device)
+        placeholder = torch.tensor([1]).to(device)
 
         buffer.insert(tokens, roi, key, value, placeholder)
+
+        #for i in range(2000):
+        #    print("Here:", i)
+        #    time.sleep(0.01)
     torch.distributed.barrier()
         
     # drop_select
@@ -33,22 +38,21 @@ def test_run(my_rank, buffer):
         tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
         assert torch.allclose(tokens, tok)
         assert torch.allclose(roi, roi_)
-        assert torch.allclose(key, 2.0 * torch.ones([5, 6]))
-        assert torch.allclose(value, 3.0 * torch.ones([5, 6]))
+        assert torch.allclose(key, 2.0 * torch.ones([5, 6], device = device))
+        assert torch.allclose(value, 3.0 * torch.ones([5, 6], device = device))
     torch.distributed.barrier()
     
     if my_rank == 0:
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
+    
+    print("Test run passed!")
 
-
-def stress_test(my_rank, buf):
+def stress_test(my_rank, buf, device):
     
     torch.distributed.barrier()
     torch.manual_seed(100)
 
-    device = buf.pipe.device
-    
     reqs = [
         (
          torch.rand(100).to(device),   # tokens
@@ -56,7 +60,7 @@ def stress_test(my_rank, buf):
          torch.rand(100).to(device),   # key
          torch.rand(100).to(device),   # value
          torch.rand(100).to(device),   # hidden
-         ) for i in range(200)]
+         ) for i in tqdm(range(200))]
 
     random.seed(my_rank)
     random.shuffle(reqs)
@@ -86,7 +90,7 @@ def stress_test(my_rank, buf):
                 assert torch.allclose(k, k_)
                 assert torch.allclose(v, v_)
                 assert torch.allclose(h, h_)
-    print('Rand %d done' % my_rank)
+    print('Rank %d done' % my_rank)
     torch.distributed.barrier()
     
     
@@ -101,13 +105,9 @@ def stress_test(my_rank, buf):
     else:
         torch.distributed.send(torch.tensor([n]), 0)
 
+    print("Passed stress test!")
         
     
-            
-            
-    
-    
-    
 
 if __name__ == "__main__":
 
@@ -123,10 +123,14 @@ def stress_test(my_rank, buf):
 
 
     pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "nccl")
-    buffer = sklb.SimpleKVLookupBuffer(pipe, 170000)
+    cpu_pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "gloo")
+    buffer = sklb.SimpleKVLookupBuffer(cpu_pipe, pipe, 170000)
 
-    test_run(my_rank, buffer)
+    test_run(my_rank, buffer, pipe.device)
     
-    stress_test(my_rank, buffer)
+    stress_test(my_rank, buffer, pipe.device)
     
+    buffer.close()
+    pipe.close()
+    cpu_pipe.close()
     print('Done')
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 95a7528f0f7a..4bf757d7c849 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -12,12 +12,28 @@ def test_run(my_rank, pipe):
     x = torch.tensor([1]).to(pipe.device)
     y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
     if my_rank == 0:
-        
         pipe.send_tensor(x)
+        print("sent tensor x")
         pipe.send_tensor(y)
+        print("sent tensor y")
+        x2 = pipe.recv_tensor()
+        print("received x2 = ", x2)
+        y2 = pipe.recv_tensor()
+        print("received y2 = ", x2)
+
     else:
-        assert torch.allclose(x, pipe.recv_tensor())
-        assert torch.allclose(y, pipe.recv_tensor())
+        x2 = pipe.recv_tensor()
+        print("received x2 = ", x2)
+        y2 = pipe.recv_tensor()
+        print("received y2 = ", x2)
+        pipe.send_tensor(x)
+        print("sent tensor x")
+        pipe.send_tensor(y)
+        print("sent tensor y")
+
+    assert torch.allclose(x, x2)
+    assert torch.allclose(y, y2)
+
 
 
 def stress_test(my_rank, pipe):
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index 5ac8fbb24444..733bc82bf53f 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -15,4 +15,10 @@ def insert(self,
     @abstractmethod
     def drop_select(self, input_tokens, roi) -> Optional[torch.Tensor]:
         raise NotImplementedError
-    
\ No newline at end of file
+    
+    @abstractmethod
+    def close(self):
+        """
+        Close the buffer, release resources.
+        """
+        raise NotImplementedError
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
index 407ac7c9bcfc..df52dd65692e 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
@@ -1,6 +1,7 @@
 
 from vllm.distributed.kv_transfer.kv_lookup_buffer.base import \
     KVLookupBufferBase
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 from typing import Dict, Tuple, List, Optional
 import threading
 import torch
@@ -13,16 +14,24 @@
 
 class SimpleKVLookupBuffer(KVLookupBufferBase):
     
-    def __init__(self, pipe, buffer_size_thresh):
+    def __init__(self, signal_pipe, data_pipe, buffer_size_thresh):
+        """
+        signal_pipe: on CPU -- avoid recv() stops the python intepreter
+        data_pipe: on GPU
+        """
         
         self.buffer = deque()
         
         self.buffer_size = 0
         self.buffer_size_threshold = buffer_size_thresh
         self.buffer_lock = threading.Lock()
-        self.pipe = pipe
+        self.signal_pipe = signal_pipe
+        self.data_pipe = data_pipe
         self.request_handling_thread = None
 
+        self.normal_signal = torch.tensor([0])
+        self.end_signal = None
+
         
     def _matches(self, tokens_roi_sender, tokens_roi_recver):
 
@@ -57,9 +66,9 @@ def _matches(self, tokens_roi_sender, tokens_roi_recver):
             
     def _send_tensor_and_dec_size(self, tensor: Optional[torch.Tensor]) -> None:
 
-        assert tensor is not None, "Use self.pipe.send(None) instead"
+        assert tensor is not None, "Use self.data_pipe.send(None) instead"
         self.buffer_size -= tensor.element_size() * tensor.numel()
-        self.pipe.send_tensor(tensor)
+        self.data_pipe.send_tensor(tensor)
 
     def _get_element_size(self, data):
         
@@ -91,14 +100,22 @@ def _add_to_buffer(self, input_tokens, roi, key, value, hidden):
                 self.buffer_size += self._get_element_size(data)
             self.buffer.append(buffer_item)
         
+    def _is_end_signal(self, signal):
+        return signal is None
         
     def drop_select_handler(self):
 
         try:
         
             while True:
-                input_tokens = self.pipe.recv_tensor()
-                roi = self.pipe.recv_tensor()
+                signal = self.signal_pipe.recv_tensor()
+                if self._is_end_signal(signal):
+                    logger.info("Received end signal!")
+                    break
+
+                input_tokens = self.data_pipe.recv_tensor()
+
+                roi = self.data_pipe.recv_tensor()
                 tokens_roi_recver = [input_tokens, roi]
                 
                 matched_length = 0
@@ -125,10 +142,13 @@ def drop_select_handler(self):
                     else:
                         # no match, just send None
                         for _ in range(5):
-                            self.pipe.send_tensor(None)
+                            self.data_pipe.send_tensor(None)
+
         except RuntimeError as e:
             if 'Connection closed by peer' not in str(e):
                 raise e
+
+        logger.debug("closing drop_select_handler")
                         
         
     def drop_select(self, input_tokens, roi):
@@ -142,14 +162,15 @@ def drop_select(self, input_tokens, roi):
         if isinstance(roi, torch.Tensor):
             roi = roi.clone()
         
-        self.pipe.send_tensor(input_tokens)
-        self.pipe.send_tensor(roi)
+        self.signal_pipe.send_tensor(self.normal_signal)
+        self.data_pipe.send_tensor(input_tokens)
+        self.data_pipe.send_tensor(roi)
         
-        input_tokens = self.pipe.recv_tensor()
-        roi = self.pipe.recv_tensor()
-        key = self.pipe.recv_tensor()
-        value = self.pipe.recv_tensor()
-        hidden = self.pipe.recv_tensor()
+        input_tokens = self.data_pipe.recv_tensor()
+        roi = self.data_pipe.recv_tensor()
+        key = self.data_pipe.recv_tensor()
+        value = self.data_pipe.recv_tensor()
+        hidden = self.data_pipe.recv_tensor()
         
         return [input_tokens, roi, key, value, hidden]
 
@@ -173,4 +194,12 @@ def insert(self, input_tokens, roi, key, value, hidden) -> None:
                 target=self.drop_select_handler)
             self.request_handling_thread.start()
             
-            
\ No newline at end of file
+            
+    def close(self):
+
+        if hasattr(self, "request_handling_thread") and self.request_handling_thread is not None:
+            self.request_handling_thread.join()
+
+        else:
+            # TODO: have a explicit close signal and have a explicit way to check if it's requester 
+            self.signal_pipe.send_tensor(self.end_signal)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
index 625656adc266..7662a5893ceb 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -10,4 +10,8 @@ def send_tensor(self, tensor):
     
     @abstractmethod 
     def recv_tensor(self):
-        raise NotImplementedError
\ No newline at end of file
+        raise NotImplementedError
+
+    @abstractmethod
+    def close(self):
+        raise NotImplementedError
diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index 994afc82ffa4..caa9e6aabd93 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -50,6 +50,7 @@ class TorchDistributedPipe:
     MAX_TENSOR_DIMENSIONS = 14
     METADATA_DTYPE = torch.int64
 
+
     def __init__(
         self,
         group_ranks: List[List[int]],
@@ -73,10 +74,7 @@ def __init__(
         assert self.device_group is not None
         assert self.rank_in_group <= 1
 
-        if torch.cuda.is_available():
-            self.device = torch.device(f"cuda:{local_rank}")
-        else:
-            self.device = torch.device("cpu")
+        self.device = self._select_device(torch_distributed_backend)
 
         self.target_rank_for_send = self.ranks[
             (self.rank_in_group + 1) % self.world_size
@@ -99,6 +97,12 @@ def __init__(
             self.METADATA_LENGTH, dtype=self.METADATA_DTYPE, device=self.device
         )
 
+    def _select_device(self, backend: Union[str, Backend]):
+        if torch.cuda.is_available() and backend == Backend.NCCL:
+            return torch.device(f"cuda:{self.local_rank}")
+        else:
+            return "cpu"
+
     def _make_metadata(self, tensor: torch.Tensor) -> torch.Tensor:
         """
         Create the metadata on based on the input tensor, and move it to GPU.
@@ -168,11 +172,12 @@ def _recv_metadata(self) -> torch.Tensor:
             race conditions during sending/receiving. Therefore, the metadata
             buffer can be reused
         """
-        torch.distributed.recv(
+        task = torch.distributed.recv(
             self.rcv_metadata_buffer,
             src=self.target_rank_for_recv,
             group=self.device_group,
         )
+
         return self.rcv_metadata_buffer
 
     def _send_impl(self, tensor):
@@ -256,15 +261,16 @@ def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
             # print("Remaining size:", self.buffer_size)
             self.buffer_size = self.buffer_size + tensor_size
 
-        # prepare the metadata before sending the tensor.
+
+        #self.send_tensor_wrapper(tensor)
         self.transport_thread.submit(
             self.send_tensor_wrapper,
             tensor,
         )
 
+
     def recv_tensor(self) -> Optional[torch.Tensor]:
         """Receives a tensor from the src rank. Blocking."""
-
         if self.transport_thread is None:
             self.transport_thread = ThreadPoolExecutor(max_workers=1)
 
@@ -276,6 +282,8 @@ def recv_tensor(self) -> Optional[torch.Tensor]:
             logger.error("Encountering exception in KV receiving thread")
             logger.error("%s", e)
 
+        #tensor = self._recv_impl()
+
         if tensor.numel() == 1 and tensor.item() == NONE_INT:
             return None
         else:
diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py.bkup b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py.bkup
new file mode 100644
index 000000000000..489052285475
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py.bkup
@@ -0,0 +1,300 @@
+from torch.distributed import Backend
+import torch
+from typing import List, Optional, Union
+import threading
+from concurrent.futures import ThreadPoolExecutor
+import time
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+# if the tensor is only one-element and only contains this number
+# this means that the sended object is None.
+NONE_INT = -150886311
+FLOAT16_INT = -543205003776624
+INT64_INT = -375623078607432
+BOOL_INT = -28035262008646
+BFLOAT16_INT = -452084912267662
+FLOAT32_INT = -1049557997456592
+FLOAT64_INT = -452201007054137
+
+DTYPE2INT = {
+    torch.float16: FLOAT16_INT,
+    torch.int64: INT64_INT,
+    torch.bool: BOOL_INT,
+    torch.bfloat16: BFLOAT16_INT,
+    torch.float32: FLOAT32_INT,
+    torch.float64: FLOAT64_INT,
+}
+
+INT2DTYPE = {
+    FLOAT16_INT: torch.float16,
+    INT64_INT: torch.int64,
+    BOOL_INT: torch.bool,
+    BFLOAT16_INT: torch.bfloat16,
+    FLOAT32_INT: torch.float32,
+    FLOAT64_INT: torch.float64,
+}
+
+
+class BrokenPipeException(Exception):
+    def __init__(self, message):
+        self.message = message
+        super().__init__(self.message)
+
+
+class TorchDistributedPipe:
+    METADATA_LENGTH = 16
+    MAX_TENSOR_DIMENSIONS = 14
+    METADATA_DTYPE = torch.int64
+
+    def __init__(
+        self,
+        group_ranks: List[List[int]],
+        local_rank: int,
+        torch_distributed_backend: Union[str, Backend],
+    ):
+        self.rank = torch.distributed.get_rank()
+        self.local_rank = local_rank
+        self.device_group = None
+
+        for ranks in group_ranks:
+            device_group = torch.distributed.new_group(
+                ranks, backend=torch_distributed_backend
+            )
+            if self.rank in ranks:
+                self.ranks = ranks
+                self.world_size = len(ranks)
+                self.rank_in_group = ranks.index(self.rank)
+                self.device_group = device_group
+
+        assert self.device_group is not None
+        assert self.rank_in_group <= 1
+
+        if torch.cuda.is_available():
+            self.device = torch.device(f"cuda:{local_rank}")
+        else:
+            self.device = torch.device("cpu")
+
+        self.target_rank_for_send = self.ranks[
+            (self.rank_in_group + 1) % self.world_size
+        ]
+        self.target_rank_for_recv = self.ranks[
+            (self.rank_in_group - 1) % self.world_size
+        ]
+
+        # FIXME: why we need this?
+        torch.set_default_device(self.device)
+
+        self.transport_thread = None
+        self.buffer_size = 0
+        self.buffer_size_lock = threading.Lock()
+
+        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
+
+        # On-device tensors to be reused for recv
+        self.rcv_metadata_buffer = torch.zeros(
+            self.METADATA_LENGTH, dtype=self.METADATA_DTYPE, device=self.device
+        )
+
+        self.pending_recv = None
+
+    def _make_metadata(self, tensor: torch.Tensor) -> torch.Tensor:
+        """
+        Create the metadata on based on the input tensor, and move it to GPU.
+        The metadata's length is `TorchDistributedPipe.METADATA_LENGTH`.
+
+        Currently, the metadata is a int64 tensor and it includes dtype, number
+        of dimensions, and the shape information of the input tensor.
+
+
+        The information follows the layout below:
+        - metadata[0] -- dtype
+        - metadata[1] -- number of dimensions
+        - metadata[2 : 2+ndims] -- the shape of the input tensor
+
+        Parameters:
+            - tensor: the input tensor
+
+        Returns:
+            - metadata: the metadata tensor, on self.device
+        """
+        buffer = torch.empty(self.METADATA_LENGTH, dtype=self.METADATA_DTYPE)
+        buffer[0] = DTYPE2INT[tensor.dtype]
+        ndims = len(tensor.shape)
+        buffer[1] = len(tensor.shape)
+        buffer[2 : 2 + ndims] = torch.tensor(
+            tensor.shape, dtype=self.METADATA_DTYPE
+        )
+        return buffer.to(self.device)
+
+    def _prepare_recv_buffer(
+        self, d_metadata_buffer: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Create a buffer to receive the tensor based on the metadata.
+
+        Parameters:
+            - d_metadata_buffer: the metadata tensor on self.device
+
+        Returns:
+            - buffer: the buffer tensor to receive the tensor, on self.device
+        """
+        h_buffer = d_metadata_buffer.cpu().numpy()
+        dtype = INT2DTYPE[h_buffer[0]]
+        ndims = h_buffer[1]
+        shape = tuple(h_buffer[2 : 2 + ndims])
+        return torch.empty(shape, dtype=dtype, device=self.device)
+
+    def _send_metadata(self, d_metadata_buffer: torch.Tensor):
+        """
+        Send the metadata buffer to the target rank.
+        """
+        torch.distributed.send(
+            d_metadata_buffer,
+            dst=self.target_rank_for_send,
+            group=self.device_group,
+        )
+
+    def _recv_metadata(self) -> torch.Tensor:
+        """
+        Receive the metadata buffer from the target rank.
+
+        Returns:
+            - metadata_buffer: the metadata buffer tensor, on self.device
+
+        Note:
+            The current implementation uses the assumption that there is no
+            race conditions during sending/receiving. Therefore, the metadata
+            buffer can be reused
+        """
+        torch.distributed.recv(
+            self.rcv_metadata_buffer,
+            src=self.target_rank_for_recv,
+            group=self.device_group,
+        )
+        return self.rcv_metadata_buffer
+
+    def _send_impl(self, tensor):
+        """
+        The actual implementation of sending the tensor to the target rank.
+        This function will first send the metadata, and then send the tensor.
+
+        Parameters:
+            - tensor: the input tensor to be sent
+        """
+
+        metadata = self._make_metadata(tensor)
+        self._send_metadata(metadata)
+
+        torch.distributed.send(
+            tensor, dst=self.target_rank_for_send, group=self.device_group
+        )
+
+    def _recv_impl(self) -> torch.Tensor:
+        """
+        The actual implementation of receiving the tensor from the target rank.
+        This function will first receive the metadata, then receive the tensor.
+
+        This function will block if there is no tensor to receive.
+
+        Returns:
+            - buffer: the received tensor, on self.device
+        """
+        d_metadata = self._recv_metadata()
+        buffer = self._prepare_recv_buffer(d_metadata)
+
+        torch.distributed.recv(
+            buffer, src=self.target_rank_for_recv, group=self.device_group
+        )
+
+        return buffer
+
+    def send_tensor_wrapper(self, tensor):
+        try:
+            """Wrapper for send_tensor_dict"""
+            tensor_size = tensor.element_size() * tensor.numel()
+            self._send_impl(tensor)
+
+            with self.buffer_size_lock:
+                self.buffer_size = self.buffer_size - tensor_size
+        except Exception as e:
+            logger.error("Encountering exception in KV sending thread")
+            logger.error("%s", e)
+
+    def block_if_full(self):
+        """
+        Block the current thread if the buffer size is larger than 1e9.
+        """
+        # TODO: replace this 1e9 with a configurable parameter or a constant
+        while self.buffer_size > 1e9:
+            logger.debug("KV cache transfer pipe is full. Waiting...")
+            time.sleep(0.05)
+
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """
+        Sends a tensor to the destination rank in a non-blocking way.
+        Flow: send tensor dim -- send tensor shape -- send tensor data
+        """
+
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+
+        if tensor is None:
+            tensor = self.none_tensor
+            tensor_size = 0
+        else:
+            tensor_size = tensor.element_size() * tensor.numel()
+
+        assert (
+            0 < len(tensor.shape) < self.MAX_TENSOR_DIMENSIONS
+        ), f"Only support dimensions within 1-{self.MAX_TENSOR_DIMENSIONS}"
+
+        self.block_if_full()
+
+        with self.buffer_size_lock:
+            # print("Remaining size:", self.buffer_size)
+            self.buffer_size = self.buffer_size + tensor_size
+
+        # prepare the metadata before sending the tensor.
+        self.transport_thread.submit(
+            self.send_tensor_wrapper,
+            tensor,
+        )
+
+    def recv_tensor(self, timeout: float = None) -> Optional[torch.Tensor]:
+        """Receives a tensor from the src rank. Blocking."""
+
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+
+        if self.pending_recv is None:
+            self.pending_recv = self.transport_thread.submit(self._recv_impl)
+
+        try:
+            tensor = self.pending_recv.result(timeout=timeout)
+            self.pending_recv = None
+
+        except TimeoutError as e:
+            raise e
+
+        except Exception as e:
+            logger.error("Encountering exception in KV receiving thread")
+            logger.error("%s", e)
+
+        if tensor.numel() == 1 and tensor.item() == NONE_INT:
+            return None
+        else:
+            return tensor
+
+    def close(self):
+        """
+        Close the pipe and release the resources.
+        """
+        if (
+            hasattr(self, "transport_thread")
+            and self.transport_thread is not None
+        ):
+            self.transport_thread.shutdown()

From d7d32c1e378ee8cf519eb9f538fd3fcd717034f2 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Fri, 13 Sep 2024 01:41:08 +0000
Subject: [PATCH 174/278] remove unused file

---
 .../kv_pipe/torch_distributed_pipe.py.bkup    | 300 ------------------
 1 file changed, 300 deletions(-)
 delete mode 100644 vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py.bkup

diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py.bkup b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py.bkup
deleted file mode 100644
index 489052285475..000000000000
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py.bkup
+++ /dev/null
@@ -1,300 +0,0 @@
-from torch.distributed import Backend
-import torch
-from typing import List, Optional, Union
-import threading
-from concurrent.futures import ThreadPoolExecutor
-import time
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-
-# if the tensor is only one-element and only contains this number
-# this means that the sended object is None.
-NONE_INT = -150886311
-FLOAT16_INT = -543205003776624
-INT64_INT = -375623078607432
-BOOL_INT = -28035262008646
-BFLOAT16_INT = -452084912267662
-FLOAT32_INT = -1049557997456592
-FLOAT64_INT = -452201007054137
-
-DTYPE2INT = {
-    torch.float16: FLOAT16_INT,
-    torch.int64: INT64_INT,
-    torch.bool: BOOL_INT,
-    torch.bfloat16: BFLOAT16_INT,
-    torch.float32: FLOAT32_INT,
-    torch.float64: FLOAT64_INT,
-}
-
-INT2DTYPE = {
-    FLOAT16_INT: torch.float16,
-    INT64_INT: torch.int64,
-    BOOL_INT: torch.bool,
-    BFLOAT16_INT: torch.bfloat16,
-    FLOAT32_INT: torch.float32,
-    FLOAT64_INT: torch.float64,
-}
-
-
-class BrokenPipeException(Exception):
-    def __init__(self, message):
-        self.message = message
-        super().__init__(self.message)
-
-
-class TorchDistributedPipe:
-    METADATA_LENGTH = 16
-    MAX_TENSOR_DIMENSIONS = 14
-    METADATA_DTYPE = torch.int64
-
-    def __init__(
-        self,
-        group_ranks: List[List[int]],
-        local_rank: int,
-        torch_distributed_backend: Union[str, Backend],
-    ):
-        self.rank = torch.distributed.get_rank()
-        self.local_rank = local_rank
-        self.device_group = None
-
-        for ranks in group_ranks:
-            device_group = torch.distributed.new_group(
-                ranks, backend=torch_distributed_backend
-            )
-            if self.rank in ranks:
-                self.ranks = ranks
-                self.world_size = len(ranks)
-                self.rank_in_group = ranks.index(self.rank)
-                self.device_group = device_group
-
-        assert self.device_group is not None
-        assert self.rank_in_group <= 1
-
-        if torch.cuda.is_available():
-            self.device = torch.device(f"cuda:{local_rank}")
-        else:
-            self.device = torch.device("cpu")
-
-        self.target_rank_for_send = self.ranks[
-            (self.rank_in_group + 1) % self.world_size
-        ]
-        self.target_rank_for_recv = self.ranks[
-            (self.rank_in_group - 1) % self.world_size
-        ]
-
-        # FIXME: why we need this?
-        torch.set_default_device(self.device)
-
-        self.transport_thread = None
-        self.buffer_size = 0
-        self.buffer_size_lock = threading.Lock()
-
-        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
-
-        # On-device tensors to be reused for recv
-        self.rcv_metadata_buffer = torch.zeros(
-            self.METADATA_LENGTH, dtype=self.METADATA_DTYPE, device=self.device
-        )
-
-        self.pending_recv = None
-
-    def _make_metadata(self, tensor: torch.Tensor) -> torch.Tensor:
-        """
-        Create the metadata on based on the input tensor, and move it to GPU.
-        The metadata's length is `TorchDistributedPipe.METADATA_LENGTH`.
-
-        Currently, the metadata is a int64 tensor and it includes dtype, number
-        of dimensions, and the shape information of the input tensor.
-
-
-        The information follows the layout below:
-        - metadata[0] -- dtype
-        - metadata[1] -- number of dimensions
-        - metadata[2 : 2+ndims] -- the shape of the input tensor
-
-        Parameters:
-            - tensor: the input tensor
-
-        Returns:
-            - metadata: the metadata tensor, on self.device
-        """
-        buffer = torch.empty(self.METADATA_LENGTH, dtype=self.METADATA_DTYPE)
-        buffer[0] = DTYPE2INT[tensor.dtype]
-        ndims = len(tensor.shape)
-        buffer[1] = len(tensor.shape)
-        buffer[2 : 2 + ndims] = torch.tensor(
-            tensor.shape, dtype=self.METADATA_DTYPE
-        )
-        return buffer.to(self.device)
-
-    def _prepare_recv_buffer(
-        self, d_metadata_buffer: torch.Tensor
-    ) -> torch.Tensor:
-        """
-        Create a buffer to receive the tensor based on the metadata.
-
-        Parameters:
-            - d_metadata_buffer: the metadata tensor on self.device
-
-        Returns:
-            - buffer: the buffer tensor to receive the tensor, on self.device
-        """
-        h_buffer = d_metadata_buffer.cpu().numpy()
-        dtype = INT2DTYPE[h_buffer[0]]
-        ndims = h_buffer[1]
-        shape = tuple(h_buffer[2 : 2 + ndims])
-        return torch.empty(shape, dtype=dtype, device=self.device)
-
-    def _send_metadata(self, d_metadata_buffer: torch.Tensor):
-        """
-        Send the metadata buffer to the target rank.
-        """
-        torch.distributed.send(
-            d_metadata_buffer,
-            dst=self.target_rank_for_send,
-            group=self.device_group,
-        )
-
-    def _recv_metadata(self) -> torch.Tensor:
-        """
-        Receive the metadata buffer from the target rank.
-
-        Returns:
-            - metadata_buffer: the metadata buffer tensor, on self.device
-
-        Note:
-            The current implementation uses the assumption that there is no
-            race conditions during sending/receiving. Therefore, the metadata
-            buffer can be reused
-        """
-        torch.distributed.recv(
-            self.rcv_metadata_buffer,
-            src=self.target_rank_for_recv,
-            group=self.device_group,
-        )
-        return self.rcv_metadata_buffer
-
-    def _send_impl(self, tensor):
-        """
-        The actual implementation of sending the tensor to the target rank.
-        This function will first send the metadata, and then send the tensor.
-
-        Parameters:
-            - tensor: the input tensor to be sent
-        """
-
-        metadata = self._make_metadata(tensor)
-        self._send_metadata(metadata)
-
-        torch.distributed.send(
-            tensor, dst=self.target_rank_for_send, group=self.device_group
-        )
-
-    def _recv_impl(self) -> torch.Tensor:
-        """
-        The actual implementation of receiving the tensor from the target rank.
-        This function will first receive the metadata, then receive the tensor.
-
-        This function will block if there is no tensor to receive.
-
-        Returns:
-            - buffer: the received tensor, on self.device
-        """
-        d_metadata = self._recv_metadata()
-        buffer = self._prepare_recv_buffer(d_metadata)
-
-        torch.distributed.recv(
-            buffer, src=self.target_rank_for_recv, group=self.device_group
-        )
-
-        return buffer
-
-    def send_tensor_wrapper(self, tensor):
-        try:
-            """Wrapper for send_tensor_dict"""
-            tensor_size = tensor.element_size() * tensor.numel()
-            self._send_impl(tensor)
-
-            with self.buffer_size_lock:
-                self.buffer_size = self.buffer_size - tensor_size
-        except Exception as e:
-            logger.error("Encountering exception in KV sending thread")
-            logger.error("%s", e)
-
-    def block_if_full(self):
-        """
-        Block the current thread if the buffer size is larger than 1e9.
-        """
-        # TODO: replace this 1e9 with a configurable parameter or a constant
-        while self.buffer_size > 1e9:
-            logger.debug("KV cache transfer pipe is full. Waiting...")
-            time.sleep(0.05)
-
-    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
-        """
-        Sends a tensor to the destination rank in a non-blocking way.
-        Flow: send tensor dim -- send tensor shape -- send tensor data
-        """
-
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-
-        if tensor is None:
-            tensor = self.none_tensor
-            tensor_size = 0
-        else:
-            tensor_size = tensor.element_size() * tensor.numel()
-
-        assert (
-            0 < len(tensor.shape) < self.MAX_TENSOR_DIMENSIONS
-        ), f"Only support dimensions within 1-{self.MAX_TENSOR_DIMENSIONS}"
-
-        self.block_if_full()
-
-        with self.buffer_size_lock:
-            # print("Remaining size:", self.buffer_size)
-            self.buffer_size = self.buffer_size + tensor_size
-
-        # prepare the metadata before sending the tensor.
-        self.transport_thread.submit(
-            self.send_tensor_wrapper,
-            tensor,
-        )
-
-    def recv_tensor(self, timeout: float = None) -> Optional[torch.Tensor]:
-        """Receives a tensor from the src rank. Blocking."""
-
-        if self.transport_thread is None:
-            self.transport_thread = ThreadPoolExecutor(max_workers=1)
-
-        if self.pending_recv is None:
-            self.pending_recv = self.transport_thread.submit(self._recv_impl)
-
-        try:
-            tensor = self.pending_recv.result(timeout=timeout)
-            self.pending_recv = None
-
-        except TimeoutError as e:
-            raise e
-
-        except Exception as e:
-            logger.error("Encountering exception in KV receiving thread")
-            logger.error("%s", e)
-
-        if tensor.numel() == 1 and tensor.item() == NONE_INT:
-            return None
-        else:
-            return tensor
-
-    def close(self):
-        """
-        Close the pipe and release the resources.
-        """
-        if (
-            hasattr(self, "transport_thread")
-            and self.transport_thread is not None
-        ):
-            self.transport_thread.shutdown()

From 417ccb35c6cd391942c29344ef0056333e716e0e Mon Sep 17 00:00:00 2001
From: YaoJiayi <120040070@link.cuhk.edu.cn>
Date: Fri, 13 Sep 2024 04:25:08 -0500
Subject: [PATCH 175/278] update vllm adapter

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 154 ++++++++++++++++++-
 1 file changed, 146 insertions(+), 8 deletions(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index d13d132f5dfe..98d861fe9a12 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -30,6 +30,9 @@
 from vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe import TorchDistributedPipe
 from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_kv_lookup_buffer import SimpleKVLookupBuffer
 
+from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+from copy import deepcopy
+
 assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode", "lmcache"], \
     "VLLM_DISAGG_PREFILL_ROLE can only be prefill, decode or lmcache."
 
@@ -77,7 +80,7 @@ def __init__(
     def send_kv_caches_and_hidden_states(
         self,
         model_executable: torch.nn.Module,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
+        model_input: ModelInputForGPUWithSamplingMetadata,
         kv_caches: List[torch.Tensor],
         hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors],
     ) -> None:
@@ -128,9 +131,9 @@ def send_kv_caches_and_hidden_states(
     def recv_kv_caches_and_hidden_states(
         self,
         model_executable: torch.nn.Module,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
+        model_input: ModelInputForGPUWithSamplingMetadata,
         kv_caches: List[torch.Tensor]
-    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool]:
+    ) -> List[Union[torch.Tensor, IntermediateTensors], bool, ModelInputForGPUWithSamplingMetadata]:
 
         bypass_model_exec = True
 
@@ -142,6 +145,10 @@ def recv_kv_caches_and_hidden_states(
 
         hidden_or_intermediate_states_for_one_req = []
 
+        input_tokens_list = []
+        num_computed_tokens_list = []
+        start_pos_list = []
+        
         # enumerate different requests
         # FIXME(Kuntai): This impl assumes that all requests are prefill.
         for idx, slen in enumerate(seq_lens):
@@ -151,16 +158,27 @@ def recv_kv_caches_and_hidden_states(
             current_tokens = input_tokens_tensor[start_pos:end_pos]
             num_tokens = slen
 
+            input_tokens_list.append(current_tokens)
+            start_pos_list.append(start_pos)
+            
             ret = self.buffer.drop_select(
                 current_tokens, 
                 torch.ones_like(current_tokens, dtype=bool))
             if ret[0] is None:
                 # didn't find any match.
                 self.bypass_model_exec = False
+                num_computed_tokens_list.append(0)
                 continue
             
-            _, _, keys, values, hidden = ret
-
+            # TODO(Jiayi): change the logic here (need roi)
+            _, roi, keys, values, hidden = ret
+            
+            # Jiayi: currently assume roi is a prefix
+            num_computed_tokens = len(roi)
+            num_computed_tokens_list.append(num_computed_tokens)
+            is_complete = (num_computed_tokens == num_tokens)
+            end_pos = start_pos + num_computed_tokens
+            
             # receive KV cache from disaggregated prefill instance
             for i in range(model_executable.model.start_layer,
                         model_executable.model.end_layer):
@@ -184,14 +202,134 @@ def recv_kv_caches_and_hidden_states(
 
             hidden_or_intermediate_states_for_one_req.append(hidden)
 
+        # FIXME(Jiayi): we need to support only skip m out of n reqs in a batch 
+        # same for prefix caching
         if not bypass_model_exec:
             # Some of the KV cache is not retrieved
             # so we need to recompute the hidden state
-            return [], bypass_model_exec
-
+            logger.debug("[rank%d]: KV EMPTY recv DONE.", torch.distributed.get_rank())
+            return None, bypass_model_exec, None
+
+        if not is_complete:
+            rebuilt_model_input = self.adpat_model_input(
+                model_input,
+                input_tokens_list,
+                num_computed_tokens_list,
+                start_pos_list,
+                slot_mapping,
+                device=kv_cache[0].device,
+            )
+            logger.debug("[rank%d]: KV PARTIAL recv DONE.", torch.distributed.get_rank())
+            return None, bypass_model_exec, rebuilt_model_input
+        
         # concatenate hidden states from different requests
         hidden_or_intermediate_states = torch.cat(
             hidden_or_intermediate_states_for_one_req, dim=0)
 
         logger.debug("[rank%d]: KV recv DONE.", torch.distributed.get_rank())
-        return hidden_or_intermediate_states, bypass_model_exec, model_input
\ No newline at end of file
+        return hidden_or_intermediate_states, bypass_model_exec, model_input
+    
+    
+    def adpat_model_input(
+        self,
+        model_input: ModelInputForGPUWithSamplingMetadata,
+        input_tokens_list: List[torch.Tensor],
+        num_computed_tokens_list: List[int],
+        start_pos_list: List[int],
+        slot_mapping_flat: torch.Tensor,
+        device: torch.device,
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        rebuilt_input_tokens = []
+        rebuilt_input_positions= []
+        rebuilt_query_lens = []
+        
+        rebuilt_num_prefills = 0
+        rebuilt_num_prefill_tokens = 0
+        rebuilt_slot_mapping = []
+        rebuilt_max_query_len = 0
+        
+        rebuilt_block_tables = []
+        
+        rebuilt_query_start_loc = [0]
+        rebuilt_context_lens_tensor = []
+        rebuilt_selected_token_indices = []
+        
+        for idx in range(len(input_tokens_list)):
+            token_tensor = input_tokens_list[idx]
+            num_token = len(token_tensor)
+            num_computed_token = num_computed_tokens_list[idx]
+            start_pos = start_pos_list[idx]
+            
+            rebuilt_input_tokens.append(token_tensor[num_computed_token:])
+            # TODO(Jiayi): please check the correctness of next line
+            rebuilt_input_positions.append(model_input.input_positions[start_pos+num_computed_token:start_pos+num_token])
+            q_len = num_token - num_computed_token
+            rebuilt_query_lens.append(q_len)
+            
+            # Attn metadata-related
+            rebuilt_num_prefills += 1
+            rebuilt_num_prefill_tokens += q_len
+            rebuilt_slot_mapping.append(slot_mapping_flat[start_pos+num_computed_token:start_pos+num_token])
+            rebuilt_max_query_len = max(q_len, rebuilt_max_query_len)
+            # TODO(Jiayi): remove hard-code (block_size=16)
+            blk_size = 16
+            temp_block_table = [i//blk_size for i in range(start_pos, start_pos+num_token, blk_size)]
+            rebuilt_block_tables.append(temp_block_table)
+            rebuilt_query_start_loc.append(q_len) #start with 0
+            rebuilt_context_lens_tensor.append(num_computed_token)
+            
+            # Sampling metadata related
+            #seq_groups (use rebuilt query lens)
+            rebuilt_selected_token_indices.append(start_pos+q_len-1)
+        
+        
+        # rebuilt attn_metadata
+        rebuilt_attn_metadata = deepcopy(model_input.attn_metadata)
+        rebuilt_attn_metadata.num_prefills = rebuilt_num_prefills
+        rebuilt_attn_metadata.num_prefill_tokens = rebuilt_num_prefill_tokens
+        rebuilt_attn_metadata.slot_mapping = torch.cat(rebuilt_slot_mapping).to(device)
+        rebuilt_attn_metadata.max_query_len = rebuilt_max_query_len
+        
+        rebuilt_attn_metadata.block_tables = torch.tensor(
+            rebuilt_block_tables,
+            dtype=model_input.attn_metadata.block_tables.dtype
+            ).to(device)
+        
+        rebuilt_attn_metadata.query_start_loc = torch.tensor(
+            rebuilt_query_start_loc,
+            dtype=model_input.attn_metadata.query_start_loc.dtype).to(device)
+        rebuilt_attn_metadata.context_lens_tensor = torch.tensor(
+            rebuilt_context_lens_tensor, 
+            dtype=model_input.attn_metadata.context_lens_tensor.dtype,
+            ).to(device)
+        
+        rebuilt_attn_metadata._cached_prefill_metadata = None
+
+        # rebuilt sampling_metadata
+        rebuilt_sampling_metadata = deepcopy(model_input.sampling_metadata)
+        for idx, q_len in enumerate(rebuilt_query_lens):
+            rebuilt_sampling_metadata.seq_groups[idx].query_len = q_len
+        rebuilt_sampling_metadata.selected_token_indices = torch.tensor(
+            rebuilt_selected_token_indices,
+            dtype=model_input.sampling_metadata.selected_token_indices.dtype,
+            ).to(device)
+        
+        rebuilt_model_input = ModelInputForGPUWithSamplingMetadata(
+            input_tokens = torch.cat(rebuilt_input_tokens).to(device),
+            input_positions = torch.cat(rebuilt_input_positions).to(device),
+            seq_lens = model_input.seq_lens,
+            query_lens = rebuilt_query_lens,
+            lora_mapping = model_input.lora_mapping,
+            lora_requests = model_input.lora_requests,
+            attn_metadata = rebuilt_attn_metadata,
+            prompt_adapter_mapping = model_input.prompt_adapter_mapping,
+            prompt_adapter_requests = model_input.prompt_adapter_requests,
+            multi_modal_kwargs = model_input.multi_modal_kwargs,
+            request_ids_to_seq_ids = model_input.request_ids_to_seq_ids,
+            finished_requests_ids = model_input.finished_requests_ids,
+            virtual_engine = model_input.virtual_engine,
+            sampling_metadata = rebuilt_sampling_metadata,
+            is_prompt = model_input.is_prompt,
+        )
+        
+        return rebuilt_model_input
\ No newline at end of file

From 0176ebb18a1698e07b0e9e635060001a0f6b419a Mon Sep 17 00:00:00 2001
From: YaoJiayi <120040070@link.cuhk.edu.cn>
Date: Fri, 13 Sep 2024 04:31:33 -0500
Subject: [PATCH 176/278] update worker_base

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 3 +++
 vllm/worker/worker_base.py                   | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 98d861fe9a12..e3e9a2b3187e 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -68,12 +68,15 @@ def __init__(
         torch_distributed_backend: Union[str, Backend],
     ):
         
+        # FIXME(Jiayi): we need two pipes
+        # one or send and one for recv
         # init pipe
         self.pipe = TorchDistributedPipe(
             group_ranks,
             local_rank,
             torch_distributed_backend,
         )
+        # FIXME(Jiayi): buffer initializtion should be updated accordingly
         # init lookup buffer
         self.buffer = SimpleKVLookupBuffer(self.pipe, 1000**3 * 10)
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 1b60b444709e..2fc5d3200b13 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -295,7 +295,7 @@ def execute_model(
                     model_input,
                     self.kv_cache[worker_input.virtual_engine],
                 )
-            assert bypass_model_exec
+            #assert bypass_model_exec
         
         if not bypass_model_exec: 
             hidden_or_intermediate_states = self.model_runner.execute_model(

From 84fd0b826dc93f15861feaf5ea73fa142f571985 Mon Sep 17 00:00:00 2001
From: YaoJiayi <120040070@link.cuhk.edu.cn>
Date: Fri, 13 Sep 2024 09:47:42 -0500
Subject: [PATCH 177/278] update comm initialization

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 36 +++++++++++++++-----
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index e3e9a2b3187e..981c23891a6f 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -68,15 +68,33 @@ def __init__(
         torch_distributed_backend: Union[str, Backend],
     ):
         
-        # FIXME(Jiayi): we need two pipes
-        # one or send and one for recv
-        # init pipe
-        self.pipe = TorchDistributedPipe(
-            group_ranks,
-            local_rank,
-            torch_distributed_backend,
-        )
-        # FIXME(Jiayi): buffer initializtion should be updated accordingly
+        # init two pipes: one or send and one for recv
+        if IS_KV_PREFILL_INSTANCE or IS_LMCACHE_INSTANCE:
+            self.send_pipe = TorchDistributedPipe(
+                group_ranks,
+                local_rank,
+                torch_distributed_backend,
+            )
+            self.recv_pipe = TorchDistributedPipe(
+                group_ranks,
+                local_rank,
+                torch_distributed_backend,
+            )
+        elif IS_KV_DECODE_INSTANCE:
+            self.recv_pipe = TorchDistributedPipe(
+                group_ranks,
+                local_rank,
+                torch_distributed_backend,
+            )
+            self.send_pipe = TorchDistributedPipe(
+                group_ranks,
+                local_rank,
+                torch_distributed_backend,
+            )
+            
+        
+        # FIXME(Jiayi): buffer initializtion should be adapted accordingly
+        # Signal pipe needs to be initialized on both vllm and lmc side
         # init lookup buffer
         self.buffer = SimpleKVLookupBuffer(self.pipe, 1000**3 * 10)
 

From 826ca70daa1cd7ae991c516287823f22893544c3 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Fri, 13 Sep 2024 22:44:23 +0000
Subject: [PATCH 178/278] update

---
 .../kv_lookup_buffer/simple_kv_lookup_buffer.py            | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
index df52dd65692e..c06093413d7e 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
@@ -50,15 +50,14 @@ def _matches(self, tokens_roi_sender, tokens_roi_recver):
             return True
 
             
-        # I am assuming that roi is a mask on tokens
+        # Assuming that roi is a mask on tokens
         tokens_sender = tokens_sender[roi_sender]
         tokens_recver = tokens_recver[roi_recver]
         
         
+        # simple common prefix matching
         min_length = min(len(tokens_sender), len(tokens_recver))
         if torch.allclose(tokens_sender[:min_length], tokens_recver[:min_length]):
-            # drastically simplified
-            # common prefix matching
             return min_length
         
         return 0
@@ -148,7 +147,7 @@ def drop_select_handler(self):
             if 'Connection closed by peer' not in str(e):
                 raise e
 
-        logger.debug("closing drop_select_handler")
+        logger.debug("Closing drop_select_handler")
                         
         
     def drop_select(self, input_tokens, roi):

From 3425ab64e8afae1b4aca4d69628b69d584033a6f Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Fri, 13 Sep 2024 22:56:33 +0000
Subject: [PATCH 179/278] update documentation

---
 tests/kv_transfer/test_lookup_buffer.py                     | 2 +-
 .../kv_transfer/kv_pipe/torch_distributed_pipe.py           | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index 5041bf026483..5ccbc5b0f586 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -70,7 +70,7 @@ def stress_test(my_rank, buf, device):
     n = 0
     
     # the buffer size can only store 100 reqs
-    # so the sender will occasionally block.needs to wait for the receiver.
+    # so the sender will occasionally block to wait for the receiver.
     for req in tqdm(reqs):
         if my_rank == 0:
             buf.insert(*req)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index caa9e6aabd93..c77da1e6b75b 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -10,9 +10,11 @@
 logger = init_logger(__name__)
 
 
-# if the tensor is only one-element and only contains this number
+# if the tensor is only one-element and only contains NONE_INT
 # this means that the sended object is None.
 NONE_INT = -150886311
+
+# Mapping tensor dtype to a int, used for tensor metadata transmission
 FLOAT16_INT = -543205003776624
 INT64_INT = -375623078607432
 BOOL_INT = -28035262008646
@@ -258,11 +260,9 @@ def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         self.block_if_full()
 
         with self.buffer_size_lock:
-            # print("Remaining size:", self.buffer_size)
             self.buffer_size = self.buffer_size + tensor_size
 
 
-        #self.send_tensor_wrapper(tensor)
         self.transport_thread.submit(
             self.send_tensor_wrapper,
             tensor,

From 9f3a3a50bbe2bace6c2a17d139a68d92eecd8939 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Fri, 13 Sep 2024 23:04:17 +0000
Subject: [PATCH 180/278] adjust vllm adapter: now we separate CPU and device
 into different pipes

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index d13d132f5dfe..ba2c06c24b2d 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -66,13 +66,19 @@ def __init__(
     ):
         
         # init pipe
-        self.pipe = TorchDistributedPipe(
+        self.device_pipe = TorchDistributedPipe(
             group_ranks,
             local_rank,
             torch_distributed_backend,
         )
+        self.cpu_pipe = TorchDistributedPipe(
+            group_ranks,
+            local_ranks,
+            "gloo"
+        )
         # init lookup buffer
-        self.buffer = SimpleKVLookupBuffer(self.pipe, 1000**3 * 10)
+        # TODO: replace this 1e9 with a configurable parameter or a constant
+        self.buffer = SimpleKVLookupBuffer(self.cpu_pipe, self.device_pipe, 1e9)
 
     def send_kv_caches_and_hidden_states(
         self,

From ce79d596fc68dbbb4e57921490a0a6a41cdc6715 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Fri, 13 Sep 2024 23:20:33 +0000
Subject: [PATCH 181/278] build 2 pipes in vLLM adapter

---
 .../kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py   | 2 +-
 vllm/distributed/kv_transfer/vllm_adapter.py                  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
index c06093413d7e..de2e7cf5d5d5 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
@@ -181,7 +181,7 @@ def full_handler(self):
     def insert(self, input_tokens, roi, key, value, hidden) -> None:
 
         while self.buffer_size > self.buffer_size_threshold:
-            logger.debug("KV transfer buffer is full. Handling...")
+            # logger.debug("KV transfer buffer is full. Handling...")
             self.full_handler()
         
         self._add_to_buffer(input_tokens, roi, key, value, hidden)
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index ba2c06c24b2d..d85ffe79abdf 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -73,12 +73,12 @@ def __init__(
         )
         self.cpu_pipe = TorchDistributedPipe(
             group_ranks,
-            local_ranks,
+            local_rank,
             "gloo"
         )
         # init lookup buffer
         # TODO: replace this 1e9 with a configurable parameter or a constant
-        self.buffer = SimpleKVLookupBuffer(self.cpu_pipe, self.device_pipe, 1e9)
+        self.buffer = SimpleKVLookupBuffer(self.cpu_pipe, self.device_pipe, 1e9 * 10)
 
     def send_kv_caches_and_hidden_states(
         self,

From 34dfdde0face48494c69f1e5d1b20cd6ca96ab27 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Fri, 13 Sep 2024 23:39:27 +0000
Subject: [PATCH 182/278] documentation chagne

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index d85ffe79abdf..42f451f6bc8f 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -1,14 +1,14 @@
 """vLLM distributed KV cache transfer API.
 These APIs are used in `vllm/worker/model_runner.py`.
 
-Currently supporting TP and PP.
+Currently supporting TP and PP, but TP and PP must be the same.
 
 Workflow:
-- In prefill instance, KV cache sender *buffers* the KV cache send requests
+- In prefill instance, vLLM `insert` that buffers the KV cache into lookup buffer.
 - In decode instance
-    - KV cache receiver sends the hash of input tokens to sender
-    - KV cache sender executes send request
-    - KV cache receiver receives the KV cache
+    - vLLM first runs `drop_select` to send input tokens and a mask on input tokens to sender
+    - The prefill instance send back the matching KV caches
+    - vLLM then store the KV cache into paged memory.
 """
 from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING
 from collections import defaultdict, deque

From 9355be358a8fcfe86d945c8e8893515dab84d1c0 Mon Sep 17 00:00:00 2001
From: YaoJiayi <120040070@link.cuhk.edu.cn>
Date: Sat, 14 Sep 2024 10:09:17 -0500
Subject: [PATCH 183/278] update vllm_adapter

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 35 ++++++++++++++++----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 11bea4bcef12..2ed4bea47345 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -68,7 +68,7 @@ def __init__(
         torch_distributed_backend: Union[str, Backend],
     ):
         
-
+        '''
         # init pipe
         self.device_pipe = TorchDistributedPipe(
             group_ranks,
@@ -80,30 +80,50 @@ def __init__(
             local_rank,
             "gloo"
         )
-
-        # init two pipes: one or send and one for recv
+        '''
+        # init 4 pipes: 2 * (one for send and one for recv)
         if IS_KV_PREFILL_INSTANCE or IS_LMCACHE_INSTANCE:
             self.send_pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
                 torch_distributed_backend,
             )
+            self.send_signal_pipe = TorchDistributedPipe(
+                group_ranks,
+                local_rank,
+                "gloo",
+            )
             self.recv_pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
                 torch_distributed_backend,
             )
+            self.recv_signal_pipe = TorchDistributedPipe(
+                group_ranks,
+                local_rank,
+                "gloo",
+            )
         elif IS_KV_DECODE_INSTANCE:
             self.recv_pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
                 torch_distributed_backend,
             )
+            self.recv_signal_pipe = TorchDistributedPipe(
+                group_ranks,
+                local_rank,
+                "gloo",
+            )
             self.send_pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
                 torch_distributed_backend,
             )
+            self.send_signal_pipe = TorchDistributedPipe(
+                group_ranks,
+                local_rank,
+                "gloo",
+            )
             
         
         # FIXME(Jiayi): buffer initializtion should be adapted accordingly
@@ -111,7 +131,10 @@ def __init__(
 
         # init lookup buffer
         # TODO: replace this 1e9 with a configurable parameter or a constant
-        self.buffer = SimpleKVLookupBuffer(self.cpu_pipe, self.device_pipe, 1e9 * 10)
+        #self.buffer = SimpleKVLookupBuffer(self.cpu_pipe, self.device_pipe, 1e9 * 10)
+        
+        self.send_buffer = SimpleKVLookupBuffer(self.send_pipe, self.send_signal_pipe, 1e9 * 10)
+        self.recv_buffer = SimpleKVLookupBuffer(self.recv_pipe, self.recv_signal_pipe, 1e9 * 10)
 
     def send_kv_caches_and_hidden_states(
         self,
@@ -152,7 +175,7 @@ def send_kv_caches_and_hidden_states(
                 
             keys = torch.cat(keys, dim=0)
             values = torch.cat(values, dim=0)
-            self.buffer.insert(
+            self.send_buffer.insert(
                 current_tokens, 
                 torch.ones_like(current_tokens, dtype=bool),
                 keys, 
@@ -197,7 +220,7 @@ def recv_kv_caches_and_hidden_states(
             input_tokens_list.append(current_tokens)
             start_pos_list.append(start_pos)
             
-            ret = self.buffer.drop_select(
+            ret = self.recv_buffer.drop_select(
                 current_tokens, 
                 torch.ones_like(current_tokens, dtype=bool))
             if ret[0] is None:

From 54b68c9a170215196862054bece059bd47e96f01 Mon Sep 17 00:00:00 2001
From: YaoJiayi <120040070@link.cuhk.edu.cn>
Date: Sun, 15 Sep 2024 06:45:09 -0500
Subject: [PATCH 184/278] minor fix

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 2ed4bea47345..db1e4a39dd0c 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -30,7 +30,6 @@
 from vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe import TorchDistributedPipe
 from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_kv_lookup_buffer import SimpleKVLookupBuffer
 
-from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 from copy import deepcopy
 
 assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode", "lmcache"], \
@@ -139,7 +138,7 @@ def __init__(
     def send_kv_caches_and_hidden_states(
         self,
         model_executable: torch.nn.Module,
-        model_input: ModelInputForGPUWithSamplingMetadata,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
         kv_caches: List[torch.Tensor],
         hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors],
     ) -> None:
@@ -190,9 +189,9 @@ def send_kv_caches_and_hidden_states(
     def recv_kv_caches_and_hidden_states(
         self,
         model_executable: torch.nn.Module,
-        model_input: ModelInputForGPUWithSamplingMetadata,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
         kv_caches: List[torch.Tensor]
-    ) -> List[Union[torch.Tensor, IntermediateTensors], bool, ModelInputForGPUWithSamplingMetadata]:
+    ) -> List[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]:
 
         bypass_model_exec = True
 
@@ -291,13 +290,13 @@ def recv_kv_caches_and_hidden_states(
     
     def adpat_model_input(
         self,
-        model_input: ModelInputForGPUWithSamplingMetadata,
+        model_input: "ModelInputForGPUWithSamplingMetadata",
         input_tokens_list: List[torch.Tensor],
         num_computed_tokens_list: List[int],
         start_pos_list: List[int],
         slot_mapping_flat: torch.Tensor,
         device: torch.device,
-    ) -> ModelInputForGPUWithSamplingMetadata:
+    ) -> "ModelInputForGPUWithSamplingMetadata":
         rebuilt_input_tokens = []
         rebuilt_input_positions= []
         rebuilt_query_lens = []
@@ -373,6 +372,7 @@ def adpat_model_input(
             dtype=model_input.sampling_metadata.selected_token_indices.dtype,
             ).to(device)
         
+        from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
         rebuilt_model_input = ModelInputForGPUWithSamplingMetadata(
             input_tokens = torch.cat(rebuilt_input_tokens).to(device),
             input_positions = torch.cat(rebuilt_input_positions).to(device),

From 2dff6580684f8436e7d18878fed6681227a7142b Mon Sep 17 00:00:00 2001
From: YaoJiayi <120040070@link.cuhk.edu.cn>
Date: Sun, 15 Sep 2024 07:02:00 -0500
Subject: [PATCH 185/278] fix type hint

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index db1e4a39dd0c..b60236700ab2 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -191,7 +191,7 @@ def recv_kv_caches_and_hidden_states(
         model_executable: torch.nn.Module,
         model_input: "ModelInputForGPUWithSamplingMetadata",
         kv_caches: List[torch.Tensor]
-    ) -> List[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]:
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]:
 
         bypass_model_exec = True
 

From c6a6714310b24d70188efd2dccc0ad24f699bc43 Mon Sep 17 00:00:00 2001
From: YaoJiayi <120040070@link.cuhk.edu.cn>
Date: Sun, 15 Sep 2024 09:40:01 -0500
Subject: [PATCH 186/278] fix comm init

---
 vllm/distributed/parallel_state.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index c48b113de970..92f2fe03d2bc 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -191,10 +191,10 @@ def init_distributed_environment(
         # this backend is used for WORLD
         maybe_disagg_world_size = world_size
         maybe_disagg_rank = rank
-        if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
+        if dist_kv.IS_DISTRIBUTED_KV_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE:
             maybe_disagg_world_size = world_size * 2
             logger.debug("Disaggregated prefill enabled.")
-            if dist_kv.IS_KV_PREFILL_INSTANCE:
+            if dist_kv.IS_KV_PREFILL_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE:
                 # for prefill, the ranks are [0, world_size)
                 maybe_disagg_rank = rank
             else:
@@ -227,7 +227,7 @@ def init_distributed_environment(
     if _WORLD is None:
         ranks = [[i for i in range(world_size)]]
         # offset the distributed group
-        if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
+        if dist_kv.IS_DISTRIBUTED_KV_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE:
             ranks = include_decoding_groups_if_disagg_enabled(
                 ranks, world_size)
 
@@ -289,7 +289,7 @@ def initialize_model_parallel(
     world_size: int = torch.distributed.get_world_size()
     backend = backend or torch.distributed.get_backend(
         get_world_group().device_group)
-    if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
+    if dist_kv.IS_DISTRIBUTED_KV_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE:
         # Disaggregated prefill enabled
         # The world_size for this vLLM instance is tp * pp, but torch.distributed contains 2 vLLM instances, its world size is 2 * tp * pp
         # Adjust the world_size to match.
@@ -341,7 +341,8 @@ def initialize_model_parallel(
                                     use_custom_allreduce=False)
     logger.debug("_PP initialized for rank %d", torch.distributed.get_rank())
 
-    if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
+    # TODO(Jiayi): perhaps we need to separate lmcache and disagg
+    if dist_kv.IS_DISTRIBUTED_KV_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE:
         global _DISAGG
         logger.debug("Disaggregated prefill enabled, create _DISAGG group")
         group_ranks = []

From fef35b24cfb2884f3e26b86c85ec9d76ae629039 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 17:42:34 +0000
Subject: [PATCH 187/278] bug fix: remove self from bypass_model_exec

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index b60236700ab2..b74bf3c75826 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -224,7 +224,7 @@ def recv_kv_caches_and_hidden_states(
                 torch.ones_like(current_tokens, dtype=bool))
             if ret[0] is None:
                 # didn't find any match.
-                self.bypass_model_exec = False
+                bypass_model_exec = False
                 num_computed_tokens_list.append(0)
                 continue
             

From 4d0b5cdf9ccbd99d6641da0af17a42e2fe7fad52 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 18:59:38 +0000
Subject: [PATCH 188/278] bug fix: should init SimpleKVLookupBuffer with signal
 pipe first and then data pipe

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 51 +++++++-------------
 1 file changed, 18 insertions(+), 33 deletions(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index b74bf3c75826..93edeaa1d5d0 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -67,21 +67,9 @@ def __init__(
         torch_distributed_backend: Union[str, Backend],
     ):
         
-        '''
-        # init pipe
-        self.device_pipe = TorchDistributedPipe(
-            group_ranks,
-            local_rank,
-            torch_distributed_backend,
-        )
-        self.cpu_pipe = TorchDistributedPipe(
-            group_ranks,
-            local_rank,
-            "gloo"
-        )
-        '''
-        # init 4 pipes: 2 * (one for send and one for recv)
-        if IS_KV_PREFILL_INSTANCE or IS_LMCACHE_INSTANCE:
+        if IS_LMCACHE_INSTANCE:
+            # when vLLM is connected with LMCache
+            # it needs to both send and recv KV cache
             self.send_pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
@@ -102,27 +90,26 @@ def __init__(
                 local_rank,
                 "gloo",
             )
-        elif IS_KV_DECODE_INSTANCE:
-            self.recv_pipe = TorchDistributedPipe(
-                group_ranks,
-                local_rank,
-                torch_distributed_backend,
-            )
-            self.recv_signal_pipe = TorchDistributedPipe(
-                group_ranks,
-                local_rank,
-                "gloo",
-            )
-            self.send_pipe = TorchDistributedPipe(
+            self.send_buffer = SimpleKVLookupBuffer(self.send_signal_pipe, self.send_pipe, 1e9 * 10)
+            self.recv_buffer = SimpleKVLookupBuffer(self.recv_signal_pipe, self.recv_pipe, 1e9 * 10)
+        else:
+            # when performing disaggregated prefill, only 1 pipe is needed
+            # at prefill instance this pipe is used for send KV cache
+            # at decode instance this pipe is used for recv KV cache
+            self.pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
                 torch_distributed_backend,
             )
-            self.send_signal_pipe = TorchDistributedPipe(
+            self.signal_pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
                 "gloo",
             )
+            self.send_buffer = SimpleKVLookupBuffer(self.signal_pipe, self.pipe, 1e9 * 10)
+            self.recv_buffer = self.send_buffer
+
+        
             
         
         # FIXME(Jiayi): buffer initializtion should be adapted accordingly
@@ -132,9 +119,7 @@ def __init__(
         # TODO: replace this 1e9 with a configurable parameter or a constant
         #self.buffer = SimpleKVLookupBuffer(self.cpu_pipe, self.device_pipe, 1e9 * 10)
         
-        self.send_buffer = SimpleKVLookupBuffer(self.send_pipe, self.send_signal_pipe, 1e9 * 10)
-        self.recv_buffer = SimpleKVLookupBuffer(self.recv_pipe, self.recv_signal_pipe, 1e9 * 10)
-
+        
     def send_kv_caches_and_hidden_states(
         self,
         model_executable: torch.nn.Module,
@@ -269,7 +254,7 @@ def recv_kv_caches_and_hidden_states(
             return None, bypass_model_exec, None
 
         if not is_complete:
-            rebuilt_model_input = self.adpat_model_input(
+            rebuilt_model_input = self.build_partial_prefill_input(
                 model_input,
                 input_tokens_list,
                 num_computed_tokens_list,
@@ -288,7 +273,7 @@ def recv_kv_caches_and_hidden_states(
         return hidden_or_intermediate_states, bypass_model_exec, model_input
     
     
-    def adpat_model_input(
+    def build_partial_prefill_input(
         self,
         model_input: "ModelInputForGPUWithSamplingMetadata",
         input_tokens_list: List[torch.Tensor],

From 31b891db4170a00572c3c81a962555411f9bee3c Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 19:04:15 +0000
Subject: [PATCH 189/278] adjust torch distributed logging

---
 .../kv_transfer/kv_pipe/torch_distributed_pipe.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index c77da1e6b75b..3a6a94bb0e75 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -226,8 +226,14 @@ def send_tensor_wrapper(self, tensor):
             with self.buffer_size_lock:
                 self.buffer_size = self.buffer_size - tensor_size
         except Exception as e:
-            logger.error("Encountering exception in KV sending thread")
-            logger.error("%s", e)
+            logger.error("[rank%d]: Exception when trying to send %s, msg: %s",
+                         torch.distributed.get_rank(),
+                         str(tensor),
+                         str(e))
+            import traceback
+            traceback.print_exc()
+            
+
 
     def block_if_full(self):
         """
@@ -279,10 +285,11 @@ def recv_tensor(self) -> Optional[torch.Tensor]:
         try:
             tensor = future.result()
         except Exception as e:
+            # the underlying pipe is likely broken
             logger.error("Encountering exception in KV receiving thread")
             logger.error("%s", e)
-
-        #tensor = self._recv_impl()
+            # fault tolerance: if the pipe is broken, return None
+            return None
 
         if tensor.numel() == 1 and tensor.item() == NONE_INT:
             return None

From 7e68d089032077815cb9164035325154036c7b9e Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 19:04:29 +0000
Subject: [PATCH 190/278] remove unnecessaqry comments

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 30 ++++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 93edeaa1d5d0..15b512512055 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -65,6 +65,8 @@ def __init__(
         group_ranks: List[List[int]],
         local_rank: int,
         torch_distributed_backend: Union[str, Backend],
+        # FIXME(Kuntai): remove this hardcoding
+        lookup_buffer_size: int = 1e10
     ):
         
         if IS_LMCACHE_INSTANCE:
@@ -90,8 +92,14 @@ def __init__(
                 local_rank,
                 "gloo",
             )
-            self.send_buffer = SimpleKVLookupBuffer(self.send_signal_pipe, self.send_pipe, 1e9 * 10)
-            self.recv_buffer = SimpleKVLookupBuffer(self.recv_signal_pipe, self.recv_pipe, 1e9 * 10)
+            self.send_buffer = SimpleKVLookupBuffer(
+                self.send_signal_pipe,
+                self.send_pipe,
+                self.lookup_buffer_size)
+            self.recv_buffer = SimpleKVLookupBuffer(
+                self.recv_signal_pipe, 
+                self.recv_pipe, 
+                self.lookup_buffer_size)
         else:
             # when performing disaggregated prefill, only 1 pipe is needed
             # at prefill instance this pipe is used for send KV cache
@@ -106,19 +114,11 @@ def __init__(
                 local_rank,
                 "gloo",
             )
-            self.send_buffer = SimpleKVLookupBuffer(self.signal_pipe, self.pipe, 1e9 * 10)
-            self.recv_buffer = self.send_buffer
-
-        
-            
-        
-        # FIXME(Jiayi): buffer initializtion should be adapted accordingly
-        # Signal pipe needs to be initialized on both vllm and lmc side
-
-        # init lookup buffer
-        # TODO: replace this 1e9 with a configurable parameter or a constant
-        #self.buffer = SimpleKVLookupBuffer(self.cpu_pipe, self.device_pipe, 1e9 * 10)
-        
+            self.send_buffer = SimpleKVLookupBuffer(
+                self.signal_pipe, 
+                self.pipe, 
+                self.lookup_buffer_size)
+            self.recv_buffer = self.send_buffer 
         
     def send_kv_caches_and_hidden_states(
         self,

From 85c7a644f87fa5477fb6ea5812b70882981fc7e6 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 19:18:00 +0000
Subject: [PATCH 191/278] remove unnecessary comments

---
 tests/kv_transfer/test_lookup_buffer.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index 5ccbc5b0f586..ae19d068be9f 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -27,10 +27,7 @@ def test_run(my_rank, buffer, device):
         placeholder = torch.tensor([1]).to(device)
 
         buffer.insert(tokens, roi, key, value, placeholder)
-
-        #for i in range(2000):
-        #    print("Here:", i)
-        #    time.sleep(0.01)
+        
     torch.distributed.barrier()
         
     # drop_select

From 01fe335ce3fb3c7023fd527db8a000e3a96e8996 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 20:09:47 +0000
Subject: [PATCH 192/278] update documentation

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 31 +++++++++++++-------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 15b512512055..e504d886466f 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -1,14 +1,19 @@
 """vLLM distributed KV cache transfer API.
-These APIs are used in `vllm/worker/model_runner.py`.
+These APIs are used in `vllm/worker/worker_base.py`.
 
-Currently supporting TP and PP, but TP and PP must be the same.
+Currently supporting TP. The TP between prefill and decode instance needs to be the same.
 
-Workflow:
-- In prefill instance, vLLM `insert` that buffers the KV cache into lookup buffer.
+Workflow (disaggregated prefill)
+- In prefill instance
+    - After prefill, vLLM `insert` its KV caches into a lookup buffer.
+    - The prefill instance will also open up a thread that listens to `drop_select` request.
 - In decode instance
-    - vLLM first runs `drop_select` to send input tokens and a mask on input tokens to sender
-    - The prefill instance send back the matching KV caches
-    - vLLM then store the KV cache into paged memory.
+    - vLLM first runs `drop_select` to send input tokens and a mask on input tokens (we call it roi, region of interest) to prefill instance
+    - The prefill instance then respond to `drop_select` request by
+        - Finding a match in current lookup buffer.
+        - Clone and send the matched item out
+        - Delete the matched item in the lookup buffer to free up GPU memory.
+    - The decode vLLM then store the KV cache into paged memory.
 """
 from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING
 from collections import defaultdict, deque
@@ -69,6 +74,8 @@ def __init__(
         lookup_buffer_size: int = 1e10
     ):
         
+        self.lookup_buffer_size = lookup_buffer_size
+        
         if IS_LMCACHE_INSTANCE:
             # when vLLM is connected with LMCache
             # it needs to both send and recv KV cache
@@ -114,11 +121,12 @@ def __init__(
                 local_rank,
                 "gloo",
             )
-            self.send_buffer = SimpleKVLookupBuffer(
+            buffer = SimpleKVLookupBuffer(
                 self.signal_pipe, 
                 self.pipe, 
                 self.lookup_buffer_size)
-            self.recv_buffer = self.send_buffer 
+            self.send_buffer = buffer
+            self.recv_buffer = buffer
         
     def send_kv_caches_and_hidden_states(
         self,
@@ -178,6 +186,7 @@ def recv_kv_caches_and_hidden_states(
         kv_caches: List[torch.Tensor]
     ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]:
 
+        # When this flag is set to False, it means that 
         bypass_model_exec = True
 
         # This is disagg decode instance, during prefill state
@@ -226,9 +235,7 @@ def recv_kv_caches_and_hidden_states(
             for i in range(model_executable.model.start_layer,
                         model_executable.model.end_layer):
 
-                # get kv cache
                 kv_cache = kv_caches[i - model_executable.model.start_layer]
-                # get corresponding layer
                 layer = model_executable.model.layers[i]
 
                 key_cache, value_cache = kv_cache[0], kv_cache[1]
@@ -297,6 +304,7 @@ def build_partial_prefill_input(
         rebuilt_context_lens_tensor = []
         rebuilt_selected_token_indices = []
         
+        # recounting query and context lengths
         for idx in range(len(input_tokens_list)):
             token_tensor = input_tokens_list[idx]
             num_token = len(token_tensor)
@@ -357,6 +365,7 @@ def build_partial_prefill_input(
             dtype=model_input.sampling_metadata.selected_token_indices.dtype,
             ).to(device)
         
+        # import here to avoid circular import.
         from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
         rebuilt_model_input = ModelInputForGPUWithSamplingMetadata(
             input_tokens = torch.cat(rebuilt_input_tokens).to(device),

From caaaeb8a1bf94a202be5bd6e6a3a2dacfefe8402 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 20:15:51 +0000
Subject: [PATCH 193/278] update overhead benchmark

---
 .../disagg_overhead_benchmark.sh              |  2 +-
 .../simple_kv_lookup_buffer.py                | 35 ++++++++++++++-----
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index d264f1815643..f0ee54357af7 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -53,7 +53,7 @@ benchmark() {
   model="meta-llama/Meta-Llama-3.1-8B-Instruct"
   dataset_name="sonnet"
   dataset_path="../sonnet_4x.txt"
-  num_prompts=20
+  num_prompts=10
   qps=$1
   prefix_len=50
   input_len=2048
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
index de2e7cf5d5d5..6172bf092fb0 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
@@ -2,7 +2,7 @@
 from vllm.distributed.kv_transfer.kv_lookup_buffer.base import \
     KVLookupBufferBase
 from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from typing import Dict, Tuple, List, Optional
+from typing import Dict, Tuple, List, Optional, Union
 import threading
 import torch
 from collections import deque
@@ -14,10 +14,19 @@
 
 class SimpleKVLookupBuffer(KVLookupBufferBase):
     
-    def __init__(self, signal_pipe, data_pipe, buffer_size_thresh):
+    def __init__(self, 
+                 signal_pipe: KVPipeBase, 
+                 data_pipe: KVPipeBase, 
+                 buffer_size_thresh: int):
         """
-        signal_pipe: on CPU -- avoid recv() stops the python intepreter
-        data_pipe: on GPU
+        signal_pipe: on CPU 
+        
+        NOTE: on-device recv will block all threads in the process, making the 
+        KV cache producer unable to listen to new request while transmitting 
+        KV cache. Luckily CPU recv only blocks the current thread so we use 
+        CPU recv to listen to new request.
+        
+        data_pipe: on device (e.g. GPU)
         """
         
         self.buffer = deque()
@@ -33,7 +42,9 @@ def __init__(self, signal_pipe, data_pipe, buffer_size_thresh):
         self.end_signal = None
 
         
-    def _matches(self, tokens_roi_sender, tokens_roi_recver):
+    def _matches(self,
+                 tokens_roi_sender: List[torch.Tensor], 
+                 tokens_roi_recver: List[torch.Tensor]):
 
         # tokens_roi_sender: tokens and roi of the producer (in the buffer)
         # tokens_roi_recver: tokens and roi of the consumer (query)
@@ -69,7 +80,7 @@ def _send_tensor_and_dec_size(self, tensor: Optional[torch.Tensor]) -> None:
         self.buffer_size -= tensor.element_size() * tensor.numel()
         self.data_pipe.send_tensor(tensor)
 
-    def _get_element_size(self, data):
+    def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
         
         if data == [] or data is None:
             return 0
@@ -78,7 +89,12 @@ def _get_element_size(self, data):
 
         assert False, "Unknown data type %s" % type(data)
         
-    def _add_to_buffer(self, input_tokens, roi, key, value, hidden):
+    def _add_to_buffer(self,
+                       input_tokens: torch.Tensor, 
+                       roi: torch.Tensor, 
+                       key: torch.Tensor, 
+                       value: torch.Tensor, 
+                       hidden: torch.Tensor):
 
         if isinstance(input_tokens, torch.Tensor):
             input_tokens = input_tokens.clone()
@@ -150,7 +166,9 @@ def drop_select_handler(self):
         logger.debug("Closing drop_select_handler")
                         
         
-    def drop_select(self, input_tokens, roi):
+    def drop_select(self, 
+                    input_tokens: torch.Tensor, 
+                    roi: torch.Tensor):
         
         assert self.request_handling_thread is None, \
             "drop_select should be called by the receiver"
@@ -183,6 +201,7 @@ def insert(self, input_tokens, roi, key, value, hidden) -> None:
         while self.buffer_size > self.buffer_size_threshold:
             # logger.debug("KV transfer buffer is full. Handling...")
             self.full_handler()
+
         
         self._add_to_buffer(input_tokens, roi, key, value, hidden)
         

From 515c47b4cea8f638e80fef32e06ad07765594016 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 20:46:39 +0000
Subject: [PATCH 194/278] remove group coordinator import

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index e504d886466f..9a6b55cbbe66 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -27,7 +27,6 @@
 from torch.distributed import Backend, ProcessGroup
 
 import vllm.envs as envs
-from vllm.distributed.group_coordinator import GroupCoordinator
 from vllm.logger import init_logger
 import vllm.distributed.parallel_state as ps
 from vllm import _custom_ops as ops

From f166cf8db07fa93bd626564e0547cace2615232b Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 20:48:19 +0000
Subject: [PATCH 195/278] remove syntax bug

---
 vllm/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 48ca1a67123b..1adab6191726 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -529,7 +529,7 @@ def get_open_zmq_ipc_path() -> str:
     return f"ipc://{base_rpc_path}/{uuid4()}"
 
 
-def get_open_port(, force: bool = False) -> int:
+def get_open_port(force: bool = False) -> int:
     port = envs.VLLM_PORT
     if port is not None:
         if force and port is not None:

From f320518485307b5eb7a4006253929b9bbc654bf7 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 21:09:35 +0000
Subject: [PATCH 196/278] update round robin proxy. Prior bash-based impl is
 buggy

---
 .../disagg_benchmarks/round_robin_proxy.py    | 94 +++++++++++++++++++
 .../disagg_benchmarks/round_robin_proxy.sh    | 19 ----
 2 files changed, 94 insertions(+), 19 deletions(-)
 create mode 100644 benchmarks/disagg_benchmarks/round_robin_proxy.py
 delete mode 100644 benchmarks/disagg_benchmarks/round_robin_proxy.sh

diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
new file mode 100644
index 000000000000..04a30f774670
--- /dev/null
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -0,0 +1,94 @@
+import asyncio
+import aiohttp
+from aiohttp import web
+import itertools
+
+class AsyncRoundRobinProxy:
+    def __init__(self, backend_ports):
+        self.backend_ports = itertools.cycle(backend_ports)
+        self.session = None
+
+    async def start(self):
+        self.session = aiohttp.ClientSession()
+
+    async def stop(self):
+        if self.session:
+            await self.session.close()
+
+    async def handle_request(self, request):
+        backend_port = next(self.backend_ports)
+        print("forwarding to port", backend_port)
+        backend_url = f"http://localhost:{backend_port}{request.path_qs}"
+
+        try:
+            async with self.session.request(
+                method=request.method,
+                url=backend_url,
+                headers=request.headers,
+                data=await request.read()
+            ) as backend_response:
+                response = web.StreamResponse(
+                    status=backend_response.status,
+                    headers=backend_response.headers
+                )
+                await response.prepare(request)
+
+                async for chunk in backend_response.content.iter_any():
+                    await response.write(chunk)
+
+                await response.write_eof()
+                return response
+
+        except aiohttp.ClientError as e:
+            return web.Response(text=f"Backend error: {str(e)}", status=502)
+
+async def run_backend(port):
+    async def handle(request):
+        if request.path == '/stream':
+            response = web.StreamResponse(
+                status=200,
+                headers={'Content-Type': 'text/plain'}
+            )
+            await response.prepare(request)
+            for i in range(10):
+                await response.write(f"Chunk {i}\n".encode())
+                await asyncio.sleep(0.5)  # Simulate delay between chunks
+            return response
+        else:
+            return web.Response(text=f"Response from backend on port {port}")
+
+    app = web.Application()
+    app.router.add_route('*', '/{tail:.*}', handle)
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, 'localhost', port)
+    await site.start()
+    print(f"Backend running on http://localhost:{port}")
+
+async def main():
+    proxy = AsyncRoundRobinProxy([8100, 8200])
+    await proxy.start()
+
+    app = web.Application()
+    app.router.add_route('*', '/{tail:.*}', proxy.handle_request)
+
+    runner = web.AppRunner(app)
+    await runner.setup()
+    site = web.TCPSite(runner, 'localhost', 8000)
+
+    await asyncio.gather(
+        site.start(),
+        run_backend(8100),
+        run_backend(8200)
+    )
+
+    print("Proxy running on http://localhost:8000")
+
+    try:
+        await asyncio.Future()  # Run forever
+    finally:
+        await proxy.stop()
+        await runner.cleanup()
+
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.sh b/benchmarks/disagg_benchmarks/round_robin_proxy.sh
deleted file mode 100644
index 375bf9e42237..000000000000
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-# Define the ports to forward to
-PORTS=(8100 8200)
-NUM_PORTS=${#PORTS[@]}
-CURRENT=0
-
-# Function to handle the round-robin logic
-get_next_port() {
-  NEXT_PORT=${PORTS[$CURRENT]}
-  CURRENT=$(( (CURRENT + 1) % NUM_PORTS ))
-  echo $NEXT_PORT
-}
-
-# Start the proxy
-while true; do
-  NEXT_PORT=$(get_next_port)
-  socat TCP4-LISTEN:8000,reuseaddr,fork TCP4:localhost:$NEXT_PORT 2>/dev/null
-done
\ No newline at end of file

From 5b4a3e3140d99b812180a4eb9e4c0afef529726e Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 21:12:35 +0000
Subject: [PATCH 197/278] update docs for disagg overhead benchmark

---
 .../disagg_overhead_benchmark.sh                | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index f0ee54357af7..36116172ab7c 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -1,17 +1,10 @@
 #!/bin/bash
 
-# Requirement: 8x H100 GPUs.
-
-
-# Model: neuralmagic/Meta-Llama-3-70B-Instruct-FP8-KV 
-# Query: 2048 input tokens, 11 output tokens, QPS 4, 500 requests
-# Resource: 8x H100
-# Approaches:
-# 1. Chunked prefill: 1 vllm instance with tp=8
-# 2. Chunked prefill: 2 vllm instance with tp=4, equivalent to 1 tp=4 instance with QPS 4
-# 3. Disaggregated prefill: 1 prefilling instance and 1 decoding instance
-# Prefilling instance: max_output_token=1
-# Decoding instance: force the input tokens be the same across requests to bypass prefilling
+# benchmark the overhead of disaggregated prefill.
+# methodology:
+# - send all request to prefill vLLM instance. It will buffer KV cache.
+# - then send all request to decode instance. 
+# - The TTFT of decode instance is the overhead.
 
 set -ex
 

From 01b2fd3068a709efd62b52e31a99b0cf9cbca030 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 21:15:00 +0000
Subject: [PATCH 198/278] use new round robin proxy in performance benchmark

---
 benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index dde9a80b59b3..715fe56d6c59 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -19,7 +19,6 @@ kill_gpu_processes() {
   # kill all processes on GPU.
   pkill -f pt_main_thread
   pkill -f python3
-  pkill -f round_robin_proxy.sh
   ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
   for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
   sleep 1
@@ -61,7 +60,7 @@ launch_chunked_prefill() {
     --gpu-memory-utilization 0.8 &
   wait_for_server 8100
   wait_for_server 8200
-  bash round_robin_proxy.sh &
+  python3 round_robin_proxy.py &
   sleep 1
 }
 
@@ -149,7 +148,7 @@ main() {
   mkdir results
 
   default_qps=10
-  default_output_len=150
+  default_output_len=10
 
   export VLLM_LOGGING_LEVEL=DEBUG
   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')

From 54bd11f169daf6f9a0d639ce8695db71598778c8 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 21:19:42 +0000
Subject: [PATCH 199/278] update

---
 .../disagg_benchmarks/round_robin_proxy.py    | 117 ++++++------------
 1 file changed, 40 insertions(+), 77 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
index 04a30f774670..8751e24a08d3 100644
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -3,92 +3,55 @@
 from aiohttp import web
 import itertools
 
-class AsyncRoundRobinProxy:
-    def __init__(self, backend_ports):
-        self.backend_ports = itertools.cycle(backend_ports)
-        self.session = None
-
-    async def start(self):
-        self.session = aiohttp.ClientSession()
-
-    async def stop(self):
-        if self.session:
-            await self.session.close()
+class RoundRobinProxy:
+    def __init__(self, target_ports):
+        self.target_ports = target_ports
+        self.port_cycle = itertools.cycle(self.target_ports)
 
     async def handle_request(self, request):
-        backend_port = next(self.backend_ports)
-        print("forwarding to port", backend_port)
-        backend_url = f"http://localhost:{backend_port}{request.path_qs}"
-
-        try:
-            async with self.session.request(
-                method=request.method,
-                url=backend_url,
-                headers=request.headers,
-                data=await request.read()
-            ) as backend_response:
-                response = web.StreamResponse(
-                    status=backend_response.status,
-                    headers=backend_response.headers
-                )
-                await response.prepare(request)
-
-                async for chunk in backend_response.content.iter_any():
-                    await response.write(chunk)
-
-                await response.write_eof()
-                return response
-
-        except aiohttp.ClientError as e:
-            return web.Response(text=f"Backend error: {str(e)}", status=502)
-
-async def run_backend(port):
-    async def handle(request):
-        if request.path == '/stream':
-            response = web.StreamResponse(
-                status=200,
-                headers={'Content-Type': 'text/plain'}
-            )
-            await response.prepare(request)
-            for i in range(10):
-                await response.write(f"Chunk {i}\n".encode())
-                await asyncio.sleep(0.5)  # Simulate delay between chunks
-            return response
-        else:
-            return web.Response(text=f"Response from backend on port {port}")
-
-    app = web.Application()
-    app.router.add_route('*', '/{tail:.*}', handle)
-    runner = web.AppRunner(app)
-    await runner.setup()
-    site = web.TCPSite(runner, 'localhost', port)
-    await site.start()
-    print(f"Backend running on http://localhost:{port}")
+        target_port = next(self.port_cycle)
+        target_url = f"http://localhost:{target_port}{request.path_qs}"
+
+        async with aiohttp.ClientSession() as session:
+            try:
+                # Forward the request
+                async with session.request(
+                    method=request.method,
+                    url=target_url,
+                    headers=request.headers,
+                    data=request.content,
+                ) as response:
+                    # Start sending the response
+                    resp = web.StreamResponse(
+                        status=response.status,
+                        headers=response.headers
+                    )
+                    await resp.prepare(request)
+
+                    # Stream the response content
+                    async for chunk in response.content.iter_any():
+                        await resp.write(chunk)
+
+                    await resp.write_eof()
+                    return resp
+
+            except Exception as e:
+                return web.Response(text=f"Error: {str(e)}", status=500)
 
 async def main():
-    proxy = AsyncRoundRobinProxy([8100, 8200])
-    await proxy.start()
-
+    proxy = RoundRobinProxy([8100, 8200])
     app = web.Application()
-    app.router.add_route('*', '/{tail:.*}', proxy.handle_request)
+    app.router.add_route('*', '/{path:.*}', proxy.handle_request)
 
     runner = web.AppRunner(app)
     await runner.setup()
     site = web.TCPSite(runner, 'localhost', 8000)
+    await site.start()
 
-    await asyncio.gather(
-        site.start(),
-        run_backend(8100),
-        run_backend(8200)
-    )
-
-    print("Proxy running on http://localhost:8000")
-
-    try:
-        await asyncio.Future()  # Run forever
-    finally:
-        await proxy.stop()
-        await runner.cleanup()
+    print("Proxy server started on http://localhost:8000")
+    
+    # Keep the server running
+    await asyncio.Event().wait()
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     asyncio.run(main())
\ No newline at end of file

From b19f346a0c72799ccfaf9e9f89b5b6a8938e0330 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 22:13:26 +0000
Subject: [PATCH 200/278] update benchmarking script

---
 .../analyze_benchmark_results.py              | 48 -------------------
 .../disagg_performance_benchmark.sh           | 17 +++----
 2 files changed, 9 insertions(+), 56 deletions(-)
 delete mode 100644 benchmarks/disagg_benchmarks/analyze_benchmark_results.py

diff --git a/benchmarks/disagg_benchmarks/analyze_benchmark_results.py b/benchmarks/disagg_benchmarks/analyze_benchmark_results.py
deleted file mode 100644
index 4b675c675d25..000000000000
--- a/benchmarks/disagg_benchmarks/analyze_benchmark_results.py
+++ /dev/null
@@ -1,48 +0,0 @@
-
-import argparse
-import json
-import yaml
-import os
-from pathlib import Path
-
-def load(path):
-    
-    with open(str(path), 'r') as f:
-        return json.loads(f.read())
-
-def main(args):
-
-    results = Path(args.results_folder)
-
-    chunk = load(results / "chunked_prefill_tp4.json")
-    prefill = load(results / "disagg_prefill_tp4.json")
-    decode = load(results / "disagg_decode_tp4.json")
-
-    ttft_ratio = chunk["mean_ttft_ms"] / prefill["mean_ttft_ms"]
-    itl_ratio = chunk["mean_itl_ms"] / decode["mean_itl_ms"]
-    prefill_decode_ratio = prefill["mean_ttft_ms"] / (decode["mean_itl_ms"] * args.output_len)
-    
-    with open(results / args.output_file, 'a') as f:
-        f.write(yaml.dump([{
-            'qps': args.qps,
-            'output_len': args.output_len,
-            'prefill_decode_ratio': prefill_decode_ratio,
-            'ttft_ratio': ttft_ratio,
-            'itl_ratio': itl_ratio,
-            "chunk_ttft": chunk["mean_ttft_ms"],
-            "chunk_itl": chunk["mean_itl_ms"],
-            "disagg_ttft": prefill["mean_ttft_ms"],
-            "disagg_itl": decode["mean_itl_ms"]
-        }]))
-    
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Analyze benchmark results")
-    parser.add_argument("--results-folder", required=True, help="Path to the results folder")
-    parser.add_argument("--output-len", type=int, required=True, help="Target output length")
-    parser.add_argument("--qps", type=int, required=True, help="Target QPS")
-    parser.add_argument("--output-file", type=str, default="chunk_vs_disagg.yaml")
-
-    args = parser.parse_args()
-    main(args)
-    
\ No newline at end of file
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index 715fe56d6c59..734679660c23 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -43,7 +43,7 @@ launch_chunked_prefill() {
       --model $model \
       --port 8100 \
       -tp 4 \
-      --max-model-len 30000 \
+      --max-model-len 10000 \
       --disable-log-stats \
       --disable-log-requests \
       --enable-chunked-prefill \
@@ -53,7 +53,7 @@ launch_chunked_prefill() {
     --model $model \
     --port 8200 \
     -tp 4 \
-    --max-model-len 30000 \
+    --max-model-len 10000 \
     --disable-log-stats \
     --disable-log-requests \
     --enable-chunked-prefill \
@@ -73,7 +73,7 @@ launch_disagg_prefill() {
       --model $model \
       --port 8100 \
       -tp 4 \
-      --max-model-len 30000 \
+      --max-model-len 10000 \
       --disable-log-stats \
       --disable-log-requests \
       --gpu-memory-utilization 0.8 &
@@ -82,7 +82,7 @@ launch_disagg_prefill() {
     --model $model \
     --port 8200 \
     -tp 4 \
-    --max-model-len 30000 \
+    --max-model-len 10000 \
     --disable-log-stats \
     --disable-log-requests \
     --gpu-memory-utilization 0.8 &
@@ -98,10 +98,10 @@ benchmark() {
   model="meta-llama/Meta-Llama-3.1-70B-Instruct"
   dataset_name="sonnet"
   dataset_path="../sonnet_4x.txt"
-  num_prompts=400
+  num_prompts=200
   qps=$1
   prefix_len=50
-  input_len=2048
+  input_len=1024
   output_len=$2
   tag=$3
 
@@ -131,7 +131,7 @@ main() {
   (which jq) || (apt-get -y install jq)
   (which socat) || (apt-get -y install socat)
 
-  pip install quart httpx
+  pip install quart httpx matplotlib aiohttp
 
   cd "$(dirname "$0")"
 
@@ -147,7 +147,6 @@ main() {
   rm -rf results
   mkdir results
 
-  default_qps=10
   default_output_len=10
 
   export VLLM_LOGGING_LEVEL=DEBUG
@@ -165,6 +164,8 @@ main() {
   done
   kill_gpu_processes
 
+  python3 visualize_benchmark_results.py
+
 }
 
 

From cb7ff06e1a9c230ee48479833df2da21ec96a7b9 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 22:18:40 +0000
Subject: [PATCH 201/278] revert changes in model_runner.py --- no change
 needed for disagg prefill

---
 vllm/worker/model_runner.py | 39 +++++++++++++------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 8c4899a8b7f5..447d303a57fd 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -14,7 +14,6 @@
 import torch.distributed
 import torch.nn as nn
 
-
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
@@ -1545,30 +1544,21 @@ def execute_model(
             model_forward_end = torch.cuda.Event(enable_timing=True)
             model_forward_start.record()
 
-
         hidden_or_intermediate_states = model_executable(
-        input_ids=model_input.input_tokens,
-        positions=model_input.input_positions,
-        kv_caches=kv_caches,
-        attn_metadata=model_input.attn_metadata,
-        intermediate_tensors=intermediate_tensors,
-        **MultiModalInputs.as_kwargs(multi_modal_kwargs,
-                                        device=self.device),
-        **seqlen_agnostic_kwargs)
-        
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            kv_caches=kv_caches,
+            attn_metadata=model_input.attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                                         device=self.device),
+            **seqlen_agnostic_kwargs)
+
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_end.record()
 
-        return hidden_or_intermediate_states
-
-    @torch.inference_mode()
-    def postprocess_model(
-        self,
-        model_input,
-        hidden_or_intermediate_states,
-        
-    ):
+        # Compute the logits in the last pipeline stage.
         if not get_pp_group().is_last_rank:
             if (self.is_driver_worker
                     and hidden_or_intermediate_states is not None
@@ -1586,7 +1576,7 @@ def postprocess_model(
                 hidden_or_intermediate_states.tensors["model_forward_time"] = (
                     torch.tensor(model_forward_time + orig_model_forward_time))
             return hidden_or_intermediate_states
-        
+
         logits = self.model.compute_logits(hidden_or_intermediate_states,
                                            model_input.sampling_metadata)
 
@@ -1618,7 +1608,6 @@ def postprocess_model(
             output.model_forward_time = (orig_model_forward_time +
                                          model_forward_time)
 
-        decode_meta = model_input.attn_metadata.decode_metadata
         if self.return_hidden_states:
             # we only need to pass hidden states of most recent token
             assert model_input.sampling_metadata is not None
@@ -1635,9 +1624,7 @@ def postprocess_model(
             output.hidden_states = hidden_states
 
         return [output]
-    
-    
-    
+
 
 class CUDAGraphRunner:
 
@@ -1808,4 +1795,4 @@ def _get_max_graph_batch_size(max_num_seqs: int) -> int:
     if padded_size in _BATCH_SIZES_TO_CAPTURE:
         return padded_size
     assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1]
-    return _BATCH_SIZES_TO_CAPTURE[-1]
+    return _BATCH_SIZES_TO_CAPTURE[-1]
\ No newline at end of file

From dd8c86d3e171ee045014ad65c6431b8493c25f86 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 22:21:02 +0000
Subject: [PATCH 202/278] no I was wrong

---
 vllm/worker/model_runner.py | 56 ++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 447d303a57fd..ab38302b3321 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -14,6 +14,7 @@
 import torch.distributed
 import torch.nn as nn
 
+
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
@@ -54,6 +55,7 @@
     _add_sampling_metadata_broadcastable_dict,
     _init_attn_metadata_from_tensor_dict,
     _init_sampling_metadata_from_tensor_dict, dump_input_when_exception)
+from vllm import _custom_ops as ops
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -1544,21 +1546,30 @@ def execute_model(
             model_forward_end = torch.cuda.Event(enable_timing=True)
             model_forward_start.record()
 
-        hidden_or_intermediate_states = model_executable(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=model_input.attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            **MultiModalInputs.as_kwargs(multi_modal_kwargs,
-                                         device=self.device),
-            **seqlen_agnostic_kwargs)
 
+        hidden_or_intermediate_states = model_executable(
+        input_ids=model_input.input_tokens,
+        positions=model_input.input_positions,
+        kv_caches=kv_caches,
+        attn_metadata=model_input.attn_metadata,
+        intermediate_tensors=intermediate_tensors,
+        **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                                        device=self.device),
+        **seqlen_agnostic_kwargs)
+        
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_end.record()
 
-        # Compute the logits in the last pipeline stage.
+        return hidden_or_intermediate_states
+
+    @torch.inference_mode()
+    def postprocess_model(
+        self,
+        model_input,
+        hidden_or_intermediate_states,
+        
+    ):
         if not get_pp_group().is_last_rank:
             if (self.is_driver_worker
                     and hidden_or_intermediate_states is not None
@@ -1576,7 +1587,7 @@ def execute_model(
                 hidden_or_intermediate_states.tensors["model_forward_time"] = (
                     torch.tensor(model_forward_time + orig_model_forward_time))
             return hidden_or_intermediate_states
-
+        
         logits = self.model.compute_logits(hidden_or_intermediate_states,
                                            model_input.sampling_metadata)
 
@@ -1591,23 +1602,8 @@ def execute_model(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time
-                and output is not None):
-            model_forward_end.synchronize()
-            model_forward_time = model_forward_start.elapsed_time(
-                model_forward_end)
-            orig_model_forward_time = 0.0
-            if intermediate_tensors is not None:
-                orig_model_forward_time = intermediate_tensors.tensors.get(
-                    "model_forward_time", torch.tensor(0.0)).item()
-            # If there are multiple workers, we are still tracking the latency
-            # from the start time of the driver worker to the end time of the
-            # driver worker. The model forward time will then end up covering
-            # the communication time as well.
-            output.model_forward_time = (orig_model_forward_time +
-                                         model_forward_time)
 
+        decode_meta = model_input.attn_metadata.decode_metadata
         if self.return_hidden_states:
             # we only need to pass hidden states of most recent token
             assert model_input.sampling_metadata is not None
@@ -1624,7 +1620,9 @@ def execute_model(
             output.hidden_states = hidden_states
 
         return [output]
-
+    
+    
+    
 
 class CUDAGraphRunner:
 
@@ -1795,4 +1793,4 @@ def _get_max_graph_batch_size(max_num_seqs: int) -> int:
     if padded_size in _BATCH_SIZES_TO_CAPTURE:
         return padded_size
     assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1]
-    return _BATCH_SIZES_TO_CAPTURE[-1]
\ No newline at end of file
+    return _BATCH_SIZES_TO_CAPTURE[-1]

From 4e8043c34c77e5155091c1cb95daefdd91238647 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 22:23:20 +0000
Subject: [PATCH 203/278] update benchmark

---
 benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index 734679660c23..1da5669dd1cd 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -147,7 +147,7 @@ main() {
   rm -rf results
   mkdir results
 
-  default_output_len=10
+  default_output_len=6
 
   export VLLM_LOGGING_LEVEL=DEBUG
   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')

From b51f8913b3e595a272590c22fb393f89f33e8d4a Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 22:23:48 +0000
Subject: [PATCH 204/278] remove sonnet 4x --- it can be automatically
 generated via benchmarking script

---
 benchmarks/sonnet_4x.txt | 2070 --------------------------------------
 1 file changed, 2070 deletions(-)
 delete mode 100644 benchmarks/sonnet_4x.txt

diff --git a/benchmarks/sonnet_4x.txt b/benchmarks/sonnet_4x.txt
deleted file mode 100644
index 02f39a9fb14f..000000000000
--- a/benchmarks/sonnet_4x.txt
+++ /dev/null
@@ -1,2070 +0,0 @@
-
-FROM fairest creatures we desire increase,
-That thereby beauty's rose might never die,
-But as the riper should by time decease,
-His tender heir might bear his memory:
-But thou, contracted to thine own bright eyes,
-Feed'st thy light'st flame with self-substantial fuel,
-Making a famine where abundance lies,
-Thyself thy foe, to thy sweet self too cruel.
-Thou that art now the world's fresh ornament
-And only herald to the gaudy spring,
-Within thine own bud buriest thy content
-And, tender churl, makest waste in niggarding.
-Pity the world, or else this glutton be,
-To eat the world's due, by the grave and thee.
-When forty winters shall beseige thy brow,
-And dig deep trenches in thy beauty's field,
-Thy youth's proud livery, so gazed on now,
-Will be a tatter'd weed, of small worth held:
-Then being ask'd where all thy beauty lies,
-Where all the treasure of thy lusty days,
-To say, within thine own deep-sunken eyes,
-Were an all-eating shame and thriftless praise.
-How much more praise deserved thy beauty's use,
-If thou couldst answer 'This fair child of mine
-Shall sum my count and make my old excuse,'
-Proving his beauty by succession thine!
-This were to be new made when thou art old,
-And see thy blood warm when thou feel'st it cold.
-Look in thy glass, and tell the face thou viewest
-Now is the time that face should form another;
-Whose fresh repair if now thou not renewest,
-Thou dost beguile the world, unbless some mother.
-For where is she so fair whose unear'd womb
-Disdains the tillage of thy husbandry?
-Or who is he so fond will be the tomb
-Of his self-love, to stop posterity?
-Thou art thy mother's glass, and she in thee
-Calls back the lovely April of her prime:
-So thou through windows of thine age shall see
-Despite of wrinkles this thy golden time.
-But if thou live, remember'd not to be,
-Die single, and thine image dies with thee.
-Unthrifty loveliness, why dost thou spend
-Upon thyself thy beauty's legacy?
-Nature's bequest gives nothing but doth lend,
-And being frank she lends to those are free.
-Then, beauteous niggard, why dost thou abuse
-The bounteous largess given thee to give?
-Profitless usurer, why dost thou use
-So great a sum of sums, yet canst not live?
-For having traffic with thyself alone,
-Thou of thyself thy sweet self dost deceive.
-Then how, when nature calls thee to be gone,
-What acceptable audit canst thou leave?
-Thy unused beauty must be tomb'd with thee,
-Which, used, lives th' executor to be.
-Those hours, that with gentle work did frame
-The lovely gaze where every eye doth dwell,
-Will play the tyrants to the very same
-And that unfair which fairly doth excel:
-For never-resting time leads summer on
-To hideous winter and confounds him there;
-Sap cheque'd with frost and lusty leaves quite gone,
-Beauty o'ersnow'd and bareness every where:
-Then, were not summer's distillation left,
-A liquid prisoner pent in walls of glass,
-Beauty's effect with beauty were bereft,
-Nor it nor no remembrance what it was:
-But flowers distill'd though they with winter meet,
-Leese but their show; their substance still lives sweet.
-Then let not winter's ragged hand deface
-In thee thy summer, ere thou be distill'd:
-Make sweet some vial; treasure thou some place
-With beauty's treasure, ere it be self-kill'd.
-That use is not forbidden usury,
-Which happies those that pay the willing loan;
-That's for thyself to breed another thee,
-Or ten times happier, be it ten for one;
-Ten times thyself were happier than thou art,
-If ten of thine ten times refigured thee:
-Then what could death do, if thou shouldst depart,
-Leaving thee living in posterity?
-Be not self-will'd, for thou art much too fair
-To be death's conquest and make worms thine heir.
-Lo! in the orient when the gracious light
-Lifts up his burning head, each under eye
-Doth homage to his new-appearing sight,
-Serving with looks his sacred majesty;
-And having climb'd the steep-up heavenly hill,
-Resembling strong youth in his middle age,
-yet mortal looks adore his beauty still,
-Attending on his golden pilgrimage;
-But when from highmost pitch, with weary car,
-Like feeble age, he reeleth from the day,
-The eyes, 'fore duteous, now converted are
-From his low tract and look another way:
-So thou, thyself out-going in thy noon,
-Unlook'd on diest, unless thou get a son.
-Music to hear, why hear'st thou music sadly?
-Sweets with sweets war not, joy delights in joy.
-Why lovest thou that which thou receivest not gladly,
-Or else receivest with pleasure thine annoy?
-If the true concord of well-tuned sounds,
-By unions married, do offend thine ear,
-They do but sweetly chide thee, who confounds
-In singleness the parts that thou shouldst bear.
-Mark how one string, sweet husband to another,
-Strikes each in each by mutual ordering,
-Resembling sire and child and happy mother
-Who all in one, one pleasing note do sing:
-Whose speechless song, being many, seeming one,
-Sings this to thee: 'thou single wilt prove none.'
-Is it for fear to wet a widow's eye
-That thou consumest thyself in single life?
-Ah! if thou issueless shalt hap to die.
-The world will wail thee, like a makeless wife;
-The world will be thy widow and still weep
-That thou no form of thee hast left behind,
-When every private widow well may keep
-By children's eyes her husband's shape in mind.
-Look, what an unthrift in the world doth spend
-Shifts but his place, for still the world enjoys it;
-But beauty's waste hath in the world an end,
-And kept unused, the user so destroys it.
-No love toward others in that bosom sits
-That on himself such murderous shame commits.
-For shame! deny that thou bear'st love to any,
-Who for thyself art so unprovident.
-Grant, if thou wilt, thou art beloved of many,
-But that thou none lovest is most evident;
-For thou art so possess'd with murderous hate
-That 'gainst thyself thou stick'st not to conspire.
-Seeking that beauteous roof to ruinate
-Which to repair should be thy chief desire.
-O, change thy thought, that I may change my mind!
-Shall hate be fairer lodged than gentle love?
-Be, as thy presence is, gracious and kind,
-Or to thyself at least kind-hearted prove:
-Make thee another self, for love of me,
-That beauty still may live in thine or thee.
-As fast as thou shalt wane, so fast thou growest
-In one of thine, from that which thou departest;
-And that fresh blood which youngly thou bestowest
-Thou mayst call thine when thou from youth convertest.
-Herein lives wisdom, beauty and increase:
-Without this, folly, age and cold decay:
-If all were minded so, the times should cease
-And threescore year would make the world away.
-Let those whom Nature hath not made for store,
-Harsh featureless and rude, barrenly perish:
-Look, whom she best endow'd she gave the more;
-Which bounteous gift thou shouldst in bounty cherish:
-She carved thee for her seal, and meant thereby
-Thou shouldst print more, not let that copy die.
-When I do count the clock that tells the time,
-And see the brave day sunk in hideous night;
-When I behold the violet past prime,
-And sable curls all silver'd o'er with white;
-When lofty trees I see barren of leaves
-Which erst from heat did canopy the herd,
-And summer's green all girded up in sheaves
-Borne on the bier with white and bristly beard,
-Then of thy beauty do I question make,
-That thou among the wastes of time must go,
-Since sweets and beauties do themselves forsake
-And die as fast as they see others grow;
-And nothing 'gainst Time's scythe can make defence
-Save breed, to brave him when he takes thee hence.
-O, that you were yourself! but, love, you are
-No longer yours than you yourself here live:
-Against this coming end you should prepare,
-And your sweet semblance to some other give.
-So should that beauty which you hold in lease
-Find no determination: then you were
-Yourself again after yourself's decease,
-When your sweet issue your sweet form should bear.
-Who lets so fair a house fall to decay,
-Which husbandry in honour might uphold
-Against the stormy gusts of winter's day
-And barren rage of death's eternal cold?
-O, none but unthrifts! Dear my love, you know
-You had a father: let your son say so.
-Not from the stars do I my judgment pluck;
-And yet methinks I have astronomy,
-But not to tell of good or evil luck,
-Of plagues, of dearths, or seasons' quality;
-Nor can I fortune to brief minutes tell,
-Pointing to each his thunder, rain and wind,
-Or say with princes if it shall go well,
-By oft predict that I in heaven find:
-But from thine eyes my knowledge I derive,
-And, constant stars, in them I read such art
-As truth and beauty shall together thrive,
-If from thyself to store thou wouldst convert;
-Or else of thee this I prognosticate:
-Thy end is truth's and beauty's doom and date.
-When I consider every thing that grows
-Holds in perfection but a little moment,
-That this huge stage presenteth nought but shows
-Whereon the stars in secret influence comment;
-When I perceive that men as plants increase,
-Cheered and cheque'd even by the self-same sky,
-Vaunt in their youthful sap, at height decrease,
-And wear their brave state out of memory;
-Then the conceit of this inconstant stay
-Sets you most rich in youth before my sight,
-Where wasteful Time debateth with Decay,
-To change your day of youth to sullied night;
-And all in war with Time for love of you,
-As he takes from you, I engraft you new.
-But wherefore do not you a mightier way
-Make war upon this bloody tyrant, Time?
-And fortify yourself in your decay
-With means more blessed than my barren rhyme?
-Now stand you on the top of happy hours,
-And many maiden gardens yet unset
-With virtuous wish would bear your living flowers,
-Much liker than your painted counterfeit:
-So should the lines of life that life repair,
-Which this, Time's pencil, or my pupil pen,
-Neither in inward worth nor outward fair,
-Can make you live yourself in eyes of men.
-To give away yourself keeps yourself still,
-And you must live, drawn by your own sweet skill.
-Who will believe my verse in time to come,
-If it were fill'd with your most high deserts?
-Though yet, heaven knows, it is but as a tomb
-Which hides your life and shows not half your parts.
-If I could write the beauty of your eyes
-And in fresh numbers number all your graces,
-The age to come would say 'This poet lies:
-Such heavenly touches ne'er touch'd earthly faces.'
-So should my papers yellow'd with their age
-Be scorn'd like old men of less truth than tongue,
-And your true rights be term'd a poet's rage
-And stretched metre of an antique song:
-But were some child of yours alive that time,
-You should live twice; in it and in my rhyme.
-Shall I compare thee to a summer's day?
-Thou art more lovely and more temperate:
-Rough winds do shake the darling buds of May,
-And summer's lease hath all too short a date:
-Sometime too hot the eye of heaven shines,
-And often is his gold complexion dimm'd;
-And every fair from fair sometime declines,
-By chance or nature's changing course untrimm'd;
-But thy eternal summer shall not fade
-Nor lose possession of that fair thou owest;
-Nor shall Death brag thou wander'st in his shade,
-When in eternal lines to time thou growest:
-So long as men can breathe or eyes can see,
-So long lives this and this gives life to thee.
-Devouring Time, blunt thou the lion's paws,
-And make the earth devour her own sweet brood;
-Pluck the keen teeth from the fierce tiger's jaws,
-And burn the long-lived phoenix in her blood;
-Make glad and sorry seasons as thou fleets,
-And do whate'er thou wilt, swift-footed Time,
-To the wide world and all her fading sweets;
-But I forbid thee one most heinous crime:
-O, carve not with thy hours my love's fair brow,
-Nor draw no lines there with thine antique pen;
-Him in thy course untainted do allow
-For beauty's pattern to succeeding men.
-Yet, do thy worst, old Time: despite thy wrong,
-My love shall in my verse ever live young.
-A woman's face with Nature's own hand painted
-Hast thou, the master-mistress of my passion;
-A woman's gentle heart, but not acquainted
-With shifting change, as is false women's fashion;
-An eye more bright than theirs, less false in rolling,
-Gilding the object whereupon it gazeth;
-A man in hue, all 'hues' in his controlling,
-Much steals men's eyes and women's souls amazeth.
-And for a woman wert thou first created;
-Till Nature, as she wrought thee, fell a-doting,
-And by addition me of thee defeated,
-By adding one thing to my purpose nothing.
-But since she prick'd thee out for women's pleasure,
-Mine be thy love and thy love's use their treasure.
-So is it not with me as with that Muse
-Stirr'd by a painted beauty to his verse,
-Who heaven itself for ornament doth use
-And every fair with his fair doth rehearse
-Making a couplement of proud compare,
-With sun and moon, with earth and sea's rich gems,
-With April's first-born flowers, and all things rare
-That heaven's air in this huge rondure hems.
-O' let me, true in love, but truly write,
-And then believe me, my love is as fair
-As any mother's child, though not so bright
-As those gold candles fix'd in heaven's air:
-Let them say more than like of hearsay well;
-I will not praise that purpose not to sell.
-My glass shall not persuade me I am old,
-So long as youth and thou are of one date;
-But when in thee time's furrows I behold,
-Then look I death my days should expiate.
-For all that beauty that doth cover thee
-Is but the seemly raiment of my heart,
-Which in thy breast doth live, as thine in me:
-How can I then be elder than thou art?
-O, therefore, love, be of thyself so wary
-As I, not for myself, but for thee will;
-Bearing thy heart, which I will keep so chary
-As tender nurse her babe from faring ill.
-Presume not on thy heart when mine is slain;
-Thou gavest me thine, not to give back again.
-As an unperfect actor on the stage
-Who with his fear is put besides his part,
-Or some fierce thing replete with too much rage,
-Whose strength's abundance weakens his own heart.
-So I, for fear of trust, forget to say
-The perfect ceremony of love's rite,
-And in mine own love's strength seem to decay,
-O'ercharged with burden of mine own love's might.
-O, let my books be then the eloquence
-And dumb presagers of my speaking breast,
-Who plead for love and look for recompense
-More than that tongue that more hath more express'd.
-O, learn to read what silent love hath writ:
-To hear with eyes belongs to love's fine wit.
-Mine eye hath play'd the painter and hath stell'd
-Thy beauty's form in table of my heart;
-My body is the frame wherein 'tis held,
-And perspective it is the painter's art.
-For through the painter must you see his skill,
-To find where your true image pictured lies;
-Which in my bosom's shop is hanging still,
-That hath his windows glazed with thine eyes.
-Now see what good turns eyes for eyes have done:
-Mine eyes have drawn thy shape, and thine for me
-Are windows to my breast, where-through the sun
-Delights to peep, to gaze therein on thee;
-Yet eyes this cunning want to grace their art;
-They draw but what they see, know not the heart.
-Let those who are in favour with their stars
-Of public honour and proud titles boast,
-Whilst I, whom fortune of such triumph bars,
-Unlook'd for joy in that I honour most.
-Great princes' favourites their fair leaves spread
-But as the marigold at the sun's eye,
-And in themselves their pride lies buried,
-For at a frown they in their glory die.
-The painful warrior famoused for fight,
-After a thousand victories once foil'd,
-Is from the book of honour razed quite,
-And all the rest forgot for which he toil'd:
-Then happy I, that love and am beloved
-Where I may not remove nor be removed.
-Lord of my love, to whom in vassalage
-Thy merit hath my duty strongly knit,
-To thee I send this written embassage,
-To witness duty, not to show my wit:
-Duty so great, which wit so poor as mine
-May make seem bare, in wanting words to show it,
-But that I hope some good conceit of thine
-In thy soul's thought, all naked, will bestow it;
-Till whatsoever star that guides my moving
-Points on me graciously with fair aspect
-And puts apparel on my tatter'd loving,
-To show me worthy of thy sweet respect:
-Then may I dare to boast how I do love thee;
-Till then not show my head where thou mayst prove me.
-Weary with toil, I haste me to my bed,
-The dear repose for limbs with travel tired;
-But then begins a journey in my head,
-To work my mind, when body's work's expired:
-For then my thoughts, from far where I abide,
-Intend a zealous pilgrimage to thee,
-And keep my drooping eyelids open wide,
-Looking on darkness which the blind do see
-Save that my soul's imaginary sight
-Presents thy shadow to my sightless view,
-Which, like a jewel hung in ghastly night,
-Makes black night beauteous and her old face new.
-Lo! thus, by day my limbs, by night my mind,
-For thee and for myself no quiet find.
-How can I then return in happy plight,
-That am debarr'd the benefit of rest?
-When day's oppression is not eased by night,
-But day by night, and night by day, oppress'd?
-And each, though enemies to either's reign,
-Do in consent shake hands to torture me;
-The one by toil, the other to complain
-How far I toil, still farther off from thee.
-I tell the day, to please them thou art bright
-And dost him grace when clouds do blot the heaven:
-So flatter I the swart-complexion'd night,
-When sparkling stars twire not thou gild'st the even.
-But day doth daily draw my sorrows longer
-And night doth nightly make grief's strength seem stronger.
-When, in disgrace with fortune and men's eyes,
-I all alone beweep my outcast state
-And trouble deal heaven with my bootless cries
-And look upon myself and curse my fate,
-Wishing me like to one more rich in hope,
-Featured like him, like him with friends possess'd,
-Desiring this man's art and that man's scope,
-With what I most enjoy contented least;
-Yet in these thoughts myself almost despising,
-Haply I think on thee, and then my state,
-Like to the lark at break of day arising
-From sullen earth, sings hymns at heaven's gate;
-For thy sweet love remember'd such wealth brings
-That then I scorn to change my state with kings.
-When to the sessions of sweet silent thought
-I summon up remembrance of things past,
-I sigh the lack of many a thing I sought,
-And with old woes new wail my dear time's waste:
-Then can I drown an eye, unused to flow,
-For precious friends hid in death's dateless night,
-And weep afresh love's long since cancell'd woe,
-And moan the expense of many a vanish'd sight:
-Then can I grieve at grievances foregone,
-And heavily from woe to woe tell o'er
-The sad account of fore-bemoaned moan,
-Which I new pay as if not paid before.
-But if the while I think on thee, dear friend,
-All losses are restored and sorrows end.
-Thy bosom is endeared with all hearts,
-Which I by lacking have supposed dead,
-And there reigns love and all love's loving parts,
-And all those friends which I thought buried.
-How many a holy and obsequious tear
-Hath dear religious love stol'n from mine eye
-As interest of the dead, which now appear
-But things removed that hidden in thee lie!
-Thou art the grave where buried love doth live,
-Hung with the trophies of my lovers gone,
-Who all their parts of me to thee did give;
-That due of many now is thine alone:
-Their images I loved I view in thee,
-And thou, all they, hast all the all of me.
-If thou survive my well-contented day,
-When that churl Death my bones with dust shall cover,
-And shalt by fortune once more re-survey
-These poor rude lines of thy deceased lover,
-Compare them with the bettering of the time,
-And though they be outstripp'd by every pen,
-Reserve them for my love, not for their rhyme,
-Exceeded by the height of happier men.
-O, then vouchsafe me but this loving thought:
-'Had my friend's Muse grown with this growing age,
-A dearer birth than this his love had brought,
-To march in ranks of better equipage:
-But since he died and poets better prove,
-Theirs for their style I'll read, his for his love.'
-Full many a glorious morning have I seen
-Flatter the mountain-tops with sovereign eye,
-Kissing with golden face the meadows green,
-Gilding pale streams with heavenly alchemy;
-Anon permit the basest clouds to ride
-With ugly rack on his celestial face,
-And from the forlorn world his visage hide,
-Stealing unseen to west with this disgrace:
-Even so my sun one early morn did shine
-With all triumphant splendor on my brow;
-But out, alack! he was but one hour mine;
-The region cloud hath mask'd him from me now.
-Yet him for this my love no whit disdaineth;
-Suns of the world may stain when heaven's sun staineth.
-Why didst thou promise such a beauteous day,
-And make me travel forth without my cloak,
-To let base clouds o'ertake me in my way,
-Hiding thy bravery in their rotten smoke?
-'Tis not enough that through the cloud thou break,
-To dry the rain on my storm-beaten face,
-For no man well of such a salve can speak
-That heals the wound and cures not the disgrace:
-Nor can thy shame give physic to my grief;
-Though thou repent, yet I have still the loss:
-The offender's sorrow lends but weak relief
-To him that bears the strong offence's cross.
-Ah! but those tears are pearl which thy love sheds,
-And they are rich and ransom all ill deeds.
-No more be grieved at that which thou hast done:
-Roses have thorns, and silver fountains mud;
-Clouds and eclipses stain both moon and sun,
-And loathsome canker lives in sweetest bud.
-All men make faults, and even I in this,
-Authorizing thy trespass with compare,
-Myself corrupting, salving thy amiss,
-Excusing thy sins more than thy sins are;
-For to thy sensual fault I bring in sense--
-Thy adverse party is thy advocate--
-And 'gainst myself a lawful plea commence:
-Such civil war is in my love and hate
-That I an accessary needs must be
-To that sweet thief which sourly robs from me.
-Let me confess that we two must be twain,
-Although our undivided loves are one:
-So shall those blots that do with me remain
-Without thy help by me be borne alone.
-In our two loves there is but one respect,
-Though in our lives a separable spite,
-Which though it alter not love's sole effect,
-Yet doth it steal sweet hours from love's delight.
-I may not evermore acknowledge thee,
-Lest my bewailed guilt should do thee shame,
-Nor thou with public kindness honour me,
-Unless thou take that honour from thy name:
-But do not so; I love thee in such sort
-As, thou being mine, mine is thy good report.
-As a decrepit father takes delight
-To see his active child do deeds of youth,
-So I, made lame by fortune's dearest spite,
-Take all my comfort of thy worth and truth.
-For whether beauty, birth, or wealth, or wit,
-Or any of these all, or all, or more,
-Entitled in thy parts do crowned sit,
-I make my love engrafted to this store:
-So then I am not lame, poor, nor despised,
-Whilst that this shadow doth such substance give
-That I in thy abundance am sufficed
-And by a part of all thy glory live.
-Look, what is best, that best I wish in thee:
-This wish I have; then ten times happy me!FROM fairest creatures we desire increase,
-That thereby beauty's rose might never die,
-But as the riper should by time decease,
-His tender heir might bear his memory:
-But thou, contracted to thine own bright eyes,
-Feed'st thy light'st flame with self-substantial fuel,
-Making a famine where abundance lies,
-Thyself thy foe, to thy sweet self too cruel.
-Thou that art now the world's fresh ornament
-And only herald to the gaudy spring,
-Within thine own bud buriest thy content
-And, tender churl, makest waste in niggarding.
-Pity the world, or else this glutton be,
-To eat the world's due, by the grave and thee.
-When forty winters shall beseige thy brow,
-And dig deep trenches in thy beauty's field,
-Thy youth's proud livery, so gazed on now,
-Will be a tatter'd weed, of small worth held:
-Then being ask'd where all thy beauty lies,
-Where all the treasure of thy lusty days,
-To say, within thine own deep-sunken eyes,
-Were an all-eating shame and thriftless praise.
-How much more praise deserved thy beauty's use,
-If thou couldst answer 'This fair child of mine
-Shall sum my count and make my old excuse,'
-Proving his beauty by succession thine!
-This were to be new made when thou art old,
-And see thy blood warm when thou feel'st it cold.
-Look in thy glass, and tell the face thou viewest
-Now is the time that face should form another;
-Whose fresh repair if now thou not renewest,
-Thou dost beguile the world, unbless some mother.
-For where is she so fair whose unear'd womb
-Disdains the tillage of thy husbandry?
-Or who is he so fond will be the tomb
-Of his self-love, to stop posterity?
-Thou art thy mother's glass, and she in thee
-Calls back the lovely April of her prime:
-So thou through windows of thine age shall see
-Despite of wrinkles this thy golden time.
-But if thou live, remember'd not to be,
-Die single, and thine image dies with thee.
-Unthrifty loveliness, why dost thou spend
-Upon thyself thy beauty's legacy?
-Nature's bequest gives nothing but doth lend,
-And being frank she lends to those are free.
-Then, beauteous niggard, why dost thou abuse
-The bounteous largess given thee to give?
-Profitless usurer, why dost thou use
-So great a sum of sums, yet canst not live?
-For having traffic with thyself alone,
-Thou of thyself thy sweet self dost deceive.
-Then how, when nature calls thee to be gone,
-What acceptable audit canst thou leave?
-Thy unused beauty must be tomb'd with thee,
-Which, used, lives th' executor to be.
-Those hours, that with gentle work did frame
-The lovely gaze where every eye doth dwell,
-Will play the tyrants to the very same
-And that unfair which fairly doth excel:
-For never-resting time leads summer on
-To hideous winter and confounds him there;
-Sap cheque'd with frost and lusty leaves quite gone,
-Beauty o'ersnow'd and bareness every where:
-Then, were not summer's distillation left,
-A liquid prisoner pent in walls of glass,
-Beauty's effect with beauty were bereft,
-Nor it nor no remembrance what it was:
-But flowers distill'd though they with winter meet,
-Leese but their show; their substance still lives sweet.
-Then let not winter's ragged hand deface
-In thee thy summer, ere thou be distill'd:
-Make sweet some vial; treasure thou some place
-With beauty's treasure, ere it be self-kill'd.
-That use is not forbidden usury,
-Which happies those that pay the willing loan;
-That's for thyself to breed another thee,
-Or ten times happier, be it ten for one;
-Ten times thyself were happier than thou art,
-If ten of thine ten times refigured thee:
-Then what could death do, if thou shouldst depart,
-Leaving thee living in posterity?
-Be not self-will'd, for thou art much too fair
-To be death's conquest and make worms thine heir.
-Lo! in the orient when the gracious light
-Lifts up his burning head, each under eye
-Doth homage to his new-appearing sight,
-Serving with looks his sacred majesty;
-And having climb'd the steep-up heavenly hill,
-Resembling strong youth in his middle age,
-yet mortal looks adore his beauty still,
-Attending on his golden pilgrimage;
-But when from highmost pitch, with weary car,
-Like feeble age, he reeleth from the day,
-The eyes, 'fore duteous, now converted are
-From his low tract and look another way:
-So thou, thyself out-going in thy noon,
-Unlook'd on diest, unless thou get a son.
-Music to hear, why hear'st thou music sadly?
-Sweets with sweets war not, joy delights in joy.
-Why lovest thou that which thou receivest not gladly,
-Or else receivest with pleasure thine annoy?
-If the true concord of well-tuned sounds,
-By unions married, do offend thine ear,
-They do but sweetly chide thee, who confounds
-In singleness the parts that thou shouldst bear.
-Mark how one string, sweet husband to another,
-Strikes each in each by mutual ordering,
-Resembling sire and child and happy mother
-Who all in one, one pleasing note do sing:
-Whose speechless song, being many, seeming one,
-Sings this to thee: 'thou single wilt prove none.'
-Is it for fear to wet a widow's eye
-That thou consumest thyself in single life?
-Ah! if thou issueless shalt hap to die.
-The world will wail thee, like a makeless wife;
-The world will be thy widow and still weep
-That thou no form of thee hast left behind,
-When every private widow well may keep
-By children's eyes her husband's shape in mind.
-Look, what an unthrift in the world doth spend
-Shifts but his place, for still the world enjoys it;
-But beauty's waste hath in the world an end,
-And kept unused, the user so destroys it.
-No love toward others in that bosom sits
-That on himself such murderous shame commits.
-For shame! deny that thou bear'st love to any,
-Who for thyself art so unprovident.
-Grant, if thou wilt, thou art beloved of many,
-But that thou none lovest is most evident;
-For thou art so possess'd with murderous hate
-That 'gainst thyself thou stick'st not to conspire.
-Seeking that beauteous roof to ruinate
-Which to repair should be thy chief desire.
-O, change thy thought, that I may change my mind!
-Shall hate be fairer lodged than gentle love?
-Be, as thy presence is, gracious and kind,
-Or to thyself at least kind-hearted prove:
-Make thee another self, for love of me,
-That beauty still may live in thine or thee.
-As fast as thou shalt wane, so fast thou growest
-In one of thine, from that which thou departest;
-And that fresh blood which youngly thou bestowest
-Thou mayst call thine when thou from youth convertest.
-Herein lives wisdom, beauty and increase:
-Without this, folly, age and cold decay:
-If all were minded so, the times should cease
-And threescore year would make the world away.
-Let those whom Nature hath not made for store,
-Harsh featureless and rude, barrenly perish:
-Look, whom she best endow'd she gave the more;
-Which bounteous gift thou shouldst in bounty cherish:
-She carved thee for her seal, and meant thereby
-Thou shouldst print more, not let that copy die.
-When I do count the clock that tells the time,
-And see the brave day sunk in hideous night;
-When I behold the violet past prime,
-And sable curls all silver'd o'er with white;
-When lofty trees I see barren of leaves
-Which erst from heat did canopy the herd,
-And summer's green all girded up in sheaves
-Borne on the bier with white and bristly beard,
-Then of thy beauty do I question make,
-That thou among the wastes of time must go,
-Since sweets and beauties do themselves forsake
-And die as fast as they see others grow;
-And nothing 'gainst Time's scythe can make defence
-Save breed, to brave him when he takes thee hence.
-O, that you were yourself! but, love, you are
-No longer yours than you yourself here live:
-Against this coming end you should prepare,
-And your sweet semblance to some other give.
-So should that beauty which you hold in lease
-Find no determination: then you were
-Yourself again after yourself's decease,
-When your sweet issue your sweet form should bear.
-Who lets so fair a house fall to decay,
-Which husbandry in honour might uphold
-Against the stormy gusts of winter's day
-And barren rage of death's eternal cold?
-O, none but unthrifts! Dear my love, you know
-You had a father: let your son say so.
-Not from the stars do I my judgment pluck;
-And yet methinks I have astronomy,
-But not to tell of good or evil luck,
-Of plagues, of dearths, or seasons' quality;
-Nor can I fortune to brief minutes tell,
-Pointing to each his thunder, rain and wind,
-Or say with princes if it shall go well,
-By oft predict that I in heaven find:
-But from thine eyes my knowledge I derive,
-And, constant stars, in them I read such art
-As truth and beauty shall together thrive,
-If from thyself to store thou wouldst convert;
-Or else of thee this I prognosticate:
-Thy end is truth's and beauty's doom and date.
-When I consider every thing that grows
-Holds in perfection but a little moment,
-That this huge stage presenteth nought but shows
-Whereon the stars in secret influence comment;
-When I perceive that men as plants increase,
-Cheered and cheque'd even by the self-same sky,
-Vaunt in their youthful sap, at height decrease,
-And wear their brave state out of memory;
-Then the conceit of this inconstant stay
-Sets you most rich in youth before my sight,
-Where wasteful Time debateth with Decay,
-To change your day of youth to sullied night;
-And all in war with Time for love of you,
-As he takes from you, I engraft you new.
-But wherefore do not you a mightier way
-Make war upon this bloody tyrant, Time?
-And fortify yourself in your decay
-With means more blessed than my barren rhyme?
-Now stand you on the top of happy hours,
-And many maiden gardens yet unset
-With virtuous wish would bear your living flowers,
-Much liker than your painted counterfeit:
-So should the lines of life that life repair,
-Which this, Time's pencil, or my pupil pen,
-Neither in inward worth nor outward fair,
-Can make you live yourself in eyes of men.
-To give away yourself keeps yourself still,
-And you must live, drawn by your own sweet skill.
-Who will believe my verse in time to come,
-If it were fill'd with your most high deserts?
-Though yet, heaven knows, it is but as a tomb
-Which hides your life and shows not half your parts.
-If I could write the beauty of your eyes
-And in fresh numbers number all your graces,
-The age to come would say 'This poet lies:
-Such heavenly touches ne'er touch'd earthly faces.'
-So should my papers yellow'd with their age
-Be scorn'd like old men of less truth than tongue,
-And your true rights be term'd a poet's rage
-And stretched metre of an antique song:
-But were some child of yours alive that time,
-You should live twice; in it and in my rhyme.
-Shall I compare thee to a summer's day?
-Thou art more lovely and more temperate:
-Rough winds do shake the darling buds of May,
-And summer's lease hath all too short a date:
-Sometime too hot the eye of heaven shines,
-And often is his gold complexion dimm'd;
-And every fair from fair sometime declines,
-By chance or nature's changing course untrimm'd;
-But thy eternal summer shall not fade
-Nor lose possession of that fair thou owest;
-Nor shall Death brag thou wander'st in his shade,
-When in eternal lines to time thou growest:
-So long as men can breathe or eyes can see,
-So long lives this and this gives life to thee.
-Devouring Time, blunt thou the lion's paws,
-And make the earth devour her own sweet brood;
-Pluck the keen teeth from the fierce tiger's jaws,
-And burn the long-lived phoenix in her blood;
-Make glad and sorry seasons as thou fleets,
-And do whate'er thou wilt, swift-footed Time,
-To the wide world and all her fading sweets;
-But I forbid thee one most heinous crime:
-O, carve not with thy hours my love's fair brow,
-Nor draw no lines there with thine antique pen;
-Him in thy course untainted do allow
-For beauty's pattern to succeeding men.
-Yet, do thy worst, old Time: despite thy wrong,
-My love shall in my verse ever live young.
-A woman's face with Nature's own hand painted
-Hast thou, the master-mistress of my passion;
-A woman's gentle heart, but not acquainted
-With shifting change, as is false women's fashion;
-An eye more bright than theirs, less false in rolling,
-Gilding the object whereupon it gazeth;
-A man in hue, all 'hues' in his controlling,
-Much steals men's eyes and women's souls amazeth.
-And for a woman wert thou first created;
-Till Nature, as she wrought thee, fell a-doting,
-And by addition me of thee defeated,
-By adding one thing to my purpose nothing.
-But since she prick'd thee out for women's pleasure,
-Mine be thy love and thy love's use their treasure.
-So is it not with me as with that Muse
-Stirr'd by a painted beauty to his verse,
-Who heaven itself for ornament doth use
-And every fair with his fair doth rehearse
-Making a couplement of proud compare,
-With sun and moon, with earth and sea's rich gems,
-With April's first-born flowers, and all things rare
-That heaven's air in this huge rondure hems.
-O' let me, true in love, but truly write,
-And then believe me, my love is as fair
-As any mother's child, though not so bright
-As those gold candles fix'd in heaven's air:
-Let them say more than like of hearsay well;
-I will not praise that purpose not to sell.
-My glass shall not persuade me I am old,
-So long as youth and thou are of one date;
-But when in thee time's furrows I behold,
-Then look I death my days should expiate.
-For all that beauty that doth cover thee
-Is but the seemly raiment of my heart,
-Which in thy breast doth live, as thine in me:
-How can I then be elder than thou art?
-O, therefore, love, be of thyself so wary
-As I, not for myself, but for thee will;
-Bearing thy heart, which I will keep so chary
-As tender nurse her babe from faring ill.
-Presume not on thy heart when mine is slain;
-Thou gavest me thine, not to give back again.
-As an unperfect actor on the stage
-Who with his fear is put besides his part,
-Or some fierce thing replete with too much rage,
-Whose strength's abundance weakens his own heart.
-So I, for fear of trust, forget to say
-The perfect ceremony of love's rite,
-And in mine own love's strength seem to decay,
-O'ercharged with burden of mine own love's might.
-O, let my books be then the eloquence
-And dumb presagers of my speaking breast,
-Who plead for love and look for recompense
-More than that tongue that more hath more express'd.
-O, learn to read what silent love hath writ:
-To hear with eyes belongs to love's fine wit.
-Mine eye hath play'd the painter and hath stell'd
-Thy beauty's form in table of my heart;
-My body is the frame wherein 'tis held,
-And perspective it is the painter's art.
-For through the painter must you see his skill,
-To find where your true image pictured lies;
-Which in my bosom's shop is hanging still,
-That hath his windows glazed with thine eyes.
-Now see what good turns eyes for eyes have done:
-Mine eyes have drawn thy shape, and thine for me
-Are windows to my breast, where-through the sun
-Delights to peep, to gaze therein on thee;
-Yet eyes this cunning want to grace their art;
-They draw but what they see, know not the heart.
-Let those who are in favour with their stars
-Of public honour and proud titles boast,
-Whilst I, whom fortune of such triumph bars,
-Unlook'd for joy in that I honour most.
-Great princes' favourites their fair leaves spread
-But as the marigold at the sun's eye,
-And in themselves their pride lies buried,
-For at a frown they in their glory die.
-The painful warrior famoused for fight,
-After a thousand victories once foil'd,
-Is from the book of honour razed quite,
-And all the rest forgot for which he toil'd:
-Then happy I, that love and am beloved
-Where I may not remove nor be removed.
-Lord of my love, to whom in vassalage
-Thy merit hath my duty strongly knit,
-To thee I send this written embassage,
-To witness duty, not to show my wit:
-Duty so great, which wit so poor as mine
-May make seem bare, in wanting words to show it,
-But that I hope some good conceit of thine
-In thy soul's thought, all naked, will bestow it;
-Till whatsoever star that guides my moving
-Points on me graciously with fair aspect
-And puts apparel on my tatter'd loving,
-To show me worthy of thy sweet respect:
-Then may I dare to boast how I do love thee;
-Till then not show my head where thou mayst prove me.
-Weary with toil, I haste me to my bed,
-The dear repose for limbs with travel tired;
-But then begins a journey in my head,
-To work my mind, when body's work's expired:
-For then my thoughts, from far where I abide,
-Intend a zealous pilgrimage to thee,
-And keep my drooping eyelids open wide,
-Looking on darkness which the blind do see
-Save that my soul's imaginary sight
-Presents thy shadow to my sightless view,
-Which, like a jewel hung in ghastly night,
-Makes black night beauteous and her old face new.
-Lo! thus, by day my limbs, by night my mind,
-For thee and for myself no quiet find.
-How can I then return in happy plight,
-That am debarr'd the benefit of rest?
-When day's oppression is not eased by night,
-But day by night, and night by day, oppress'd?
-And each, though enemies to either's reign,
-Do in consent shake hands to torture me;
-The one by toil, the other to complain
-How far I toil, still farther off from thee.
-I tell the day, to please them thou art bright
-And dost him grace when clouds do blot the heaven:
-So flatter I the swart-complexion'd night,
-When sparkling stars twire not thou gild'st the even.
-But day doth daily draw my sorrows longer
-And night doth nightly make grief's strength seem stronger.
-When, in disgrace with fortune and men's eyes,
-I all alone beweep my outcast state
-And trouble deal heaven with my bootless cries
-And look upon myself and curse my fate,
-Wishing me like to one more rich in hope,
-Featured like him, like him with friends possess'd,
-Desiring this man's art and that man's scope,
-With what I most enjoy contented least;
-Yet in these thoughts myself almost despising,
-Haply I think on thee, and then my state,
-Like to the lark at break of day arising
-From sullen earth, sings hymns at heaven's gate;
-For thy sweet love remember'd such wealth brings
-That then I scorn to change my state with kings.
-When to the sessions of sweet silent thought
-I summon up remembrance of things past,
-I sigh the lack of many a thing I sought,
-And with old woes new wail my dear time's waste:
-Then can I drown an eye, unused to flow,
-For precious friends hid in death's dateless night,
-And weep afresh love's long since cancell'd woe,
-And moan the expense of many a vanish'd sight:
-Then can I grieve at grievances foregone,
-And heavily from woe to woe tell o'er
-The sad account of fore-bemoaned moan,
-Which I new pay as if not paid before.
-But if the while I think on thee, dear friend,
-All losses are restored and sorrows end.
-Thy bosom is endeared with all hearts,
-Which I by lacking have supposed dead,
-And there reigns love and all love's loving parts,
-And all those friends which I thought buried.
-How many a holy and obsequious tear
-Hath dear religious love stol'n from mine eye
-As interest of the dead, which now appear
-But things removed that hidden in thee lie!
-Thou art the grave where buried love doth live,
-Hung with the trophies of my lovers gone,
-Who all their parts of me to thee did give;
-That due of many now is thine alone:
-Their images I loved I view in thee,
-And thou, all they, hast all the all of me.
-If thou survive my well-contented day,
-When that churl Death my bones with dust shall cover,
-And shalt by fortune once more re-survey
-These poor rude lines of thy deceased lover,
-Compare them with the bettering of the time,
-And though they be outstripp'd by every pen,
-Reserve them for my love, not for their rhyme,
-Exceeded by the height of happier men.
-O, then vouchsafe me but this loving thought:
-'Had my friend's Muse grown with this growing age,
-A dearer birth than this his love had brought,
-To march in ranks of better equipage:
-But since he died and poets better prove,
-Theirs for their style I'll read, his for his love.'
-Full many a glorious morning have I seen
-Flatter the mountain-tops with sovereign eye,
-Kissing with golden face the meadows green,
-Gilding pale streams with heavenly alchemy;
-Anon permit the basest clouds to ride
-With ugly rack on his celestial face,
-And from the forlorn world his visage hide,
-Stealing unseen to west with this disgrace:
-Even so my sun one early morn did shine
-With all triumphant splendor on my brow;
-But out, alack! he was but one hour mine;
-The region cloud hath mask'd him from me now.
-Yet him for this my love no whit disdaineth;
-Suns of the world may stain when heaven's sun staineth.
-Why didst thou promise such a beauteous day,
-And make me travel forth without my cloak,
-To let base clouds o'ertake me in my way,
-Hiding thy bravery in their rotten smoke?
-'Tis not enough that through the cloud thou break,
-To dry the rain on my storm-beaten face,
-For no man well of such a salve can speak
-That heals the wound and cures not the disgrace:
-Nor can thy shame give physic to my grief;
-Though thou repent, yet I have still the loss:
-The offender's sorrow lends but weak relief
-To him that bears the strong offence's cross.
-Ah! but those tears are pearl which thy love sheds,
-And they are rich and ransom all ill deeds.
-No more be grieved at that which thou hast done:
-Roses have thorns, and silver fountains mud;
-Clouds and eclipses stain both moon and sun,
-And loathsome canker lives in sweetest bud.
-All men make faults, and even I in this,
-Authorizing thy trespass with compare,
-Myself corrupting, salving thy amiss,
-Excusing thy sins more than thy sins are;
-For to thy sensual fault I bring in sense--
-Thy adverse party is thy advocate--
-And 'gainst myself a lawful plea commence:
-Such civil war is in my love and hate
-That I an accessary needs must be
-To that sweet thief which sourly robs from me.
-Let me confess that we two must be twain,
-Although our undivided loves are one:
-So shall those blots that do with me remain
-Without thy help by me be borne alone.
-In our two loves there is but one respect,
-Though in our lives a separable spite,
-Which though it alter not love's sole effect,
-Yet doth it steal sweet hours from love's delight.
-I may not evermore acknowledge thee,
-Lest my bewailed guilt should do thee shame,
-Nor thou with public kindness honour me,
-Unless thou take that honour from thy name:
-But do not so; I love thee in such sort
-As, thou being mine, mine is thy good report.
-As a decrepit father takes delight
-To see his active child do deeds of youth,
-So I, made lame by fortune's dearest spite,
-Take all my comfort of thy worth and truth.
-For whether beauty, birth, or wealth, or wit,
-Or any of these all, or all, or more,
-Entitled in thy parts do crowned sit,
-I make my love engrafted to this store:
-So then I am not lame, poor, nor despised,
-Whilst that this shadow doth such substance give
-That I in thy abundance am sufficed
-And by a part of all thy glory live.
-Look, what is best, that best I wish in thee:
-This wish I have; then ten times happy me!FROM fairest creatures we desire increase,
-That thereby beauty's rose might never die,
-But as the riper should by time decease,
-His tender heir might bear his memory:
-But thou, contracted to thine own bright eyes,
-Feed'st thy light'st flame with self-substantial fuel,
-Making a famine where abundance lies,
-Thyself thy foe, to thy sweet self too cruel.
-Thou that art now the world's fresh ornament
-And only herald to the gaudy spring,
-Within thine own bud buriest thy content
-And, tender churl, makest waste in niggarding.
-Pity the world, or else this glutton be,
-To eat the world's due, by the grave and thee.
-When forty winters shall beseige thy brow,
-And dig deep trenches in thy beauty's field,
-Thy youth's proud livery, so gazed on now,
-Will be a tatter'd weed, of small worth held:
-Then being ask'd where all thy beauty lies,
-Where all the treasure of thy lusty days,
-To say, within thine own deep-sunken eyes,
-Were an all-eating shame and thriftless praise.
-How much more praise deserved thy beauty's use,
-If thou couldst answer 'This fair child of mine
-Shall sum my count and make my old excuse,'
-Proving his beauty by succession thine!
-This were to be new made when thou art old,
-And see thy blood warm when thou feel'st it cold.
-Look in thy glass, and tell the face thou viewest
-Now is the time that face should form another;
-Whose fresh repair if now thou not renewest,
-Thou dost beguile the world, unbless some mother.
-For where is she so fair whose unear'd womb
-Disdains the tillage of thy husbandry?
-Or who is he so fond will be the tomb
-Of his self-love, to stop posterity?
-Thou art thy mother's glass, and she in thee
-Calls back the lovely April of her prime:
-So thou through windows of thine age shall see
-Despite of wrinkles this thy golden time.
-But if thou live, remember'd not to be,
-Die single, and thine image dies with thee.
-Unthrifty loveliness, why dost thou spend
-Upon thyself thy beauty's legacy?
-Nature's bequest gives nothing but doth lend,
-And being frank she lends to those are free.
-Then, beauteous niggard, why dost thou abuse
-The bounteous largess given thee to give?
-Profitless usurer, why dost thou use
-So great a sum of sums, yet canst not live?
-For having traffic with thyself alone,
-Thou of thyself thy sweet self dost deceive.
-Then how, when nature calls thee to be gone,
-What acceptable audit canst thou leave?
-Thy unused beauty must be tomb'd with thee,
-Which, used, lives th' executor to be.
-Those hours, that with gentle work did frame
-The lovely gaze where every eye doth dwell,
-Will play the tyrants to the very same
-And that unfair which fairly doth excel:
-For never-resting time leads summer on
-To hideous winter and confounds him there;
-Sap cheque'd with frost and lusty leaves quite gone,
-Beauty o'ersnow'd and bareness every where:
-Then, were not summer's distillation left,
-A liquid prisoner pent in walls of glass,
-Beauty's effect with beauty were bereft,
-Nor it nor no remembrance what it was:
-But flowers distill'd though they with winter meet,
-Leese but their show; their substance still lives sweet.
-Then let not winter's ragged hand deface
-In thee thy summer, ere thou be distill'd:
-Make sweet some vial; treasure thou some place
-With beauty's treasure, ere it be self-kill'd.
-That use is not forbidden usury,
-Which happies those that pay the willing loan;
-That's for thyself to breed another thee,
-Or ten times happier, be it ten for one;
-Ten times thyself were happier than thou art,
-If ten of thine ten times refigured thee:
-Then what could death do, if thou shouldst depart,
-Leaving thee living in posterity?
-Be not self-will'd, for thou art much too fair
-To be death's conquest and make worms thine heir.
-Lo! in the orient when the gracious light
-Lifts up his burning head, each under eye
-Doth homage to his new-appearing sight,
-Serving with looks his sacred majesty;
-And having climb'd the steep-up heavenly hill,
-Resembling strong youth in his middle age,
-yet mortal looks adore his beauty still,
-Attending on his golden pilgrimage;
-But when from highmost pitch, with weary car,
-Like feeble age, he reeleth from the day,
-The eyes, 'fore duteous, now converted are
-From his low tract and look another way:
-So thou, thyself out-going in thy noon,
-Unlook'd on diest, unless thou get a son.
-Music to hear, why hear'st thou music sadly?
-Sweets with sweets war not, joy delights in joy.
-Why lovest thou that which thou receivest not gladly,
-Or else receivest with pleasure thine annoy?
-If the true concord of well-tuned sounds,
-By unions married, do offend thine ear,
-They do but sweetly chide thee, who confounds
-In singleness the parts that thou shouldst bear.
-Mark how one string, sweet husband to another,
-Strikes each in each by mutual ordering,
-Resembling sire and child and happy mother
-Who all in one, one pleasing note do sing:
-Whose speechless song, being many, seeming one,
-Sings this to thee: 'thou single wilt prove none.'
-Is it for fear to wet a widow's eye
-That thou consumest thyself in single life?
-Ah! if thou issueless shalt hap to die.
-The world will wail thee, like a makeless wife;
-The world will be thy widow and still weep
-That thou no form of thee hast left behind,
-When every private widow well may keep
-By children's eyes her husband's shape in mind.
-Look, what an unthrift in the world doth spend
-Shifts but his place, for still the world enjoys it;
-But beauty's waste hath in the world an end,
-And kept unused, the user so destroys it.
-No love toward others in that bosom sits
-That on himself such murderous shame commits.
-For shame! deny that thou bear'st love to any,
-Who for thyself art so unprovident.
-Grant, if thou wilt, thou art beloved of many,
-But that thou none lovest is most evident;
-For thou art so possess'd with murderous hate
-That 'gainst thyself thou stick'st not to conspire.
-Seeking that beauteous roof to ruinate
-Which to repair should be thy chief desire.
-O, change thy thought, that I may change my mind!
-Shall hate be fairer lodged than gentle love?
-Be, as thy presence is, gracious and kind,
-Or to thyself at least kind-hearted prove:
-Make thee another self, for love of me,
-That beauty still may live in thine or thee.
-As fast as thou shalt wane, so fast thou growest
-In one of thine, from that which thou departest;
-And that fresh blood which youngly thou bestowest
-Thou mayst call thine when thou from youth convertest.
-Herein lives wisdom, beauty and increase:
-Without this, folly, age and cold decay:
-If all were minded so, the times should cease
-And threescore year would make the world away.
-Let those whom Nature hath not made for store,
-Harsh featureless and rude, barrenly perish:
-Look, whom she best endow'd she gave the more;
-Which bounteous gift thou shouldst in bounty cherish:
-She carved thee for her seal, and meant thereby
-Thou shouldst print more, not let that copy die.
-When I do count the clock that tells the time,
-And see the brave day sunk in hideous night;
-When I behold the violet past prime,
-And sable curls all silver'd o'er with white;
-When lofty trees I see barren of leaves
-Which erst from heat did canopy the herd,
-And summer's green all girded up in sheaves
-Borne on the bier with white and bristly beard,
-Then of thy beauty do I question make,
-That thou among the wastes of time must go,
-Since sweets and beauties do themselves forsake
-And die as fast as they see others grow;
-And nothing 'gainst Time's scythe can make defence
-Save breed, to brave him when he takes thee hence.
-O, that you were yourself! but, love, you are
-No longer yours than you yourself here live:
-Against this coming end you should prepare,
-And your sweet semblance to some other give.
-So should that beauty which you hold in lease
-Find no determination: then you were
-Yourself again after yourself's decease,
-When your sweet issue your sweet form should bear.
-Who lets so fair a house fall to decay,
-Which husbandry in honour might uphold
-Against the stormy gusts of winter's day
-And barren rage of death's eternal cold?
-O, none but unthrifts! Dear my love, you know
-You had a father: let your son say so.
-Not from the stars do I my judgment pluck;
-And yet methinks I have astronomy,
-But not to tell of good or evil luck,
-Of plagues, of dearths, or seasons' quality;
-Nor can I fortune to brief minutes tell,
-Pointing to each his thunder, rain and wind,
-Or say with princes if it shall go well,
-By oft predict that I in heaven find:
-But from thine eyes my knowledge I derive,
-And, constant stars, in them I read such art
-As truth and beauty shall together thrive,
-If from thyself to store thou wouldst convert;
-Or else of thee this I prognosticate:
-Thy end is truth's and beauty's doom and date.
-When I consider every thing that grows
-Holds in perfection but a little moment,
-That this huge stage presenteth nought but shows
-Whereon the stars in secret influence comment;
-When I perceive that men as plants increase,
-Cheered and cheque'd even by the self-same sky,
-Vaunt in their youthful sap, at height decrease,
-And wear their brave state out of memory;
-Then the conceit of this inconstant stay
-Sets you most rich in youth before my sight,
-Where wasteful Time debateth with Decay,
-To change your day of youth to sullied night;
-And all in war with Time for love of you,
-As he takes from you, I engraft you new.
-But wherefore do not you a mightier way
-Make war upon this bloody tyrant, Time?
-And fortify yourself in your decay
-With means more blessed than my barren rhyme?
-Now stand you on the top of happy hours,
-And many maiden gardens yet unset
-With virtuous wish would bear your living flowers,
-Much liker than your painted counterfeit:
-So should the lines of life that life repair,
-Which this, Time's pencil, or my pupil pen,
-Neither in inward worth nor outward fair,
-Can make you live yourself in eyes of men.
-To give away yourself keeps yourself still,
-And you must live, drawn by your own sweet skill.
-Who will believe my verse in time to come,
-If it were fill'd with your most high deserts?
-Though yet, heaven knows, it is but as a tomb
-Which hides your life and shows not half your parts.
-If I could write the beauty of your eyes
-And in fresh numbers number all your graces,
-The age to come would say 'This poet lies:
-Such heavenly touches ne'er touch'd earthly faces.'
-So should my papers yellow'd with their age
-Be scorn'd like old men of less truth than tongue,
-And your true rights be term'd a poet's rage
-And stretched metre of an antique song:
-But were some child of yours alive that time,
-You should live twice; in it and in my rhyme.
-Shall I compare thee to a summer's day?
-Thou art more lovely and more temperate:
-Rough winds do shake the darling buds of May,
-And summer's lease hath all too short a date:
-Sometime too hot the eye of heaven shines,
-And often is his gold complexion dimm'd;
-And every fair from fair sometime declines,
-By chance or nature's changing course untrimm'd;
-But thy eternal summer shall not fade
-Nor lose possession of that fair thou owest;
-Nor shall Death brag thou wander'st in his shade,
-When in eternal lines to time thou growest:
-So long as men can breathe or eyes can see,
-So long lives this and this gives life to thee.
-Devouring Time, blunt thou the lion's paws,
-And make the earth devour her own sweet brood;
-Pluck the keen teeth from the fierce tiger's jaws,
-And burn the long-lived phoenix in her blood;
-Make glad and sorry seasons as thou fleets,
-And do whate'er thou wilt, swift-footed Time,
-To the wide world and all her fading sweets;
-But I forbid thee one most heinous crime:
-O, carve not with thy hours my love's fair brow,
-Nor draw no lines there with thine antique pen;
-Him in thy course untainted do allow
-For beauty's pattern to succeeding men.
-Yet, do thy worst, old Time: despite thy wrong,
-My love shall in my verse ever live young.
-A woman's face with Nature's own hand painted
-Hast thou, the master-mistress of my passion;
-A woman's gentle heart, but not acquainted
-With shifting change, as is false women's fashion;
-An eye more bright than theirs, less false in rolling,
-Gilding the object whereupon it gazeth;
-A man in hue, all 'hues' in his controlling,
-Much steals men's eyes and women's souls amazeth.
-And for a woman wert thou first created;
-Till Nature, as she wrought thee, fell a-doting,
-And by addition me of thee defeated,
-By adding one thing to my purpose nothing.
-But since she prick'd thee out for women's pleasure,
-Mine be thy love and thy love's use their treasure.
-So is it not with me as with that Muse
-Stirr'd by a painted beauty to his verse,
-Who heaven itself for ornament doth use
-And every fair with his fair doth rehearse
-Making a couplement of proud compare,
-With sun and moon, with earth and sea's rich gems,
-With April's first-born flowers, and all things rare
-That heaven's air in this huge rondure hems.
-O' let me, true in love, but truly write,
-And then believe me, my love is as fair
-As any mother's child, though not so bright
-As those gold candles fix'd in heaven's air:
-Let them say more than like of hearsay well;
-I will not praise that purpose not to sell.
-My glass shall not persuade me I am old,
-So long as youth and thou are of one date;
-But when in thee time's furrows I behold,
-Then look I death my days should expiate.
-For all that beauty that doth cover thee
-Is but the seemly raiment of my heart,
-Which in thy breast doth live, as thine in me:
-How can I then be elder than thou art?
-O, therefore, love, be of thyself so wary
-As I, not for myself, but for thee will;
-Bearing thy heart, which I will keep so chary
-As tender nurse her babe from faring ill.
-Presume not on thy heart when mine is slain;
-Thou gavest me thine, not to give back again.
-As an unperfect actor on the stage
-Who with his fear is put besides his part,
-Or some fierce thing replete with too much rage,
-Whose strength's abundance weakens his own heart.
-So I, for fear of trust, forget to say
-The perfect ceremony of love's rite,
-And in mine own love's strength seem to decay,
-O'ercharged with burden of mine own love's might.
-O, let my books be then the eloquence
-And dumb presagers of my speaking breast,
-Who plead for love and look for recompense
-More than that tongue that more hath more express'd.
-O, learn to read what silent love hath writ:
-To hear with eyes belongs to love's fine wit.
-Mine eye hath play'd the painter and hath stell'd
-Thy beauty's form in table of my heart;
-My body is the frame wherein 'tis held,
-And perspective it is the painter's art.
-For through the painter must you see his skill,
-To find where your true image pictured lies;
-Which in my bosom's shop is hanging still,
-That hath his windows glazed with thine eyes.
-Now see what good turns eyes for eyes have done:
-Mine eyes have drawn thy shape, and thine for me
-Are windows to my breast, where-through the sun
-Delights to peep, to gaze therein on thee;
-Yet eyes this cunning want to grace their art;
-They draw but what they see, know not the heart.
-Let those who are in favour with their stars
-Of public honour and proud titles boast,
-Whilst I, whom fortune of such triumph bars,
-Unlook'd for joy in that I honour most.
-Great princes' favourites their fair leaves spread
-But as the marigold at the sun's eye,
-And in themselves their pride lies buried,
-For at a frown they in their glory die.
-The painful warrior famoused for fight,
-After a thousand victories once foil'd,
-Is from the book of honour razed quite,
-And all the rest forgot for which he toil'd:
-Then happy I, that love and am beloved
-Where I may not remove nor be removed.
-Lord of my love, to whom in vassalage
-Thy merit hath my duty strongly knit,
-To thee I send this written embassage,
-To witness duty, not to show my wit:
-Duty so great, which wit so poor as mine
-May make seem bare, in wanting words to show it,
-But that I hope some good conceit of thine
-In thy soul's thought, all naked, will bestow it;
-Till whatsoever star that guides my moving
-Points on me graciously with fair aspect
-And puts apparel on my tatter'd loving,
-To show me worthy of thy sweet respect:
-Then may I dare to boast how I do love thee;
-Till then not show my head where thou mayst prove me.
-Weary with toil, I haste me to my bed,
-The dear repose for limbs with travel tired;
-But then begins a journey in my head,
-To work my mind, when body's work's expired:
-For then my thoughts, from far where I abide,
-Intend a zealous pilgrimage to thee,
-And keep my drooping eyelids open wide,
-Looking on darkness which the blind do see
-Save that my soul's imaginary sight
-Presents thy shadow to my sightless view,
-Which, like a jewel hung in ghastly night,
-Makes black night beauteous and her old face new.
-Lo! thus, by day my limbs, by night my mind,
-For thee and for myself no quiet find.
-How can I then return in happy plight,
-That am debarr'd the benefit of rest?
-When day's oppression is not eased by night,
-But day by night, and night by day, oppress'd?
-And each, though enemies to either's reign,
-Do in consent shake hands to torture me;
-The one by toil, the other to complain
-How far I toil, still farther off from thee.
-I tell the day, to please them thou art bright
-And dost him grace when clouds do blot the heaven:
-So flatter I the swart-complexion'd night,
-When sparkling stars twire not thou gild'st the even.
-But day doth daily draw my sorrows longer
-And night doth nightly make grief's strength seem stronger.
-When, in disgrace with fortune and men's eyes,
-I all alone beweep my outcast state
-And trouble deal heaven with my bootless cries
-And look upon myself and curse my fate,
-Wishing me like to one more rich in hope,
-Featured like him, like him with friends possess'd,
-Desiring this man's art and that man's scope,
-With what I most enjoy contented least;
-Yet in these thoughts myself almost despising,
-Haply I think on thee, and then my state,
-Like to the lark at break of day arising
-From sullen earth, sings hymns at heaven's gate;
-For thy sweet love remember'd such wealth brings
-That then I scorn to change my state with kings.
-When to the sessions of sweet silent thought
-I summon up remembrance of things past,
-I sigh the lack of many a thing I sought,
-And with old woes new wail my dear time's waste:
-Then can I drown an eye, unused to flow,
-For precious friends hid in death's dateless night,
-And weep afresh love's long since cancell'd woe,
-And moan the expense of many a vanish'd sight:
-Then can I grieve at grievances foregone,
-And heavily from woe to woe tell o'er
-The sad account of fore-bemoaned moan,
-Which I new pay as if not paid before.
-But if the while I think on thee, dear friend,
-All losses are restored and sorrows end.
-Thy bosom is endeared with all hearts,
-Which I by lacking have supposed dead,
-And there reigns love and all love's loving parts,
-And all those friends which I thought buried.
-How many a holy and obsequious tear
-Hath dear religious love stol'n from mine eye
-As interest of the dead, which now appear
-But things removed that hidden in thee lie!
-Thou art the grave where buried love doth live,
-Hung with the trophies of my lovers gone,
-Who all their parts of me to thee did give;
-That due of many now is thine alone:
-Their images I loved I view in thee,
-And thou, all they, hast all the all of me.
-If thou survive my well-contented day,
-When that churl Death my bones with dust shall cover,
-And shalt by fortune once more re-survey
-These poor rude lines of thy deceased lover,
-Compare them with the bettering of the time,
-And though they be outstripp'd by every pen,
-Reserve them for my love, not for their rhyme,
-Exceeded by the height of happier men.
-O, then vouchsafe me but this loving thought:
-'Had my friend's Muse grown with this growing age,
-A dearer birth than this his love had brought,
-To march in ranks of better equipage:
-But since he died and poets better prove,
-Theirs for their style I'll read, his for his love.'
-Full many a glorious morning have I seen
-Flatter the mountain-tops with sovereign eye,
-Kissing with golden face the meadows green,
-Gilding pale streams with heavenly alchemy;
-Anon permit the basest clouds to ride
-With ugly rack on his celestial face,
-And from the forlorn world his visage hide,
-Stealing unseen to west with this disgrace:
-Even so my sun one early morn did shine
-With all triumphant splendor on my brow;
-But out, alack! he was but one hour mine;
-The region cloud hath mask'd him from me now.
-Yet him for this my love no whit disdaineth;
-Suns of the world may stain when heaven's sun staineth.
-Why didst thou promise such a beauteous day,
-And make me travel forth without my cloak,
-To let base clouds o'ertake me in my way,
-Hiding thy bravery in their rotten smoke?
-'Tis not enough that through the cloud thou break,
-To dry the rain on my storm-beaten face,
-For no man well of such a salve can speak
-That heals the wound and cures not the disgrace:
-Nor can thy shame give physic to my grief;
-Though thou repent, yet I have still the loss:
-The offender's sorrow lends but weak relief
-To him that bears the strong offence's cross.
-Ah! but those tears are pearl which thy love sheds,
-And they are rich and ransom all ill deeds.
-No more be grieved at that which thou hast done:
-Roses have thorns, and silver fountains mud;
-Clouds and eclipses stain both moon and sun,
-And loathsome canker lives in sweetest bud.
-All men make faults, and even I in this,
-Authorizing thy trespass with compare,
-Myself corrupting, salving thy amiss,
-Excusing thy sins more than thy sins are;
-For to thy sensual fault I bring in sense--
-Thy adverse party is thy advocate--
-And 'gainst myself a lawful plea commence:
-Such civil war is in my love and hate
-That I an accessary needs must be
-To that sweet thief which sourly robs from me.
-Let me confess that we two must be twain,
-Although our undivided loves are one:
-So shall those blots that do with me remain
-Without thy help by me be borne alone.
-In our two loves there is but one respect,
-Though in our lives a separable spite,
-Which though it alter not love's sole effect,
-Yet doth it steal sweet hours from love's delight.
-I may not evermore acknowledge thee,
-Lest my bewailed guilt should do thee shame,
-Nor thou with public kindness honour me,
-Unless thou take that honour from thy name:
-But do not so; I love thee in such sort
-As, thou being mine, mine is thy good report.
-As a decrepit father takes delight
-To see his active child do deeds of youth,
-So I, made lame by fortune's dearest spite,
-Take all my comfort of thy worth and truth.
-For whether beauty, birth, or wealth, or wit,
-Or any of these all, or all, or more,
-Entitled in thy parts do crowned sit,
-I make my love engrafted to this store:
-So then I am not lame, poor, nor despised,
-Whilst that this shadow doth such substance give
-That I in thy abundance am sufficed
-And by a part of all thy glory live.
-Look, what is best, that best I wish in thee:
-This wish I have; then ten times happy me!FROM fairest creatures we desire increase,
-That thereby beauty's rose might never die,
-But as the riper should by time decease,
-His tender heir might bear his memory:
-But thou, contracted to thine own bright eyes,
-Feed'st thy light'st flame with self-substantial fuel,
-Making a famine where abundance lies,
-Thyself thy foe, to thy sweet self too cruel.
-Thou that art now the world's fresh ornament
-And only herald to the gaudy spring,
-Within thine own bud buriest thy content
-And, tender churl, makest waste in niggarding.
-Pity the world, or else this glutton be,
-To eat the world's due, by the grave and thee.
-When forty winters shall beseige thy brow,
-And dig deep trenches in thy beauty's field,
-Thy youth's proud livery, so gazed on now,
-Will be a tatter'd weed, of small worth held:
-Then being ask'd where all thy beauty lies,
-Where all the treasure of thy lusty days,
-To say, within thine own deep-sunken eyes,
-Were an all-eating shame and thriftless praise.
-How much more praise deserved thy beauty's use,
-If thou couldst answer 'This fair child of mine
-Shall sum my count and make my old excuse,'
-Proving his beauty by succession thine!
-This were to be new made when thou art old,
-And see thy blood warm when thou feel'st it cold.
-Look in thy glass, and tell the face thou viewest
-Now is the time that face should form another;
-Whose fresh repair if now thou not renewest,
-Thou dost beguile the world, unbless some mother.
-For where is she so fair whose unear'd womb
-Disdains the tillage of thy husbandry?
-Or who is he so fond will be the tomb
-Of his self-love, to stop posterity?
-Thou art thy mother's glass, and she in thee
-Calls back the lovely April of her prime:
-So thou through windows of thine age shall see
-Despite of wrinkles this thy golden time.
-But if thou live, remember'd not to be,
-Die single, and thine image dies with thee.
-Unthrifty loveliness, why dost thou spend
-Upon thyself thy beauty's legacy?
-Nature's bequest gives nothing but doth lend,
-And being frank she lends to those are free.
-Then, beauteous niggard, why dost thou abuse
-The bounteous largess given thee to give?
-Profitless usurer, why dost thou use
-So great a sum of sums, yet canst not live?
-For having traffic with thyself alone,
-Thou of thyself thy sweet self dost deceive.
-Then how, when nature calls thee to be gone,
-What acceptable audit canst thou leave?
-Thy unused beauty must be tomb'd with thee,
-Which, used, lives th' executor to be.
-Those hours, that with gentle work did frame
-The lovely gaze where every eye doth dwell,
-Will play the tyrants to the very same
-And that unfair which fairly doth excel:
-For never-resting time leads summer on
-To hideous winter and confounds him there;
-Sap cheque'd with frost and lusty leaves quite gone,
-Beauty o'ersnow'd and bareness every where:
-Then, were not summer's distillation left,
-A liquid prisoner pent in walls of glass,
-Beauty's effect with beauty were bereft,
-Nor it nor no remembrance what it was:
-But flowers distill'd though they with winter meet,
-Leese but their show; their substance still lives sweet.
-Then let not winter's ragged hand deface
-In thee thy summer, ere thou be distill'd:
-Make sweet some vial; treasure thou some place
-With beauty's treasure, ere it be self-kill'd.
-That use is not forbidden usury,
-Which happies those that pay the willing loan;
-That's for thyself to breed another thee,
-Or ten times happier, be it ten for one;
-Ten times thyself were happier than thou art,
-If ten of thine ten times refigured thee:
-Then what could death do, if thou shouldst depart,
-Leaving thee living in posterity?
-Be not self-will'd, for thou art much too fair
-To be death's conquest and make worms thine heir.
-Lo! in the orient when the gracious light
-Lifts up his burning head, each under eye
-Doth homage to his new-appearing sight,
-Serving with looks his sacred majesty;
-And having climb'd the steep-up heavenly hill,
-Resembling strong youth in his middle age,
-yet mortal looks adore his beauty still,
-Attending on his golden pilgrimage;
-But when from highmost pitch, with weary car,
-Like feeble age, he reeleth from the day,
-The eyes, 'fore duteous, now converted are
-From his low tract and look another way:
-So thou, thyself out-going in thy noon,
-Unlook'd on diest, unless thou get a son.
-Music to hear, why hear'st thou music sadly?
-Sweets with sweets war not, joy delights in joy.
-Why lovest thou that which thou receivest not gladly,
-Or else receivest with pleasure thine annoy?
-If the true concord of well-tuned sounds,
-By unions married, do offend thine ear,
-They do but sweetly chide thee, who confounds
-In singleness the parts that thou shouldst bear.
-Mark how one string, sweet husband to another,
-Strikes each in each by mutual ordering,
-Resembling sire and child and happy mother
-Who all in one, one pleasing note do sing:
-Whose speechless song, being many, seeming one,
-Sings this to thee: 'thou single wilt prove none.'
-Is it for fear to wet a widow's eye
-That thou consumest thyself in single life?
-Ah! if thou issueless shalt hap to die.
-The world will wail thee, like a makeless wife;
-The world will be thy widow and still weep
-That thou no form of thee hast left behind,
-When every private widow well may keep
-By children's eyes her husband's shape in mind.
-Look, what an unthrift in the world doth spend
-Shifts but his place, for still the world enjoys it;
-But beauty's waste hath in the world an end,
-And kept unused, the user so destroys it.
-No love toward others in that bosom sits
-That on himself such murderous shame commits.
-For shame! deny that thou bear'st love to any,
-Who for thyself art so unprovident.
-Grant, if thou wilt, thou art beloved of many,
-But that thou none lovest is most evident;
-For thou art so possess'd with murderous hate
-That 'gainst thyself thou stick'st not to conspire.
-Seeking that beauteous roof to ruinate
-Which to repair should be thy chief desire.
-O, change thy thought, that I may change my mind!
-Shall hate be fairer lodged than gentle love?
-Be, as thy presence is, gracious and kind,
-Or to thyself at least kind-hearted prove:
-Make thee another self, for love of me,
-That beauty still may live in thine or thee.
-As fast as thou shalt wane, so fast thou growest
-In one of thine, from that which thou departest;
-And that fresh blood which youngly thou bestowest
-Thou mayst call thine when thou from youth convertest.
-Herein lives wisdom, beauty and increase:
-Without this, folly, age and cold decay:
-If all were minded so, the times should cease
-And threescore year would make the world away.
-Let those whom Nature hath not made for store,
-Harsh featureless and rude, barrenly perish:
-Look, whom she best endow'd she gave the more;
-Which bounteous gift thou shouldst in bounty cherish:
-She carved thee for her seal, and meant thereby
-Thou shouldst print more, not let that copy die.
-When I do count the clock that tells the time,
-And see the brave day sunk in hideous night;
-When I behold the violet past prime,
-And sable curls all silver'd o'er with white;
-When lofty trees I see barren of leaves
-Which erst from heat did canopy the herd,
-And summer's green all girded up in sheaves
-Borne on the bier with white and bristly beard,
-Then of thy beauty do I question make,
-That thou among the wastes of time must go,
-Since sweets and beauties do themselves forsake
-And die as fast as they see others grow;
-And nothing 'gainst Time's scythe can make defence
-Save breed, to brave him when he takes thee hence.
-O, that you were yourself! but, love, you are
-No longer yours than you yourself here live:
-Against this coming end you should prepare,
-And your sweet semblance to some other give.
-So should that beauty which you hold in lease
-Find no determination: then you were
-Yourself again after yourself's decease,
-When your sweet issue your sweet form should bear.
-Who lets so fair a house fall to decay,
-Which husbandry in honour might uphold
-Against the stormy gusts of winter's day
-And barren rage of death's eternal cold?
-O, none but unthrifts! Dear my love, you know
-You had a father: let your son say so.
-Not from the stars do I my judgment pluck;
-And yet methinks I have astronomy,
-But not to tell of good or evil luck,
-Of plagues, of dearths, or seasons' quality;
-Nor can I fortune to brief minutes tell,
-Pointing to each his thunder, rain and wind,
-Or say with princes if it shall go well,
-By oft predict that I in heaven find:
-But from thine eyes my knowledge I derive,
-And, constant stars, in them I read such art
-As truth and beauty shall together thrive,
-If from thyself to store thou wouldst convert;
-Or else of thee this I prognosticate:
-Thy end is truth's and beauty's doom and date.
-When I consider every thing that grows
-Holds in perfection but a little moment,
-That this huge stage presenteth nought but shows
-Whereon the stars in secret influence comment;
-When I perceive that men as plants increase,
-Cheered and cheque'd even by the self-same sky,
-Vaunt in their youthful sap, at height decrease,
-And wear their brave state out of memory;
-Then the conceit of this inconstant stay
-Sets you most rich in youth before my sight,
-Where wasteful Time debateth with Decay,
-To change your day of youth to sullied night;
-And all in war with Time for love of you,
-As he takes from you, I engraft you new.
-But wherefore do not you a mightier way
-Make war upon this bloody tyrant, Time?
-And fortify yourself in your decay
-With means more blessed than my barren rhyme?
-Now stand you on the top of happy hours,
-And many maiden gardens yet unset
-With virtuous wish would bear your living flowers,
-Much liker than your painted counterfeit:
-So should the lines of life that life repair,
-Which this, Time's pencil, or my pupil pen,
-Neither in inward worth nor outward fair,
-Can make you live yourself in eyes of men.
-To give away yourself keeps yourself still,
-And you must live, drawn by your own sweet skill.
-Who will believe my verse in time to come,
-If it were fill'd with your most high deserts?
-Though yet, heaven knows, it is but as a tomb
-Which hides your life and shows not half your parts.
-If I could write the beauty of your eyes
-And in fresh numbers number all your graces,
-The age to come would say 'This poet lies:
-Such heavenly touches ne'er touch'd earthly faces.'
-So should my papers yellow'd with their age
-Be scorn'd like old men of less truth than tongue,
-And your true rights be term'd a poet's rage
-And stretched metre of an antique song:
-But were some child of yours alive that time,
-You should live twice; in it and in my rhyme.
-Shall I compare thee to a summer's day?
-Thou art more lovely and more temperate:
-Rough winds do shake the darling buds of May,
-And summer's lease hath all too short a date:
-Sometime too hot the eye of heaven shines,
-And often is his gold complexion dimm'd;
-And every fair from fair sometime declines,
-By chance or nature's changing course untrimm'd;
-But thy eternal summer shall not fade
-Nor lose possession of that fair thou owest;
-Nor shall Death brag thou wander'st in his shade,
-When in eternal lines to time thou growest:
-So long as men can breathe or eyes can see,
-So long lives this and this gives life to thee.
-Devouring Time, blunt thou the lion's paws,
-And make the earth devour her own sweet brood;
-Pluck the keen teeth from the fierce tiger's jaws,
-And burn the long-lived phoenix in her blood;
-Make glad and sorry seasons as thou fleets,
-And do whate'er thou wilt, swift-footed Time,
-To the wide world and all her fading sweets;
-But I forbid thee one most heinous crime:
-O, carve not with thy hours my love's fair brow,
-Nor draw no lines there with thine antique pen;
-Him in thy course untainted do allow
-For beauty's pattern to succeeding men.
-Yet, do thy worst, old Time: despite thy wrong,
-My love shall in my verse ever live young.
-A woman's face with Nature's own hand painted
-Hast thou, the master-mistress of my passion;
-A woman's gentle heart, but not acquainted
-With shifting change, as is false women's fashion;
-An eye more bright than theirs, less false in rolling,
-Gilding the object whereupon it gazeth;
-A man in hue, all 'hues' in his controlling,
-Much steals men's eyes and women's souls amazeth.
-And for a woman wert thou first created;
-Till Nature, as she wrought thee, fell a-doting,
-And by addition me of thee defeated,
-By adding one thing to my purpose nothing.
-But since she prick'd thee out for women's pleasure,
-Mine be thy love and thy love's use their treasure.
-So is it not with me as with that Muse
-Stirr'd by a painted beauty to his verse,
-Who heaven itself for ornament doth use
-And every fair with his fair doth rehearse
-Making a couplement of proud compare,
-With sun and moon, with earth and sea's rich gems,
-With April's first-born flowers, and all things rare
-That heaven's air in this huge rondure hems.
-O' let me, true in love, but truly write,
-And then believe me, my love is as fair
-As any mother's child, though not so bright
-As those gold candles fix'd in heaven's air:
-Let them say more than like of hearsay well;
-I will not praise that purpose not to sell.
-My glass shall not persuade me I am old,
-So long as youth and thou are of one date;
-But when in thee time's furrows I behold,
-Then look I death my days should expiate.
-For all that beauty that doth cover thee
-Is but the seemly raiment of my heart,
-Which in thy breast doth live, as thine in me:
-How can I then be elder than thou art?
-O, therefore, love, be of thyself so wary
-As I, not for myself, but for thee will;
-Bearing thy heart, which I will keep so chary
-As tender nurse her babe from faring ill.
-Presume not on thy heart when mine is slain;
-Thou gavest me thine, not to give back again.
-As an unperfect actor on the stage
-Who with his fear is put besides his part,
-Or some fierce thing replete with too much rage,
-Whose strength's abundance weakens his own heart.
-So I, for fear of trust, forget to say
-The perfect ceremony of love's rite,
-And in mine own love's strength seem to decay,
-O'ercharged with burden of mine own love's might.
-O, let my books be then the eloquence
-And dumb presagers of my speaking breast,
-Who plead for love and look for recompense
-More than that tongue that more hath more express'd.
-O, learn to read what silent love hath writ:
-To hear with eyes belongs to love's fine wit.
-Mine eye hath play'd the painter and hath stell'd
-Thy beauty's form in table of my heart;
-My body is the frame wherein 'tis held,
-And perspective it is the painter's art.
-For through the painter must you see his skill,
-To find where your true image pictured lies;
-Which in my bosom's shop is hanging still,
-That hath his windows glazed with thine eyes.
-Now see what good turns eyes for eyes have done:
-Mine eyes have drawn thy shape, and thine for me
-Are windows to my breast, where-through the sun
-Delights to peep, to gaze therein on thee;
-Yet eyes this cunning want to grace their art;
-They draw but what they see, know not the heart.
-Let those who are in favour with their stars
-Of public honour and proud titles boast,
-Whilst I, whom fortune of such triumph bars,
-Unlook'd for joy in that I honour most.
-Great princes' favourites their fair leaves spread
-But as the marigold at the sun's eye,
-And in themselves their pride lies buried,
-For at a frown they in their glory die.
-The painful warrior famoused for fight,
-After a thousand victories once foil'd,
-Is from the book of honour razed quite,
-And all the rest forgot for which he toil'd:
-Then happy I, that love and am beloved
-Where I may not remove nor be removed.
-Lord of my love, to whom in vassalage
-Thy merit hath my duty strongly knit,
-To thee I send this written embassage,
-To witness duty, not to show my wit:
-Duty so great, which wit so poor as mine
-May make seem bare, in wanting words to show it,
-But that I hope some good conceit of thine
-In thy soul's thought, all naked, will bestow it;
-Till whatsoever star that guides my moving
-Points on me graciously with fair aspect
-And puts apparel on my tatter'd loving,
-To show me worthy of thy sweet respect:
-Then may I dare to boast how I do love thee;
-Till then not show my head where thou mayst prove me.
-Weary with toil, I haste me to my bed,
-The dear repose for limbs with travel tired;
-But then begins a journey in my head,
-To work my mind, when body's work's expired:
-For then my thoughts, from far where I abide,
-Intend a zealous pilgrimage to thee,
-And keep my drooping eyelids open wide,
-Looking on darkness which the blind do see
-Save that my soul's imaginary sight
-Presents thy shadow to my sightless view,
-Which, like a jewel hung in ghastly night,
-Makes black night beauteous and her old face new.
-Lo! thus, by day my limbs, by night my mind,
-For thee and for myself no quiet find.
-How can I then return in happy plight,
-That am debarr'd the benefit of rest?
-When day's oppression is not eased by night,
-But day by night, and night by day, oppress'd?
-And each, though enemies to either's reign,
-Do in consent shake hands to torture me;
-The one by toil, the other to complain
-How far I toil, still farther off from thee.
-I tell the day, to please them thou art bright
-And dost him grace when clouds do blot the heaven:
-So flatter I the swart-complexion'd night,
-When sparkling stars twire not thou gild'st the even.
-But day doth daily draw my sorrows longer
-And night doth nightly make grief's strength seem stronger.
-When, in disgrace with fortune and men's eyes,
-I all alone beweep my outcast state
-And trouble deal heaven with my bootless cries
-And look upon myself and curse my fate,
-Wishing me like to one more rich in hope,
-Featured like him, like him with friends possess'd,
-Desiring this man's art and that man's scope,
-With what I most enjoy contented least;
-Yet in these thoughts myself almost despising,
-Haply I think on thee, and then my state,
-Like to the lark at break of day arising
-From sullen earth, sings hymns at heaven's gate;
-For thy sweet love remember'd such wealth brings
-That then I scorn to change my state with kings.
-When to the sessions of sweet silent thought
-I summon up remembrance of things past,
-I sigh the lack of many a thing I sought,
-And with old woes new wail my dear time's waste:
-Then can I drown an eye, unused to flow,
-For precious friends hid in death's dateless night,
-And weep afresh love's long since cancell'd woe,
-And moan the expense of many a vanish'd sight:
-Then can I grieve at grievances foregone,
-And heavily from woe to woe tell o'er
-The sad account of fore-bemoaned moan,
-Which I new pay as if not paid before.
-But if the while I think on thee, dear friend,
-All losses are restored and sorrows end.
-Thy bosom is endeared with all hearts,
-Which I by lacking have supposed dead,
-And there reigns love and all love's loving parts,
-And all those friends which I thought buried.
-How many a holy and obsequious tear
-Hath dear religious love stol'n from mine eye
-As interest of the dead, which now appear
-But things removed that hidden in thee lie!
-Thou art the grave where buried love doth live,
-Hung with the trophies of my lovers gone,
-Who all their parts of me to thee did give;
-That due of many now is thine alone:
-Their images I loved I view in thee,
-And thou, all they, hast all the all of me.
-If thou survive my well-contented day,
-When that churl Death my bones with dust shall cover,
-And shalt by fortune once more re-survey
-These poor rude lines of thy deceased lover,
-Compare them with the bettering of the time,
-And though they be outstripp'd by every pen,
-Reserve them for my love, not for their rhyme,
-Exceeded by the height of happier men.
-O, then vouchsafe me but this loving thought:
-'Had my friend's Muse grown with this growing age,
-A dearer birth than this his love had brought,
-To march in ranks of better equipage:
-But since he died and poets better prove,
-Theirs for their style I'll read, his for his love.'
-Full many a glorious morning have I seen
-Flatter the mountain-tops with sovereign eye,
-Kissing with golden face the meadows green,
-Gilding pale streams with heavenly alchemy;
-Anon permit the basest clouds to ride
-With ugly rack on his celestial face,
-And from the forlorn world his visage hide,
-Stealing unseen to west with this disgrace:
-Even so my sun one early morn did shine
-With all triumphant splendor on my brow;
-But out, alack! he was but one hour mine;
-The region cloud hath mask'd him from me now.
-Yet him for this my love no whit disdaineth;
-Suns of the world may stain when heaven's sun staineth.
-Why didst thou promise such a beauteous day,
-And make me travel forth without my cloak,
-To let base clouds o'ertake me in my way,
-Hiding thy bravery in their rotten smoke?
-'Tis not enough that through the cloud thou break,
-To dry the rain on my storm-beaten face,
-For no man well of such a salve can speak
-That heals the wound and cures not the disgrace:
-Nor can thy shame give physic to my grief;
-Though thou repent, yet I have still the loss:
-The offender's sorrow lends but weak relief
-To him that bears the strong offence's cross.
-Ah! but those tears are pearl which thy love sheds,
-And they are rich and ransom all ill deeds.
-No more be grieved at that which thou hast done:
-Roses have thorns, and silver fountains mud;
-Clouds and eclipses stain both moon and sun,
-And loathsome canker lives in sweetest bud.
-All men make faults, and even I in this,
-Authorizing thy trespass with compare,
-Myself corrupting, salving thy amiss,
-Excusing thy sins more than thy sins are;
-For to thy sensual fault I bring in sense--
-Thy adverse party is thy advocate--
-And 'gainst myself a lawful plea commence:
-Such civil war is in my love and hate
-That I an accessary needs must be
-To that sweet thief which sourly robs from me.
-Let me confess that we two must be twain,
-Although our undivided loves are one:
-So shall those blots that do with me remain
-Without thy help by me be borne alone.
-In our two loves there is but one respect,
-Though in our lives a separable spite,
-Which though it alter not love's sole effect,
-Yet doth it steal sweet hours from love's delight.
-I may not evermore acknowledge thee,
-Lest my bewailed guilt should do thee shame,
-Nor thou with public kindness honour me,
-Unless thou take that honour from thy name:
-But do not so; I love thee in such sort
-As, thou being mine, mine is thy good report.
-As a decrepit father takes delight
-To see his active child do deeds of youth,
-So I, made lame by fortune's dearest spite,
-Take all my comfort of thy worth and truth.
-For whether beauty, birth, or wealth, or wit,
-Or any of these all, or all, or more,
-Entitled in thy parts do crowned sit,
-I make my love engrafted to this store:
-So then I am not lame, poor, nor despised,
-Whilst that this shadow doth such substance give
-That I in thy abundance am sufficed
-And by a part of all thy glory live.
-Look, what is best, that best I wish in thee:
-This wish I have; then ten times happy me!
\ No newline at end of file

From 168452fb026bc3cb35e476d74fca9f46afcb4f4e Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 22:26:11 +0000
Subject: [PATCH 205/278] revert change in flash attn and flash infer to clean
 up the diff

---
 vllm/attention/backends/flash_attn.py | 3 ---
 vllm/attention/backends/flashinfer.py | 1 -
 2 files changed, 4 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 09456ca8d7b6..bf883987bd80 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -15,9 +15,6 @@
                                            is_block_tables_empty)
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
-from vllm.distributed import get_disagg_group
-import vllm.envs as envs
-
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                           ModelInputForGPUWithSamplingMetadata)
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 852c5cd8dc18..4054d337316f 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -823,4 +823,3 @@ def forward(
                 k_scale=k_scale,
                 v_scale=v_scale)
         return output.view(num_tokens, hidden_size)
-        
\ No newline at end of file

From 784d9058d5a8f483024d23887942583bac2ff238 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 22:35:12 +0000
Subject: [PATCH 206/278] update the example

---
 .../disagg_prefill/disagg_prefill_example.sh  | 35 +++++++++++++++----
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index f57f5fd86d89..56b6f44c7418 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -16,7 +16,7 @@ wait_for_server() {
 }
 
 # prefilling instance
-VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 python3 \
+VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8100 \
@@ -24,7 +24,7 @@ VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 pytho
     --gpu-memory-utilization 0.8 &
 
 # decoding instance
-VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=1 python3 \
+VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8200 \
@@ -36,18 +36,39 @@ wait_for_server 8100
 wait_for_server 8200
 
 # launch a proxy server that opens the service at port 8000
+# the workflow of this proxy:
+# - send the request to prefill vLLM instance (port 8100), change max_tokens to 1
+# - after the prefill vLLM finishes prefill, send the request to decode vLLM instance
 python3 ../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
 sleep 1
 
-# serve an example request
-curl http://localhost:8000/v1/completions \
+# serve two example requests
+output1=$(curl -s http://localhost:8000/v1/completions \
 -H "Content-Type: application/json" \
 -d '{
 "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 "prompt": "San Francisco is a",
 "max_tokens": 10,
 "temperature": 0
-}'
+}')
 
-# clean up
-ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
\ No newline at end of file
+output2=$(curl -s http://localhost:8000/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"prompt": "Santa Clara is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+# Print the outputs of the curl requests
+echo ""
+echo "Output of first request: $output1"
+echo "Output of second request: $output2"
+
+echo "Successfully finished 2 test requests!"
+echo ""
+
+# Cleanup commands, suppressing their output
+ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9 > /dev/null 2>&1
+pkill -f python3 > /dev/null 2>&1

From 17d2505f1b810c68ec8e8f9ff8cde425647013f9 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 23:42:30 +0000
Subject: [PATCH 207/278] make format checker happy

---
 .../disagg_prefill_proxy_server.py            |  40 +--
 .../disagg_benchmarks/round_robin_proxy.py    |  25 +-
 .../visualize_benchmark_results.py            |  39 ++-
 .../kv_transfer/kv_lookup_buffer/base.py      |  19 +-
 .../simple_kv_lookup_buffer.py                | 122 ++++-----
 vllm/distributed/kv_transfer/kv_pipe/base.py  |  16 +-
 .../kv_pipe/torch_distributed_pipe.py         |  76 +++---
 vllm/distributed/kv_transfer/vllm_adapter.py  | 252 ++++++++++--------
 vllm/distributed/parallel_state.py            |  33 ++-
 vllm/executor/gpu_executor.py                 |   3 +-
 vllm/executor/multiproc_gpu_executor.py       |   3 +-
 vllm/executor/ray_gpu_executor.py             |   2 +-
 vllm/worker/model_runner.py                   |  46 ++--
 vllm/worker/worker_base.py                    |  87 +++---
 14 files changed, 386 insertions(+), 377 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index 5750df7735ad..4058b1c0a3b7 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -1,28 +1,31 @@
-from quart import Quart, request, Response, jsonify, make_response
-import aiohttp
-import sys
-import traceback
 import os
 
+import aiohttp
+from quart import Quart, make_response, request
+
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 
 app = Quart(__name__)
 
+
 async def forward_request(url, data):
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: 
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         headers = {
             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
         }
-        async with session.post(url=url, json=data, headers=headers) as response:
+        async with session.post(url=url, json=data,
+                                headers=headers) as response:
             if response.status == 200:
                 # if response.headers.get('Transfer-Encoding') == 'chunked':
                 if True:
-                    async for chunk_bytes in response.content.iter_chunked(1024):
+                    async for chunk_bytes in response.content.iter_chunked(
+                            1024):
                         yield chunk_bytes
                 else:
                     content = await response.read()
                     yield content
 
+
 @app.route('/v1/completions', methods=['POST'])
 async def handle_request():
     try:
@@ -31,25 +34,28 @@ async def handle_request():
         prefill_request = original_request_data.copy()
         # change max_tokens = 1 to let it only do prefill
         prefill_request['max_tokens'] = 1
-        
+
         # finish prefill
-        async for _ in forward_request('http://localhost:8100/v1/completions', prefill_request):
+        async for _ in forward_request('http://localhost:8100/v1/completions',
+                                       prefill_request):
             continue
 
-        print(f"Prefill done. proceeding to decode.")
-        
         # return decode
-        generator = forward_request('http://localhost:8200/v1/completions', original_request_data)
+        generator = forward_request('http://localhost:8200/v1/completions',
+                                    original_request_data)
         response = await make_response(generator)
         response.timeout = None
 
         return response
-    
+
     except Exception as e:
-        pass
-        # exc_info = sys.exc_info()
-        # print(e)
-        # print("".join(traceback.format_exception(*exc_info)))
+        import sys
+        import traceback
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+
 
 if __name__ == '__main__':
     app.run(port=8000)
diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
index 8751e24a08d3..6eb5f6398007 100644
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -1,9 +1,12 @@
 import asyncio
+import itertools
+
 import aiohttp
 from aiohttp import web
-import itertools
+
 
 class RoundRobinProxy:
+
     def __init__(self, target_ports):
         self.target_ports = target_ports
         self.port_cycle = itertools.cycle(self.target_ports)
@@ -16,16 +19,14 @@ async def handle_request(self, request):
             try:
                 # Forward the request
                 async with session.request(
-                    method=request.method,
-                    url=target_url,
-                    headers=request.headers,
-                    data=request.content,
+                        method=request.method,
+                        url=target_url,
+                        headers=request.headers,
+                        data=request.content,
                 ) as response:
                     # Start sending the response
-                    resp = web.StreamResponse(
-                        status=response.status,
-                        headers=response.headers
-                    )
+                    resp = web.StreamResponse(status=response.status,
+                                              headers=response.headers)
                     await resp.prepare(request)
 
                     # Stream the response content
@@ -38,6 +39,7 @@ async def handle_request(self, request):
             except Exception as e:
                 return web.Response(text=f"Error: {str(e)}", status=500)
 
+
 async def main():
     proxy = RoundRobinProxy([8100, 8200])
     app = web.Application()
@@ -49,9 +51,10 @@ async def main():
     await site.start()
 
     print("Proxy server started on http://localhost:8000")
-    
+
     # Keep the server running
     await asyncio.Event().wait()
 
+
 if __name__ == '__main__':
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())
diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
index 192f26a1e3cd..6c5bf5c791dc 100644
--- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -1,40 +1,42 @@
+import json
 
 import matplotlib.pyplot as plt
-import yaml
 import pandas as pd
-import json
-
-
 
 if __name__ == "__main__":
 
     data = []
     for name in ['disagg_prefill', 'chunked_prefill']:
-        for qps in [2,4,6,8]:
+        for qps in [2, 4, 6, 8]:
             with open(f"results/{name}-qps-{qps}.json", "r") as f:
                 x = json.load(f)
                 x['name'] = name
                 x['qps'] = qps
                 data.append(x)
-                
+
     df = pd.DataFrame.from_dict(data)
     dis_df = df[df['name'] == 'disagg_prefill']
     chu_df = df[df['name'] == 'chunked_prefill']
-    
+
     plt.style.use('bmh')
     plt.rcParams['font.size'] = 20
 
-    
-    for key in ['mean_ttft_ms', 
-                'median_ttft_ms',
-                'p99_ttft_ms',
-                'mean_itl_ms',
-                'median_itl_ms',
-                'p99_itl_ms']:
-        
+    for key in [
+            'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms',
+            'median_itl_ms', 'p99_itl_ms'
+    ]:
+
         fig, ax = plt.subplots(figsize=(11, 7))
-        plt.plot(dis_df['qps'], dis_df[key], label='disagg_prefill', marker='o', linewidth=4)
-        plt.plot(chu_df['qps'], chu_df[key], label='chunked_prefill', marker='o', linewidth=4)
+        plt.plot(dis_df['qps'],
+                 dis_df[key],
+                 label='disagg_prefill',
+                 marker='o',
+                 linewidth=4)
+        plt.plot(chu_df['qps'],
+                 chu_df[key],
+                 label='chunked_prefill',
+                 marker='o',
+                 linewidth=4)
         ax.legend()
 
         ax.set_xlabel('QPS')
@@ -42,6 +44,3 @@
         ax.set_ylim(bottom=0)
         fig.savefig(f'results/{key}.png')
         plt.close(fig)
-
-    
-    
\ No newline at end of file
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index 733bc82bf53f..80802f87987a 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -1,21 +1,22 @@
-
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import List, Optional
+
 import torch
 
 
 class KVLookupBufferBase(ABC):
-    
+
     @abstractmethod
-    def insert(self,
-               input_tokens: torch.Tensor,
-               kv: torch.Tensor, roi) -> None:
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
         raise NotImplementedError
-    
+
     @abstractmethod
-    def drop_select(self, input_tokens, roi) -> Optional[torch.Tensor]:
+    def drop_select(self, input_tokens: torch.Tensor,
+                    roi: torch.Tensor) -> List[Optional[torch.Tensor]]:
         raise NotImplementedError
-    
+
     @abstractmethod
     def close(self):
         """
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
index 6172bf092fb0..9696032002fd 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
@@ -1,22 +1,21 @@
-
-from vllm.distributed.kv_transfer.kv_lookup_buffer.base import \
-    KVLookupBufferBase
-from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from typing import Dict, Tuple, List, Optional, Union
 import threading
-import torch
-from collections import deque
 import time
+from collections import deque
+from typing import Deque, List, Optional, Union
+
+import torch
 
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
+    KVLookupBufferBase)
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
+
 class SimpleKVLookupBuffer(KVLookupBufferBase):
-    
-    def __init__(self, 
-                 signal_pipe: KVPipeBase, 
-                 data_pipe: KVPipeBase, 
+
+    def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase,
                  buffer_size_thresh: int):
         """
         signal_pipe: on CPU 
@@ -28,72 +27,66 @@ def __init__(self,
         
         data_pipe: on device (e.g. GPU)
         """
-        
-        self.buffer = deque()
-        
+
+        self.buffer: Deque[List[torch.Tensor]] = deque()
+
         self.buffer_size = 0
         self.buffer_size_threshold = buffer_size_thresh
         self.buffer_lock = threading.Lock()
         self.signal_pipe = signal_pipe
         self.data_pipe = data_pipe
-        self.request_handling_thread = None
+        self.request_handling_thread: Optional[threading.Thread] = None
 
         self.normal_signal = torch.tensor([0])
         self.end_signal = None
 
-        
-    def _matches(self,
-                 tokens_roi_sender: List[torch.Tensor], 
+    def _matches(self, tokens_roi_sender: List[torch.Tensor],
                  tokens_roi_recver: List[torch.Tensor]):
 
         # tokens_roi_sender: tokens and roi of the producer (in the buffer)
         # tokens_roi_recver: tokens and roi of the consumer (query)
-        
+
         tokens_sender = tokens_roi_sender[0]
         tokens_recver = tokens_roi_recver[0]
         roi_sender = tokens_roi_sender[1]
         roi_recver = tokens_roi_recver[1]
-        
+
         if tokens_recver is None:
             # consumer sends an empty request
             # semantics: DROP SELECT * LIMIT 1
             # so any of the data in the buffer can be drop-selected
             return True
 
-            
         # Assuming that roi is a mask on tokens
         tokens_sender = tokens_sender[roi_sender]
         tokens_recver = tokens_recver[roi_recver]
-        
-        
+
         # simple common prefix matching
         min_length = min(len(tokens_sender), len(tokens_recver))
-        if torch.allclose(tokens_sender[:min_length], tokens_recver[:min_length]):
+        if torch.allclose(tokens_sender[:min_length],
+                          tokens_recver[:min_length]):
             return min_length
-        
+
         return 0
 
-            
-    def _send_tensor_and_dec_size(self, tensor: Optional[torch.Tensor]) -> None:
+    def _send_tensor_and_dec_size(self,
+                                  tensor: Optional[torch.Tensor]) -> None:
 
         assert tensor is not None, "Use self.data_pipe.send(None) instead"
         self.buffer_size -= tensor.element_size() * tensor.numel()
         self.data_pipe.send_tensor(tensor)
 
     def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
-        
+
         if data == [] or data is None:
             return 0
         if isinstance(data, torch.Tensor):
             return data.element_size() * data.numel()
+        else:
+            raise AssertionError("Unknown data type %s" % type(data))
 
-        assert False, "Unknown data type %s" % type(data)
-        
-    def _add_to_buffer(self,
-                       input_tokens: torch.Tensor, 
-                       roi: torch.Tensor, 
-                       key: torch.Tensor, 
-                       value: torch.Tensor, 
+    def _add_to_buffer(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+                       key: torch.Tensor, value: torch.Tensor,
                        hidden: torch.Tensor):
 
         if isinstance(input_tokens, torch.Tensor):
@@ -107,21 +100,20 @@ def _add_to_buffer(self,
         if isinstance(hidden, torch.Tensor):
             hidden = hidden.clone()
 
-        
         buffer_item = [input_tokens, roi, key, value, hidden]
-        
+
         with self.buffer_lock:
             for data in buffer_item:
                 self.buffer_size += self._get_element_size(data)
             self.buffer.append(buffer_item)
-        
+
     def _is_end_signal(self, signal):
         return signal is None
-        
+
     def drop_select_handler(self):
 
         try:
-        
+
             while True:
                 signal = self.signal_pipe.recv_tensor()
                 if self._is_end_signal(signal):
@@ -132,28 +124,29 @@ def drop_select_handler(self):
 
                 roi = self.data_pipe.recv_tensor()
                 tokens_roi_recver = [input_tokens, roi]
-                
+
                 matched_length = 0
-                
+
                 # perform input tokens and roi matching
                 with self.buffer_lock:
 
                     for _ in range(len(self.buffer)):
-                        
-                        temp_length = self._matches(self.buffer[0], tokens_roi_recver)
+
+                        temp_length = self._matches(self.buffer[0],
+                                                    tokens_roi_recver)
                         if temp_length > 0:
                             matched_length = temp_length
                             break
                         # rotate the element we just accessed to the end
                         self.buffer.rotate(-1)
-                        
+
                     if matched_length > 0:
                         # need to clone the tensor
                         # in case the tensor is freed before sending finishes
                         matched_item = self.buffer.popleft()
                         for tensor in matched_item:
                             self._send_tensor_and_dec_size(tensor)
-                        
+
                     else:
                         # no match, just send None
                         for _ in range(5):
@@ -164,60 +157,57 @@ def drop_select_handler(self):
                 raise e
 
         logger.debug("Closing drop_select_handler")
-                        
-        
-    def drop_select(self, 
-                    input_tokens: torch.Tensor, 
-                    roi: torch.Tensor):
-        
+
+    def drop_select(self, input_tokens: torch.Tensor,
+                    roi: torch.Tensor) -> List[Optional[torch.Tensor]]:
+
         assert self.request_handling_thread is None, \
             "drop_select should be called by the receiver"
 
-            
         if isinstance(input_tokens, torch.Tensor):
             input_tokens = input_tokens.clone()
         if isinstance(roi, torch.Tensor):
             roi = roi.clone()
-        
+
         self.signal_pipe.send_tensor(self.normal_signal)
         self.data_pipe.send_tensor(input_tokens)
         self.data_pipe.send_tensor(roi)
-        
+
         input_tokens = self.data_pipe.recv_tensor()
         roi = self.data_pipe.recv_tensor()
         key = self.data_pipe.recv_tensor()
         value = self.data_pipe.recv_tensor()
         hidden = self.data_pipe.recv_tensor()
-        
+
         return [input_tokens, roi, key, value, hidden]
 
-        
     def full_handler(self):
         time.sleep(0.001)
-        
-    
-    def insert(self, input_tokens, roi, key, value, hidden) -> None:
+
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
 
         while self.buffer_size > self.buffer_size_threshold:
             # logger.debug("KV transfer buffer is full. Handling...")
             self.full_handler()
 
-        
         self._add_to_buffer(input_tokens, roi, key, value, hidden)
-        
+
         # when calling the insert, the current process is a sender
         # need to launch the request handler and start listening to request.
         if self.request_handling_thread is None:
             self.request_handling_thread = threading.Thread(
                 target=self.drop_select_handler)
             self.request_handling_thread.start()
-            
-            
+
     def close(self):
 
-        if hasattr(self, "request_handling_thread") and self.request_handling_thread is not None:
+        if hasattr(self, "request_handling_thread"
+                   ) and self.request_handling_thread is not None:
             self.request_handling_thread.join()
 
         else:
-            # TODO: have a explicit close signal and have a explicit way to check if it's requester 
+            # TODO: have a explicit close signal and have a explicit way to
+            # check if it's requester
             self.signal_pipe.send_tensor(self.end_signal)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
index 7662a5893ceb..0955b4e83889 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -1,15 +1,17 @@
-
 from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
 
 
 class KVPipeBase(ABC):
-    
-    @abstractmethod 
-    def send_tensor(self, tensor):
+
+    @abstractmethod
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         raise NotImplementedError
-    
-    @abstractmethod 
-    def recv_tensor(self):
+
+    @abstractmethod
+    def recv_tensor(self) -> Optional[torch.Tensor]:
         raise NotImplementedError
 
     @abstractmethod
diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index 3a6a94bb0e75..911bce88a38f 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -1,15 +1,16 @@
-from torch.distributed import Backend
-import torch
-from typing import List, Optional, Union
 import threading
-from concurrent.futures import ThreadPoolExecutor
 import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import List, Optional, Union
 
+import torch
+from torch.distributed import Backend
+
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
-
 # if the tensor is only one-element and only contains NONE_INT
 # this means that the sended object is None.
 NONE_INT = -150886311
@@ -42,17 +43,17 @@
 
 
 class BrokenPipeException(Exception):
+
     def __init__(self, message):
         self.message = message
         super().__init__(self.message)
 
 
-class TorchDistributedPipe:
+class TorchDistributedPipe(KVPipeBase):
     METADATA_LENGTH = 16
     MAX_TENSOR_DIMENSIONS = 14
     METADATA_DTYPE = torch.int64
 
-
     def __init__(
         self,
         group_ranks: List[List[int]],
@@ -65,8 +66,7 @@ def __init__(
 
         for ranks in group_ranks:
             device_group = torch.distributed.new_group(
-                ranks, backend=torch_distributed_backend
-            )
+                ranks, backend=torch_distributed_backend)
             if self.rank in ranks:
                 self.ranks = ranks
                 self.world_size = len(ranks)
@@ -78,26 +78,24 @@ def __init__(
 
         self.device = self._select_device(torch_distributed_backend)
 
-        self.target_rank_for_send = self.ranks[
-            (self.rank_in_group + 1) % self.world_size
-        ]
-        self.target_rank_for_recv = self.ranks[
-            (self.rank_in_group - 1) % self.world_size
-        ]
+        self.target_rank_for_send = self.ranks[(self.rank_in_group + 1) %
+                                               self.world_size]
+        self.target_rank_for_recv = self.ranks[(self.rank_in_group - 1) %
+                                               self.world_size]
 
         # FIXME: why we need this?
         torch.set_default_device(self.device)
 
-        self.transport_thread = None
+        self.transport_thread: Optional[ThreadPoolExecutor] = None
         self.buffer_size = 0
         self.buffer_size_lock = threading.Lock()
 
         self.none_tensor = torch.tensor([NONE_INT], device=self.device)
 
         # On-device tensors to be reused for recv
-        self.rcv_metadata_buffer = torch.zeros(
-            self.METADATA_LENGTH, dtype=self.METADATA_DTYPE, device=self.device
-        )
+        self.rcv_metadata_buffer = torch.zeros(self.METADATA_LENGTH,
+                                               dtype=self.METADATA_DTYPE,
+                                               device=self.device)
 
     def _select_device(self, backend: Union[str, Backend]):
         if torch.cuda.is_available() and backend == Backend.NCCL:
@@ -129,14 +127,12 @@ def _make_metadata(self, tensor: torch.Tensor) -> torch.Tensor:
         buffer[0] = DTYPE2INT[tensor.dtype]
         ndims = len(tensor.shape)
         buffer[1] = len(tensor.shape)
-        buffer[2 : 2 + ndims] = torch.tensor(
-            tensor.shape, dtype=self.METADATA_DTYPE
-        )
+        buffer[2:2 + ndims] = torch.tensor(tensor.shape,
+                                           dtype=self.METADATA_DTYPE)
         return buffer.to(self.device)
 
-    def _prepare_recv_buffer(
-        self, d_metadata_buffer: torch.Tensor
-    ) -> torch.Tensor:
+    def _prepare_recv_buffer(self,
+                             d_metadata_buffer: torch.Tensor) -> torch.Tensor:
         """
         Create a buffer to receive the tensor based on the metadata.
 
@@ -149,7 +145,7 @@ def _prepare_recv_buffer(
         h_buffer = d_metadata_buffer.cpu().numpy()
         dtype = INT2DTYPE[h_buffer[0]]
         ndims = h_buffer[1]
-        shape = tuple(h_buffer[2 : 2 + ndims])
+        shape = tuple(h_buffer[2:2 + ndims])
         return torch.empty(shape, dtype=dtype, device=self.device)
 
     def _send_metadata(self, d_metadata_buffer: torch.Tensor):
@@ -174,7 +170,7 @@ def _recv_metadata(self) -> torch.Tensor:
             race conditions during sending/receiving. Therefore, the metadata
             buffer can be reused
         """
-        task = torch.distributed.recv(
+        torch.distributed.recv(
             self.rcv_metadata_buffer,
             src=self.target_rank_for_recv,
             group=self.device_group,
@@ -194,9 +190,9 @@ def _send_impl(self, tensor):
         metadata = self._make_metadata(tensor)
         self._send_metadata(metadata)
 
-        torch.distributed.send(
-            tensor, dst=self.target_rank_for_send, group=self.device_group
-        )
+        torch.distributed.send(tensor,
+                               dst=self.target_rank_for_send,
+                               group=self.device_group)
 
     def _recv_impl(self) -> torch.Tensor:
         """
@@ -211,9 +207,9 @@ def _recv_impl(self) -> torch.Tensor:
         d_metadata = self._recv_metadata()
         buffer = self._prepare_recv_buffer(d_metadata)
 
-        torch.distributed.recv(
-            buffer, src=self.target_rank_for_recv, group=self.device_group
-        )
+        torch.distributed.recv(buffer,
+                               src=self.target_rank_for_recv,
+                               group=self.device_group)
 
         return buffer
 
@@ -227,13 +223,9 @@ def send_tensor_wrapper(self, tensor):
                 self.buffer_size = self.buffer_size - tensor_size
         except Exception as e:
             logger.error("[rank%d]: Exception when trying to send %s, msg: %s",
-                         torch.distributed.get_rank(),
-                         str(tensor),
-                         str(e))
+                         torch.distributed.get_rank(), str(tensor), str(e))
             import traceback
             traceback.print_exc()
-            
-
 
     def block_if_full(self):
         """
@@ -268,13 +260,11 @@ def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         with self.buffer_size_lock:
             self.buffer_size = self.buffer_size + tensor_size
 
-
         self.transport_thread.submit(
             self.send_tensor_wrapper,
             tensor,
         )
 
-
     def recv_tensor(self) -> Optional[torch.Tensor]:
         """Receives a tensor from the src rank. Blocking."""
         if self.transport_thread is None:
@@ -300,8 +290,6 @@ def close(self):
         """
         Close the pipe and release the resources.
         """
-        if (
-            hasattr(self, "transport_thread")
-            and self.transport_thread is not None
-        ):
+        if (hasattr(self, "transport_thread")
+                and self.transport_thread is not None):
             self.transport_thread.shutdown()
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 9a6b55cbbe66..03392ec13f10 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -1,59 +1,59 @@
 """vLLM distributed KV cache transfer API.
 These APIs are used in `vllm/worker/worker_base.py`.
 
-Currently supporting TP. The TP between prefill and decode instance needs to be the same.
+Currently supporting TP. The TP between prefill and decode instance needs to be 
+the same.
 
 Workflow (disaggregated prefill)
 - In prefill instance
     - After prefill, vLLM `insert` its KV caches into a lookup buffer.
-    - The prefill instance will also open up a thread that listens to `drop_select` request.
+    - The prefill instance will also open up a thread that listens to 
+      `drop_select` request.
 - In decode instance
-    - vLLM first runs `drop_select` to send input tokens and a mask on input tokens (we call it roi, region of interest) to prefill instance
+    - vLLM first runs `drop_select` to send input tokens and a mask on input 
+      tokens (we call it roi, region of interest) to prefill instance
     - The prefill instance then respond to `drop_select` request by
         - Finding a match in current lookup buffer.
         - Clone and send the matched item out
         - Delete the matched item in the lookup buffer to free up GPU memory.
     - The decode vLLM then store the KV cache into paged memory.
 """
-from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING
-from collections import defaultdict, deque
-from concurrent.futures import ThreadPoolExecutor
-from threading import Lock
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
 from copy import deepcopy
-import time
-import threading
 
 import torch
-from torch.distributed import Backend, ProcessGroup
+from torch.distributed import Backend
 
 import vllm.envs as envs
-from vllm.logger import init_logger
-import vllm.distributed.parallel_state as ps
 from vllm import _custom_ops as ops
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
+    KVLookupBufferBase)
+from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_kv_lookup_buffer import (
+    SimpleKVLookupBuffer)
+from vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe import (
+    TorchDistributedPipe)
+from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
-from vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe import TorchDistributedPipe
-from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_kv_lookup_buffer import SimpleKVLookupBuffer
-
-from copy import deepcopy
 
-assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode", "lmcache"], \
+assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode", "lmcache"],\
     "VLLM_DISAGG_PREFILL_ROLE can only be prefill, decode or lmcache."
 
-
 # currently the connections are hard-coded.
 # we only handle 2 cases:
 # - prefill vLLM --> decode vLLM
 # - vLLM --> LMCache
-IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"])
+IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE
+                                    in ["prefill", "decode"])
 IS_KV_PREFILL_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "prefill")
 IS_KV_DECODE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "decode")
 IS_LMCACHE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "lmcache")
 
-
 logger = init_logger(__name__)
 
-import logging
-
 
 class KV_transfer_agent:
     """
@@ -70,11 +70,13 @@ def __init__(
         local_rank: int,
         torch_distributed_backend: Union[str, Backend],
         # FIXME(Kuntai): remove this hardcoding
-        lookup_buffer_size: int = 1e10
-    ):
-        
+        lookup_buffer_size: int = int(1e10)):
+
         self.lookup_buffer_size = lookup_buffer_size
-        
+
+        self.send_buffer: Optional[KVLookupBufferBase] = None
+        self.recv_buffer: Optional[KVLookupBufferBase] = None
+
         if IS_LMCACHE_INSTANCE:
             # when vLLM is connected with LMCache
             # it needs to both send and recv KV cache
@@ -98,14 +100,12 @@ def __init__(
                 local_rank,
                 "gloo",
             )
-            self.send_buffer = SimpleKVLookupBuffer(
-                self.send_signal_pipe,
-                self.send_pipe,
-                self.lookup_buffer_size)
-            self.recv_buffer = SimpleKVLookupBuffer(
-                self.recv_signal_pipe, 
-                self.recv_pipe, 
-                self.lookup_buffer_size)
+            self.send_buffer = SimpleKVLookupBuffer(self.send_signal_pipe,
+                                                    self.send_pipe,
+                                                    self.lookup_buffer_size)
+            self.recv_buffer = SimpleKVLookupBuffer(self.recv_signal_pipe,
+                                                    self.recv_pipe,
+                                                    self.lookup_buffer_size)
         else:
             # when performing disaggregated prefill, only 1 pipe is needed
             # at prefill instance this pipe is used for send KV cache
@@ -120,24 +120,25 @@ def __init__(
                 local_rank,
                 "gloo",
             )
-            buffer = SimpleKVLookupBuffer(
-                self.signal_pipe, 
-                self.pipe, 
-                self.lookup_buffer_size)
+            buffer = SimpleKVLookupBuffer(self.signal_pipe, self.pipe,
+                                          self.lookup_buffer_size)
             self.send_buffer = buffer
             self.recv_buffer = buffer
-        
+
     def send_kv_caches_and_hidden_states(
         self,
         model_executable: torch.nn.Module,
         model_input: "ModelInputForGPUWithSamplingMetadata",
         kv_caches: List[torch.Tensor],
-        hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
     ) -> None:
 
         input_tokens_tensor = model_input.input_tokens
         seq_lens = model_input.attn_metadata.seq_lens
         slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
 
         # query_lens contains new KV caches that are added to vLLM.
         # so we will send them to decode instance
@@ -146,13 +147,11 @@ def send_kv_caches_and_hidden_states(
             start_pos = sum(seq_lens[:idx])
             end_pos = start_pos + slen
             current_tokens = input_tokens_tensor[start_pos:end_pos]
-            
+
             keys, values = [], []
-            
-            
-            for l in range(model_executable.model.start_layer,
-                        model_executable.model.end_layer):
-                kv_cache = kv_caches[l - model_executable.model.start_layer]
+
+            for layer_id in range(start_layer, end_layer):
+                kv_cache = kv_caches[layer_id - start_layer]
 
                 _, _, num_heads, head_size = kv_cache[0].shape
 
@@ -163,29 +162,31 @@ def send_kv_caches_and_hidden_states(
 
                 keys.append(key_cache[current_slot_mapping].unsqueeze(0))
                 values.append(value_cache[current_slot_mapping].unsqueeze(0))
-                
+
             keys = torch.cat(keys, dim=0)
             values = torch.cat(values, dim=0)
-            self.send_buffer.insert(
-                current_tokens, 
-                torch.ones_like(current_tokens, dtype=bool),
-                keys, 
-                values, 
-                hidden_or_intermediate_states[start_pos:end_pos]
-            )
-            
+            if self.send_buffer is not None:
+                self.send_buffer.insert(
+                    current_tokens, torch.ones_like(current_tokens,
+                                                    dtype=bool), keys, values,
+                    hidden_or_intermediate_states[start_pos:end_pos])
 
         logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
 
+    def destroy(self) -> None:
+        if self.send_buffer is not None:
+            self.send_buffer.close()
+        if self.recv_buffer is not None:
+            self.recv_buffer.close()
 
     def recv_kv_caches_and_hidden_states(
-        self,
-        model_executable: torch.nn.Module,
+        self, model_executable: torch.nn.Module,
         model_input: "ModelInputForGPUWithSamplingMetadata",
         kv_caches: List[torch.Tensor]
-    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]:
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
 
-        # When this flag is set to False, it means that 
+        # When this flag is set to False, it means that
         bypass_model_exec = True
 
         # This is disagg decode instance, during prefill state
@@ -199,7 +200,7 @@ def recv_kv_caches_and_hidden_states(
         input_tokens_list = []
         num_computed_tokens_list = []
         start_pos_list = []
-        
+
         # enumerate different requests
         # FIXME(Kuntai): This impl assumes that all requests are prefill.
         for idx, slen in enumerate(seq_lens):
@@ -211,28 +212,34 @@ def recv_kv_caches_and_hidden_states(
 
             input_tokens_list.append(current_tokens)
             start_pos_list.append(start_pos)
-            
+
+            if self.recv_buffer is None:
+                bypass_model_exec = False
+                break
+
             ret = self.recv_buffer.drop_select(
-                current_tokens, 
-                torch.ones_like(current_tokens, dtype=bool))
+                current_tokens, torch.ones_like(current_tokens, dtype=bool))
             if ret[0] is None:
                 # didn't find any match.
                 bypass_model_exec = False
                 num_computed_tokens_list.append(0)
                 continue
-            
+
             # TODO(Jiayi): change the logic here (need roi)
-            _, roi, keys, values, hidden = ret
-            
+            roi: torch.Tensor = ret[1]
+            keys: torch.Tensor = ret[2]
+            values: torch.Tensor = ret[3]
+            hidden: torch.Tensor = ret[4]
+
             # Jiayi: currently assume roi is a prefix
-            num_computed_tokens = len(roi)
+            num_computed_tokens = roi.shape[0]
             num_computed_tokens_list.append(num_computed_tokens)
             is_complete = (num_computed_tokens == num_tokens)
             end_pos = start_pos + num_computed_tokens
-            
+
             # receive KV cache from disaggregated prefill instance
             for i in range(model_executable.model.start_layer,
-                        model_executable.model.end_layer):
+                           model_executable.model.end_layer):
 
                 kv_cache = kv_caches[i - model_executable.model.start_layer]
                 layer = model_executable.model.layers[i]
@@ -251,12 +258,13 @@ def recv_kv_caches_and_hidden_states(
 
             hidden_or_intermediate_states_for_one_req.append(hidden)
 
-        # FIXME(Jiayi): we need to support only skip m out of n reqs in a batch 
+        # FIXME(Jiayi): we need to support only skip m out of n reqs in a batch
         # same for prefix caching
         if not bypass_model_exec:
             # Some of the KV cache is not retrieved
             # so we need to recompute the hidden state
-            logger.debug("[rank%d]: KV EMPTY recv DONE.", torch.distributed.get_rank())
+            logger.debug("[rank%d]: KV EMPTY recv DONE.",
+                         torch.distributed.get_rank())
             return None, bypass_model_exec, None
 
         if not is_complete:
@@ -268,17 +276,17 @@ def recv_kv_caches_and_hidden_states(
                 slot_mapping,
                 device=kv_cache[0].device,
             )
-            logger.debug("[rank%d]: KV PARTIAL recv DONE.", torch.distributed.get_rank())
+            logger.debug("[rank%d]: KV PARTIAL recv DONE.",
+                         torch.distributed.get_rank())
             return None, bypass_model_exec, rebuilt_model_input
-        
+
         # concatenate hidden states from different requests
         hidden_or_intermediate_states = torch.cat(
             hidden_or_intermediate_states_for_one_req, dim=0)
 
         logger.debug("[rank%d]: KV recv DONE.", torch.distributed.get_rank())
         return hidden_or_intermediate_states, bypass_model_exec, model_input
-    
-    
+
     def build_partial_prefill_input(
         self,
         model_input: "ModelInputForGPUWithSamplingMetadata",
@@ -289,70 +297,77 @@ def build_partial_prefill_input(
         device: torch.device,
     ) -> "ModelInputForGPUWithSamplingMetadata":
         rebuilt_input_tokens = []
-        rebuilt_input_positions= []
+        rebuilt_input_positions = []
         rebuilt_query_lens = []
-        
+
         rebuilt_num_prefills = 0
         rebuilt_num_prefill_tokens = 0
         rebuilt_slot_mapping = []
         rebuilt_max_query_len = 0
-        
+
         rebuilt_block_tables = []
-        
+
         rebuilt_query_start_loc = [0]
         rebuilt_context_lens_tensor = []
         rebuilt_selected_token_indices = []
-        
+
         # recounting query and context lengths
         for idx in range(len(input_tokens_list)):
             token_tensor = input_tokens_list[idx]
             num_token = len(token_tensor)
             num_computed_token = num_computed_tokens_list[idx]
             start_pos = start_pos_list[idx]
-            
+
             rebuilt_input_tokens.append(token_tensor[num_computed_token:])
             # TODO(Jiayi): please check the correctness of next line
-            rebuilt_input_positions.append(model_input.input_positions[start_pos+num_computed_token:start_pos+num_token])
+            rebuilt_input_positions.append(
+                model_input.input_positions[start_pos +
+                                            num_computed_token:start_pos +
+                                            num_token])
             q_len = num_token - num_computed_token
             rebuilt_query_lens.append(q_len)
-            
+
             # Attn metadata-related
             rebuilt_num_prefills += 1
             rebuilt_num_prefill_tokens += q_len
-            rebuilt_slot_mapping.append(slot_mapping_flat[start_pos+num_computed_token:start_pos+num_token])
+            rebuilt_slot_mapping.append(
+                slot_mapping_flat[start_pos + num_computed_token:start_pos +
+                                  num_token])
             rebuilt_max_query_len = max(q_len, rebuilt_max_query_len)
             # TODO(Jiayi): remove hard-code (block_size=16)
             blk_size = 16
-            temp_block_table = [i//blk_size for i in range(start_pos, start_pos+num_token, blk_size)]
+            temp_block_table = [
+                i // blk_size
+                for i in range(start_pos, start_pos + num_token, blk_size)
+            ]
             rebuilt_block_tables.append(temp_block_table)
-            rebuilt_query_start_loc.append(q_len) #start with 0
+            rebuilt_query_start_loc.append(q_len)  #start with 0
             rebuilt_context_lens_tensor.append(num_computed_token)
-            
+
             # Sampling metadata related
             #seq_groups (use rebuilt query lens)
-            rebuilt_selected_token_indices.append(start_pos+q_len-1)
-        
-        
+            rebuilt_selected_token_indices.append(start_pos + q_len - 1)
+
         # rebuilt attn_metadata
         rebuilt_attn_metadata = deepcopy(model_input.attn_metadata)
         rebuilt_attn_metadata.num_prefills = rebuilt_num_prefills
         rebuilt_attn_metadata.num_prefill_tokens = rebuilt_num_prefill_tokens
-        rebuilt_attn_metadata.slot_mapping = torch.cat(rebuilt_slot_mapping).to(device)
+        rebuilt_attn_metadata.slot_mapping = torch.cat(
+            rebuilt_slot_mapping).to(device)
         rebuilt_attn_metadata.max_query_len = rebuilt_max_query_len
-        
+
         rebuilt_attn_metadata.block_tables = torch.tensor(
             rebuilt_block_tables,
-            dtype=model_input.attn_metadata.block_tables.dtype
-            ).to(device)
-        
+            dtype=model_input.attn_metadata.block_tables.dtype).to(device)
+
         rebuilt_attn_metadata.query_start_loc = torch.tensor(
             rebuilt_query_start_loc,
             dtype=model_input.attn_metadata.query_start_loc.dtype).to(device)
         rebuilt_attn_metadata.context_lens_tensor = torch.tensor(
-            rebuilt_context_lens_tensor, 
+            rebuilt_context_lens_tensor,
             dtype=model_input.attn_metadata.context_lens_tensor.dtype,
-            ).to(device)
-        
+        ).to(device)
+
         rebuilt_attn_metadata._cached_prefill_metadata = None
 
         # rebuilt sampling_metadata
@@ -362,26 +377,27 @@ def build_partial_prefill_input(
         rebuilt_sampling_metadata.selected_token_indices = torch.tensor(
             rebuilt_selected_token_indices,
             dtype=model_input.sampling_metadata.selected_token_indices.dtype,
-            ).to(device)
-        
+        ).to(device)
+
         # import here to avoid circular import.
-        from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+        from vllm.worker.model_runner import (
+            ModelInputForGPUWithSamplingMetadata)
         rebuilt_model_input = ModelInputForGPUWithSamplingMetadata(
-            input_tokens = torch.cat(rebuilt_input_tokens).to(device),
-            input_positions = torch.cat(rebuilt_input_positions).to(device),
-            seq_lens = model_input.seq_lens,
-            query_lens = rebuilt_query_lens,
-            lora_mapping = model_input.lora_mapping,
-            lora_requests = model_input.lora_requests,
-            attn_metadata = rebuilt_attn_metadata,
-            prompt_adapter_mapping = model_input.prompt_adapter_mapping,
-            prompt_adapter_requests = model_input.prompt_adapter_requests,
-            multi_modal_kwargs = model_input.multi_modal_kwargs,
-            request_ids_to_seq_ids = model_input.request_ids_to_seq_ids,
-            finished_requests_ids = model_input.finished_requests_ids,
-            virtual_engine = model_input.virtual_engine,
-            sampling_metadata = rebuilt_sampling_metadata,
-            is_prompt = model_input.is_prompt,
+            input_tokens=torch.cat(rebuilt_input_tokens).to(device),
+            input_positions=torch.cat(rebuilt_input_positions).to(device),
+            seq_lens=model_input.seq_lens,
+            query_lens=rebuilt_query_lens,
+            lora_mapping=model_input.lora_mapping,
+            lora_requests=model_input.lora_requests,
+            attn_metadata=rebuilt_attn_metadata,
+            prompt_adapter_mapping=model_input.prompt_adapter_mapping,
+            prompt_adapter_requests=model_input.prompt_adapter_requests,
+            multi_modal_kwargs=model_input.multi_modal_kwargs,
+            request_ids_to_seq_ids=model_input.request_ids_to_seq_ids,
+            finished_requests_ids=model_input.finished_requests_ids,
+            virtual_engine=model_input.virtual_engine,
+            sampling_metadata=rebuilt_sampling_metadata,
+            is_prompt=model_input.is_prompt,
         )
-        
-        return rebuilt_model_input
\ No newline at end of file
+
+        return rebuilt_model_input
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 13527110a223..3615fa6af399 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -20,29 +20,25 @@
  parallelism, you can skip the model parallel initialization and destruction
  steps.
 """
-import time
 import contextlib
 import pickle
-import logging
+import time
 from collections import namedtuple
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from multiprocessing import shared_memory
 from typing import Any, Dict, List, Optional, Tuple, Union
 from unittest.mock import patch
-import queue
 
 import torch
 import torch.distributed
 from torch.distributed import Backend, ProcessGroup
 
-import vllm.envs as envs
-from vllm.logger import init_logger
-
-
 # Use this import to check if disagg prefill is enabled.
 # if enabled, need to adjust distributed group correspondingly.
 import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
+import vllm.envs as envs
+from vllm.logger import init_logger
 
 
 @dataclass
@@ -865,7 +861,8 @@ def include_decoding_groups_if_disagg_enabled(
             Extended: [ [0,1], [2,3], [4,5], [6,7] ]
         Arguments:
             groups: original distributed group
-            world_size: the vLLM world size, which is half of torch.distributed.get_world_size()
+            world_size: the vLLM world size, which is half of 
+            torch.distributed.get_world_size()
     """
 
     if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
@@ -908,9 +905,8 @@ def init_distributed_environment(
                 # offset global rank by tp * pp (which is world_size)
                 maybe_disagg_rank = rank + world_size
 
-        logger.debug(
-            f"Before: world size {maybe_disagg_world_size}, rank {maybe_disagg_rank}"
-        )
+        logger.debug("Before: world size %d, rank %d", maybe_disagg_world_size,
+                     maybe_disagg_rank)
 
         torch.distributed.init_process_group(
             backend=backend,
@@ -974,17 +970,18 @@ def initialize_model_parallel(
     ranks 8 to 15 belong to the second box.
 
 
-    Disaggregated prefill will also initialize its process group using this function.
+    Disaggregated prefill will also init its process group using this function.
     Changes:
         - vLLM world size: unchanged (tp * pp)
         - torch.distributed.get_world_size():
             - 2 * tp * pp
-            - Why: torch.distributed package sees 2 vLLM instances (prefill and decode)
+            - Why: both prefill vLLM and decode vLLM is in the world
         - Global rank:
             - [0, tp * pp) for prefill
             - [tp * pp, 2 * tp * pp) for decode
         - Parallel groups
-            - Extend _WORLD, _TP and _PP using `include_decoding_groups_if_disagg_enabled`
+            - Extend _WORLD, _TP and _PP using 
+              `include_decoding_groups_if_disagg_enabled`
             - Add a new parallel group `_DISAGG` for disaggregated prefill
                 - [ [0, tp * pp], [1, tp * pp + 1], .. ]
         - Local rank: unchanged
@@ -997,12 +994,14 @@ def initialize_model_parallel(
         get_world_group().device_group)
     if dist_kv.IS_DISTRIBUTED_KV_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE:
         # Disaggregated prefill enabled
-        # The world_size for this vLLM instance is tp * pp, but torch.distributed contains 2 vLLM instances, its world size is 2 * tp * pp
+        # The world_size for this vLLM instance is tp * pp, but
+        # torch.distributed contains 2 vLLM instances,
+        # its world size is 2 * tp * pp
         # Adjust the world_size to match.
         world_size = world_size // 2
 
-    if (world_size
-            != tensor_model_parallel_size * pipeline_model_parallel_size):
+    if (world_size !=
+            tensor_model_parallel_size * pipeline_model_parallel_size):
         raise RuntimeError(
             f"world_size ({world_size}) is not equal to "
             f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 679f8394688e..b774a649d39f 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -48,7 +48,8 @@ def _get_worker_kwargs(
         """Return worker init args for a given rank."""
         if distributed_init_method is None:
             distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port(force=dist_kv.IS_DISTRIBUTED_KV_INSTANCE))
+                get_ip(),
+                get_open_port(force=dist_kv.IS_DISTRIBUTED_KV_INSTANCE))
         return dict(
             model_config=self.model_config,
             parallel_config=self.parallel_config,
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 944822887945..499e891d98fc 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -70,7 +70,8 @@ def _init_executor(self) -> None:
         # Since it only works for single node, we can use the loopback address
         # 127.0.0.1 for communication.
         distributed_init_method = get_distributed_init_method(
-            "127.0.0.1", get_open_port(force=dist_kv.IS_DISTRIBUTED_KV_INSTANCE))
+            "127.0.0.1",
+            get_open_port(force=dist_kv.IS_DISTRIBUTED_KV_INSTANCE))
 
         self.workers: List[ProcessWorkerWrapper] = []
         # This is the list of workers that are rank 0 of each TP group EXCEPT
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index c646e8536ba1..0cca5db1677e 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -6,8 +6,8 @@
 
 import msgspec
 
-import vllm.envs as envs
 import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
+import vllm.envs as envs
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.msgspec_utils import encode_hook
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index ab38302b3321..b846d1d707db 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -14,7 +14,6 @@
 import torch.distributed
 import torch.nn as nn
 
-
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
@@ -55,7 +54,6 @@
     _add_sampling_metadata_broadcastable_dict,
     _init_attn_metadata_from_tensor_dict,
     _init_sampling_metadata_from_tensor_dict, dump_input_when_exception)
-from vllm import _custom_ops as ops
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -1546,30 +1544,20 @@ def execute_model(
             model_forward_end = torch.cuda.Event(enable_timing=True)
             model_forward_start.record()
 
-
         hidden_or_intermediate_states = model_executable(
-        input_ids=model_input.input_tokens,
-        positions=model_input.input_positions,
-        kv_caches=kv_caches,
-        attn_metadata=model_input.attn_metadata,
-        intermediate_tensors=intermediate_tensors,
-        **MultiModalInputs.as_kwargs(multi_modal_kwargs,
-                                        device=self.device),
-        **seqlen_agnostic_kwargs)
-        
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            kv_caches=kv_caches,
+            attn_metadata=model_input.attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                                         device=self.device),
+            **seqlen_agnostic_kwargs)
+
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_end.record()
 
-        return hidden_or_intermediate_states
-
-    @torch.inference_mode()
-    def postprocess_model(
-        self,
-        model_input,
-        hidden_or_intermediate_states,
-        
-    ):
         if not get_pp_group().is_last_rank:
             if (self.is_driver_worker
                     and hidden_or_intermediate_states is not None
@@ -1587,7 +1575,16 @@ def postprocess_model(
                 hidden_or_intermediate_states.tensors["model_forward_time"] = (
                     torch.tensor(model_forward_time + orig_model_forward_time))
             return hidden_or_intermediate_states
-        
+
+        return hidden_or_intermediate_states
+
+    @torch.inference_mode()
+    def postprocess_model(
+        self,
+        model_input: ModelInputForGPUWithSamplingMetadata,
+        hidden_or_intermediate_states,
+    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
+
         logits = self.model.compute_logits(hidden_or_intermediate_states,
                                            model_input.sampling_metadata)
 
@@ -1603,6 +1600,7 @@ def postprocess_model(
             sampling_metadata=model_input.sampling_metadata,
         )
 
+        assert model_input.attn_metadata is not None
         decode_meta = model_input.attn_metadata.decode_metadata
         if self.return_hidden_states:
             # we only need to pass hidden states of most recent token
@@ -1620,9 +1618,7 @@ def postprocess_model(
             output.hidden_states = hidden_states
 
         return [output]
-    
-    
-    
+
 
 class CUDAGraphRunner:
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 22577ebf6949..7908fc466eb3 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -7,6 +7,8 @@
 
 import torch
 
+import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
+import vllm.distributed.parallel_state as ps
 from vllm.config import ObservabilityConfig
 from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
 from vllm.logger import init_logger
@@ -16,13 +18,11 @@
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import (enable_trace_function_call_for_thread,
                         update_environment_variables)
+from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
 from vllm.worker.model_runner_base import (BroadcastableModelInput,
                                            ModelRunnerBase,
                                            ModelRunnerInputBase)
 
-import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
-import vllm.distributed.parallel_state as ps
-
 logger = init_logger(__name__)
 
 
@@ -223,7 +223,6 @@ def execute_worker(self, worker_input: WorkerInput) -> None:
         Process an execution request.
         """
         raise NotImplementedError
-    
 
     def _get_worker_input_from_broadcast(
         self
@@ -327,19 +326,14 @@ def execute_model(
                     and self.observability_config.collect_model_execute_time):
                 orig_model_execute_time = intermediate_tensors.tensors.get(
                     "model_execute_time", torch.tensor(0)).item()
-        
-        
+
         # for disaggregated prefilling: allow bypassing model execution
         bypass_model_exec = False
-        
-        
-        # receive KV cache. 
-        # NOTE(kuntai): 
-        # If only a part of KV cache is received, we will adjust model_input
-        # to avoid prefill on the part of KV caches that are already received.
-        # This will not happen for disaggregated prefill, but will happen
-        # when connecting to a KV cache database (like LMCache).
+
+        # receive KV cache from prefill instance, or from LMCache
         if self.need_recv_kv(model_input, worker_input):
+            assert isinstance(self.model_runner, GPUModelRunnerBase), \
+                "Distributed KV transfer only support GPU modelrunner"
             hidden_or_intermediate_states, bypass_model_exec, model_input = \
                 ps.get_disagg_group().recv_kv_caches_and_hidden_states(
                     # model is used to know which layer the current worker
@@ -347,11 +341,12 @@ def execute_model(
                     # layers.
                     self.model_runner.model,
                     model_input,
-                    self.kv_cache[worker_input.virtual_engine],
+                    kv_caches=self.kv_cache[worker_input.virtual_engine]
+                    if self.kv_cache is not None else None,
                 )
             #assert bypass_model_exec
-        
-        if not bypass_model_exec: 
+
+        if not bypass_model_exec:
             hidden_or_intermediate_states = self.model_runner.execute_model(
                 model_input=model_input,
                 kv_caches=self.kv_cache[worker_input.virtual_engine]
@@ -360,24 +355,31 @@ def execute_model(
                 num_steps=num_steps,
                 **kwargs,
             )
-            
+
         # sending out KV cache
         if self.need_send_kv(model_input, worker_input):
+            assert isinstance(self.model_runner, GPUModelRunnerBase), \
+                "Distributed KV transfer only support GPU modelrunner"
             ps.get_disagg_group().send_kv_caches_and_hidden_states(
                 # model is used to know which layer the current worker
                 # is working on, so that we can send KV for only those
                 # layers.
                 self.model_runner.model,
                 model_input,
-                self.kv_cache[worker_input.virtual_engine],
+                self.kv_cache[worker_input.virtual_engine]
+                if self.kv_cache is not None else None,
                 hidden_or_intermediate_states,
             )
-            
-        # Get model output based on hidden state.
-        output = self.model_runner.postprocess_model(
-            model_input,
-            hidden_or_intermediate_states,
-        )
+
+        # separating postprocessing steps out from execute_model
+        # so that disaggregated prefill can completely bypass model forwarding
+        if isinstance(self.model_runner, ModelRunner):
+            output = self.model_runner.postprocess_model(
+                model_input,
+                hidden_or_intermediate_states,
+            )
+        else:
+            output = hidden_or_intermediate_states
 
         model_execute_time = time.perf_counter() - start_time
         if not get_pp_group().is_last_rank:
@@ -400,38 +402,43 @@ def execute_model(
         return output
 
     def need_recv_kv(self, model_input, worker_input) -> bool:
-        
+
+        if self.kv_cache is None:
+            return False
+
         kv_caches = self.kv_cache[worker_input.virtual_engine]
         prefill_meta = model_input.attn_metadata.prefill_metadata
-        
+
         # check if the current run is profiling
         is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
         # for disaggregated prefilling: allow bypassing model execution
-        
+
         return all([
-            is_prefill_run,
-            dist_kv.IS_KV_DECODE_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE,
-            not is_profile_run])
+            is_prefill_run, dist_kv.IS_KV_DECODE_INSTANCE
+            or dist_kv.IS_LMCACHE_INSTANCE, not is_profile_run
+        ])
 
-            
     def need_send_kv(self, model_input, worker_input) -> bool:
-        
+
+        if self.kv_cache is None:
+            return False
+
         kv_caches = self.kv_cache[worker_input.virtual_engine]
         prefill_meta = model_input.attn_metadata.prefill_metadata
-        model_executable = self.model_runner.model
-        
+        if not isinstance(self.model_runner, GPUModelRunnerBase):
+            return False
+
         # check if the current run is profiling
         is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
-        
+
         return all([
-            is_prefill_run,
-            dist_kv.IS_KV_PREFILL_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE,
-            not is_profile_run])
-        
+            is_prefill_run, dist_kv.IS_KV_PREFILL_INSTANCE
+            or dist_kv.IS_LMCACHE_INSTANCE, not is_profile_run
+        ])
 
     def _execute_model_spmd(
         self,

From 36a382c961c75aa00733a5d04022a2ad1a17b229 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 23:48:40 +0000
Subject: [PATCH 208/278] resolve circular import

---
 vllm/utils.py              | 2 +-
 vllm/worker/worker_base.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 1adab6191726..8e27e1f73f4b 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -535,7 +535,7 @@ def get_open_port(force: bool = False) -> int:
         if force and port is not None:
             # force vLLM to use envs.VLLM_PORT for torch.distributed init
             # This is because this port will binded by prefill instance
-            # But both prefill and decode instance need to use this port to 
+            # But both prefill and decode instance need to use this port to
             # initialize torch.distributed
             return port
         while True:
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 7908fc466eb3..d55400a40240 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -18,7 +18,6 @@
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import (enable_trace_function_call_for_thread,
                         update_environment_variables)
-from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
 from vllm.worker.model_runner_base import (BroadcastableModelInput,
                                            ModelRunnerBase,
                                            ModelRunnerInputBase)
@@ -332,6 +331,7 @@ def execute_model(
 
         # receive KV cache from prefill instance, or from LMCache
         if self.need_recv_kv(model_input, worker_input):
+            from vllm.worker.model_runner import GPUModelRunnerBase
             assert isinstance(self.model_runner, GPUModelRunnerBase), \
                 "Distributed KV transfer only support GPU modelrunner"
             hidden_or_intermediate_states, bypass_model_exec, model_input = \
@@ -358,6 +358,7 @@ def execute_model(
 
         # sending out KV cache
         if self.need_send_kv(model_input, worker_input):
+            from vllm.worker.model_runner import GPUModelRunnerBase
             assert isinstance(self.model_runner, GPUModelRunnerBase), \
                 "Distributed KV transfer only support GPU modelrunner"
             ps.get_disagg_group().send_kv_caches_and_hidden_states(
@@ -373,6 +374,7 @@ def execute_model(
 
         # separating postprocessing steps out from execute_model
         # so that disaggregated prefill can completely bypass model forwarding
+        from vllm.worker.model_runner import ModelRunner
         if isinstance(self.model_runner, ModelRunner):
             output = self.model_runner.postprocess_model(
                 model_input,
@@ -427,6 +429,7 @@ def need_send_kv(self, model_input, worker_input) -> bool:
 
         kv_caches = self.kv_cache[worker_input.virtual_engine]
         prefill_meta = model_input.attn_metadata.prefill_metadata
+        from vllm.worker.model_runner import GPUModelRunnerBase
         if not isinstance(self.model_runner, GPUModelRunnerBase):
             return False
 

From a0867dd1cf73ae998f6051875a7949026f98cf26 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 23:49:01 +0000
Subject: [PATCH 209/278] fix redundant import

---
 tests/kv_transfer/test_send_recv.py | 81 ++++++++++++++---------------
 1 file changed, 39 insertions(+), 42 deletions(-)

diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 4bf757d7c849..994b907e0c89 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -1,10 +1,11 @@
+import os
+import time
+from typing import List
 
-import vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe as tdp
 import torch
-import os
-import random
 from tqdm import tqdm
-import time
+
+import vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe as tdp
 
 
 def test_run(my_rank, pipe):
@@ -35,20 +36,19 @@ def test_run(my_rank, pipe):
     assert torch.allclose(y, y2)
 
 
-
 def stress_test(my_rank, pipe):
-    
+
     torch.distributed.barrier()
-    
-    tensors = []
-    
-    
+
+    tensors: List[torch.Tensor] = []
+
     for i in tqdm(range(2000)):
         mean = torch.rand(1).item()
         std = torch.rand(1).item()
-        size = torch.randint(900, 1000, (2,))
-        x = torch.normal(mean * 1.0, std * 1.0, size=size.tolist()).to(pipe.device)
-        
+        size = torch.randint(900, 1000, (2, ))
+        x = torch.normal(mean * 1.0, std * 1.0,
+                         size=size.tolist()).to(pipe.device)
+
         # 5% probability of sending a None
         if torch.rand(1).item() < 0.05:
             tensors.append(None)
@@ -59,15 +59,13 @@ def stress_test(my_rank, pipe):
             tensors.append(x.mean().unsqueeze(0))
             tensors.append(x.std().unsqueeze(0))
 
-        
-        
     torch.distributed.barrier()
-    
+
     for i in tqdm(range(2000)):
         if my_rank == int((i % 10) > 3):
-            pipe.send_tensor(tensors[3*i])
-            pipe.send_tensor(tensors[3*i+1])
-            pipe.send_tensor(tensors[3*i+2])
+            pipe.send_tensor(tensors[3 * i])
+            pipe.send_tensor(tensors[3 * i + 1])
+            pipe.send_tensor(tensors[3 * i + 2])
         else:
             x = pipe.recv_tensor()
             mean = pipe.recv_tensor()
@@ -76,34 +74,36 @@ def stress_test(my_rank, pipe):
                 assert mean is None
                 assert std is None
             else:
-                assert torch.allclose(x, tensors[3*i])
+                assert torch.allclose(x, tensors[3 * i])
                 assert x.mean() == mean[0]
                 assert x.std() == std[0]
 
     torch.distributed.barrier()
 
     print("Stress test passed.")
-    
-    
-    
+
+
 def latency_test(my_rank, pipe, nelement, ntensor):
-    
+
     latencies = []
-    
+
     torch.distributed.barrier()
-    
+
     for i in tqdm(range(1000)):
-        
+
         tensors = []
-        
+
         if my_rank == 0:
             # create tensor
-            tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)]
-        
+            tensors = [
+                torch.rand(nelement).to(pipe.device) for _ in range(ntensor)
+            ]
+
         torch.distributed.barrier()
-        
+
         if my_rank == 0:
-            t = torch.tensor([time.time()], dtype=torch.float64).to(pipe.device)
+            t = torch.tensor([time.time()],
+                             dtype=torch.float64).to(pipe.device)
             for tensor in tensors:
                 pipe.send_tensor(tensor)
             pipe.send_tensor(t)
@@ -114,7 +114,7 @@ def latency_test(my_rank, pipe, nelement, ntensor):
             latencies.append(time.time() - t.item())
 
     torch.distributed.barrier()
-            
+
     print('Latency test passed.')
     print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms')
 
@@ -123,18 +123,15 @@ def latency_test(my_rank, pipe, nelement, ntensor):
 
     my_rank = int(os.environ['RANK'])
 
-
-    torch.distributed.init_process_group(
-                init_method="tcp://127.0.0.1:23456",
-                world_size=2,
-                rank=my_rank)
+    torch.distributed.init_process_group(init_method="tcp://127.0.0.1:23456",
+                                         world_size=2,
+                                         rank=my_rank)
 
     print("initialized! My rank is %d" % my_rank)
 
+    pipe = tdp.TorchDistributedPipe([[0, 1]], my_rank, "nccl")
 
-    pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "nccl")
-
-    torch.manual_seed(0) 
+    torch.manual_seed(0)
     test_run(my_rank, pipe)
     stress_test(my_rank, pipe)
-    latency_test(my_rank, pipe, 1024*8*128, 80)
+    latency_test(my_rank, pipe, 1024 * 8 * 128, 80)

From 7f90903a448755fe9c00b252ec9d17a4c6566f61 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Sun, 15 Sep 2024 23:55:50 +0000
Subject: [PATCH 210/278] rename to a shorter name

---
 tests/kv_transfer/test_lookup_buffer.py       | 80 +++++++++----------
 ...e_kv_lookup_buffer.py => simple_buffer.py} |  0
 vllm/distributed/kv_transfer/vllm_adapter.py  |  5 +-
 3 files changed, 42 insertions(+), 43 deletions(-)
 rename vllm/distributed/kv_transfer/kv_lookup_buffer/{simple_kv_lookup_buffer.py => simple_buffer.py} (100%)

diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index ae19d068be9f..0730f091a34b 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -1,24 +1,25 @@
-
-import vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe as tdp
-import vllm.distributed.kv_transfer.kv_lookup_buffer.simple_kv_lookup_buffer as sklb
-import torch
 import os
 import random
+
+import torch
 from tqdm import tqdm
-import time
 
-# TODO: the test depends on a lot of fields in the current implementation. We should have standard interface instead direct field access
+import vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer as sklb
+import vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe as tdp
+
+# TODO: the test depends on a lot of fields in the current implementation.
+# We should have standard interface instead direct field access
+
 
 def test_run(my_rank, buffer, device):
-    
-    # buffer should be empty in the beginning    
+
+    # buffer should be empty in the beginning
     if my_rank == 0:
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
 
-
     # insert
-    tokens = torch.tensor([1,2,3]).to(device)
+    tokens = torch.tensor([1, 2, 3]).to(device)
     roi = (tokens > 0)
     if my_rank == 0:
         key = 2.0 * torch.ones([5, 6]).to(device)
@@ -27,45 +28,47 @@ def test_run(my_rank, buffer, device):
         placeholder = torch.tensor([1]).to(device)
 
         buffer.insert(tokens, roi, key, value, placeholder)
-        
+
     torch.distributed.barrier()
-        
+
     # drop_select
     if my_rank == 1:
         tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
         assert torch.allclose(tokens, tok)
         assert torch.allclose(roi, roi_)
-        assert torch.allclose(key, 2.0 * torch.ones([5, 6], device = device))
-        assert torch.allclose(value, 3.0 * torch.ones([5, 6], device = device))
+        assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device))
+        assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device))
     torch.distributed.barrier()
-    
+
     if my_rank == 0:
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
-    
+
     print("Test run passed!")
 
+
 def stress_test(my_rank, buf, device):
-    
+
     torch.distributed.barrier()
     torch.manual_seed(100)
 
     reqs = [
         (
-         torch.rand(100).to(device),   # tokens
-         torch.ones(100).bool().to(device),    # roi
-         torch.rand(100).to(device),   # key
-         torch.rand(100).to(device),   # value
-         torch.rand(100).to(device),   # hidden
-         ) for i in tqdm(range(200))]
+            torch.rand(100).to(device),  # tokens
+            torch.ones(100).bool().to(device),  # roi
+            torch.rand(100).to(device),  # key
+            torch.rand(100).to(device),  # value
+            torch.rand(100).to(device),  # hidden
+        ) for i in tqdm(range(200))
+    ]
 
     random.seed(my_rank)
     random.shuffle(reqs)
-    
+
     torch.distributed.barrier()
-    
+
     n = 0
-    
+
     # the buffer size can only store 100 reqs
     # so the sender will occasionally block to wait for the receiver.
     for req in tqdm(reqs):
@@ -74,7 +77,7 @@ def stress_test(my_rank, buf, device):
         else:
             tok, roi, k, v, h = req
             tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi)
-            
+
             if tok_ is None:
                 assert roi_ is None
                 assert k_ is None
@@ -89,8 +92,7 @@ def stress_test(my_rank, buf, device):
                 assert torch.allclose(h, h_)
     print('Rank %d done' % my_rank)
     torch.distributed.barrier()
-    
-    
+
     if my_rank == 0:
         x = torch.tensor([0])
         torch.distributed.recv(x, 1)
@@ -103,30 +105,26 @@ def stress_test(my_rank, buf, device):
         torch.distributed.send(torch.tensor([n]), 0)
 
     print("Passed stress test!")
-        
-    
+
 
 if __name__ == "__main__":
 
     my_rank = int(os.environ['RANK'])
 
-
-    torch.distributed.init_process_group(
-                init_method="tcp://127.0.0.1:23456",
-                world_size=2,
-                rank=my_rank)
+    torch.distributed.init_process_group(init_method="tcp://127.0.0.1:23456",
+                                         world_size=2,
+                                         rank=my_rank)
 
     print("initialized! My rank is %d" % my_rank)
 
-
-    pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "nccl")
-    cpu_pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "gloo")
+    pipe = tdp.TorchDistributedPipe([[0, 1]], my_rank, "nccl")
+    cpu_pipe = tdp.TorchDistributedPipe([[0, 1]], my_rank, "gloo")
     buffer = sklb.SimpleKVLookupBuffer(cpu_pipe, pipe, 170000)
 
     test_run(my_rank, buffer, pipe.device)
-    
+
     stress_test(my_rank, buffer, pipe.device)
-    
+
     buffer.close()
     pipe.close()
     cpu_pipe.close()
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
similarity index 100%
rename from vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
rename to vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 03392ec13f10..2edb426c5c8d 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -28,12 +28,11 @@
 import torch
 from torch.distributed import Backend
 
+import vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer as sklb
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
     KVLookupBufferBase)
-from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_kv_lookup_buffer import (
-    SimpleKVLookupBuffer)
 from vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe import (
     TorchDistributedPipe)
 from vllm.logger import init_logger
@@ -77,6 +76,8 @@ def __init__(
         self.send_buffer: Optional[KVLookupBufferBase] = None
         self.recv_buffer: Optional[KVLookupBufferBase] = None
 
+        SimpleKVLookupBuffer = sklb.SimpleKVLookupBuffer
+
         if IS_LMCACHE_INSTANCE:
             # when vLLM is connected with LMCache
             # it needs to both send and recv KV cache

From 5ca22fb44866e9b5154fa09844d71a9e34c18729 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Mon, 16 Sep 2024 00:06:43 +0000
Subject: [PATCH 211/278] remove unnecessary file

---
 tests/test_send_recv.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/test_send_recv.sh

diff --git a/tests/test_send_recv.sh b/tests/test_send_recv.sh
deleted file mode 100644
index e69de29bb2d1..000000000000

From 073642bde894301ced000178eaec7f76562e78be Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Mon, 16 Sep 2024 00:40:26 +0000
Subject: [PATCH 212/278] update kv transfer test

---
 tests/kv_transfer/test_launcher.py      | 52 +++++++++++++++++++++++++
 tests/kv_transfer/test_lookup_buffer.sh |  3 --
 tests/kv_transfer/test_send_recv.py     | 10 +++--
 tests/kv_transfer/test_send_recv.sh     |  3 --
 4 files changed, 58 insertions(+), 10 deletions(-)
 create mode 100644 tests/kv_transfer/test_launcher.py
 delete mode 100644 tests/kv_transfer/test_lookup_buffer.sh
 delete mode 100644 tests/kv_transfer/test_send_recv.sh

diff --git a/tests/kv_transfer/test_launcher.py b/tests/kv_transfer/test_launcher.py
new file mode 100644
index 000000000000..5c0aeb04b43f
--- /dev/null
+++ b/tests/kv_transfer/test_launcher.py
@@ -0,0 +1,52 @@
+import subprocess
+import pytest
+import sys
+import torch
+
+def run_python_script(script_name, timeout):
+    try:
+        # Start both processes asynchronously using Popen
+        process0 = subprocess.Popen(
+            [sys.executable, script_name],
+            env={"RANK": "0"},  # Set the RANK environment variable for process 0
+            stdout=sys.stdout,  # Pipe stdout to current stdout
+            stderr=sys.stderr,  # Pipe stderr to current stderr
+        )
+        
+        process1 = subprocess.Popen(
+            [sys.executable, script_name],
+            env={"RANK": "1"},  # Set the RANK environment variable for process 1
+            stdout=sys.stdout,  # Pipe stdout to current stdout
+            stderr=sys.stderr,  # Pipe stderr to current stderr
+        )
+
+        # Wait for both processes to complete, with a timeout
+        process0.wait(timeout=timeout)
+        process1.wait(timeout=timeout)
+
+        # Check the return status of both processes
+        if process0.returncode != 0:
+            pytest.fail(f"Test {script_name} failed for RANK=0 with return code {process0.returncode}")
+        if process1.returncode != 0:
+            pytest.fail(f"Test {script_name} failed for RANK=1 with return code {process1.returncode}")
+
+    except subprocess.TimeoutExpired:
+        # If either process times out, terminate both and fail the test
+        process0.terminate()
+        process1.terminate()
+        pytest.fail(f"Test {script_name} timed out")
+    except Exception as e:
+        pytest.fail(f"Test {script_name} failed with error: {str(e)}")
+
+# Define the test cases using pytest's parametrize
+@pytest.mark.parametrize("script_name,timeout", [
+    ("test_lookup_buffer.py", 60), # Second test case with a 60-second timeout
+    ("test_send_recv.py", 120)     # First test case with a 120-second timeout
+])
+def test_run_python_script(script_name, timeout):
+    # Check the number of GPUs
+    if torch.cuda.device_count() < 2:
+        pytest.skip(f"Skipping test {script_name} because fewer than 2 GPUs are available")
+    
+    # Run the test if there are at least 2 GPUs
+    run_python_script(script_name, timeout)
diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh
deleted file mode 100644
index 336b540e7054..000000000000
--- a/tests/kv_transfer/test_lookup_buffer.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-
-RANK=0 python3 test_lookup_buffer.py &
-RANK=1 python3 test_lookup_buffer.py &
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 994b907e0c89..f6da7f88d5f5 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -42,7 +42,7 @@ def stress_test(my_rank, pipe):
 
     tensors: List[torch.Tensor] = []
 
-    for i in tqdm(range(2000)):
+    for i in tqdm(range(500)):
         mean = torch.rand(1).item()
         std = torch.rand(1).item()
         size = torch.randint(900, 1000, (2, ))
@@ -61,7 +61,7 @@ def stress_test(my_rank, pipe):
 
     torch.distributed.barrier()
 
-    for i in tqdm(range(2000)):
+    for i in tqdm(range(500)):
         if my_rank == int((i % 10) > 3):
             pipe.send_tensor(tensors[3 * i])
             pipe.send_tensor(tensors[3 * i + 1])
@@ -89,7 +89,7 @@ def latency_test(my_rank, pipe, nelement, ntensor):
 
     torch.distributed.barrier()
 
-    for i in tqdm(range(1000)):
+    for i in tqdm(range(500)):
 
         tensors = []
 
@@ -134,4 +134,6 @@ def latency_test(my_rank, pipe, nelement, ntensor):
     torch.manual_seed(0)
     test_run(my_rank, pipe)
     stress_test(my_rank, pipe)
-    latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
+    
+    # Use this function if you want to test the latency of pipe impl.
+    # latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh
deleted file mode 100644
index 2a478871bd0e..000000000000
--- a/tests/kv_transfer/test_send_recv.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-
-RANK=0 python3 test_send_recv.py &
-RANK=1 python3 test_send_recv.py &

From 70d6571c936d4b8baf351a7161512ca67cb1e079 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Mon, 16 Sep 2024 01:12:57 +0000
Subject: [PATCH 213/278] update tests

---
 tests/kv_transfer/disagg_test.py              | 107 ++++++++++++++++++
 .../{test_launcher.py => module_test.py}      |   0
 2 files changed, 107 insertions(+)
 create mode 100644 tests/kv_transfer/disagg_test.py
 rename tests/kv_transfer/{test_launcher.py => module_test.py} (100%)

diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
new file mode 100644
index 000000000000..2e8414a9f464
--- /dev/null
+++ b/tests/kv_transfer/disagg_test.py
@@ -0,0 +1,107 @@
+import os
+import sys
+import subprocess
+import time
+import pytest
+import requests
+import signal
+from subprocess import Popen
+import torch
+
+
+# Fixture to set up environment variables and teardown servers after tests
+@pytest.fixture(scope="module", autouse=True)
+def setup_servers():
+    if torch.cuda.device_count() < 4:
+        pytest.skip("Skipping test: fewer than 4 GPUs available")
+
+    # Set up environment variables
+    VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", shell=True).decode().strip()
+    os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP
+    os.environ["VLLM_PORT"] = "12345"
+
+    # Start prefill instance
+    prefill_cmd = [
+        sys.executable, "-m", "vllm.entrypoints.openai.api_server",
+        "-tp", "2",
+        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "--port", "8100",
+        "--gpu-memory-utilization", "0.8"
+    ]
+    prefill_env = os.environ.copy()
+    prefill_env["VLLM_DISAGG_PREFILL_ROLE"] = "prefill"
+    prefill_env["CUDA_VISIBLE_DEVICES"] = "0,1"
+    prefill_proc = Popen(prefill_cmd, env=prefill_env)
+
+    # Start decode instance
+    decode_cmd = [
+        sys.executable, "-m", "vllm.entrypoints.openai.api_server",
+        "-tp", "2",
+        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "--port", "8200",
+        "--gpu-memory-utilization", "0.8"
+    ]
+    decode_env = os.environ.copy()
+    decode_env["VLLM_DISAGG_PREFILL_ROLE"] = "decode"
+    decode_env["CUDA_VISIBLE_DEVICES"] = "2,3"
+    decode_proc = Popen(decode_cmd, env=decode_env)
+
+    # Wait for servers to be ready
+    assert wait_for_server(8100), "Prefill server did not start in time"
+    assert wait_for_server(8200), "Decode server did not start in time"
+
+    # Yield to the test function and handle teardown after tests
+    yield
+
+    # Cleanup: kill the processes
+    prefill_proc.terminate()
+    decode_proc.terminate()
+
+    # Additional cleanup if needed
+    prefill_proc.wait()
+    decode_proc.wait()
+
+# Helper function to wait for server
+def wait_for_server(port, timeout=120):
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            response = requests.get(f"http://localhost:{port}/v1/completions")
+            if response.status_code in [200, 405]:
+                return True
+        except requests.ConnectionError:
+            time.sleep(1)
+    return False
+
+# Test function to send curl requests and validate responses
+@pytest.mark.parametrize("prompt", [
+    "San Francisco is a",
+    "Santa Clara is a"
+])
+def test_disaggregated_prefilling(prompt):
+    # Send to prefill
+    response = requests.post(
+        "http://localhost:8100/v1/completions",
+        headers={"Content-Type": "application/json"},
+        json={
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "prompt": prompt,
+            "max_tokens": 1,
+            "temperature": 0
+        }
+    )
+    assert response.status_code == 200
+
+    # Send to decode
+    response = requests.post(
+        "http://localhost:8200/v1/completions",
+        headers={"Content-Type": "application/json"},
+        json={
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "prompt": prompt,
+            "max_tokens": 10,
+            "temperature": 0
+        }
+    )
+    assert response.status_code == 200
+    
\ No newline at end of file
diff --git a/tests/kv_transfer/test_launcher.py b/tests/kv_transfer/module_test.py
similarity index 100%
rename from tests/kv_transfer/test_launcher.py
rename to tests/kv_transfer/module_test.py

From 4d6b00a6c370e5005a8e92745de603938a3b341a Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Mon, 16 Sep 2024 01:14:18 +0000
Subject: [PATCH 214/278] make fmt checker happy

---
 tests/kv_transfer/disagg_test.py    | 67 +++++++++++++----------------
 tests/kv_transfer/module_test.py    | 35 +++++++++------
 tests/kv_transfer/test_send_recv.py |  2 +-
 3 files changed, 54 insertions(+), 50 deletions(-)

diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
index 2e8414a9f464..fa6a527574cf 100644
--- a/tests/kv_transfer/disagg_test.py
+++ b/tests/kv_transfer/disagg_test.py
@@ -1,11 +1,11 @@
 import os
-import sys
 import subprocess
+import sys
 import time
+from subprocess import Popen
+
 import pytest
 import requests
-import signal
-from subprocess import Popen
 import torch
 
 
@@ -16,16 +16,15 @@ def setup_servers():
         pytest.skip("Skipping test: fewer than 4 GPUs available")
 
     # Set up environment variables
-    VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", shell=True).decode().strip()
+    VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'",
+                                           shell=True).decode().strip()
     os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP
     os.environ["VLLM_PORT"] = "12345"
 
     # Start prefill instance
     prefill_cmd = [
-        sys.executable, "-m", "vllm.entrypoints.openai.api_server",
-        "-tp", "2",
-        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct",
-        "--port", "8100",
+        sys.executable, "-m", "vllm.entrypoints.openai.api_server", "-tp", "2",
+        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--port", "8100",
         "--gpu-memory-utilization", "0.8"
     ]
     prefill_env = os.environ.copy()
@@ -35,10 +34,8 @@ def setup_servers():
 
     # Start decode instance
     decode_cmd = [
-        sys.executable, "-m", "vllm.entrypoints.openai.api_server",
-        "-tp", "2",
-        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct",
-        "--port", "8200",
+        sys.executable, "-m", "vllm.entrypoints.openai.api_server", "-tp", "2",
+        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--port", "8200",
         "--gpu-memory-utilization", "0.8"
     ]
     decode_env = os.environ.copy()
@@ -61,6 +58,7 @@ def setup_servers():
     prefill_proc.wait()
     decode_proc.wait()
 
+
 # Helper function to wait for server
 def wait_for_server(port, timeout=120):
     start_time = time.time()
@@ -73,35 +71,30 @@ def wait_for_server(port, timeout=120):
             time.sleep(1)
     return False
 
+
 # Test function to send curl requests and validate responses
-@pytest.mark.parametrize("prompt", [
-    "San Francisco is a",
-    "Santa Clara is a"
-])
+@pytest.mark.parametrize("prompt", ["San Francisco is a", "Santa Clara is a"])
 def test_disaggregated_prefilling(prompt):
     # Send to prefill
-    response = requests.post(
-        "http://localhost:8100/v1/completions",
-        headers={"Content-Type": "application/json"},
-        json={
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "prompt": prompt,
-            "max_tokens": 1,
-            "temperature": 0
-        }
-    )
+    response = requests.post("http://localhost:8100/v1/completions",
+                             headers={"Content-Type": "application/json"},
+                             json={
+                                 "model":
+                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "prompt": prompt,
+                                 "max_tokens": 1,
+                                 "temperature": 0
+                             })
     assert response.status_code == 200
 
     # Send to decode
-    response = requests.post(
-        "http://localhost:8200/v1/completions",
-        headers={"Content-Type": "application/json"},
-        json={
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "prompt": prompt,
-            "max_tokens": 10,
-            "temperature": 0
-        }
-    )
+    response = requests.post("http://localhost:8200/v1/completions",
+                             headers={"Content-Type": "application/json"},
+                             json={
+                                 "model":
+                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "prompt": prompt,
+                                 "max_tokens": 10,
+                                 "temperature": 0
+                             })
     assert response.status_code == 200
-    
\ No newline at end of file
diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/module_test.py
index 5c0aeb04b43f..10fb19a3128e 100644
--- a/tests/kv_transfer/module_test.py
+++ b/tests/kv_transfer/module_test.py
@@ -1,21 +1,25 @@
 import subprocess
-import pytest
 import sys
+
+import pytest
 import torch
 
+
 def run_python_script(script_name, timeout):
     try:
         # Start both processes asynchronously using Popen
         process0 = subprocess.Popen(
             [sys.executable, script_name],
-            env={"RANK": "0"},  # Set the RANK environment variable for process 0
+            env={"RANK":
+                 "0"},  # Set the RANK environment variable for process 0
             stdout=sys.stdout,  # Pipe stdout to current stdout
             stderr=sys.stderr,  # Pipe stderr to current stderr
         )
-        
+
         process1 = subprocess.Popen(
             [sys.executable, script_name],
-            env={"RANK": "1"},  # Set the RANK environment variable for process 1
+            env={"RANK":
+                 "1"},  # Set the RANK environment variable for process 1
             stdout=sys.stdout,  # Pipe stdout to current stdout
             stderr=sys.stderr,  # Pipe stderr to current stderr
         )
@@ -26,9 +30,11 @@ def run_python_script(script_name, timeout):
 
         # Check the return status of both processes
         if process0.returncode != 0:
-            pytest.fail(f"Test {script_name} failed for RANK=0 with return code {process0.returncode}")
+            pytest.fail(
+                f"Test {script_name} failed for RANK=0, {process0.returncode}")
         if process1.returncode != 0:
-            pytest.fail(f"Test {script_name} failed for RANK=1 with return code {process1.returncode}")
+            pytest.fail(
+                f"Test {script_name} failed for RANK=1, {process1.returncode}")
 
     except subprocess.TimeoutExpired:
         # If either process times out, terminate both and fail the test
@@ -38,15 +44,20 @@ def run_python_script(script_name, timeout):
     except Exception as e:
         pytest.fail(f"Test {script_name} failed with error: {str(e)}")
 
+
 # Define the test cases using pytest's parametrize
-@pytest.mark.parametrize("script_name,timeout", [
-    ("test_lookup_buffer.py", 60), # Second test case with a 60-second timeout
-    ("test_send_recv.py", 120)     # First test case with a 120-second timeout
-])
+@pytest.mark.parametrize(
+    "script_name,timeout",
+    [
+        ("test_lookup_buffer.py",
+         60),  # Second test case with a 60-second timeout
+        ("test_send_recv.py", 120)  # First test case with a 120-second timeout
+    ])
 def test_run_python_script(script_name, timeout):
     # Check the number of GPUs
     if torch.cuda.device_count() < 2:
-        pytest.skip(f"Skipping test {script_name} because fewer than 2 GPUs are available")
-    
+        pytest.skip(
+            f"Skipping test {script_name} because <2 GPUs are available")
+
     # Run the test if there are at least 2 GPUs
     run_python_script(script_name, timeout)
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index f6da7f88d5f5..ff771f34c032 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -134,6 +134,6 @@ def latency_test(my_rank, pipe, nelement, ntensor):
     torch.manual_seed(0)
     test_run(my_rank, pipe)
     stress_test(my_rank, pipe)
-    
+
     # Use this function if you want to test the latency of pipe impl.
     # latency_test(my_rank, pipe, 1024 * 8 * 128, 80)

From 7c13e03847a37417277483def8827cc2190749f8 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Mon, 16 Sep 2024 01:18:19 +0000
Subject: [PATCH 215/278] constraint the model length

---
 tests/kv_transfer/disagg_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
index fa6a527574cf..fffd9ab6f42a 100644
--- a/tests/kv_transfer/disagg_test.py
+++ b/tests/kv_transfer/disagg_test.py
@@ -25,7 +25,7 @@ def setup_servers():
     prefill_cmd = [
         sys.executable, "-m", "vllm.entrypoints.openai.api_server", "-tp", "2",
         "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--port", "8100",
-        "--gpu-memory-utilization", "0.8"
+        "--gpu-memory-utilization", "0.8", "--max-model-len", "1000",
     ]
     prefill_env = os.environ.copy()
     prefill_env["VLLM_DISAGG_PREFILL_ROLE"] = "prefill"
@@ -36,7 +36,7 @@ def setup_servers():
     decode_cmd = [
         sys.executable, "-m", "vllm.entrypoints.openai.api_server", "-tp", "2",
         "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--port", "8200",
-        "--gpu-memory-utilization", "0.8"
+        "--gpu-memory-utilization", "0.8", "--max-model-len", "1000",
     ]
     decode_env = os.environ.copy()
     decode_env["VLLM_DISAGG_PREFILL_ROLE"] = "decode"

From cf5b84c2a9916618c3b5bb1f304dc0f2ff27a471 Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Mon, 16 Sep 2024 01:25:17 +0000
Subject: [PATCH 216/278] adjust path

---
 tests/kv_transfer/module_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/module_test.py
index 10fb19a3128e..355461919cd7 100644
--- a/tests/kv_transfer/module_test.py
+++ b/tests/kv_transfer/module_test.py
@@ -6,6 +6,7 @@
 
 
 def run_python_script(script_name, timeout):
+    script_name = f'kv_transfer/{script_name}'
     try:
         # Start both processes asynchronously using Popen
         process0 = subprocess.Popen(

From eb751d642b3bbb1a7f215754611f45bba298a54f Mon Sep 17 00:00:00 2001
From: ApostaC <jc4xvyp@outlook.com>
Date: Mon, 16 Sep 2024 01:25:32 +0000
Subject: [PATCH 217/278] add disagg prefill test to test pipeline

---
 .buildkite/test-pipeline.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9b0cb6663a55..da79fd86b767 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -390,6 +390,18 @@ steps:
   - pytest -v -s distributed/test_pp_cudagraph.py
   - pytest -v -s distributed/test_pipeline_parallel.py
 
+- label: Disaggregated Prefill Test # 4min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/parallel_state.py
+  - vllm/distributed/kv_transfer
+  - vllm/worker/worker_base.py
+  - vllm/worker/model_runner.py
+  commands:
+  - pytest -v -s kv_transfer/module_test.py
+  - pytest -v -s kv_transfer/disagg_test.py
+
 - label: LoRA Long Context (Distributed) # 11min
   # This test runs llama 13B, so it is required to run on 4 GPUs.
   num_gpus: 4

From 1e23e99ecf68ce7ef97cdbf30340a0de8ad0bc9b Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 15 Sep 2024 21:15:00 +0000
Subject: [PATCH 218/278] use new round robin proxy in performance benchmark

---
 benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index dde9a80b59b3..715fe56d6c59 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -19,7 +19,6 @@ kill_gpu_processes() {
   # kill all processes on GPU.
   pkill -f pt_main_thread
   pkill -f python3
-  pkill -f round_robin_proxy.sh
   ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
   for port in 8000 8100 8200; do lsof -t -i:$port | xargs -r kill -9; done
   sleep 1
@@ -61,7 +60,7 @@ launch_chunked_prefill() {
     --gpu-memory-utilization 0.8 &
   wait_for_server 8100
   wait_for_server 8200
-  bash round_robin_proxy.sh &
+  python3 round_robin_proxy.py &
   sleep 1
 }
 
@@ -149,7 +148,7 @@ main() {
   mkdir results
 
   default_qps=10
-  default_output_len=150
+  default_output_len=10
 
   export VLLM_LOGGING_LEVEL=DEBUG
   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')

From b4225f80eb1fa8934ab68fbacabfc01a8e081f25 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 15 Sep 2024 21:19:42 +0000
Subject: [PATCH 219/278] update

---
 .../disagg_benchmarks/round_robin_proxy.py    | 117 ++++++------------
 1 file changed, 40 insertions(+), 77 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
index 04a30f774670..8751e24a08d3 100644
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -3,92 +3,55 @@
 from aiohttp import web
 import itertools
 
-class AsyncRoundRobinProxy:
-    def __init__(self, backend_ports):
-        self.backend_ports = itertools.cycle(backend_ports)
-        self.session = None
-
-    async def start(self):
-        self.session = aiohttp.ClientSession()
-
-    async def stop(self):
-        if self.session:
-            await self.session.close()
+class RoundRobinProxy:
+    def __init__(self, target_ports):
+        self.target_ports = target_ports
+        self.port_cycle = itertools.cycle(self.target_ports)
 
     async def handle_request(self, request):
-        backend_port = next(self.backend_ports)
-        print("forwarding to port", backend_port)
-        backend_url = f"http://localhost:{backend_port}{request.path_qs}"
-
-        try:
-            async with self.session.request(
-                method=request.method,
-                url=backend_url,
-                headers=request.headers,
-                data=await request.read()
-            ) as backend_response:
-                response = web.StreamResponse(
-                    status=backend_response.status,
-                    headers=backend_response.headers
-                )
-                await response.prepare(request)
-
-                async for chunk in backend_response.content.iter_any():
-                    await response.write(chunk)
-
-                await response.write_eof()
-                return response
-
-        except aiohttp.ClientError as e:
-            return web.Response(text=f"Backend error: {str(e)}", status=502)
-
-async def run_backend(port):
-    async def handle(request):
-        if request.path == '/stream':
-            response = web.StreamResponse(
-                status=200,
-                headers={'Content-Type': 'text/plain'}
-            )
-            await response.prepare(request)
-            for i in range(10):
-                await response.write(f"Chunk {i}\n".encode())
-                await asyncio.sleep(0.5)  # Simulate delay between chunks
-            return response
-        else:
-            return web.Response(text=f"Response from backend on port {port}")
-
-    app = web.Application()
-    app.router.add_route('*', '/{tail:.*}', handle)
-    runner = web.AppRunner(app)
-    await runner.setup()
-    site = web.TCPSite(runner, 'localhost', port)
-    await site.start()
-    print(f"Backend running on http://localhost:{port}")
+        target_port = next(self.port_cycle)
+        target_url = f"http://localhost:{target_port}{request.path_qs}"
+
+        async with aiohttp.ClientSession() as session:
+            try:
+                # Forward the request
+                async with session.request(
+                    method=request.method,
+                    url=target_url,
+                    headers=request.headers,
+                    data=request.content,
+                ) as response:
+                    # Start sending the response
+                    resp = web.StreamResponse(
+                        status=response.status,
+                        headers=response.headers
+                    )
+                    await resp.prepare(request)
+
+                    # Stream the response content
+                    async for chunk in response.content.iter_any():
+                        await resp.write(chunk)
+
+                    await resp.write_eof()
+                    return resp
+
+            except Exception as e:
+                return web.Response(text=f"Error: {str(e)}", status=500)
 
 async def main():
-    proxy = AsyncRoundRobinProxy([8100, 8200])
-    await proxy.start()
-
+    proxy = RoundRobinProxy([8100, 8200])
     app = web.Application()
-    app.router.add_route('*', '/{tail:.*}', proxy.handle_request)
+    app.router.add_route('*', '/{path:.*}', proxy.handle_request)
 
     runner = web.AppRunner(app)
     await runner.setup()
     site = web.TCPSite(runner, 'localhost', 8000)
+    await site.start()
 
-    await asyncio.gather(
-        site.start(),
-        run_backend(8100),
-        run_backend(8200)
-    )
-
-    print("Proxy running on http://localhost:8000")
-
-    try:
-        await asyncio.Future()  # Run forever
-    finally:
-        await proxy.stop()
-        await runner.cleanup()
+    print("Proxy server started on http://localhost:8000")
+    
+    # Keep the server running
+    await asyncio.Event().wait()
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     asyncio.run(main())
\ No newline at end of file

From fa4785788f4dfcf8e72f0c31923ba28f1f53c132 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 15 Sep 2024 22:13:26 +0000
Subject: [PATCH 220/278] update benchmarking script

---
 .../analyze_benchmark_results.py              | 48 -------------------
 .../disagg_performance_benchmark.sh           | 17 +++----
 2 files changed, 9 insertions(+), 56 deletions(-)
 delete mode 100644 benchmarks/disagg_benchmarks/analyze_benchmark_results.py

diff --git a/benchmarks/disagg_benchmarks/analyze_benchmark_results.py b/benchmarks/disagg_benchmarks/analyze_benchmark_results.py
deleted file mode 100644
index 4b675c675d25..000000000000
--- a/benchmarks/disagg_benchmarks/analyze_benchmark_results.py
+++ /dev/null
@@ -1,48 +0,0 @@
-
-import argparse
-import json
-import yaml
-import os
-from pathlib import Path
-
-def load(path):
-    
-    with open(str(path), 'r') as f:
-        return json.loads(f.read())
-
-def main(args):
-
-    results = Path(args.results_folder)
-
-    chunk = load(results / "chunked_prefill_tp4.json")
-    prefill = load(results / "disagg_prefill_tp4.json")
-    decode = load(results / "disagg_decode_tp4.json")
-
-    ttft_ratio = chunk["mean_ttft_ms"] / prefill["mean_ttft_ms"]
-    itl_ratio = chunk["mean_itl_ms"] / decode["mean_itl_ms"]
-    prefill_decode_ratio = prefill["mean_ttft_ms"] / (decode["mean_itl_ms"] * args.output_len)
-    
-    with open(results / args.output_file, 'a') as f:
-        f.write(yaml.dump([{
-            'qps': args.qps,
-            'output_len': args.output_len,
-            'prefill_decode_ratio': prefill_decode_ratio,
-            'ttft_ratio': ttft_ratio,
-            'itl_ratio': itl_ratio,
-            "chunk_ttft": chunk["mean_ttft_ms"],
-            "chunk_itl": chunk["mean_itl_ms"],
-            "disagg_ttft": prefill["mean_ttft_ms"],
-            "disagg_itl": decode["mean_itl_ms"]
-        }]))
-    
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Analyze benchmark results")
-    parser.add_argument("--results-folder", required=True, help="Path to the results folder")
-    parser.add_argument("--output-len", type=int, required=True, help="Target output length")
-    parser.add_argument("--qps", type=int, required=True, help="Target QPS")
-    parser.add_argument("--output-file", type=str, default="chunk_vs_disagg.yaml")
-
-    args = parser.parse_args()
-    main(args)
-    
\ No newline at end of file
diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index 715fe56d6c59..734679660c23 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -43,7 +43,7 @@ launch_chunked_prefill() {
       --model $model \
       --port 8100 \
       -tp 4 \
-      --max-model-len 30000 \
+      --max-model-len 10000 \
       --disable-log-stats \
       --disable-log-requests \
       --enable-chunked-prefill \
@@ -53,7 +53,7 @@ launch_chunked_prefill() {
     --model $model \
     --port 8200 \
     -tp 4 \
-    --max-model-len 30000 \
+    --max-model-len 10000 \
     --disable-log-stats \
     --disable-log-requests \
     --enable-chunked-prefill \
@@ -73,7 +73,7 @@ launch_disagg_prefill() {
       --model $model \
       --port 8100 \
       -tp 4 \
-      --max-model-len 30000 \
+      --max-model-len 10000 \
       --disable-log-stats \
       --disable-log-requests \
       --gpu-memory-utilization 0.8 &
@@ -82,7 +82,7 @@ launch_disagg_prefill() {
     --model $model \
     --port 8200 \
     -tp 4 \
-    --max-model-len 30000 \
+    --max-model-len 10000 \
     --disable-log-stats \
     --disable-log-requests \
     --gpu-memory-utilization 0.8 &
@@ -98,10 +98,10 @@ benchmark() {
   model="meta-llama/Meta-Llama-3.1-70B-Instruct"
   dataset_name="sonnet"
   dataset_path="../sonnet_4x.txt"
-  num_prompts=400
+  num_prompts=200
   qps=$1
   prefix_len=50
-  input_len=2048
+  input_len=1024
   output_len=$2
   tag=$3
 
@@ -131,7 +131,7 @@ main() {
   (which jq) || (apt-get -y install jq)
   (which socat) || (apt-get -y install socat)
 
-  pip install quart httpx
+  pip install quart httpx matplotlib aiohttp
 
   cd "$(dirname "$0")"
 
@@ -147,7 +147,6 @@ main() {
   rm -rf results
   mkdir results
 
-  default_qps=10
   default_output_len=10
 
   export VLLM_LOGGING_LEVEL=DEBUG
@@ -165,6 +164,8 @@ main() {
   done
   kill_gpu_processes
 
+  python3 visualize_benchmark_results.py
+
 }
 
 

From 46f82a4cfa03b86453d465ffc883a26c66e519d3 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 15 Sep 2024 22:18:40 +0000
Subject: [PATCH 221/278] revert changes in model_runner.py --- no change
 needed for disagg prefill

---
 vllm/worker/model_runner.py | 39 +++++++++++++------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 8c4899a8b7f5..447d303a57fd 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -14,7 +14,6 @@
 import torch.distributed
 import torch.nn as nn
 
-
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
@@ -1545,30 +1544,21 @@ def execute_model(
             model_forward_end = torch.cuda.Event(enable_timing=True)
             model_forward_start.record()
 
-
         hidden_or_intermediate_states = model_executable(
-        input_ids=model_input.input_tokens,
-        positions=model_input.input_positions,
-        kv_caches=kv_caches,
-        attn_metadata=model_input.attn_metadata,
-        intermediate_tensors=intermediate_tensors,
-        **MultiModalInputs.as_kwargs(multi_modal_kwargs,
-                                        device=self.device),
-        **seqlen_agnostic_kwargs)
-        
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            kv_caches=kv_caches,
+            attn_metadata=model_input.attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                                         device=self.device),
+            **seqlen_agnostic_kwargs)
+
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_end.record()
 
-        return hidden_or_intermediate_states
-
-    @torch.inference_mode()
-    def postprocess_model(
-        self,
-        model_input,
-        hidden_or_intermediate_states,
-        
-    ):
+        # Compute the logits in the last pipeline stage.
         if not get_pp_group().is_last_rank:
             if (self.is_driver_worker
                     and hidden_or_intermediate_states is not None
@@ -1586,7 +1576,7 @@ def postprocess_model(
                 hidden_or_intermediate_states.tensors["model_forward_time"] = (
                     torch.tensor(model_forward_time + orig_model_forward_time))
             return hidden_or_intermediate_states
-        
+
         logits = self.model.compute_logits(hidden_or_intermediate_states,
                                            model_input.sampling_metadata)
 
@@ -1618,7 +1608,6 @@ def postprocess_model(
             output.model_forward_time = (orig_model_forward_time +
                                          model_forward_time)
 
-        decode_meta = model_input.attn_metadata.decode_metadata
         if self.return_hidden_states:
             # we only need to pass hidden states of most recent token
             assert model_input.sampling_metadata is not None
@@ -1635,9 +1624,7 @@ def postprocess_model(
             output.hidden_states = hidden_states
 
         return [output]
-    
-    
-    
+
 
 class CUDAGraphRunner:
 
@@ -1808,4 +1795,4 @@ def _get_max_graph_batch_size(max_num_seqs: int) -> int:
     if padded_size in _BATCH_SIZES_TO_CAPTURE:
         return padded_size
     assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1]
-    return _BATCH_SIZES_TO_CAPTURE[-1]
+    return _BATCH_SIZES_TO_CAPTURE[-1]
\ No newline at end of file

From 8d7bb78895c7c42648dbabeff6dfd85318aa3924 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 15 Sep 2024 22:21:02 +0000
Subject: [PATCH 222/278] no I was wrong

---
 vllm/worker/model_runner.py | 56 ++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 447d303a57fd..ab38302b3321 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -14,6 +14,7 @@
 import torch.distributed
 import torch.nn as nn
 
+
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
@@ -54,6 +55,7 @@
     _add_sampling_metadata_broadcastable_dict,
     _init_attn_metadata_from_tensor_dict,
     _init_sampling_metadata_from_tensor_dict, dump_input_when_exception)
+from vllm import _custom_ops as ops
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -1544,21 +1546,30 @@ def execute_model(
             model_forward_end = torch.cuda.Event(enable_timing=True)
             model_forward_start.record()
 
-        hidden_or_intermediate_states = model_executable(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=model_input.attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            **MultiModalInputs.as_kwargs(multi_modal_kwargs,
-                                         device=self.device),
-            **seqlen_agnostic_kwargs)
 
+        hidden_or_intermediate_states = model_executable(
+        input_ids=model_input.input_tokens,
+        positions=model_input.input_positions,
+        kv_caches=kv_caches,
+        attn_metadata=model_input.attn_metadata,
+        intermediate_tensors=intermediate_tensors,
+        **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                                        device=self.device),
+        **seqlen_agnostic_kwargs)
+        
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_end.record()
 
-        # Compute the logits in the last pipeline stage.
+        return hidden_or_intermediate_states
+
+    @torch.inference_mode()
+    def postprocess_model(
+        self,
+        model_input,
+        hidden_or_intermediate_states,
+        
+    ):
         if not get_pp_group().is_last_rank:
             if (self.is_driver_worker
                     and hidden_or_intermediate_states is not None
@@ -1576,7 +1587,7 @@ def execute_model(
                 hidden_or_intermediate_states.tensors["model_forward_time"] = (
                     torch.tensor(model_forward_time + orig_model_forward_time))
             return hidden_or_intermediate_states
-
+        
         logits = self.model.compute_logits(hidden_or_intermediate_states,
                                            model_input.sampling_metadata)
 
@@ -1591,23 +1602,8 @@ def execute_model(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time
-                and output is not None):
-            model_forward_end.synchronize()
-            model_forward_time = model_forward_start.elapsed_time(
-                model_forward_end)
-            orig_model_forward_time = 0.0
-            if intermediate_tensors is not None:
-                orig_model_forward_time = intermediate_tensors.tensors.get(
-                    "model_forward_time", torch.tensor(0.0)).item()
-            # If there are multiple workers, we are still tracking the latency
-            # from the start time of the driver worker to the end time of the
-            # driver worker. The model forward time will then end up covering
-            # the communication time as well.
-            output.model_forward_time = (orig_model_forward_time +
-                                         model_forward_time)
 
+        decode_meta = model_input.attn_metadata.decode_metadata
         if self.return_hidden_states:
             # we only need to pass hidden states of most recent token
             assert model_input.sampling_metadata is not None
@@ -1624,7 +1620,9 @@ def execute_model(
             output.hidden_states = hidden_states
 
         return [output]
-
+    
+    
+    
 
 class CUDAGraphRunner:
 
@@ -1795,4 +1793,4 @@ def _get_max_graph_batch_size(max_num_seqs: int) -> int:
     if padded_size in _BATCH_SIZES_TO_CAPTURE:
         return padded_size
     assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1]
-    return _BATCH_SIZES_TO_CAPTURE[-1]
\ No newline at end of file
+    return _BATCH_SIZES_TO_CAPTURE[-1]

From b5f9db5a45cab2a21a9a53b75318fbbc85a28e10 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 15 Sep 2024 22:23:20 +0000
Subject: [PATCH 223/278] update benchmark

---
 benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index 734679660c23..1da5669dd1cd 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -147,7 +147,7 @@ main() {
   rm -rf results
   mkdir results
 
-  default_output_len=10
+  default_output_len=6
 
   export VLLM_LOGGING_LEVEL=DEBUG
   export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')

From 0fc00918aa47dcd7d00f55940f72ffefcf666c15 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 15 Sep 2024 22:23:48 +0000
Subject: [PATCH 224/278] remove sonnet 4x --- it can be automatically
 generated via benchmarking script

---
 benchmarks/sonnet_4x.txt | 2070 --------------------------------------
 1 file changed, 2070 deletions(-)
 delete mode 100644 benchmarks/sonnet_4x.txt

diff --git a/benchmarks/sonnet_4x.txt b/benchmarks/sonnet_4x.txt
deleted file mode 100644
index 02f39a9fb14f..000000000000
--- a/benchmarks/sonnet_4x.txt
+++ /dev/null
@@ -1,2070 +0,0 @@
-
-FROM fairest creatures we desire increase,
-That thereby beauty's rose might never die,
-But as the riper should by time decease,
-His tender heir might bear his memory:
-But thou, contracted to thine own bright eyes,
-Feed'st thy light'st flame with self-substantial fuel,
-Making a famine where abundance lies,
-Thyself thy foe, to thy sweet self too cruel.
-Thou that art now the world's fresh ornament
-And only herald to the gaudy spring,
-Within thine own bud buriest thy content
-And, tender churl, makest waste in niggarding.
-Pity the world, or else this glutton be,
-To eat the world's due, by the grave and thee.
-When forty winters shall beseige thy brow,
-And dig deep trenches in thy beauty's field,
-Thy youth's proud livery, so gazed on now,
-Will be a tatter'd weed, of small worth held:
-Then being ask'd where all thy beauty lies,
-Where all the treasure of thy lusty days,
-To say, within thine own deep-sunken eyes,
-Were an all-eating shame and thriftless praise.
-How much more praise deserved thy beauty's use,
-If thou couldst answer 'This fair child of mine
-Shall sum my count and make my old excuse,'
-Proving his beauty by succession thine!
-This were to be new made when thou art old,
-And see thy blood warm when thou feel'st it cold.
-Look in thy glass, and tell the face thou viewest
-Now is the time that face should form another;
-Whose fresh repair if now thou not renewest,
-Thou dost beguile the world, unbless some mother.
-For where is she so fair whose unear'd womb
-Disdains the tillage of thy husbandry?
-Or who is he so fond will be the tomb
-Of his self-love, to stop posterity?
-Thou art thy mother's glass, and she in thee
-Calls back the lovely April of her prime:
-So thou through windows of thine age shall see
-Despite of wrinkles this thy golden time.
-But if thou live, remember'd not to be,
-Die single, and thine image dies with thee.
-Unthrifty loveliness, why dost thou spend
-Upon thyself thy beauty's legacy?
-Nature's bequest gives nothing but doth lend,
-And being frank she lends to those are free.
-Then, beauteous niggard, why dost thou abuse
-The bounteous largess given thee to give?
-Profitless usurer, why dost thou use
-So great a sum of sums, yet canst not live?
-For having traffic with thyself alone,
-Thou of thyself thy sweet self dost deceive.
-Then how, when nature calls thee to be gone,
-What acceptable audit canst thou leave?
-Thy unused beauty must be tomb'd with thee,
-Which, used, lives th' executor to be.
-Those hours, that with gentle work did frame
-The lovely gaze where every eye doth dwell,
-Will play the tyrants to the very same
-And that unfair which fairly doth excel:
-For never-resting time leads summer on
-To hideous winter and confounds him there;
-Sap cheque'd with frost and lusty leaves quite gone,
-Beauty o'ersnow'd and bareness every where:
-Then, were not summer's distillation left,
-A liquid prisoner pent in walls of glass,
-Beauty's effect with beauty were bereft,
-Nor it nor no remembrance what it was:
-But flowers distill'd though they with winter meet,
-Leese but their show; their substance still lives sweet.
-Then let not winter's ragged hand deface
-In thee thy summer, ere thou be distill'd:
-Make sweet some vial; treasure thou some place
-With beauty's treasure, ere it be self-kill'd.
-That use is not forbidden usury,
-Which happies those that pay the willing loan;
-That's for thyself to breed another thee,
-Or ten times happier, be it ten for one;
-Ten times thyself were happier than thou art,
-If ten of thine ten times refigured thee:
-Then what could death do, if thou shouldst depart,
-Leaving thee living in posterity?
-Be not self-will'd, for thou art much too fair
-To be death's conquest and make worms thine heir.
-Lo! in the orient when the gracious light
-Lifts up his burning head, each under eye
-Doth homage to his new-appearing sight,
-Serving with looks his sacred majesty;
-And having climb'd the steep-up heavenly hill,
-Resembling strong youth in his middle age,
-yet mortal looks adore his beauty still,
-Attending on his golden pilgrimage;
-But when from highmost pitch, with weary car,
-Like feeble age, he reeleth from the day,
-The eyes, 'fore duteous, now converted are
-From his low tract and look another way:
-So thou, thyself out-going in thy noon,
-Unlook'd on diest, unless thou get a son.
-Music to hear, why hear'st thou music sadly?
-Sweets with sweets war not, joy delights in joy.
-Why lovest thou that which thou receivest not gladly,
-Or else receivest with pleasure thine annoy?
-If the true concord of well-tuned sounds,
-By unions married, do offend thine ear,
-They do but sweetly chide thee, who confounds
-In singleness the parts that thou shouldst bear.
-Mark how one string, sweet husband to another,
-Strikes each in each by mutual ordering,
-Resembling sire and child and happy mother
-Who all in one, one pleasing note do sing:
-Whose speechless song, being many, seeming one,
-Sings this to thee: 'thou single wilt prove none.'
-Is it for fear to wet a widow's eye
-That thou consumest thyself in single life?
-Ah! if thou issueless shalt hap to die.
-The world will wail thee, like a makeless wife;
-The world will be thy widow and still weep
-That thou no form of thee hast left behind,
-When every private widow well may keep
-By children's eyes her husband's shape in mind.
-Look, what an unthrift in the world doth spend
-Shifts but his place, for still the world enjoys it;
-But beauty's waste hath in the world an end,
-And kept unused, the user so destroys it.
-No love toward others in that bosom sits
-That on himself such murderous shame commits.
-For shame! deny that thou bear'st love to any,
-Who for thyself art so unprovident.
-Grant, if thou wilt, thou art beloved of many,
-But that thou none lovest is most evident;
-For thou art so possess'd with murderous hate
-That 'gainst thyself thou stick'st not to conspire.
-Seeking that beauteous roof to ruinate
-Which to repair should be thy chief desire.
-O, change thy thought, that I may change my mind!
-Shall hate be fairer lodged than gentle love?
-Be, as thy presence is, gracious and kind,
-Or to thyself at least kind-hearted prove:
-Make thee another self, for love of me,
-That beauty still may live in thine or thee.
-As fast as thou shalt wane, so fast thou growest
-In one of thine, from that which thou departest;
-And that fresh blood which youngly thou bestowest
-Thou mayst call thine when thou from youth convertest.
-Herein lives wisdom, beauty and increase:
-Without this, folly, age and cold decay:
-If all were minded so, the times should cease
-And threescore year would make the world away.
-Let those whom Nature hath not made for store,
-Harsh featureless and rude, barrenly perish:
-Look, whom she best endow'd she gave the more;
-Which bounteous gift thou shouldst in bounty cherish:
-She carved thee for her seal, and meant thereby
-Thou shouldst print more, not let that copy die.
-When I do count the clock that tells the time,
-And see the brave day sunk in hideous night;
-When I behold the violet past prime,
-And sable curls all silver'd o'er with white;
-When lofty trees I see barren of leaves
-Which erst from heat did canopy the herd,
-And summer's green all girded up in sheaves
-Borne on the bier with white and bristly beard,
-Then of thy beauty do I question make,
-That thou among the wastes of time must go,
-Since sweets and beauties do themselves forsake
-And die as fast as they see others grow;
-And nothing 'gainst Time's scythe can make defence
-Save breed, to brave him when he takes thee hence.
-O, that you were yourself! but, love, you are
-No longer yours than you yourself here live:
-Against this coming end you should prepare,
-And your sweet semblance to some other give.
-So should that beauty which you hold in lease
-Find no determination: then you were
-Yourself again after yourself's decease,
-When your sweet issue your sweet form should bear.
-Who lets so fair a house fall to decay,
-Which husbandry in honour might uphold
-Against the stormy gusts of winter's day
-And barren rage of death's eternal cold?
-O, none but unthrifts! Dear my love, you know
-You had a father: let your son say so.
-Not from the stars do I my judgment pluck;
-And yet methinks I have astronomy,
-But not to tell of good or evil luck,
-Of plagues, of dearths, or seasons' quality;
-Nor can I fortune to brief minutes tell,
-Pointing to each his thunder, rain and wind,
-Or say with princes if it shall go well,
-By oft predict that I in heaven find:
-But from thine eyes my knowledge I derive,
-And, constant stars, in them I read such art
-As truth and beauty shall together thrive,
-If from thyself to store thou wouldst convert;
-Or else of thee this I prognosticate:
-Thy end is truth's and beauty's doom and date.
-When I consider every thing that grows
-Holds in perfection but a little moment,
-That this huge stage presenteth nought but shows
-Whereon the stars in secret influence comment;
-When I perceive that men as plants increase,
-Cheered and cheque'd even by the self-same sky,
-Vaunt in their youthful sap, at height decrease,
-And wear their brave state out of memory;
-Then the conceit of this inconstant stay
-Sets you most rich in youth before my sight,
-Where wasteful Time debateth with Decay,
-To change your day of youth to sullied night;
-And all in war with Time for love of you,
-As he takes from you, I engraft you new.
-But wherefore do not you a mightier way
-Make war upon this bloody tyrant, Time?
-And fortify yourself in your decay
-With means more blessed than my barren rhyme?
-Now stand you on the top of happy hours,
-And many maiden gardens yet unset
-With virtuous wish would bear your living flowers,
-Much liker than your painted counterfeit:
-So should the lines of life that life repair,
-Which this, Time's pencil, or my pupil pen,
-Neither in inward worth nor outward fair,
-Can make you live yourself in eyes of men.
-To give away yourself keeps yourself still,
-And you must live, drawn by your own sweet skill.
-Who will believe my verse in time to come,
-If it were fill'd with your most high deserts?
-Though yet, heaven knows, it is but as a tomb
-Which hides your life and shows not half your parts.
-If I could write the beauty of your eyes
-And in fresh numbers number all your graces,
-The age to come would say 'This poet lies:
-Such heavenly touches ne'er touch'd earthly faces.'
-So should my papers yellow'd with their age
-Be scorn'd like old men of less truth than tongue,
-And your true rights be term'd a poet's rage
-And stretched metre of an antique song:
-But were some child of yours alive that time,
-You should live twice; in it and in my rhyme.
-Shall I compare thee to a summer's day?
-Thou art more lovely and more temperate:
-Rough winds do shake the darling buds of May,
-And summer's lease hath all too short a date:
-Sometime too hot the eye of heaven shines,
-And often is his gold complexion dimm'd;
-And every fair from fair sometime declines,
-By chance or nature's changing course untrimm'd;
-But thy eternal summer shall not fade
-Nor lose possession of that fair thou owest;
-Nor shall Death brag thou wander'st in his shade,
-When in eternal lines to time thou growest:
-So long as men can breathe or eyes can see,
-So long lives this and this gives life to thee.
-Devouring Time, blunt thou the lion's paws,
-And make the earth devour her own sweet brood;
-Pluck the keen teeth from the fierce tiger's jaws,
-And burn the long-lived phoenix in her blood;
-Make glad and sorry seasons as thou fleets,
-And do whate'er thou wilt, swift-footed Time,
-To the wide world and all her fading sweets;
-But I forbid thee one most heinous crime:
-O, carve not with thy hours my love's fair brow,
-Nor draw no lines there with thine antique pen;
-Him in thy course untainted do allow
-For beauty's pattern to succeeding men.
-Yet, do thy worst, old Time: despite thy wrong,
-My love shall in my verse ever live young.
-A woman's face with Nature's own hand painted
-Hast thou, the master-mistress of my passion;
-A woman's gentle heart, but not acquainted
-With shifting change, as is false women's fashion;
-An eye more bright than theirs, less false in rolling,
-Gilding the object whereupon it gazeth;
-A man in hue, all 'hues' in his controlling,
-Much steals men's eyes and women's souls amazeth.
-And for a woman wert thou first created;
-Till Nature, as she wrought thee, fell a-doting,
-And by addition me of thee defeated,
-By adding one thing to my purpose nothing.
-But since she prick'd thee out for women's pleasure,
-Mine be thy love and thy love's use their treasure.
-So is it not with me as with that Muse
-Stirr'd by a painted beauty to his verse,
-Who heaven itself for ornament doth use
-And every fair with his fair doth rehearse
-Making a couplement of proud compare,
-With sun and moon, with earth and sea's rich gems,
-With April's first-born flowers, and all things rare
-That heaven's air in this huge rondure hems.
-O' let me, true in love, but truly write,
-And then believe me, my love is as fair
-As any mother's child, though not so bright
-As those gold candles fix'd in heaven's air:
-Let them say more than like of hearsay well;
-I will not praise that purpose not to sell.
-My glass shall not persuade me I am old,
-So long as youth and thou are of one date;
-But when in thee time's furrows I behold,
-Then look I death my days should expiate.
-For all that beauty that doth cover thee
-Is but the seemly raiment of my heart,
-Which in thy breast doth live, as thine in me:
-How can I then be elder than thou art?
-O, therefore, love, be of thyself so wary
-As I, not for myself, but for thee will;
-Bearing thy heart, which I will keep so chary
-As tender nurse her babe from faring ill.
-Presume not on thy heart when mine is slain;
-Thou gavest me thine, not to give back again.
-As an unperfect actor on the stage
-Who with his fear is put besides his part,
-Or some fierce thing replete with too much rage,
-Whose strength's abundance weakens his own heart.
-So I, for fear of trust, forget to say
-The perfect ceremony of love's rite,
-And in mine own love's strength seem to decay,
-O'ercharged with burden of mine own love's might.
-O, let my books be then the eloquence
-And dumb presagers of my speaking breast,
-Who plead for love and look for recompense
-More than that tongue that more hath more express'd.
-O, learn to read what silent love hath writ:
-To hear with eyes belongs to love's fine wit.
-Mine eye hath play'd the painter and hath stell'd
-Thy beauty's form in table of my heart;
-My body is the frame wherein 'tis held,
-And perspective it is the painter's art.
-For through the painter must you see his skill,
-To find where your true image pictured lies;
-Which in my bosom's shop is hanging still,
-That hath his windows glazed with thine eyes.
-Now see what good turns eyes for eyes have done:
-Mine eyes have drawn thy shape, and thine for me
-Are windows to my breast, where-through the sun
-Delights to peep, to gaze therein on thee;
-Yet eyes this cunning want to grace their art;
-They draw but what they see, know not the heart.
-Let those who are in favour with their stars
-Of public honour and proud titles boast,
-Whilst I, whom fortune of such triumph bars,
-Unlook'd for joy in that I honour most.
-Great princes' favourites their fair leaves spread
-But as the marigold at the sun's eye,
-And in themselves their pride lies buried,
-For at a frown they in their glory die.
-The painful warrior famoused for fight,
-After a thousand victories once foil'd,
-Is from the book of honour razed quite,
-And all the rest forgot for which he toil'd:
-Then happy I, that love and am beloved
-Where I may not remove nor be removed.
-Lord of my love, to whom in vassalage
-Thy merit hath my duty strongly knit,
-To thee I send this written embassage,
-To witness duty, not to show my wit:
-Duty so great, which wit so poor as mine
-May make seem bare, in wanting words to show it,
-But that I hope some good conceit of thine
-In thy soul's thought, all naked, will bestow it;
-Till whatsoever star that guides my moving
-Points on me graciously with fair aspect
-And puts apparel on my tatter'd loving,
-To show me worthy of thy sweet respect:
-Then may I dare to boast how I do love thee;
-Till then not show my head where thou mayst prove me.
-Weary with toil, I haste me to my bed,
-The dear repose for limbs with travel tired;
-But then begins a journey in my head,
-To work my mind, when body's work's expired:
-For then my thoughts, from far where I abide,
-Intend a zealous pilgrimage to thee,
-And keep my drooping eyelids open wide,
-Looking on darkness which the blind do see
-Save that my soul's imaginary sight
-Presents thy shadow to my sightless view,
-Which, like a jewel hung in ghastly night,
-Makes black night beauteous and her old face new.
-Lo! thus, by day my limbs, by night my mind,
-For thee and for myself no quiet find.
-How can I then return in happy plight,
-That am debarr'd the benefit of rest?
-When day's oppression is not eased by night,
-But day by night, and night by day, oppress'd?
-And each, though enemies to either's reign,
-Do in consent shake hands to torture me;
-The one by toil, the other to complain
-How far I toil, still farther off from thee.
-I tell the day, to please them thou art bright
-And dost him grace when clouds do blot the heaven:
-So flatter I the swart-complexion'd night,
-When sparkling stars twire not thou gild'st the even.
-But day doth daily draw my sorrows longer
-And night doth nightly make grief's strength seem stronger.
-When, in disgrace with fortune and men's eyes,
-I all alone beweep my outcast state
-And trouble deal heaven with my bootless cries
-And look upon myself and curse my fate,
-Wishing me like to one more rich in hope,
-Featured like him, like him with friends possess'd,
-Desiring this man's art and that man's scope,
-With what I most enjoy contented least;
-Yet in these thoughts myself almost despising,
-Haply I think on thee, and then my state,
-Like to the lark at break of day arising
-From sullen earth, sings hymns at heaven's gate;
-For thy sweet love remember'd such wealth brings
-That then I scorn to change my state with kings.
-When to the sessions of sweet silent thought
-I summon up remembrance of things past,
-I sigh the lack of many a thing I sought,
-And with old woes new wail my dear time's waste:
-Then can I drown an eye, unused to flow,
-For precious friends hid in death's dateless night,
-And weep afresh love's long since cancell'd woe,
-And moan the expense of many a vanish'd sight:
-Then can I grieve at grievances foregone,
-And heavily from woe to woe tell o'er
-The sad account of fore-bemoaned moan,
-Which I new pay as if not paid before.
-But if the while I think on thee, dear friend,
-All losses are restored and sorrows end.
-Thy bosom is endeared with all hearts,
-Which I by lacking have supposed dead,
-And there reigns love and all love's loving parts,
-And all those friends which I thought buried.
-How many a holy and obsequious tear
-Hath dear religious love stol'n from mine eye
-As interest of the dead, which now appear
-But things removed that hidden in thee lie!
-Thou art the grave where buried love doth live,
-Hung with the trophies of my lovers gone,
-Who all their parts of me to thee did give;
-That due of many now is thine alone:
-Their images I loved I view in thee,
-And thou, all they, hast all the all of me.
-If thou survive my well-contented day,
-When that churl Death my bones with dust shall cover,
-And shalt by fortune once more re-survey
-These poor rude lines of thy deceased lover,
-Compare them with the bettering of the time,
-And though they be outstripp'd by every pen,
-Reserve them for my love, not for their rhyme,
-Exceeded by the height of happier men.
-O, then vouchsafe me but this loving thought:
-'Had my friend's Muse grown with this growing age,
-A dearer birth than this his love had brought,
-To march in ranks of better equipage:
-But since he died and poets better prove,
-Theirs for their style I'll read, his for his love.'
-Full many a glorious morning have I seen
-Flatter the mountain-tops with sovereign eye,
-Kissing with golden face the meadows green,
-Gilding pale streams with heavenly alchemy;
-Anon permit the basest clouds to ride
-With ugly rack on his celestial face,
-And from the forlorn world his visage hide,
-Stealing unseen to west with this disgrace:
-Even so my sun one early morn did shine
-With all triumphant splendor on my brow;
-But out, alack! he was but one hour mine;
-The region cloud hath mask'd him from me now.
-Yet him for this my love no whit disdaineth;
-Suns of the world may stain when heaven's sun staineth.
-Why didst thou promise such a beauteous day,
-And make me travel forth without my cloak,
-To let base clouds o'ertake me in my way,
-Hiding thy bravery in their rotten smoke?
-'Tis not enough that through the cloud thou break,
-To dry the rain on my storm-beaten face,
-For no man well of such a salve can speak
-That heals the wound and cures not the disgrace:
-Nor can thy shame give physic to my grief;
-Though thou repent, yet I have still the loss:
-The offender's sorrow lends but weak relief
-To him that bears the strong offence's cross.
-Ah! but those tears are pearl which thy love sheds,
-And they are rich and ransom all ill deeds.
-No more be grieved at that which thou hast done:
-Roses have thorns, and silver fountains mud;
-Clouds and eclipses stain both moon and sun,
-And loathsome canker lives in sweetest bud.
-All men make faults, and even I in this,
-Authorizing thy trespass with compare,
-Myself corrupting, salving thy amiss,
-Excusing thy sins more than thy sins are;
-For to thy sensual fault I bring in sense--
-Thy adverse party is thy advocate--
-And 'gainst myself a lawful plea commence:
-Such civil war is in my love and hate
-That I an accessary needs must be
-To that sweet thief which sourly robs from me.
-Let me confess that we two must be twain,
-Although our undivided loves are one:
-So shall those blots that do with me remain
-Without thy help by me be borne alone.
-In our two loves there is but one respect,
-Though in our lives a separable spite,
-Which though it alter not love's sole effect,
-Yet doth it steal sweet hours from love's delight.
-I may not evermore acknowledge thee,
-Lest my bewailed guilt should do thee shame,
-Nor thou with public kindness honour me,
-Unless thou take that honour from thy name:
-But do not so; I love thee in such sort
-As, thou being mine, mine is thy good report.
-As a decrepit father takes delight
-To see his active child do deeds of youth,
-So I, made lame by fortune's dearest spite,
-Take all my comfort of thy worth and truth.
-For whether beauty, birth, or wealth, or wit,
-Or any of these all, or all, or more,
-Entitled in thy parts do crowned sit,
-I make my love engrafted to this store:
-So then I am not lame, poor, nor despised,
-Whilst that this shadow doth such substance give
-That I in thy abundance am sufficed
-And by a part of all thy glory live.
-Look, what is best, that best I wish in thee:
-This wish I have; then ten times happy me!FROM fairest creatures we desire increase,
-That thereby beauty's rose might never die,
-But as the riper should by time decease,
-His tender heir might bear his memory:
-But thou, contracted to thine own bright eyes,
-Feed'st thy light'st flame with self-substantial fuel,
-Making a famine where abundance lies,
-Thyself thy foe, to thy sweet self too cruel.
-Thou that art now the world's fresh ornament
-And only herald to the gaudy spring,
-Within thine own bud buriest thy content
-And, tender churl, makest waste in niggarding.
-Pity the world, or else this glutton be,
-To eat the world's due, by the grave and thee.
-When forty winters shall beseige thy brow,
-And dig deep trenches in thy beauty's field,
-Thy youth's proud livery, so gazed on now,
-Will be a tatter'd weed, of small worth held:
-Then being ask'd where all thy beauty lies,
-Where all the treasure of thy lusty days,
-To say, within thine own deep-sunken eyes,
-Were an all-eating shame and thriftless praise.
-How much more praise deserved thy beauty's use,
-If thou couldst answer 'This fair child of mine
-Shall sum my count and make my old excuse,'
-Proving his beauty by succession thine!
-This were to be new made when thou art old,
-And see thy blood warm when thou feel'st it cold.
-Look in thy glass, and tell the face thou viewest
-Now is the time that face should form another;
-Whose fresh repair if now thou not renewest,
-Thou dost beguile the world, unbless some mother.
-For where is she so fair whose unear'd womb
-Disdains the tillage of thy husbandry?
-Or who is he so fond will be the tomb
-Of his self-love, to stop posterity?
-Thou art thy mother's glass, and she in thee
-Calls back the lovely April of her prime:
-So thou through windows of thine age shall see
-Despite of wrinkles this thy golden time.
-But if thou live, remember'd not to be,
-Die single, and thine image dies with thee.
-Unthrifty loveliness, why dost thou spend
-Upon thyself thy beauty's legacy?
-Nature's bequest gives nothing but doth lend,
-And being frank she lends to those are free.
-Then, beauteous niggard, why dost thou abuse
-The bounteous largess given thee to give?
-Profitless usurer, why dost thou use
-So great a sum of sums, yet canst not live?
-For having traffic with thyself alone,
-Thou of thyself thy sweet self dost deceive.
-Then how, when nature calls thee to be gone,
-What acceptable audit canst thou leave?
-Thy unused beauty must be tomb'd with thee,
-Which, used, lives th' executor to be.
-Those hours, that with gentle work did frame
-The lovely gaze where every eye doth dwell,
-Will play the tyrants to the very same
-And that unfair which fairly doth excel:
-For never-resting time leads summer on
-To hideous winter and confounds him there;
-Sap cheque'd with frost and lusty leaves quite gone,
-Beauty o'ersnow'd and bareness every where:
-Then, were not summer's distillation left,
-A liquid prisoner pent in walls of glass,
-Beauty's effect with beauty were bereft,
-Nor it nor no remembrance what it was:
-But flowers distill'd though they with winter meet,
-Leese but their show; their substance still lives sweet.
-Then let not winter's ragged hand deface
-In thee thy summer, ere thou be distill'd:
-Make sweet some vial; treasure thou some place
-With beauty's treasure, ere it be self-kill'd.
-That use is not forbidden usury,
-Which happies those that pay the willing loan;
-That's for thyself to breed another thee,
-Or ten times happier, be it ten for one;
-Ten times thyself were happier than thou art,
-If ten of thine ten times refigured thee:
-Then what could death do, if thou shouldst depart,
-Leaving thee living in posterity?
-Be not self-will'd, for thou art much too fair
-To be death's conquest and make worms thine heir.
-Lo! in the orient when the gracious light
-Lifts up his burning head, each under eye
-Doth homage to his new-appearing sight,
-Serving with looks his sacred majesty;
-And having climb'd the steep-up heavenly hill,
-Resembling strong youth in his middle age,
-yet mortal looks adore his beauty still,
-Attending on his golden pilgrimage;
-But when from highmost pitch, with weary car,
-Like feeble age, he reeleth from the day,
-The eyes, 'fore duteous, now converted are
-From his low tract and look another way:
-So thou, thyself out-going in thy noon,
-Unlook'd on diest, unless thou get a son.
-Music to hear, why hear'st thou music sadly?
-Sweets with sweets war not, joy delights in joy.
-Why lovest thou that which thou receivest not gladly,
-Or else receivest with pleasure thine annoy?
-If the true concord of well-tuned sounds,
-By unions married, do offend thine ear,
-They do but sweetly chide thee, who confounds
-In singleness the parts that thou shouldst bear.
-Mark how one string, sweet husband to another,
-Strikes each in each by mutual ordering,
-Resembling sire and child and happy mother
-Who all in one, one pleasing note do sing:
-Whose speechless song, being many, seeming one,
-Sings this to thee: 'thou single wilt prove none.'
-Is it for fear to wet a widow's eye
-That thou consumest thyself in single life?
-Ah! if thou issueless shalt hap to die.
-The world will wail thee, like a makeless wife;
-The world will be thy widow and still weep
-That thou no form of thee hast left behind,
-When every private widow well may keep
-By children's eyes her husband's shape in mind.
-Look, what an unthrift in the world doth spend
-Shifts but his place, for still the world enjoys it;
-But beauty's waste hath in the world an end,
-And kept unused, the user so destroys it.
-No love toward others in that bosom sits
-That on himself such murderous shame commits.
-For shame! deny that thou bear'st love to any,
-Who for thyself art so unprovident.
-Grant, if thou wilt, thou art beloved of many,
-But that thou none lovest is most evident;
-For thou art so possess'd with murderous hate
-That 'gainst thyself thou stick'st not to conspire.
-Seeking that beauteous roof to ruinate
-Which to repair should be thy chief desire.
-O, change thy thought, that I may change my mind!
-Shall hate be fairer lodged than gentle love?
-Be, as thy presence is, gracious and kind,
-Or to thyself at least kind-hearted prove:
-Make thee another self, for love of me,
-That beauty still may live in thine or thee.
-As fast as thou shalt wane, so fast thou growest
-In one of thine, from that which thou departest;
-And that fresh blood which youngly thou bestowest
-Thou mayst call thine when thou from youth convertest.
-Herein lives wisdom, beauty and increase:
-Without this, folly, age and cold decay:
-If all were minded so, the times should cease
-And threescore year would make the world away.
-Let those whom Nature hath not made for store,
-Harsh featureless and rude, barrenly perish:
-Look, whom she best endow'd she gave the more;
-Which bounteous gift thou shouldst in bounty cherish:
-She carved thee for her seal, and meant thereby
-Thou shouldst print more, not let that copy die.
-When I do count the clock that tells the time,
-And see the brave day sunk in hideous night;
-When I behold the violet past prime,
-And sable curls all silver'd o'er with white;
-When lofty trees I see barren of leaves
-Which erst from heat did canopy the herd,
-And summer's green all girded up in sheaves
-Borne on the bier with white and bristly beard,
-Then of thy beauty do I question make,
-That thou among the wastes of time must go,
-Since sweets and beauties do themselves forsake
-And die as fast as they see others grow;
-And nothing 'gainst Time's scythe can make defence
-Save breed, to brave him when he takes thee hence.
-O, that you were yourself! but, love, you are
-No longer yours than you yourself here live:
-Against this coming end you should prepare,
-And your sweet semblance to some other give.
-So should that beauty which you hold in lease
-Find no determination: then you were
-Yourself again after yourself's decease,
-When your sweet issue your sweet form should bear.
-Who lets so fair a house fall to decay,
-Which husbandry in honour might uphold
-Against the stormy gusts of winter's day
-And barren rage of death's eternal cold?
-O, none but unthrifts! Dear my love, you know
-You had a father: let your son say so.
-Not from the stars do I my judgment pluck;
-And yet methinks I have astronomy,
-But not to tell of good or evil luck,
-Of plagues, of dearths, or seasons' quality;
-Nor can I fortune to brief minutes tell,
-Pointing to each his thunder, rain and wind,
-Or say with princes if it shall go well,
-By oft predict that I in heaven find:
-But from thine eyes my knowledge I derive,
-And, constant stars, in them I read such art
-As truth and beauty shall together thrive,
-If from thyself to store thou wouldst convert;
-Or else of thee this I prognosticate:
-Thy end is truth's and beauty's doom and date.
-When I consider every thing that grows
-Holds in perfection but a little moment,
-That this huge stage presenteth nought but shows
-Whereon the stars in secret influence comment;
-When I perceive that men as plants increase,
-Cheered and cheque'd even by the self-same sky,
-Vaunt in their youthful sap, at height decrease,
-And wear their brave state out of memory;
-Then the conceit of this inconstant stay
-Sets you most rich in youth before my sight,
-Where wasteful Time debateth with Decay,
-To change your day of youth to sullied night;
-And all in war with Time for love of you,
-As he takes from you, I engraft you new.
-But wherefore do not you a mightier way
-Make war upon this bloody tyrant, Time?
-And fortify yourself in your decay
-With means more blessed than my barren rhyme?
-Now stand you on the top of happy hours,
-And many maiden gardens yet unset
-With virtuous wish would bear your living flowers,
-Much liker than your painted counterfeit:
-So should the lines of life that life repair,
-Which this, Time's pencil, or my pupil pen,
-Neither in inward worth nor outward fair,
-Can make you live yourself in eyes of men.
-To give away yourself keeps yourself still,
-And you must live, drawn by your own sweet skill.
-Who will believe my verse in time to come,
-If it were fill'd with your most high deserts?
-Though yet, heaven knows, it is but as a tomb
-Which hides your life and shows not half your parts.
-If I could write the beauty of your eyes
-And in fresh numbers number all your graces,
-The age to come would say 'This poet lies:
-Such heavenly touches ne'er touch'd earthly faces.'
-So should my papers yellow'd with their age
-Be scorn'd like old men of less truth than tongue,
-And your true rights be term'd a poet's rage
-And stretched metre of an antique song:
-But were some child of yours alive that time,
-You should live twice; in it and in my rhyme.
-Shall I compare thee to a summer's day?
-Thou art more lovely and more temperate:
-Rough winds do shake the darling buds of May,
-And summer's lease hath all too short a date:
-Sometime too hot the eye of heaven shines,
-And often is his gold complexion dimm'd;
-And every fair from fair sometime declines,
-By chance or nature's changing course untrimm'd;
-But thy eternal summer shall not fade
-Nor lose possession of that fair thou owest;
-Nor shall Death brag thou wander'st in his shade,
-When in eternal lines to time thou growest:
-So long as men can breathe or eyes can see,
-So long lives this and this gives life to thee.
-Devouring Time, blunt thou the lion's paws,
-And make the earth devour her own sweet brood;
-Pluck the keen teeth from the fierce tiger's jaws,
-And burn the long-lived phoenix in her blood;
-Make glad and sorry seasons as thou fleets,
-And do whate'er thou wilt, swift-footed Time,
-To the wide world and all her fading sweets;
-But I forbid thee one most heinous crime:
-O, carve not with thy hours my love's fair brow,
-Nor draw no lines there with thine antique pen;
-Him in thy course untainted do allow
-For beauty's pattern to succeeding men.
-Yet, do thy worst, old Time: despite thy wrong,
-My love shall in my verse ever live young.
-A woman's face with Nature's own hand painted
-Hast thou, the master-mistress of my passion;
-A woman's gentle heart, but not acquainted
-With shifting change, as is false women's fashion;
-An eye more bright than theirs, less false in rolling,
-Gilding the object whereupon it gazeth;
-A man in hue, all 'hues' in his controlling,
-Much steals men's eyes and women's souls amazeth.
-And for a woman wert thou first created;
-Till Nature, as she wrought thee, fell a-doting,
-And by addition me of thee defeated,
-By adding one thing to my purpose nothing.
-But since she prick'd thee out for women's pleasure,
-Mine be thy love and thy love's use their treasure.
-So is it not with me as with that Muse
-Stirr'd by a painted beauty to his verse,
-Who heaven itself for ornament doth use
-And every fair with his fair doth rehearse
-Making a couplement of proud compare,
-With sun and moon, with earth and sea's rich gems,
-With April's first-born flowers, and all things rare
-That heaven's air in this huge rondure hems.
-O' let me, true in love, but truly write,
-And then believe me, my love is as fair
-As any mother's child, though not so bright
-As those gold candles fix'd in heaven's air:
-Let them say more than like of hearsay well;
-I will not praise that purpose not to sell.
-My glass shall not persuade me I am old,
-So long as youth and thou are of one date;
-But when in thee time's furrows I behold,
-Then look I death my days should expiate.
-For all that beauty that doth cover thee
-Is but the seemly raiment of my heart,
-Which in thy breast doth live, as thine in me:
-How can I then be elder than thou art?
-O, therefore, love, be of thyself so wary
-As I, not for myself, but for thee will;
-Bearing thy heart, which I will keep so chary
-As tender nurse her babe from faring ill.
-Presume not on thy heart when mine is slain;
-Thou gavest me thine, not to give back again.
-As an unperfect actor on the stage
-Who with his fear is put besides his part,
-Or some fierce thing replete with too much rage,
-Whose strength's abundance weakens his own heart.
-So I, for fear of trust, forget to say
-The perfect ceremony of love's rite,
-And in mine own love's strength seem to decay,
-O'ercharged with burden of mine own love's might.
-O, let my books be then the eloquence
-And dumb presagers of my speaking breast,
-Who plead for love and look for recompense
-More than that tongue that more hath more express'd.
-O, learn to read what silent love hath writ:
-To hear with eyes belongs to love's fine wit.
-Mine eye hath play'd the painter and hath stell'd
-Thy beauty's form in table of my heart;
-My body is the frame wherein 'tis held,
-And perspective it is the painter's art.
-For through the painter must you see his skill,
-To find where your true image pictured lies;
-Which in my bosom's shop is hanging still,
-That hath his windows glazed with thine eyes.
-Now see what good turns eyes for eyes have done:
-Mine eyes have drawn thy shape, and thine for me
-Are windows to my breast, where-through the sun
-Delights to peep, to gaze therein on thee;
-Yet eyes this cunning want to grace their art;
-They draw but what they see, know not the heart.
-Let those who are in favour with their stars
-Of public honour and proud titles boast,
-Whilst I, whom fortune of such triumph bars,
-Unlook'd for joy in that I honour most.
-Great princes' favourites their fair leaves spread
-But as the marigold at the sun's eye,
-And in themselves their pride lies buried,
-For at a frown they in their glory die.
-The painful warrior famoused for fight,
-After a thousand victories once foil'd,
-Is from the book of honour razed quite,
-And all the rest forgot for which he toil'd:
-Then happy I, that love and am beloved
-Where I may not remove nor be removed.
-Lord of my love, to whom in vassalage
-Thy merit hath my duty strongly knit,
-To thee I send this written embassage,
-To witness duty, not to show my wit:
-Duty so great, which wit so poor as mine
-May make seem bare, in wanting words to show it,
-But that I hope some good conceit of thine
-In thy soul's thought, all naked, will bestow it;
-Till whatsoever star that guides my moving
-Points on me graciously with fair aspect
-And puts apparel on my tatter'd loving,
-To show me worthy of thy sweet respect:
-Then may I dare to boast how I do love thee;
-Till then not show my head where thou mayst prove me.
-Weary with toil, I haste me to my bed,
-The dear repose for limbs with travel tired;
-But then begins a journey in my head,
-To work my mind, when body's work's expired:
-For then my thoughts, from far where I abide,
-Intend a zealous pilgrimage to thee,
-And keep my drooping eyelids open wide,
-Looking on darkness which the blind do see
-Save that my soul's imaginary sight
-Presents thy shadow to my sightless view,
-Which, like a jewel hung in ghastly night,
-Makes black night beauteous and her old face new.
-Lo! thus, by day my limbs, by night my mind,
-For thee and for myself no quiet find.
-How can I then return in happy plight,
-That am debarr'd the benefit of rest?
-When day's oppression is not eased by night,
-But day by night, and night by day, oppress'd?
-And each, though enemies to either's reign,
-Do in consent shake hands to torture me;
-The one by toil, the other to complain
-How far I toil, still farther off from thee.
-I tell the day, to please them thou art bright
-And dost him grace when clouds do blot the heaven:
-So flatter I the swart-complexion'd night,
-When sparkling stars twire not thou gild'st the even.
-But day doth daily draw my sorrows longer
-And night doth nightly make grief's strength seem stronger.
-When, in disgrace with fortune and men's eyes,
-I all alone beweep my outcast state
-And trouble deal heaven with my bootless cries
-And look upon myself and curse my fate,
-Wishing me like to one more rich in hope,
-Featured like him, like him with friends possess'd,
-Desiring this man's art and that man's scope,
-With what I most enjoy contented least;
-Yet in these thoughts myself almost despising,
-Haply I think on thee, and then my state,
-Like to the lark at break of day arising
-From sullen earth, sings hymns at heaven's gate;
-For thy sweet love remember'd such wealth brings
-That then I scorn to change my state with kings.
-When to the sessions of sweet silent thought
-I summon up remembrance of things past,
-I sigh the lack of many a thing I sought,
-And with old woes new wail my dear time's waste:
-Then can I drown an eye, unused to flow,
-For precious friends hid in death's dateless night,
-And weep afresh love's long since cancell'd woe,
-And moan the expense of many a vanish'd sight:
-Then can I grieve at grievances foregone,
-And heavily from woe to woe tell o'er
-The sad account of fore-bemoaned moan,
-Which I new pay as if not paid before.
-But if the while I think on thee, dear friend,
-All losses are restored and sorrows end.
-Thy bosom is endeared with all hearts,
-Which I by lacking have supposed dead,
-And there reigns love and all love's loving parts,
-And all those friends which I thought buried.
-How many a holy and obsequious tear
-Hath dear religious love stol'n from mine eye
-As interest of the dead, which now appear
-But things removed that hidden in thee lie!
-Thou art the grave where buried love doth live,
-Hung with the trophies of my lovers gone,
-Who all their parts of me to thee did give;
-That due of many now is thine alone:
-Their images I loved I view in thee,
-And thou, all they, hast all the all of me.
-If thou survive my well-contented day,
-When that churl Death my bones with dust shall cover,
-And shalt by fortune once more re-survey
-These poor rude lines of thy deceased lover,
-Compare them with the bettering of the time,
-And though they be outstripp'd by every pen,
-Reserve them for my love, not for their rhyme,
-Exceeded by the height of happier men.
-O, then vouchsafe me but this loving thought:
-'Had my friend's Muse grown with this growing age,
-A dearer birth than this his love had brought,
-To march in ranks of better equipage:
-But since he died and poets better prove,
-Theirs for their style I'll read, his for his love.'
-Full many a glorious morning have I seen
-Flatter the mountain-tops with sovereign eye,
-Kissing with golden face the meadows green,
-Gilding pale streams with heavenly alchemy;
-Anon permit the basest clouds to ride
-With ugly rack on his celestial face,
-And from the forlorn world his visage hide,
-Stealing unseen to west with this disgrace:
-Even so my sun one early morn did shine
-With all triumphant splendor on my brow;
-But out, alack! he was but one hour mine;
-The region cloud hath mask'd him from me now.
-Yet him for this my love no whit disdaineth;
-Suns of the world may stain when heaven's sun staineth.
-Why didst thou promise such a beauteous day,
-And make me travel forth without my cloak,
-To let base clouds o'ertake me in my way,
-Hiding thy bravery in their rotten smoke?
-'Tis not enough that through the cloud thou break,
-To dry the rain on my storm-beaten face,
-For no man well of such a salve can speak
-That heals the wound and cures not the disgrace:
-Nor can thy shame give physic to my grief;
-Though thou repent, yet I have still the loss:
-The offender's sorrow lends but weak relief
-To him that bears the strong offence's cross.
-Ah! but those tears are pearl which thy love sheds,
-And they are rich and ransom all ill deeds.
-No more be grieved at that which thou hast done:
-Roses have thorns, and silver fountains mud;
-Clouds and eclipses stain both moon and sun,
-And loathsome canker lives in sweetest bud.
-All men make faults, and even I in this,
-Authorizing thy trespass with compare,
-Myself corrupting, salving thy amiss,
-Excusing thy sins more than thy sins are;
-For to thy sensual fault I bring in sense--
-Thy adverse party is thy advocate--
-And 'gainst myself a lawful plea commence:
-Such civil war is in my love and hate
-That I an accessary needs must be
-To that sweet thief which sourly robs from me.
-Let me confess that we two must be twain,
-Although our undivided loves are one:
-So shall those blots that do with me remain
-Without thy help by me be borne alone.
-In our two loves there is but one respect,
-Though in our lives a separable spite,
-Which though it alter not love's sole effect,
-Yet doth it steal sweet hours from love's delight.
-I may not evermore acknowledge thee,
-Lest my bewailed guilt should do thee shame,
-Nor thou with public kindness honour me,
-Unless thou take that honour from thy name:
-But do not so; I love thee in such sort
-As, thou being mine, mine is thy good report.
-As a decrepit father takes delight
-To see his active child do deeds of youth,
-So I, made lame by fortune's dearest spite,
-Take all my comfort of thy worth and truth.
-For whether beauty, birth, or wealth, or wit,
-Or any of these all, or all, or more,
-Entitled in thy parts do crowned sit,
-I make my love engrafted to this store:
-So then I am not lame, poor, nor despised,
-Whilst that this shadow doth such substance give
-That I in thy abundance am sufficed
-And by a part of all thy glory live.
-Look, what is best, that best I wish in thee:
-This wish I have; then ten times happy me!FROM fairest creatures we desire increase,
-That thereby beauty's rose might never die,
-But as the riper should by time decease,
-His tender heir might bear his memory:
-But thou, contracted to thine own bright eyes,
-Feed'st thy light'st flame with self-substantial fuel,
-Making a famine where abundance lies,
-Thyself thy foe, to thy sweet self too cruel.
-Thou that art now the world's fresh ornament
-And only herald to the gaudy spring,
-Within thine own bud buriest thy content
-And, tender churl, makest waste in niggarding.
-Pity the world, or else this glutton be,
-To eat the world's due, by the grave and thee.
-When forty winters shall beseige thy brow,
-And dig deep trenches in thy beauty's field,
-Thy youth's proud livery, so gazed on now,
-Will be a tatter'd weed, of small worth held:
-Then being ask'd where all thy beauty lies,
-Where all the treasure of thy lusty days,
-To say, within thine own deep-sunken eyes,
-Were an all-eating shame and thriftless praise.
-How much more praise deserved thy beauty's use,
-If thou couldst answer 'This fair child of mine
-Shall sum my count and make my old excuse,'
-Proving his beauty by succession thine!
-This were to be new made when thou art old,
-And see thy blood warm when thou feel'st it cold.
-Look in thy glass, and tell the face thou viewest
-Now is the time that face should form another;
-Whose fresh repair if now thou not renewest,
-Thou dost beguile the world, unbless some mother.
-For where is she so fair whose unear'd womb
-Disdains the tillage of thy husbandry?
-Or who is he so fond will be the tomb
-Of his self-love, to stop posterity?
-Thou art thy mother's glass, and she in thee
-Calls back the lovely April of her prime:
-So thou through windows of thine age shall see
-Despite of wrinkles this thy golden time.
-But if thou live, remember'd not to be,
-Die single, and thine image dies with thee.
-Unthrifty loveliness, why dost thou spend
-Upon thyself thy beauty's legacy?
-Nature's bequest gives nothing but doth lend,
-And being frank she lends to those are free.
-Then, beauteous niggard, why dost thou abuse
-The bounteous largess given thee to give?
-Profitless usurer, why dost thou use
-So great a sum of sums, yet canst not live?
-For having traffic with thyself alone,
-Thou of thyself thy sweet self dost deceive.
-Then how, when nature calls thee to be gone,
-What acceptable audit canst thou leave?
-Thy unused beauty must be tomb'd with thee,
-Which, used, lives th' executor to be.
-Those hours, that with gentle work did frame
-The lovely gaze where every eye doth dwell,
-Will play the tyrants to the very same
-And that unfair which fairly doth excel:
-For never-resting time leads summer on
-To hideous winter and confounds him there;
-Sap cheque'd with frost and lusty leaves quite gone,
-Beauty o'ersnow'd and bareness every where:
-Then, were not summer's distillation left,
-A liquid prisoner pent in walls of glass,
-Beauty's effect with beauty were bereft,
-Nor it nor no remembrance what it was:
-But flowers distill'd though they with winter meet,
-Leese but their show; their substance still lives sweet.
-Then let not winter's ragged hand deface
-In thee thy summer, ere thou be distill'd:
-Make sweet some vial; treasure thou some place
-With beauty's treasure, ere it be self-kill'd.
-That use is not forbidden usury,
-Which happies those that pay the willing loan;
-That's for thyself to breed another thee,
-Or ten times happier, be it ten for one;
-Ten times thyself were happier than thou art,
-If ten of thine ten times refigured thee:
-Then what could death do, if thou shouldst depart,
-Leaving thee living in posterity?
-Be not self-will'd, for thou art much too fair
-To be death's conquest and make worms thine heir.
-Lo! in the orient when the gracious light
-Lifts up his burning head, each under eye
-Doth homage to his new-appearing sight,
-Serving with looks his sacred majesty;
-And having climb'd the steep-up heavenly hill,
-Resembling strong youth in his middle age,
-yet mortal looks adore his beauty still,
-Attending on his golden pilgrimage;
-But when from highmost pitch, with weary car,
-Like feeble age, he reeleth from the day,
-The eyes, 'fore duteous, now converted are
-From his low tract and look another way:
-So thou, thyself out-going in thy noon,
-Unlook'd on diest, unless thou get a son.
-Music to hear, why hear'st thou music sadly?
-Sweets with sweets war not, joy delights in joy.
-Why lovest thou that which thou receivest not gladly,
-Or else receivest with pleasure thine annoy?
-If the true concord of well-tuned sounds,
-By unions married, do offend thine ear,
-They do but sweetly chide thee, who confounds
-In singleness the parts that thou shouldst bear.
-Mark how one string, sweet husband to another,
-Strikes each in each by mutual ordering,
-Resembling sire and child and happy mother
-Who all in one, one pleasing note do sing:
-Whose speechless song, being many, seeming one,
-Sings this to thee: 'thou single wilt prove none.'
-Is it for fear to wet a widow's eye
-That thou consumest thyself in single life?
-Ah! if thou issueless shalt hap to die.
-The world will wail thee, like a makeless wife;
-The world will be thy widow and still weep
-That thou no form of thee hast left behind,
-When every private widow well may keep
-By children's eyes her husband's shape in mind.
-Look, what an unthrift in the world doth spend
-Shifts but his place, for still the world enjoys it;
-But beauty's waste hath in the world an end,
-And kept unused, the user so destroys it.
-No love toward others in that bosom sits
-That on himself such murderous shame commits.
-For shame! deny that thou bear'st love to any,
-Who for thyself art so unprovident.
-Grant, if thou wilt, thou art beloved of many,
-But that thou none lovest is most evident;
-For thou art so possess'd with murderous hate
-That 'gainst thyself thou stick'st not to conspire.
-Seeking that beauteous roof to ruinate
-Which to repair should be thy chief desire.
-O, change thy thought, that I may change my mind!
-Shall hate be fairer lodged than gentle love?
-Be, as thy presence is, gracious and kind,
-Or to thyself at least kind-hearted prove:
-Make thee another self, for love of me,
-That beauty still may live in thine or thee.
-As fast as thou shalt wane, so fast thou growest
-In one of thine, from that which thou departest;
-And that fresh blood which youngly thou bestowest
-Thou mayst call thine when thou from youth convertest.
-Herein lives wisdom, beauty and increase:
-Without this, folly, age and cold decay:
-If all were minded so, the times should cease
-And threescore year would make the world away.
-Let those whom Nature hath not made for store,
-Harsh featureless and rude, barrenly perish:
-Look, whom she best endow'd she gave the more;
-Which bounteous gift thou shouldst in bounty cherish:
-She carved thee for her seal, and meant thereby
-Thou shouldst print more, not let that copy die.
-When I do count the clock that tells the time,
-And see the brave day sunk in hideous night;
-When I behold the violet past prime,
-And sable curls all silver'd o'er with white;
-When lofty trees I see barren of leaves
-Which erst from heat did canopy the herd,
-And summer's green all girded up in sheaves
-Borne on the bier with white and bristly beard,
-Then of thy beauty do I question make,
-That thou among the wastes of time must go,
-Since sweets and beauties do themselves forsake
-And die as fast as they see others grow;
-And nothing 'gainst Time's scythe can make defence
-Save breed, to brave him when he takes thee hence.
-O, that you were yourself! but, love, you are
-No longer yours than you yourself here live:
-Against this coming end you should prepare,
-And your sweet semblance to some other give.
-So should that beauty which you hold in lease
-Find no determination: then you were
-Yourself again after yourself's decease,
-When your sweet issue your sweet form should bear.
-Who lets so fair a house fall to decay,
-Which husbandry in honour might uphold
-Against the stormy gusts of winter's day
-And barren rage of death's eternal cold?
-O, none but unthrifts! Dear my love, you know
-You had a father: let your son say so.
-Not from the stars do I my judgment pluck;
-And yet methinks I have astronomy,
-But not to tell of good or evil luck,
-Of plagues, of dearths, or seasons' quality;
-Nor can I fortune to brief minutes tell,
-Pointing to each his thunder, rain and wind,
-Or say with princes if it shall go well,
-By oft predict that I in heaven find:
-But from thine eyes my knowledge I derive,
-And, constant stars, in them I read such art
-As truth and beauty shall together thrive,
-If from thyself to store thou wouldst convert;
-Or else of thee this I prognosticate:
-Thy end is truth's and beauty's doom and date.
-When I consider every thing that grows
-Holds in perfection but a little moment,
-That this huge stage presenteth nought but shows
-Whereon the stars in secret influence comment;
-When I perceive that men as plants increase,
-Cheered and cheque'd even by the self-same sky,
-Vaunt in their youthful sap, at height decrease,
-And wear their brave state out of memory;
-Then the conceit of this inconstant stay
-Sets you most rich in youth before my sight,
-Where wasteful Time debateth with Decay,
-To change your day of youth to sullied night;
-And all in war with Time for love of you,
-As he takes from you, I engraft you new.
-But wherefore do not you a mightier way
-Make war upon this bloody tyrant, Time?
-And fortify yourself in your decay
-With means more blessed than my barren rhyme?
-Now stand you on the top of happy hours,
-And many maiden gardens yet unset
-With virtuous wish would bear your living flowers,
-Much liker than your painted counterfeit:
-So should the lines of life that life repair,
-Which this, Time's pencil, or my pupil pen,
-Neither in inward worth nor outward fair,
-Can make you live yourself in eyes of men.
-To give away yourself keeps yourself still,
-And you must live, drawn by your own sweet skill.
-Who will believe my verse in time to come,
-If it were fill'd with your most high deserts?
-Though yet, heaven knows, it is but as a tomb
-Which hides your life and shows not half your parts.
-If I could write the beauty of your eyes
-And in fresh numbers number all your graces,
-The age to come would say 'This poet lies:
-Such heavenly touches ne'er touch'd earthly faces.'
-So should my papers yellow'd with their age
-Be scorn'd like old men of less truth than tongue,
-And your true rights be term'd a poet's rage
-And stretched metre of an antique song:
-But were some child of yours alive that time,
-You should live twice; in it and in my rhyme.
-Shall I compare thee to a summer's day?
-Thou art more lovely and more temperate:
-Rough winds do shake the darling buds of May,
-And summer's lease hath all too short a date:
-Sometime too hot the eye of heaven shines,
-And often is his gold complexion dimm'd;
-And every fair from fair sometime declines,
-By chance or nature's changing course untrimm'd;
-But thy eternal summer shall not fade
-Nor lose possession of that fair thou owest;
-Nor shall Death brag thou wander'st in his shade,
-When in eternal lines to time thou growest:
-So long as men can breathe or eyes can see,
-So long lives this and this gives life to thee.
-Devouring Time, blunt thou the lion's paws,
-And make the earth devour her own sweet brood;
-Pluck the keen teeth from the fierce tiger's jaws,
-And burn the long-lived phoenix in her blood;
-Make glad and sorry seasons as thou fleets,
-And do whate'er thou wilt, swift-footed Time,
-To the wide world and all her fading sweets;
-But I forbid thee one most heinous crime:
-O, carve not with thy hours my love's fair brow,
-Nor draw no lines there with thine antique pen;
-Him in thy course untainted do allow
-For beauty's pattern to succeeding men.
-Yet, do thy worst, old Time: despite thy wrong,
-My love shall in my verse ever live young.
-A woman's face with Nature's own hand painted
-Hast thou, the master-mistress of my passion;
-A woman's gentle heart, but not acquainted
-With shifting change, as is false women's fashion;
-An eye more bright than theirs, less false in rolling,
-Gilding the object whereupon it gazeth;
-A man in hue, all 'hues' in his controlling,
-Much steals men's eyes and women's souls amazeth.
-And for a woman wert thou first created;
-Till Nature, as she wrought thee, fell a-doting,
-And by addition me of thee defeated,
-By adding one thing to my purpose nothing.
-But since she prick'd thee out for women's pleasure,
-Mine be thy love and thy love's use their treasure.
-So is it not with me as with that Muse
-Stirr'd by a painted beauty to his verse,
-Who heaven itself for ornament doth use
-And every fair with his fair doth rehearse
-Making a couplement of proud compare,
-With sun and moon, with earth and sea's rich gems,
-With April's first-born flowers, and all things rare
-That heaven's air in this huge rondure hems.
-O' let me, true in love, but truly write,
-And then believe me, my love is as fair
-As any mother's child, though not so bright
-As those gold candles fix'd in heaven's air:
-Let them say more than like of hearsay well;
-I will not praise that purpose not to sell.
-My glass shall not persuade me I am old,
-So long as youth and thou are of one date;
-But when in thee time's furrows I behold,
-Then look I death my days should expiate.
-For all that beauty that doth cover thee
-Is but the seemly raiment of my heart,
-Which in thy breast doth live, as thine in me:
-How can I then be elder than thou art?
-O, therefore, love, be of thyself so wary
-As I, not for myself, but for thee will;
-Bearing thy heart, which I will keep so chary
-As tender nurse her babe from faring ill.
-Presume not on thy heart when mine is slain;
-Thou gavest me thine, not to give back again.
-As an unperfect actor on the stage
-Who with his fear is put besides his part,
-Or some fierce thing replete with too much rage,
-Whose strength's abundance weakens his own heart.
-So I, for fear of trust, forget to say
-The perfect ceremony of love's rite,
-And in mine own love's strength seem to decay,
-O'ercharged with burden of mine own love's might.
-O, let my books be then the eloquence
-And dumb presagers of my speaking breast,
-Who plead for love and look for recompense
-More than that tongue that more hath more express'd.
-O, learn to read what silent love hath writ:
-To hear with eyes belongs to love's fine wit.
-Mine eye hath play'd the painter and hath stell'd
-Thy beauty's form in table of my heart;
-My body is the frame wherein 'tis held,
-And perspective it is the painter's art.
-For through the painter must you see his skill,
-To find where your true image pictured lies;
-Which in my bosom's shop is hanging still,
-That hath his windows glazed with thine eyes.
-Now see what good turns eyes for eyes have done:
-Mine eyes have drawn thy shape, and thine for me
-Are windows to my breast, where-through the sun
-Delights to peep, to gaze therein on thee;
-Yet eyes this cunning want to grace their art;
-They draw but what they see, know not the heart.
-Let those who are in favour with their stars
-Of public honour and proud titles boast,
-Whilst I, whom fortune of such triumph bars,
-Unlook'd for joy in that I honour most.
-Great princes' favourites their fair leaves spread
-But as the marigold at the sun's eye,
-And in themselves their pride lies buried,
-For at a frown they in their glory die.
-The painful warrior famoused for fight,
-After a thousand victories once foil'd,
-Is from the book of honour razed quite,
-And all the rest forgot for which he toil'd:
-Then happy I, that love and am beloved
-Where I may not remove nor be removed.
-Lord of my love, to whom in vassalage
-Thy merit hath my duty strongly knit,
-To thee I send this written embassage,
-To witness duty, not to show my wit:
-Duty so great, which wit so poor as mine
-May make seem bare, in wanting words to show it,
-But that I hope some good conceit of thine
-In thy soul's thought, all naked, will bestow it;
-Till whatsoever star that guides my moving
-Points on me graciously with fair aspect
-And puts apparel on my tatter'd loving,
-To show me worthy of thy sweet respect:
-Then may I dare to boast how I do love thee;
-Till then not show my head where thou mayst prove me.
-Weary with toil, I haste me to my bed,
-The dear repose for limbs with travel tired;
-But then begins a journey in my head,
-To work my mind, when body's work's expired:
-For then my thoughts, from far where I abide,
-Intend a zealous pilgrimage to thee,
-And keep my drooping eyelids open wide,
-Looking on darkness which the blind do see
-Save that my soul's imaginary sight
-Presents thy shadow to my sightless view,
-Which, like a jewel hung in ghastly night,
-Makes black night beauteous and her old face new.
-Lo! thus, by day my limbs, by night my mind,
-For thee and for myself no quiet find.
-How can I then return in happy plight,
-That am debarr'd the benefit of rest?
-When day's oppression is not eased by night,
-But day by night, and night by day, oppress'd?
-And each, though enemies to either's reign,
-Do in consent shake hands to torture me;
-The one by toil, the other to complain
-How far I toil, still farther off from thee.
-I tell the day, to please them thou art bright
-And dost him grace when clouds do blot the heaven:
-So flatter I the swart-complexion'd night,
-When sparkling stars twire not thou gild'st the even.
-But day doth daily draw my sorrows longer
-And night doth nightly make grief's strength seem stronger.
-When, in disgrace with fortune and men's eyes,
-I all alone beweep my outcast state
-And trouble deal heaven with my bootless cries
-And look upon myself and curse my fate,
-Wishing me like to one more rich in hope,
-Featured like him, like him with friends possess'd,
-Desiring this man's art and that man's scope,
-With what I most enjoy contented least;
-Yet in these thoughts myself almost despising,
-Haply I think on thee, and then my state,
-Like to the lark at break of day arising
-From sullen earth, sings hymns at heaven's gate;
-For thy sweet love remember'd such wealth brings
-That then I scorn to change my state with kings.
-When to the sessions of sweet silent thought
-I summon up remembrance of things past,
-I sigh the lack of many a thing I sought,
-And with old woes new wail my dear time's waste:
-Then can I drown an eye, unused to flow,
-For precious friends hid in death's dateless night,
-And weep afresh love's long since cancell'd woe,
-And moan the expense of many a vanish'd sight:
-Then can I grieve at grievances foregone,
-And heavily from woe to woe tell o'er
-The sad account of fore-bemoaned moan,
-Which I new pay as if not paid before.
-But if the while I think on thee, dear friend,
-All losses are restored and sorrows end.
-Thy bosom is endeared with all hearts,
-Which I by lacking have supposed dead,
-And there reigns love and all love's loving parts,
-And all those friends which I thought buried.
-How many a holy and obsequious tear
-Hath dear religious love stol'n from mine eye
-As interest of the dead, which now appear
-But things removed that hidden in thee lie!
-Thou art the grave where buried love doth live,
-Hung with the trophies of my lovers gone,
-Who all their parts of me to thee did give;
-That due of many now is thine alone:
-Their images I loved I view in thee,
-And thou, all they, hast all the all of me.
-If thou survive my well-contented day,
-When that churl Death my bones with dust shall cover,
-And shalt by fortune once more re-survey
-These poor rude lines of thy deceased lover,
-Compare them with the bettering of the time,
-And though they be outstripp'd by every pen,
-Reserve them for my love, not for their rhyme,
-Exceeded by the height of happier men.
-O, then vouchsafe me but this loving thought:
-'Had my friend's Muse grown with this growing age,
-A dearer birth than this his love had brought,
-To march in ranks of better equipage:
-But since he died and poets better prove,
-Theirs for their style I'll read, his for his love.'
-Full many a glorious morning have I seen
-Flatter the mountain-tops with sovereign eye,
-Kissing with golden face the meadows green,
-Gilding pale streams with heavenly alchemy;
-Anon permit the basest clouds to ride
-With ugly rack on his celestial face,
-And from the forlorn world his visage hide,
-Stealing unseen to west with this disgrace:
-Even so my sun one early morn did shine
-With all triumphant splendor on my brow;
-But out, alack! he was but one hour mine;
-The region cloud hath mask'd him from me now.
-Yet him for this my love no whit disdaineth;
-Suns of the world may stain when heaven's sun staineth.
-Why didst thou promise such a beauteous day,
-And make me travel forth without my cloak,
-To let base clouds o'ertake me in my way,
-Hiding thy bravery in their rotten smoke?
-'Tis not enough that through the cloud thou break,
-To dry the rain on my storm-beaten face,
-For no man well of such a salve can speak
-That heals the wound and cures not the disgrace:
-Nor can thy shame give physic to my grief;
-Though thou repent, yet I have still the loss:
-The offender's sorrow lends but weak relief
-To him that bears the strong offence's cross.
-Ah! but those tears are pearl which thy love sheds,
-And they are rich and ransom all ill deeds.
-No more be grieved at that which thou hast done:
-Roses have thorns, and silver fountains mud;
-Clouds and eclipses stain both moon and sun,
-And loathsome canker lives in sweetest bud.
-All men make faults, and even I in this,
-Authorizing thy trespass with compare,
-Myself corrupting, salving thy amiss,
-Excusing thy sins more than thy sins are;
-For to thy sensual fault I bring in sense--
-Thy adverse party is thy advocate--
-And 'gainst myself a lawful plea commence:
-Such civil war is in my love and hate
-That I an accessary needs must be
-To that sweet thief which sourly robs from me.
-Let me confess that we two must be twain,
-Although our undivided loves are one:
-So shall those blots that do with me remain
-Without thy help by me be borne alone.
-In our two loves there is but one respect,
-Though in our lives a separable spite,
-Which though it alter not love's sole effect,
-Yet doth it steal sweet hours from love's delight.
-I may not evermore acknowledge thee,
-Lest my bewailed guilt should do thee shame,
-Nor thou with public kindness honour me,
-Unless thou take that honour from thy name:
-But do not so; I love thee in such sort
-As, thou being mine, mine is thy good report.
-As a decrepit father takes delight
-To see his active child do deeds of youth,
-So I, made lame by fortune's dearest spite,
-Take all my comfort of thy worth and truth.
-For whether beauty, birth, or wealth, or wit,
-Or any of these all, or all, or more,
-Entitled in thy parts do crowned sit,
-I make my love engrafted to this store:
-So then I am not lame, poor, nor despised,
-Whilst that this shadow doth such substance give
-That I in thy abundance am sufficed
-And by a part of all thy glory live.
-Look, what is best, that best I wish in thee:
-This wish I have; then ten times happy me!FROM fairest creatures we desire increase,
-That thereby beauty's rose might never die,
-But as the riper should by time decease,
-His tender heir might bear his memory:
-But thou, contracted to thine own bright eyes,
-Feed'st thy light'st flame with self-substantial fuel,
-Making a famine where abundance lies,
-Thyself thy foe, to thy sweet self too cruel.
-Thou that art now the world's fresh ornament
-And only herald to the gaudy spring,
-Within thine own bud buriest thy content
-And, tender churl, makest waste in niggarding.
-Pity the world, or else this glutton be,
-To eat the world's due, by the grave and thee.
-When forty winters shall beseige thy brow,
-And dig deep trenches in thy beauty's field,
-Thy youth's proud livery, so gazed on now,
-Will be a tatter'd weed, of small worth held:
-Then being ask'd where all thy beauty lies,
-Where all the treasure of thy lusty days,
-To say, within thine own deep-sunken eyes,
-Were an all-eating shame and thriftless praise.
-How much more praise deserved thy beauty's use,
-If thou couldst answer 'This fair child of mine
-Shall sum my count and make my old excuse,'
-Proving his beauty by succession thine!
-This were to be new made when thou art old,
-And see thy blood warm when thou feel'st it cold.
-Look in thy glass, and tell the face thou viewest
-Now is the time that face should form another;
-Whose fresh repair if now thou not renewest,
-Thou dost beguile the world, unbless some mother.
-For where is she so fair whose unear'd womb
-Disdains the tillage of thy husbandry?
-Or who is he so fond will be the tomb
-Of his self-love, to stop posterity?
-Thou art thy mother's glass, and she in thee
-Calls back the lovely April of her prime:
-So thou through windows of thine age shall see
-Despite of wrinkles this thy golden time.
-But if thou live, remember'd not to be,
-Die single, and thine image dies with thee.
-Unthrifty loveliness, why dost thou spend
-Upon thyself thy beauty's legacy?
-Nature's bequest gives nothing but doth lend,
-And being frank she lends to those are free.
-Then, beauteous niggard, why dost thou abuse
-The bounteous largess given thee to give?
-Profitless usurer, why dost thou use
-So great a sum of sums, yet canst not live?
-For having traffic with thyself alone,
-Thou of thyself thy sweet self dost deceive.
-Then how, when nature calls thee to be gone,
-What acceptable audit canst thou leave?
-Thy unused beauty must be tomb'd with thee,
-Which, used, lives th' executor to be.
-Those hours, that with gentle work did frame
-The lovely gaze where every eye doth dwell,
-Will play the tyrants to the very same
-And that unfair which fairly doth excel:
-For never-resting time leads summer on
-To hideous winter and confounds him there;
-Sap cheque'd with frost and lusty leaves quite gone,
-Beauty o'ersnow'd and bareness every where:
-Then, were not summer's distillation left,
-A liquid prisoner pent in walls of glass,
-Beauty's effect with beauty were bereft,
-Nor it nor no remembrance what it was:
-But flowers distill'd though they with winter meet,
-Leese but their show; their substance still lives sweet.
-Then let not winter's ragged hand deface
-In thee thy summer, ere thou be distill'd:
-Make sweet some vial; treasure thou some place
-With beauty's treasure, ere it be self-kill'd.
-That use is not forbidden usury,
-Which happies those that pay the willing loan;
-That's for thyself to breed another thee,
-Or ten times happier, be it ten for one;
-Ten times thyself were happier than thou art,
-If ten of thine ten times refigured thee:
-Then what could death do, if thou shouldst depart,
-Leaving thee living in posterity?
-Be not self-will'd, for thou art much too fair
-To be death's conquest and make worms thine heir.
-Lo! in the orient when the gracious light
-Lifts up his burning head, each under eye
-Doth homage to his new-appearing sight,
-Serving with looks his sacred majesty;
-And having climb'd the steep-up heavenly hill,
-Resembling strong youth in his middle age,
-yet mortal looks adore his beauty still,
-Attending on his golden pilgrimage;
-But when from highmost pitch, with weary car,
-Like feeble age, he reeleth from the day,
-The eyes, 'fore duteous, now converted are
-From his low tract and look another way:
-So thou, thyself out-going in thy noon,
-Unlook'd on diest, unless thou get a son.
-Music to hear, why hear'st thou music sadly?
-Sweets with sweets war not, joy delights in joy.
-Why lovest thou that which thou receivest not gladly,
-Or else receivest with pleasure thine annoy?
-If the true concord of well-tuned sounds,
-By unions married, do offend thine ear,
-They do but sweetly chide thee, who confounds
-In singleness the parts that thou shouldst bear.
-Mark how one string, sweet husband to another,
-Strikes each in each by mutual ordering,
-Resembling sire and child and happy mother
-Who all in one, one pleasing note do sing:
-Whose speechless song, being many, seeming one,
-Sings this to thee: 'thou single wilt prove none.'
-Is it for fear to wet a widow's eye
-That thou consumest thyself in single life?
-Ah! if thou issueless shalt hap to die.
-The world will wail thee, like a makeless wife;
-The world will be thy widow and still weep
-That thou no form of thee hast left behind,
-When every private widow well may keep
-By children's eyes her husband's shape in mind.
-Look, what an unthrift in the world doth spend
-Shifts but his place, for still the world enjoys it;
-But beauty's waste hath in the world an end,
-And kept unused, the user so destroys it.
-No love toward others in that bosom sits
-That on himself such murderous shame commits.
-For shame! deny that thou bear'st love to any,
-Who for thyself art so unprovident.
-Grant, if thou wilt, thou art beloved of many,
-But that thou none lovest is most evident;
-For thou art so possess'd with murderous hate
-That 'gainst thyself thou stick'st not to conspire.
-Seeking that beauteous roof to ruinate
-Which to repair should be thy chief desire.
-O, change thy thought, that I may change my mind!
-Shall hate be fairer lodged than gentle love?
-Be, as thy presence is, gracious and kind,
-Or to thyself at least kind-hearted prove:
-Make thee another self, for love of me,
-That beauty still may live in thine or thee.
-As fast as thou shalt wane, so fast thou growest
-In one of thine, from that which thou departest;
-And that fresh blood which youngly thou bestowest
-Thou mayst call thine when thou from youth convertest.
-Herein lives wisdom, beauty and increase:
-Without this, folly, age and cold decay:
-If all were minded so, the times should cease
-And threescore year would make the world away.
-Let those whom Nature hath not made for store,
-Harsh featureless and rude, barrenly perish:
-Look, whom she best endow'd she gave the more;
-Which bounteous gift thou shouldst in bounty cherish:
-She carved thee for her seal, and meant thereby
-Thou shouldst print more, not let that copy die.
-When I do count the clock that tells the time,
-And see the brave day sunk in hideous night;
-When I behold the violet past prime,
-And sable curls all silver'd o'er with white;
-When lofty trees I see barren of leaves
-Which erst from heat did canopy the herd,
-And summer's green all girded up in sheaves
-Borne on the bier with white and bristly beard,
-Then of thy beauty do I question make,
-That thou among the wastes of time must go,
-Since sweets and beauties do themselves forsake
-And die as fast as they see others grow;
-And nothing 'gainst Time's scythe can make defence
-Save breed, to brave him when he takes thee hence.
-O, that you were yourself! but, love, you are
-No longer yours than you yourself here live:
-Against this coming end you should prepare,
-And your sweet semblance to some other give.
-So should that beauty which you hold in lease
-Find no determination: then you were
-Yourself again after yourself's decease,
-When your sweet issue your sweet form should bear.
-Who lets so fair a house fall to decay,
-Which husbandry in honour might uphold
-Against the stormy gusts of winter's day
-And barren rage of death's eternal cold?
-O, none but unthrifts! Dear my love, you know
-You had a father: let your son say so.
-Not from the stars do I my judgment pluck;
-And yet methinks I have astronomy,
-But not to tell of good or evil luck,
-Of plagues, of dearths, or seasons' quality;
-Nor can I fortune to brief minutes tell,
-Pointing to each his thunder, rain and wind,
-Or say with princes if it shall go well,
-By oft predict that I in heaven find:
-But from thine eyes my knowledge I derive,
-And, constant stars, in them I read such art
-As truth and beauty shall together thrive,
-If from thyself to store thou wouldst convert;
-Or else of thee this I prognosticate:
-Thy end is truth's and beauty's doom and date.
-When I consider every thing that grows
-Holds in perfection but a little moment,
-That this huge stage presenteth nought but shows
-Whereon the stars in secret influence comment;
-When I perceive that men as plants increase,
-Cheered and cheque'd even by the self-same sky,
-Vaunt in their youthful sap, at height decrease,
-And wear their brave state out of memory;
-Then the conceit of this inconstant stay
-Sets you most rich in youth before my sight,
-Where wasteful Time debateth with Decay,
-To change your day of youth to sullied night;
-And all in war with Time for love of you,
-As he takes from you, I engraft you new.
-But wherefore do not you a mightier way
-Make war upon this bloody tyrant, Time?
-And fortify yourself in your decay
-With means more blessed than my barren rhyme?
-Now stand you on the top of happy hours,
-And many maiden gardens yet unset
-With virtuous wish would bear your living flowers,
-Much liker than your painted counterfeit:
-So should the lines of life that life repair,
-Which this, Time's pencil, or my pupil pen,
-Neither in inward worth nor outward fair,
-Can make you live yourself in eyes of men.
-To give away yourself keeps yourself still,
-And you must live, drawn by your own sweet skill.
-Who will believe my verse in time to come,
-If it were fill'd with your most high deserts?
-Though yet, heaven knows, it is but as a tomb
-Which hides your life and shows not half your parts.
-If I could write the beauty of your eyes
-And in fresh numbers number all your graces,
-The age to come would say 'This poet lies:
-Such heavenly touches ne'er touch'd earthly faces.'
-So should my papers yellow'd with their age
-Be scorn'd like old men of less truth than tongue,
-And your true rights be term'd a poet's rage
-And stretched metre of an antique song:
-But were some child of yours alive that time,
-You should live twice; in it and in my rhyme.
-Shall I compare thee to a summer's day?
-Thou art more lovely and more temperate:
-Rough winds do shake the darling buds of May,
-And summer's lease hath all too short a date:
-Sometime too hot the eye of heaven shines,
-And often is his gold complexion dimm'd;
-And every fair from fair sometime declines,
-By chance or nature's changing course untrimm'd;
-But thy eternal summer shall not fade
-Nor lose possession of that fair thou owest;
-Nor shall Death brag thou wander'st in his shade,
-When in eternal lines to time thou growest:
-So long as men can breathe or eyes can see,
-So long lives this and this gives life to thee.
-Devouring Time, blunt thou the lion's paws,
-And make the earth devour her own sweet brood;
-Pluck the keen teeth from the fierce tiger's jaws,
-And burn the long-lived phoenix in her blood;
-Make glad and sorry seasons as thou fleets,
-And do whate'er thou wilt, swift-footed Time,
-To the wide world and all her fading sweets;
-But I forbid thee one most heinous crime:
-O, carve not with thy hours my love's fair brow,
-Nor draw no lines there with thine antique pen;
-Him in thy course untainted do allow
-For beauty's pattern to succeeding men.
-Yet, do thy worst, old Time: despite thy wrong,
-My love shall in my verse ever live young.
-A woman's face with Nature's own hand painted
-Hast thou, the master-mistress of my passion;
-A woman's gentle heart, but not acquainted
-With shifting change, as is false women's fashion;
-An eye more bright than theirs, less false in rolling,
-Gilding the object whereupon it gazeth;
-A man in hue, all 'hues' in his controlling,
-Much steals men's eyes and women's souls amazeth.
-And for a woman wert thou first created;
-Till Nature, as she wrought thee, fell a-doting,
-And by addition me of thee defeated,
-By adding one thing to my purpose nothing.
-But since she prick'd thee out for women's pleasure,
-Mine be thy love and thy love's use their treasure.
-So is it not with me as with that Muse
-Stirr'd by a painted beauty to his verse,
-Who heaven itself for ornament doth use
-And every fair with his fair doth rehearse
-Making a couplement of proud compare,
-With sun and moon, with earth and sea's rich gems,
-With April's first-born flowers, and all things rare
-That heaven's air in this huge rondure hems.
-O' let me, true in love, but truly write,
-And then believe me, my love is as fair
-As any mother's child, though not so bright
-As those gold candles fix'd in heaven's air:
-Let them say more than like of hearsay well;
-I will not praise that purpose not to sell.
-My glass shall not persuade me I am old,
-So long as youth and thou are of one date;
-But when in thee time's furrows I behold,
-Then look I death my days should expiate.
-For all that beauty that doth cover thee
-Is but the seemly raiment of my heart,
-Which in thy breast doth live, as thine in me:
-How can I then be elder than thou art?
-O, therefore, love, be of thyself so wary
-As I, not for myself, but for thee will;
-Bearing thy heart, which I will keep so chary
-As tender nurse her babe from faring ill.
-Presume not on thy heart when mine is slain;
-Thou gavest me thine, not to give back again.
-As an unperfect actor on the stage
-Who with his fear is put besides his part,
-Or some fierce thing replete with too much rage,
-Whose strength's abundance weakens his own heart.
-So I, for fear of trust, forget to say
-The perfect ceremony of love's rite,
-And in mine own love's strength seem to decay,
-O'ercharged with burden of mine own love's might.
-O, let my books be then the eloquence
-And dumb presagers of my speaking breast,
-Who plead for love and look for recompense
-More than that tongue that more hath more express'd.
-O, learn to read what silent love hath writ:
-To hear with eyes belongs to love's fine wit.
-Mine eye hath play'd the painter and hath stell'd
-Thy beauty's form in table of my heart;
-My body is the frame wherein 'tis held,
-And perspective it is the painter's art.
-For through the painter must you see his skill,
-To find where your true image pictured lies;
-Which in my bosom's shop is hanging still,
-That hath his windows glazed with thine eyes.
-Now see what good turns eyes for eyes have done:
-Mine eyes have drawn thy shape, and thine for me
-Are windows to my breast, where-through the sun
-Delights to peep, to gaze therein on thee;
-Yet eyes this cunning want to grace their art;
-They draw but what they see, know not the heart.
-Let those who are in favour with their stars
-Of public honour and proud titles boast,
-Whilst I, whom fortune of such triumph bars,
-Unlook'd for joy in that I honour most.
-Great princes' favourites their fair leaves spread
-But as the marigold at the sun's eye,
-And in themselves their pride lies buried,
-For at a frown they in their glory die.
-The painful warrior famoused for fight,
-After a thousand victories once foil'd,
-Is from the book of honour razed quite,
-And all the rest forgot for which he toil'd:
-Then happy I, that love and am beloved
-Where I may not remove nor be removed.
-Lord of my love, to whom in vassalage
-Thy merit hath my duty strongly knit,
-To thee I send this written embassage,
-To witness duty, not to show my wit:
-Duty so great, which wit so poor as mine
-May make seem bare, in wanting words to show it,
-But that I hope some good conceit of thine
-In thy soul's thought, all naked, will bestow it;
-Till whatsoever star that guides my moving
-Points on me graciously with fair aspect
-And puts apparel on my tatter'd loving,
-To show me worthy of thy sweet respect:
-Then may I dare to boast how I do love thee;
-Till then not show my head where thou mayst prove me.
-Weary with toil, I haste me to my bed,
-The dear repose for limbs with travel tired;
-But then begins a journey in my head,
-To work my mind, when body's work's expired:
-For then my thoughts, from far where I abide,
-Intend a zealous pilgrimage to thee,
-And keep my drooping eyelids open wide,
-Looking on darkness which the blind do see
-Save that my soul's imaginary sight
-Presents thy shadow to my sightless view,
-Which, like a jewel hung in ghastly night,
-Makes black night beauteous and her old face new.
-Lo! thus, by day my limbs, by night my mind,
-For thee and for myself no quiet find.
-How can I then return in happy plight,
-That am debarr'd the benefit of rest?
-When day's oppression is not eased by night,
-But day by night, and night by day, oppress'd?
-And each, though enemies to either's reign,
-Do in consent shake hands to torture me;
-The one by toil, the other to complain
-How far I toil, still farther off from thee.
-I tell the day, to please them thou art bright
-And dost him grace when clouds do blot the heaven:
-So flatter I the swart-complexion'd night,
-When sparkling stars twire not thou gild'st the even.
-But day doth daily draw my sorrows longer
-And night doth nightly make grief's strength seem stronger.
-When, in disgrace with fortune and men's eyes,
-I all alone beweep my outcast state
-And trouble deal heaven with my bootless cries
-And look upon myself and curse my fate,
-Wishing me like to one more rich in hope,
-Featured like him, like him with friends possess'd,
-Desiring this man's art and that man's scope,
-With what I most enjoy contented least;
-Yet in these thoughts myself almost despising,
-Haply I think on thee, and then my state,
-Like to the lark at break of day arising
-From sullen earth, sings hymns at heaven's gate;
-For thy sweet love remember'd such wealth brings
-That then I scorn to change my state with kings.
-When to the sessions of sweet silent thought
-I summon up remembrance of things past,
-I sigh the lack of many a thing I sought,
-And with old woes new wail my dear time's waste:
-Then can I drown an eye, unused to flow,
-For precious friends hid in death's dateless night,
-And weep afresh love's long since cancell'd woe,
-And moan the expense of many a vanish'd sight:
-Then can I grieve at grievances foregone,
-And heavily from woe to woe tell o'er
-The sad account of fore-bemoaned moan,
-Which I new pay as if not paid before.
-But if the while I think on thee, dear friend,
-All losses are restored and sorrows end.
-Thy bosom is endeared with all hearts,
-Which I by lacking have supposed dead,
-And there reigns love and all love's loving parts,
-And all those friends which I thought buried.
-How many a holy and obsequious tear
-Hath dear religious love stol'n from mine eye
-As interest of the dead, which now appear
-But things removed that hidden in thee lie!
-Thou art the grave where buried love doth live,
-Hung with the trophies of my lovers gone,
-Who all their parts of me to thee did give;
-That due of many now is thine alone:
-Their images I loved I view in thee,
-And thou, all they, hast all the all of me.
-If thou survive my well-contented day,
-When that churl Death my bones with dust shall cover,
-And shalt by fortune once more re-survey
-These poor rude lines of thy deceased lover,
-Compare them with the bettering of the time,
-And though they be outstripp'd by every pen,
-Reserve them for my love, not for their rhyme,
-Exceeded by the height of happier men.
-O, then vouchsafe me but this loving thought:
-'Had my friend's Muse grown with this growing age,
-A dearer birth than this his love had brought,
-To march in ranks of better equipage:
-But since he died and poets better prove,
-Theirs for their style I'll read, his for his love.'
-Full many a glorious morning have I seen
-Flatter the mountain-tops with sovereign eye,
-Kissing with golden face the meadows green,
-Gilding pale streams with heavenly alchemy;
-Anon permit the basest clouds to ride
-With ugly rack on his celestial face,
-And from the forlorn world his visage hide,
-Stealing unseen to west with this disgrace:
-Even so my sun one early morn did shine
-With all triumphant splendor on my brow;
-But out, alack! he was but one hour mine;
-The region cloud hath mask'd him from me now.
-Yet him for this my love no whit disdaineth;
-Suns of the world may stain when heaven's sun staineth.
-Why didst thou promise such a beauteous day,
-And make me travel forth without my cloak,
-To let base clouds o'ertake me in my way,
-Hiding thy bravery in their rotten smoke?
-'Tis not enough that through the cloud thou break,
-To dry the rain on my storm-beaten face,
-For no man well of such a salve can speak
-That heals the wound and cures not the disgrace:
-Nor can thy shame give physic to my grief;
-Though thou repent, yet I have still the loss:
-The offender's sorrow lends but weak relief
-To him that bears the strong offence's cross.
-Ah! but those tears are pearl which thy love sheds,
-And they are rich and ransom all ill deeds.
-No more be grieved at that which thou hast done:
-Roses have thorns, and silver fountains mud;
-Clouds and eclipses stain both moon and sun,
-And loathsome canker lives in sweetest bud.
-All men make faults, and even I in this,
-Authorizing thy trespass with compare,
-Myself corrupting, salving thy amiss,
-Excusing thy sins more than thy sins are;
-For to thy sensual fault I bring in sense--
-Thy adverse party is thy advocate--
-And 'gainst myself a lawful plea commence:
-Such civil war is in my love and hate
-That I an accessary needs must be
-To that sweet thief which sourly robs from me.
-Let me confess that we two must be twain,
-Although our undivided loves are one:
-So shall those blots that do with me remain
-Without thy help by me be borne alone.
-In our two loves there is but one respect,
-Though in our lives a separable spite,
-Which though it alter not love's sole effect,
-Yet doth it steal sweet hours from love's delight.
-I may not evermore acknowledge thee,
-Lest my bewailed guilt should do thee shame,
-Nor thou with public kindness honour me,
-Unless thou take that honour from thy name:
-But do not so; I love thee in such sort
-As, thou being mine, mine is thy good report.
-As a decrepit father takes delight
-To see his active child do deeds of youth,
-So I, made lame by fortune's dearest spite,
-Take all my comfort of thy worth and truth.
-For whether beauty, birth, or wealth, or wit,
-Or any of these all, or all, or more,
-Entitled in thy parts do crowned sit,
-I make my love engrafted to this store:
-So then I am not lame, poor, nor despised,
-Whilst that this shadow doth such substance give
-That I in thy abundance am sufficed
-And by a part of all thy glory live.
-Look, what is best, that best I wish in thee:
-This wish I have; then ten times happy me!
\ No newline at end of file

From afd7a29ba1fd7fabb33eae9cb8b5e1961d629cf6 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 15 Sep 2024 22:26:11 +0000
Subject: [PATCH 225/278] revert change in flash attn and flash infer to clean
 up the diff

---
 vllm/attention/backends/flash_attn.py | 3 ---
 vllm/attention/backends/flashinfer.py | 1 -
 2 files changed, 4 deletions(-)

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 09456ca8d7b6..bf883987bd80 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -15,9 +15,6 @@
                                            is_block_tables_empty)
 from vllm.utils import async_tensor_h2d, make_tensor_with_pad
 
-from vllm.distributed import get_disagg_group
-import vllm.envs as envs
-
 if TYPE_CHECKING:
     from vllm.worker.model_runner import (ModelInputForGPUBuilder,
                                           ModelInputForGPUWithSamplingMetadata)
diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py
index 852c5cd8dc18..4054d337316f 100644
--- a/vllm/attention/backends/flashinfer.py
+++ b/vllm/attention/backends/flashinfer.py
@@ -823,4 +823,3 @@ def forward(
                 k_scale=k_scale,
                 v_scale=v_scale)
         return output.view(num_tokens, hidden_size)
-        
\ No newline at end of file

From cbf24b34bbcf74b268beeda26550cfe5ea8639e1 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 15 Sep 2024 22:35:12 +0000
Subject: [PATCH 226/278] update the example

---
 .../disagg_prefill/disagg_prefill_example.sh  | 35 +++++++++++++++----
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index f57f5fd86d89..56b6f44c7418 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -16,7 +16,7 @@ wait_for_server() {
 }
 
 # prefilling instance
-VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 python3 \
+VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8100 \
@@ -24,7 +24,7 @@ VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 pytho
     --gpu-memory-utilization 0.8 &
 
 # decoding instance
-VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=1 python3 \
+VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8200 \
@@ -36,18 +36,39 @@ wait_for_server 8100
 wait_for_server 8200
 
 # launch a proxy server that opens the service at port 8000
+# the workflow of this proxy:
+# - send the request to prefill vLLM instance (port 8100), change max_tokens to 1
+# - after the prefill vLLM finishes prefill, send the request to decode vLLM instance
 python3 ../../benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py &
 sleep 1
 
-# serve an example request
-curl http://localhost:8000/v1/completions \
+# serve two example requests
+output1=$(curl -s http://localhost:8000/v1/completions \
 -H "Content-Type: application/json" \
 -d '{
 "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
 "prompt": "San Francisco is a",
 "max_tokens": 10,
 "temperature": 0
-}'
+}')
 
-# clean up
-ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9
\ No newline at end of file
+output2=$(curl -s http://localhost:8000/v1/completions \
+-H "Content-Type: application/json" \
+-d '{
+"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+"prompt": "Santa Clara is a",
+"max_tokens": 10,
+"temperature": 0
+}')
+
+# Print the outputs of the curl requests
+echo ""
+echo "Output of first request: $output1"
+echo "Output of second request: $output2"
+
+echo "Successfully finished 2 test requests!"
+echo ""
+
+# Cleanup commands, suppressing their output
+ps -e | grep pt_main_thread | awk '{print $1}' | xargs kill -9 > /dev/null 2>&1
+pkill -f python3 > /dev/null 2>&1

From 4f4ea5053abf40d304cb4281a46e27365da6ec75 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 15 Sep 2024 23:42:30 +0000
Subject: [PATCH 227/278] make format checker happy

---
 .../disagg_prefill_proxy_server.py            |  40 +--
 .../disagg_benchmarks/round_robin_proxy.py    |  25 +-
 .../visualize_benchmark_results.py            |  39 ++-
 .../kv_transfer/kv_lookup_buffer/base.py      |  19 +-
 .../simple_kv_lookup_buffer.py                | 122 ++++-----
 vllm/distributed/kv_transfer/kv_pipe/base.py  |  16 +-
 .../kv_pipe/torch_distributed_pipe.py         |  76 +++---
 vllm/distributed/kv_transfer/vllm_adapter.py  | 252 ++++++++++--------
 vllm/distributed/parallel_state.py            |  33 ++-
 vllm/executor/gpu_executor.py                 |   3 +-
 vllm/executor/multiproc_gpu_executor.py       |   3 +-
 vllm/executor/ray_gpu_executor.py             |   2 +-
 vllm/worker/model_runner.py                   |  46 ++--
 vllm/worker/worker_base.py                    |  87 +++---
 14 files changed, 386 insertions(+), 377 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
index 5750df7735ad..4058b1c0a3b7 100644
--- a/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
+++ b/benchmarks/disagg_benchmarks/disagg_prefill_proxy_server.py
@@ -1,28 +1,31 @@
-from quart import Quart, request, Response, jsonify, make_response
-import aiohttp
-import sys
-import traceback
 import os
 
+import aiohttp
+from quart import Quart, make_response, request
+
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 
 app = Quart(__name__)
 
+
 async def forward_request(url, data):
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: 
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
         headers = {
             "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
         }
-        async with session.post(url=url, json=data, headers=headers) as response:
+        async with session.post(url=url, json=data,
+                                headers=headers) as response:
             if response.status == 200:
                 # if response.headers.get('Transfer-Encoding') == 'chunked':
                 if True:
-                    async for chunk_bytes in response.content.iter_chunked(1024):
+                    async for chunk_bytes in response.content.iter_chunked(
+                            1024):
                         yield chunk_bytes
                 else:
                     content = await response.read()
                     yield content
 
+
 @app.route('/v1/completions', methods=['POST'])
 async def handle_request():
     try:
@@ -31,25 +34,28 @@ async def handle_request():
         prefill_request = original_request_data.copy()
         # change max_tokens = 1 to let it only do prefill
         prefill_request['max_tokens'] = 1
-        
+
         # finish prefill
-        async for _ in forward_request('http://localhost:8100/v1/completions', prefill_request):
+        async for _ in forward_request('http://localhost:8100/v1/completions',
+                                       prefill_request):
             continue
 
-        print(f"Prefill done. proceeding to decode.")
-        
         # return decode
-        generator = forward_request('http://localhost:8200/v1/completions', original_request_data)
+        generator = forward_request('http://localhost:8200/v1/completions',
+                                    original_request_data)
         response = await make_response(generator)
         response.timeout = None
 
         return response
-    
+
     except Exception as e:
-        pass
-        # exc_info = sys.exc_info()
-        # print(e)
-        # print("".join(traceback.format_exception(*exc_info)))
+        import sys
+        import traceback
+        exc_info = sys.exc_info()
+        print("Error occurred in disagg prefill proxy server")
+        print(e)
+        print("".join(traceback.format_exception(*exc_info)))
+
 
 if __name__ == '__main__':
     app.run(port=8000)
diff --git a/benchmarks/disagg_benchmarks/round_robin_proxy.py b/benchmarks/disagg_benchmarks/round_robin_proxy.py
index 8751e24a08d3..6eb5f6398007 100644
--- a/benchmarks/disagg_benchmarks/round_robin_proxy.py
+++ b/benchmarks/disagg_benchmarks/round_robin_proxy.py
@@ -1,9 +1,12 @@
 import asyncio
+import itertools
+
 import aiohttp
 from aiohttp import web
-import itertools
+
 
 class RoundRobinProxy:
+
     def __init__(self, target_ports):
         self.target_ports = target_ports
         self.port_cycle = itertools.cycle(self.target_ports)
@@ -16,16 +19,14 @@ async def handle_request(self, request):
             try:
                 # Forward the request
                 async with session.request(
-                    method=request.method,
-                    url=target_url,
-                    headers=request.headers,
-                    data=request.content,
+                        method=request.method,
+                        url=target_url,
+                        headers=request.headers,
+                        data=request.content,
                 ) as response:
                     # Start sending the response
-                    resp = web.StreamResponse(
-                        status=response.status,
-                        headers=response.headers
-                    )
+                    resp = web.StreamResponse(status=response.status,
+                                              headers=response.headers)
                     await resp.prepare(request)
 
                     # Stream the response content
@@ -38,6 +39,7 @@ async def handle_request(self, request):
             except Exception as e:
                 return web.Response(text=f"Error: {str(e)}", status=500)
 
+
 async def main():
     proxy = RoundRobinProxy([8100, 8200])
     app = web.Application()
@@ -49,9 +51,10 @@ async def main():
     await site.start()
 
     print("Proxy server started on http://localhost:8000")
-    
+
     # Keep the server running
     await asyncio.Event().wait()
 
+
 if __name__ == '__main__':
-    asyncio.run(main())
\ No newline at end of file
+    asyncio.run(main())
diff --git a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
index 192f26a1e3cd..6c5bf5c791dc 100644
--- a/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
+++ b/benchmarks/disagg_benchmarks/visualize_benchmark_results.py
@@ -1,40 +1,42 @@
+import json
 
 import matplotlib.pyplot as plt
-import yaml
 import pandas as pd
-import json
-
-
 
 if __name__ == "__main__":
 
     data = []
     for name in ['disagg_prefill', 'chunked_prefill']:
-        for qps in [2,4,6,8]:
+        for qps in [2, 4, 6, 8]:
             with open(f"results/{name}-qps-{qps}.json", "r") as f:
                 x = json.load(f)
                 x['name'] = name
                 x['qps'] = qps
                 data.append(x)
-                
+
     df = pd.DataFrame.from_dict(data)
     dis_df = df[df['name'] == 'disagg_prefill']
     chu_df = df[df['name'] == 'chunked_prefill']
-    
+
     plt.style.use('bmh')
     plt.rcParams['font.size'] = 20
 
-    
-    for key in ['mean_ttft_ms', 
-                'median_ttft_ms',
-                'p99_ttft_ms',
-                'mean_itl_ms',
-                'median_itl_ms',
-                'p99_itl_ms']:
-        
+    for key in [
+            'mean_ttft_ms', 'median_ttft_ms', 'p99_ttft_ms', 'mean_itl_ms',
+            'median_itl_ms', 'p99_itl_ms'
+    ]:
+
         fig, ax = plt.subplots(figsize=(11, 7))
-        plt.plot(dis_df['qps'], dis_df[key], label='disagg_prefill', marker='o', linewidth=4)
-        plt.plot(chu_df['qps'], chu_df[key], label='chunked_prefill', marker='o', linewidth=4)
+        plt.plot(dis_df['qps'],
+                 dis_df[key],
+                 label='disagg_prefill',
+                 marker='o',
+                 linewidth=4)
+        plt.plot(chu_df['qps'],
+                 chu_df[key],
+                 label='chunked_prefill',
+                 marker='o',
+                 linewidth=4)
         ax.legend()
 
         ax.set_xlabel('QPS')
@@ -42,6 +44,3 @@
         ax.set_ylim(bottom=0)
         fig.savefig(f'results/{key}.png')
         plt.close(fig)
-
-    
-    
\ No newline at end of file
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index 733bc82bf53f..80802f87987a 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -1,21 +1,22 @@
-
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import List, Optional
+
 import torch
 
 
 class KVLookupBufferBase(ABC):
-    
+
     @abstractmethod
-    def insert(self,
-               input_tokens: torch.Tensor,
-               kv: torch.Tensor, roi) -> None:
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
         raise NotImplementedError
-    
+
     @abstractmethod
-    def drop_select(self, input_tokens, roi) -> Optional[torch.Tensor]:
+    def drop_select(self, input_tokens: torch.Tensor,
+                    roi: torch.Tensor) -> List[Optional[torch.Tensor]]:
         raise NotImplementedError
-    
+
     @abstractmethod
     def close(self):
         """
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
index 6172bf092fb0..9696032002fd 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
@@ -1,22 +1,21 @@
-
-from vllm.distributed.kv_transfer.kv_lookup_buffer.base import \
-    KVLookupBufferBase
-from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
-from typing import Dict, Tuple, List, Optional, Union
 import threading
-import torch
-from collections import deque
 import time
+from collections import deque
+from typing import Deque, List, Optional, Union
+
+import torch
 
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
+    KVLookupBufferBase)
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
+
 class SimpleKVLookupBuffer(KVLookupBufferBase):
-    
-    def __init__(self, 
-                 signal_pipe: KVPipeBase, 
-                 data_pipe: KVPipeBase, 
+
+    def __init__(self, signal_pipe: KVPipeBase, data_pipe: KVPipeBase,
                  buffer_size_thresh: int):
         """
         signal_pipe: on CPU 
@@ -28,72 +27,66 @@ def __init__(self,
         
         data_pipe: on device (e.g. GPU)
         """
-        
-        self.buffer = deque()
-        
+
+        self.buffer: Deque[List[torch.Tensor]] = deque()
+
         self.buffer_size = 0
         self.buffer_size_threshold = buffer_size_thresh
         self.buffer_lock = threading.Lock()
         self.signal_pipe = signal_pipe
         self.data_pipe = data_pipe
-        self.request_handling_thread = None
+        self.request_handling_thread: Optional[threading.Thread] = None
 
         self.normal_signal = torch.tensor([0])
         self.end_signal = None
 
-        
-    def _matches(self,
-                 tokens_roi_sender: List[torch.Tensor], 
+    def _matches(self, tokens_roi_sender: List[torch.Tensor],
                  tokens_roi_recver: List[torch.Tensor]):
 
         # tokens_roi_sender: tokens and roi of the producer (in the buffer)
         # tokens_roi_recver: tokens and roi of the consumer (query)
-        
+
         tokens_sender = tokens_roi_sender[0]
         tokens_recver = tokens_roi_recver[0]
         roi_sender = tokens_roi_sender[1]
         roi_recver = tokens_roi_recver[1]
-        
+
         if tokens_recver is None:
             # consumer sends an empty request
             # semantics: DROP SELECT * LIMIT 1
             # so any of the data in the buffer can be drop-selected
             return True
 
-            
         # Assuming that roi is a mask on tokens
         tokens_sender = tokens_sender[roi_sender]
         tokens_recver = tokens_recver[roi_recver]
-        
-        
+
         # simple common prefix matching
         min_length = min(len(tokens_sender), len(tokens_recver))
-        if torch.allclose(tokens_sender[:min_length], tokens_recver[:min_length]):
+        if torch.allclose(tokens_sender[:min_length],
+                          tokens_recver[:min_length]):
             return min_length
-        
+
         return 0
 
-            
-    def _send_tensor_and_dec_size(self, tensor: Optional[torch.Tensor]) -> None:
+    def _send_tensor_and_dec_size(self,
+                                  tensor: Optional[torch.Tensor]) -> None:
 
         assert tensor is not None, "Use self.data_pipe.send(None) instead"
         self.buffer_size -= tensor.element_size() * tensor.numel()
         self.data_pipe.send_tensor(tensor)
 
     def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
-        
+
         if data == [] or data is None:
             return 0
         if isinstance(data, torch.Tensor):
             return data.element_size() * data.numel()
+        else:
+            raise AssertionError("Unknown data type %s" % type(data))
 
-        assert False, "Unknown data type %s" % type(data)
-        
-    def _add_to_buffer(self,
-                       input_tokens: torch.Tensor, 
-                       roi: torch.Tensor, 
-                       key: torch.Tensor, 
-                       value: torch.Tensor, 
+    def _add_to_buffer(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+                       key: torch.Tensor, value: torch.Tensor,
                        hidden: torch.Tensor):
 
         if isinstance(input_tokens, torch.Tensor):
@@ -107,21 +100,20 @@ def _add_to_buffer(self,
         if isinstance(hidden, torch.Tensor):
             hidden = hidden.clone()
 
-        
         buffer_item = [input_tokens, roi, key, value, hidden]
-        
+
         with self.buffer_lock:
             for data in buffer_item:
                 self.buffer_size += self._get_element_size(data)
             self.buffer.append(buffer_item)
-        
+
     def _is_end_signal(self, signal):
         return signal is None
-        
+
     def drop_select_handler(self):
 
         try:
-        
+
             while True:
                 signal = self.signal_pipe.recv_tensor()
                 if self._is_end_signal(signal):
@@ -132,28 +124,29 @@ def drop_select_handler(self):
 
                 roi = self.data_pipe.recv_tensor()
                 tokens_roi_recver = [input_tokens, roi]
-                
+
                 matched_length = 0
-                
+
                 # perform input tokens and roi matching
                 with self.buffer_lock:
 
                     for _ in range(len(self.buffer)):
-                        
-                        temp_length = self._matches(self.buffer[0], tokens_roi_recver)
+
+                        temp_length = self._matches(self.buffer[0],
+                                                    tokens_roi_recver)
                         if temp_length > 0:
                             matched_length = temp_length
                             break
                         # rotate the element we just accessed to the end
                         self.buffer.rotate(-1)
-                        
+
                     if matched_length > 0:
                         # need to clone the tensor
                         # in case the tensor is freed before sending finishes
                         matched_item = self.buffer.popleft()
                         for tensor in matched_item:
                             self._send_tensor_and_dec_size(tensor)
-                        
+
                     else:
                         # no match, just send None
                         for _ in range(5):
@@ -164,60 +157,57 @@ def drop_select_handler(self):
                 raise e
 
         logger.debug("Closing drop_select_handler")
-                        
-        
-    def drop_select(self, 
-                    input_tokens: torch.Tensor, 
-                    roi: torch.Tensor):
-        
+
+    def drop_select(self, input_tokens: torch.Tensor,
+                    roi: torch.Tensor) -> List[Optional[torch.Tensor]]:
+
         assert self.request_handling_thread is None, \
             "drop_select should be called by the receiver"
 
-            
         if isinstance(input_tokens, torch.Tensor):
             input_tokens = input_tokens.clone()
         if isinstance(roi, torch.Tensor):
             roi = roi.clone()
-        
+
         self.signal_pipe.send_tensor(self.normal_signal)
         self.data_pipe.send_tensor(input_tokens)
         self.data_pipe.send_tensor(roi)
-        
+
         input_tokens = self.data_pipe.recv_tensor()
         roi = self.data_pipe.recv_tensor()
         key = self.data_pipe.recv_tensor()
         value = self.data_pipe.recv_tensor()
         hidden = self.data_pipe.recv_tensor()
-        
+
         return [input_tokens, roi, key, value, hidden]
 
-        
     def full_handler(self):
         time.sleep(0.001)
-        
-    
-    def insert(self, input_tokens, roi, key, value, hidden) -> None:
+
+    def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
+               key: torch.Tensor, value: torch.Tensor,
+               hidden: torch.Tensor) -> None:
 
         while self.buffer_size > self.buffer_size_threshold:
             # logger.debug("KV transfer buffer is full. Handling...")
             self.full_handler()
 
-        
         self._add_to_buffer(input_tokens, roi, key, value, hidden)
-        
+
         # when calling the insert, the current process is a sender
         # need to launch the request handler and start listening to request.
         if self.request_handling_thread is None:
             self.request_handling_thread = threading.Thread(
                 target=self.drop_select_handler)
             self.request_handling_thread.start()
-            
-            
+
     def close(self):
 
-        if hasattr(self, "request_handling_thread") and self.request_handling_thread is not None:
+        if hasattr(self, "request_handling_thread"
+                   ) and self.request_handling_thread is not None:
             self.request_handling_thread.join()
 
         else:
-            # TODO: have a explicit close signal and have a explicit way to check if it's requester 
+            # TODO: have a explicit close signal and have a explicit way to
+            # check if it's requester
             self.signal_pipe.send_tensor(self.end_signal)
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
index 7662a5893ceb..0955b4e83889 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -1,15 +1,17 @@
-
 from abc import ABC, abstractmethod
+from typing import Optional
+
+import torch
 
 
 class KVPipeBase(ABC):
-    
-    @abstractmethod 
-    def send_tensor(self, tensor):
+
+    @abstractmethod
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         raise NotImplementedError
-    
-    @abstractmethod 
-    def recv_tensor(self):
+
+    @abstractmethod
+    def recv_tensor(self) -> Optional[torch.Tensor]:
         raise NotImplementedError
 
     @abstractmethod
diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index 3a6a94bb0e75..911bce88a38f 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -1,15 +1,16 @@
-from torch.distributed import Backend
-import torch
-from typing import List, Optional, Union
 import threading
-from concurrent.futures import ThreadPoolExecutor
 import time
+from concurrent.futures import ThreadPoolExecutor
+from typing import List, Optional, Union
 
+import torch
+from torch.distributed import Backend
+
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
 
-
 # if the tensor is only one-element and only contains NONE_INT
 # this means that the sended object is None.
 NONE_INT = -150886311
@@ -42,17 +43,17 @@
 
 
 class BrokenPipeException(Exception):
+
     def __init__(self, message):
         self.message = message
         super().__init__(self.message)
 
 
-class TorchDistributedPipe:
+class TorchDistributedPipe(KVPipeBase):
     METADATA_LENGTH = 16
     MAX_TENSOR_DIMENSIONS = 14
     METADATA_DTYPE = torch.int64
 
-
     def __init__(
         self,
         group_ranks: List[List[int]],
@@ -65,8 +66,7 @@ def __init__(
 
         for ranks in group_ranks:
             device_group = torch.distributed.new_group(
-                ranks, backend=torch_distributed_backend
-            )
+                ranks, backend=torch_distributed_backend)
             if self.rank in ranks:
                 self.ranks = ranks
                 self.world_size = len(ranks)
@@ -78,26 +78,24 @@ def __init__(
 
         self.device = self._select_device(torch_distributed_backend)
 
-        self.target_rank_for_send = self.ranks[
-            (self.rank_in_group + 1) % self.world_size
-        ]
-        self.target_rank_for_recv = self.ranks[
-            (self.rank_in_group - 1) % self.world_size
-        ]
+        self.target_rank_for_send = self.ranks[(self.rank_in_group + 1) %
+                                               self.world_size]
+        self.target_rank_for_recv = self.ranks[(self.rank_in_group - 1) %
+                                               self.world_size]
 
         # FIXME: why we need this?
         torch.set_default_device(self.device)
 
-        self.transport_thread = None
+        self.transport_thread: Optional[ThreadPoolExecutor] = None
         self.buffer_size = 0
         self.buffer_size_lock = threading.Lock()
 
         self.none_tensor = torch.tensor([NONE_INT], device=self.device)
 
         # On-device tensors to be reused for recv
-        self.rcv_metadata_buffer = torch.zeros(
-            self.METADATA_LENGTH, dtype=self.METADATA_DTYPE, device=self.device
-        )
+        self.rcv_metadata_buffer = torch.zeros(self.METADATA_LENGTH,
+                                               dtype=self.METADATA_DTYPE,
+                                               device=self.device)
 
     def _select_device(self, backend: Union[str, Backend]):
         if torch.cuda.is_available() and backend == Backend.NCCL:
@@ -129,14 +127,12 @@ def _make_metadata(self, tensor: torch.Tensor) -> torch.Tensor:
         buffer[0] = DTYPE2INT[tensor.dtype]
         ndims = len(tensor.shape)
         buffer[1] = len(tensor.shape)
-        buffer[2 : 2 + ndims] = torch.tensor(
-            tensor.shape, dtype=self.METADATA_DTYPE
-        )
+        buffer[2:2 + ndims] = torch.tensor(tensor.shape,
+                                           dtype=self.METADATA_DTYPE)
         return buffer.to(self.device)
 
-    def _prepare_recv_buffer(
-        self, d_metadata_buffer: torch.Tensor
-    ) -> torch.Tensor:
+    def _prepare_recv_buffer(self,
+                             d_metadata_buffer: torch.Tensor) -> torch.Tensor:
         """
         Create a buffer to receive the tensor based on the metadata.
 
@@ -149,7 +145,7 @@ def _prepare_recv_buffer(
         h_buffer = d_metadata_buffer.cpu().numpy()
         dtype = INT2DTYPE[h_buffer[0]]
         ndims = h_buffer[1]
-        shape = tuple(h_buffer[2 : 2 + ndims])
+        shape = tuple(h_buffer[2:2 + ndims])
         return torch.empty(shape, dtype=dtype, device=self.device)
 
     def _send_metadata(self, d_metadata_buffer: torch.Tensor):
@@ -174,7 +170,7 @@ def _recv_metadata(self) -> torch.Tensor:
             race conditions during sending/receiving. Therefore, the metadata
             buffer can be reused
         """
-        task = torch.distributed.recv(
+        torch.distributed.recv(
             self.rcv_metadata_buffer,
             src=self.target_rank_for_recv,
             group=self.device_group,
@@ -194,9 +190,9 @@ def _send_impl(self, tensor):
         metadata = self._make_metadata(tensor)
         self._send_metadata(metadata)
 
-        torch.distributed.send(
-            tensor, dst=self.target_rank_for_send, group=self.device_group
-        )
+        torch.distributed.send(tensor,
+                               dst=self.target_rank_for_send,
+                               group=self.device_group)
 
     def _recv_impl(self) -> torch.Tensor:
         """
@@ -211,9 +207,9 @@ def _recv_impl(self) -> torch.Tensor:
         d_metadata = self._recv_metadata()
         buffer = self._prepare_recv_buffer(d_metadata)
 
-        torch.distributed.recv(
-            buffer, src=self.target_rank_for_recv, group=self.device_group
-        )
+        torch.distributed.recv(buffer,
+                               src=self.target_rank_for_recv,
+                               group=self.device_group)
 
         return buffer
 
@@ -227,13 +223,9 @@ def send_tensor_wrapper(self, tensor):
                 self.buffer_size = self.buffer_size - tensor_size
         except Exception as e:
             logger.error("[rank%d]: Exception when trying to send %s, msg: %s",
-                         torch.distributed.get_rank(),
-                         str(tensor),
-                         str(e))
+                         torch.distributed.get_rank(), str(tensor), str(e))
             import traceback
             traceback.print_exc()
-            
-
 
     def block_if_full(self):
         """
@@ -268,13 +260,11 @@ def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         with self.buffer_size_lock:
             self.buffer_size = self.buffer_size + tensor_size
 
-
         self.transport_thread.submit(
             self.send_tensor_wrapper,
             tensor,
         )
 
-
     def recv_tensor(self) -> Optional[torch.Tensor]:
         """Receives a tensor from the src rank. Blocking."""
         if self.transport_thread is None:
@@ -300,8 +290,6 @@ def close(self):
         """
         Close the pipe and release the resources.
         """
-        if (
-            hasattr(self, "transport_thread")
-            and self.transport_thread is not None
-        ):
+        if (hasattr(self, "transport_thread")
+                and self.transport_thread is not None):
             self.transport_thread.shutdown()
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 9a6b55cbbe66..03392ec13f10 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -1,59 +1,59 @@
 """vLLM distributed KV cache transfer API.
 These APIs are used in `vllm/worker/worker_base.py`.
 
-Currently supporting TP. The TP between prefill and decode instance needs to be the same.
+Currently supporting TP. The TP between prefill and decode instance needs to be 
+the same.
 
 Workflow (disaggregated prefill)
 - In prefill instance
     - After prefill, vLLM `insert` its KV caches into a lookup buffer.
-    - The prefill instance will also open up a thread that listens to `drop_select` request.
+    - The prefill instance will also open up a thread that listens to 
+      `drop_select` request.
 - In decode instance
-    - vLLM first runs `drop_select` to send input tokens and a mask on input tokens (we call it roi, region of interest) to prefill instance
+    - vLLM first runs `drop_select` to send input tokens and a mask on input 
+      tokens (we call it roi, region of interest) to prefill instance
     - The prefill instance then respond to `drop_select` request by
         - Finding a match in current lookup buffer.
         - Clone and send the matched item out
         - Delete the matched item in the lookup buffer to free up GPU memory.
     - The decode vLLM then store the KV cache into paged memory.
 """
-from typing import Any, Dict, List, Optional, Tuple, Union, TYPE_CHECKING
-from collections import defaultdict, deque
-from concurrent.futures import ThreadPoolExecutor
-from threading import Lock
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+if TYPE_CHECKING:
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+
 from copy import deepcopy
-import time
-import threading
 
 import torch
-from torch.distributed import Backend, ProcessGroup
+from torch.distributed import Backend
 
 import vllm.envs as envs
-from vllm.logger import init_logger
-import vllm.distributed.parallel_state as ps
 from vllm import _custom_ops as ops
+from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
+    KVLookupBufferBase)
+from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_kv_lookup_buffer import (
+    SimpleKVLookupBuffer)
+from vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe import (
+    TorchDistributedPipe)
+from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
-from vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe import TorchDistributedPipe
-from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_kv_lookup_buffer import SimpleKVLookupBuffer
-
-from copy import deepcopy
 
-assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode", "lmcache"], \
+assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode", "lmcache"],\
     "VLLM_DISAGG_PREFILL_ROLE can only be prefill, decode or lmcache."
 
-
 # currently the connections are hard-coded.
 # we only handle 2 cases:
 # - prefill vLLM --> decode vLLM
 # - vLLM --> LMCache
-IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE in ["prefill", "decode"])
+IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE
+                                    in ["prefill", "decode"])
 IS_KV_PREFILL_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "prefill")
 IS_KV_DECODE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "decode")
 IS_LMCACHE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "lmcache")
 
-
 logger = init_logger(__name__)
 
-import logging
-
 
 class KV_transfer_agent:
     """
@@ -70,11 +70,13 @@ def __init__(
         local_rank: int,
         torch_distributed_backend: Union[str, Backend],
         # FIXME(Kuntai): remove this hardcoding
-        lookup_buffer_size: int = 1e10
-    ):
-        
+        lookup_buffer_size: int = int(1e10)):
+
         self.lookup_buffer_size = lookup_buffer_size
-        
+
+        self.send_buffer: Optional[KVLookupBufferBase] = None
+        self.recv_buffer: Optional[KVLookupBufferBase] = None
+
         if IS_LMCACHE_INSTANCE:
             # when vLLM is connected with LMCache
             # it needs to both send and recv KV cache
@@ -98,14 +100,12 @@ def __init__(
                 local_rank,
                 "gloo",
             )
-            self.send_buffer = SimpleKVLookupBuffer(
-                self.send_signal_pipe,
-                self.send_pipe,
-                self.lookup_buffer_size)
-            self.recv_buffer = SimpleKVLookupBuffer(
-                self.recv_signal_pipe, 
-                self.recv_pipe, 
-                self.lookup_buffer_size)
+            self.send_buffer = SimpleKVLookupBuffer(self.send_signal_pipe,
+                                                    self.send_pipe,
+                                                    self.lookup_buffer_size)
+            self.recv_buffer = SimpleKVLookupBuffer(self.recv_signal_pipe,
+                                                    self.recv_pipe,
+                                                    self.lookup_buffer_size)
         else:
             # when performing disaggregated prefill, only 1 pipe is needed
             # at prefill instance this pipe is used for send KV cache
@@ -120,24 +120,25 @@ def __init__(
                 local_rank,
                 "gloo",
             )
-            buffer = SimpleKVLookupBuffer(
-                self.signal_pipe, 
-                self.pipe, 
-                self.lookup_buffer_size)
+            buffer = SimpleKVLookupBuffer(self.signal_pipe, self.pipe,
+                                          self.lookup_buffer_size)
             self.send_buffer = buffer
             self.recv_buffer = buffer
-        
+
     def send_kv_caches_and_hidden_states(
         self,
         model_executable: torch.nn.Module,
         model_input: "ModelInputForGPUWithSamplingMetadata",
         kv_caches: List[torch.Tensor],
-        hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors],
+        hidden_or_intermediate_states: Union[torch.Tensor,
+                                             IntermediateTensors],
     ) -> None:
 
         input_tokens_tensor = model_input.input_tokens
         seq_lens = model_input.attn_metadata.seq_lens
         slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten()
+        start_layer = model_executable.model.start_layer
+        end_layer = model_executable.model.end_layer
 
         # query_lens contains new KV caches that are added to vLLM.
         # so we will send them to decode instance
@@ -146,13 +147,11 @@ def send_kv_caches_and_hidden_states(
             start_pos = sum(seq_lens[:idx])
             end_pos = start_pos + slen
             current_tokens = input_tokens_tensor[start_pos:end_pos]
-            
+
             keys, values = [], []
-            
-            
-            for l in range(model_executable.model.start_layer,
-                        model_executable.model.end_layer):
-                kv_cache = kv_caches[l - model_executable.model.start_layer]
+
+            for layer_id in range(start_layer, end_layer):
+                kv_cache = kv_caches[layer_id - start_layer]
 
                 _, _, num_heads, head_size = kv_cache[0].shape
 
@@ -163,29 +162,31 @@ def send_kv_caches_and_hidden_states(
 
                 keys.append(key_cache[current_slot_mapping].unsqueeze(0))
                 values.append(value_cache[current_slot_mapping].unsqueeze(0))
-                
+
             keys = torch.cat(keys, dim=0)
             values = torch.cat(values, dim=0)
-            self.send_buffer.insert(
-                current_tokens, 
-                torch.ones_like(current_tokens, dtype=bool),
-                keys, 
-                values, 
-                hidden_or_intermediate_states[start_pos:end_pos]
-            )
-            
+            if self.send_buffer is not None:
+                self.send_buffer.insert(
+                    current_tokens, torch.ones_like(current_tokens,
+                                                    dtype=bool), keys, values,
+                    hidden_or_intermediate_states[start_pos:end_pos])
 
         logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank())
 
+    def destroy(self) -> None:
+        if self.send_buffer is not None:
+            self.send_buffer.close()
+        if self.recv_buffer is not None:
+            self.recv_buffer.close()
 
     def recv_kv_caches_and_hidden_states(
-        self,
-        model_executable: torch.nn.Module,
+        self, model_executable: torch.nn.Module,
         model_input: "ModelInputForGPUWithSamplingMetadata",
         kv_caches: List[torch.Tensor]
-    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]:
+    ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
+               "ModelInputForGPUWithSamplingMetadata"]:
 
-        # When this flag is set to False, it means that 
+        # When this flag is set to False, it means that
         bypass_model_exec = True
 
         # This is disagg decode instance, during prefill state
@@ -199,7 +200,7 @@ def recv_kv_caches_and_hidden_states(
         input_tokens_list = []
         num_computed_tokens_list = []
         start_pos_list = []
-        
+
         # enumerate different requests
         # FIXME(Kuntai): This impl assumes that all requests are prefill.
         for idx, slen in enumerate(seq_lens):
@@ -211,28 +212,34 @@ def recv_kv_caches_and_hidden_states(
 
             input_tokens_list.append(current_tokens)
             start_pos_list.append(start_pos)
-            
+
+            if self.recv_buffer is None:
+                bypass_model_exec = False
+                break
+
             ret = self.recv_buffer.drop_select(
-                current_tokens, 
-                torch.ones_like(current_tokens, dtype=bool))
+                current_tokens, torch.ones_like(current_tokens, dtype=bool))
             if ret[0] is None:
                 # didn't find any match.
                 bypass_model_exec = False
                 num_computed_tokens_list.append(0)
                 continue
-            
+
             # TODO(Jiayi): change the logic here (need roi)
-            _, roi, keys, values, hidden = ret
-            
+            roi: torch.Tensor = ret[1]
+            keys: torch.Tensor = ret[2]
+            values: torch.Tensor = ret[3]
+            hidden: torch.Tensor = ret[4]
+
             # Jiayi: currently assume roi is a prefix
-            num_computed_tokens = len(roi)
+            num_computed_tokens = roi.shape[0]
             num_computed_tokens_list.append(num_computed_tokens)
             is_complete = (num_computed_tokens == num_tokens)
             end_pos = start_pos + num_computed_tokens
-            
+
             # receive KV cache from disaggregated prefill instance
             for i in range(model_executable.model.start_layer,
-                        model_executable.model.end_layer):
+                           model_executable.model.end_layer):
 
                 kv_cache = kv_caches[i - model_executable.model.start_layer]
                 layer = model_executable.model.layers[i]
@@ -251,12 +258,13 @@ def recv_kv_caches_and_hidden_states(
 
             hidden_or_intermediate_states_for_one_req.append(hidden)
 
-        # FIXME(Jiayi): we need to support only skip m out of n reqs in a batch 
+        # FIXME(Jiayi): we need to support only skip m out of n reqs in a batch
         # same for prefix caching
         if not bypass_model_exec:
             # Some of the KV cache is not retrieved
             # so we need to recompute the hidden state
-            logger.debug("[rank%d]: KV EMPTY recv DONE.", torch.distributed.get_rank())
+            logger.debug("[rank%d]: KV EMPTY recv DONE.",
+                         torch.distributed.get_rank())
             return None, bypass_model_exec, None
 
         if not is_complete:
@@ -268,17 +276,17 @@ def recv_kv_caches_and_hidden_states(
                 slot_mapping,
                 device=kv_cache[0].device,
             )
-            logger.debug("[rank%d]: KV PARTIAL recv DONE.", torch.distributed.get_rank())
+            logger.debug("[rank%d]: KV PARTIAL recv DONE.",
+                         torch.distributed.get_rank())
             return None, bypass_model_exec, rebuilt_model_input
-        
+
         # concatenate hidden states from different requests
         hidden_or_intermediate_states = torch.cat(
             hidden_or_intermediate_states_for_one_req, dim=0)
 
         logger.debug("[rank%d]: KV recv DONE.", torch.distributed.get_rank())
         return hidden_or_intermediate_states, bypass_model_exec, model_input
-    
-    
+
     def build_partial_prefill_input(
         self,
         model_input: "ModelInputForGPUWithSamplingMetadata",
@@ -289,70 +297,77 @@ def build_partial_prefill_input(
         device: torch.device,
     ) -> "ModelInputForGPUWithSamplingMetadata":
         rebuilt_input_tokens = []
-        rebuilt_input_positions= []
+        rebuilt_input_positions = []
         rebuilt_query_lens = []
-        
+
         rebuilt_num_prefills = 0
         rebuilt_num_prefill_tokens = 0
         rebuilt_slot_mapping = []
         rebuilt_max_query_len = 0
-        
+
         rebuilt_block_tables = []
-        
+
         rebuilt_query_start_loc = [0]
         rebuilt_context_lens_tensor = []
         rebuilt_selected_token_indices = []
-        
+
         # recounting query and context lengths
         for idx in range(len(input_tokens_list)):
             token_tensor = input_tokens_list[idx]
             num_token = len(token_tensor)
             num_computed_token = num_computed_tokens_list[idx]
             start_pos = start_pos_list[idx]
-            
+
             rebuilt_input_tokens.append(token_tensor[num_computed_token:])
             # TODO(Jiayi): please check the correctness of next line
-            rebuilt_input_positions.append(model_input.input_positions[start_pos+num_computed_token:start_pos+num_token])
+            rebuilt_input_positions.append(
+                model_input.input_positions[start_pos +
+                                            num_computed_token:start_pos +
+                                            num_token])
             q_len = num_token - num_computed_token
             rebuilt_query_lens.append(q_len)
-            
+
             # Attn metadata-related
             rebuilt_num_prefills += 1
             rebuilt_num_prefill_tokens += q_len
-            rebuilt_slot_mapping.append(slot_mapping_flat[start_pos+num_computed_token:start_pos+num_token])
+            rebuilt_slot_mapping.append(
+                slot_mapping_flat[start_pos + num_computed_token:start_pos +
+                                  num_token])
             rebuilt_max_query_len = max(q_len, rebuilt_max_query_len)
             # TODO(Jiayi): remove hard-code (block_size=16)
             blk_size = 16
-            temp_block_table = [i//blk_size for i in range(start_pos, start_pos+num_token, blk_size)]
+            temp_block_table = [
+                i // blk_size
+                for i in range(start_pos, start_pos + num_token, blk_size)
+            ]
             rebuilt_block_tables.append(temp_block_table)
-            rebuilt_query_start_loc.append(q_len) #start with 0
+            rebuilt_query_start_loc.append(q_len)  #start with 0
             rebuilt_context_lens_tensor.append(num_computed_token)
-            
+
             # Sampling metadata related
             #seq_groups (use rebuilt query lens)
-            rebuilt_selected_token_indices.append(start_pos+q_len-1)
-        
-        
+            rebuilt_selected_token_indices.append(start_pos + q_len - 1)
+
         # rebuilt attn_metadata
         rebuilt_attn_metadata = deepcopy(model_input.attn_metadata)
         rebuilt_attn_metadata.num_prefills = rebuilt_num_prefills
         rebuilt_attn_metadata.num_prefill_tokens = rebuilt_num_prefill_tokens
-        rebuilt_attn_metadata.slot_mapping = torch.cat(rebuilt_slot_mapping).to(device)
+        rebuilt_attn_metadata.slot_mapping = torch.cat(
+            rebuilt_slot_mapping).to(device)
         rebuilt_attn_metadata.max_query_len = rebuilt_max_query_len
-        
+
         rebuilt_attn_metadata.block_tables = torch.tensor(
             rebuilt_block_tables,
-            dtype=model_input.attn_metadata.block_tables.dtype
-            ).to(device)
-        
+            dtype=model_input.attn_metadata.block_tables.dtype).to(device)
+
         rebuilt_attn_metadata.query_start_loc = torch.tensor(
             rebuilt_query_start_loc,
             dtype=model_input.attn_metadata.query_start_loc.dtype).to(device)
         rebuilt_attn_metadata.context_lens_tensor = torch.tensor(
-            rebuilt_context_lens_tensor, 
+            rebuilt_context_lens_tensor,
             dtype=model_input.attn_metadata.context_lens_tensor.dtype,
-            ).to(device)
-        
+        ).to(device)
+
         rebuilt_attn_metadata._cached_prefill_metadata = None
 
         # rebuilt sampling_metadata
@@ -362,26 +377,27 @@ def build_partial_prefill_input(
         rebuilt_sampling_metadata.selected_token_indices = torch.tensor(
             rebuilt_selected_token_indices,
             dtype=model_input.sampling_metadata.selected_token_indices.dtype,
-            ).to(device)
-        
+        ).to(device)
+
         # import here to avoid circular import.
-        from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
+        from vllm.worker.model_runner import (
+            ModelInputForGPUWithSamplingMetadata)
         rebuilt_model_input = ModelInputForGPUWithSamplingMetadata(
-            input_tokens = torch.cat(rebuilt_input_tokens).to(device),
-            input_positions = torch.cat(rebuilt_input_positions).to(device),
-            seq_lens = model_input.seq_lens,
-            query_lens = rebuilt_query_lens,
-            lora_mapping = model_input.lora_mapping,
-            lora_requests = model_input.lora_requests,
-            attn_metadata = rebuilt_attn_metadata,
-            prompt_adapter_mapping = model_input.prompt_adapter_mapping,
-            prompt_adapter_requests = model_input.prompt_adapter_requests,
-            multi_modal_kwargs = model_input.multi_modal_kwargs,
-            request_ids_to_seq_ids = model_input.request_ids_to_seq_ids,
-            finished_requests_ids = model_input.finished_requests_ids,
-            virtual_engine = model_input.virtual_engine,
-            sampling_metadata = rebuilt_sampling_metadata,
-            is_prompt = model_input.is_prompt,
+            input_tokens=torch.cat(rebuilt_input_tokens).to(device),
+            input_positions=torch.cat(rebuilt_input_positions).to(device),
+            seq_lens=model_input.seq_lens,
+            query_lens=rebuilt_query_lens,
+            lora_mapping=model_input.lora_mapping,
+            lora_requests=model_input.lora_requests,
+            attn_metadata=rebuilt_attn_metadata,
+            prompt_adapter_mapping=model_input.prompt_adapter_mapping,
+            prompt_adapter_requests=model_input.prompt_adapter_requests,
+            multi_modal_kwargs=model_input.multi_modal_kwargs,
+            request_ids_to_seq_ids=model_input.request_ids_to_seq_ids,
+            finished_requests_ids=model_input.finished_requests_ids,
+            virtual_engine=model_input.virtual_engine,
+            sampling_metadata=rebuilt_sampling_metadata,
+            is_prompt=model_input.is_prompt,
         )
-        
-        return rebuilt_model_input
\ No newline at end of file
+
+        return rebuilt_model_input
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 13527110a223..3615fa6af399 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -20,29 +20,25 @@
  parallelism, you can skip the model parallel initialization and destruction
  steps.
 """
-import time
 import contextlib
 import pickle
-import logging
+import time
 from collections import namedtuple
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
 from multiprocessing import shared_memory
 from typing import Any, Dict, List, Optional, Tuple, Union
 from unittest.mock import patch
-import queue
 
 import torch
 import torch.distributed
 from torch.distributed import Backend, ProcessGroup
 
-import vllm.envs as envs
-from vllm.logger import init_logger
-
-
 # Use this import to check if disagg prefill is enabled.
 # if enabled, need to adjust distributed group correspondingly.
 import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
+import vllm.envs as envs
+from vllm.logger import init_logger
 
 
 @dataclass
@@ -865,7 +861,8 @@ def include_decoding_groups_if_disagg_enabled(
             Extended: [ [0,1], [2,3], [4,5], [6,7] ]
         Arguments:
             groups: original distributed group
-            world_size: the vLLM world size, which is half of torch.distributed.get_world_size()
+            world_size: the vLLM world size, which is half of 
+            torch.distributed.get_world_size()
     """
 
     if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
@@ -908,9 +905,8 @@ def init_distributed_environment(
                 # offset global rank by tp * pp (which is world_size)
                 maybe_disagg_rank = rank + world_size
 
-        logger.debug(
-            f"Before: world size {maybe_disagg_world_size}, rank {maybe_disagg_rank}"
-        )
+        logger.debug("Before: world size %d, rank %d", maybe_disagg_world_size,
+                     maybe_disagg_rank)
 
         torch.distributed.init_process_group(
             backend=backend,
@@ -974,17 +970,18 @@ def initialize_model_parallel(
     ranks 8 to 15 belong to the second box.
 
 
-    Disaggregated prefill will also initialize its process group using this function.
+    Disaggregated prefill will also init its process group using this function.
     Changes:
         - vLLM world size: unchanged (tp * pp)
         - torch.distributed.get_world_size():
             - 2 * tp * pp
-            - Why: torch.distributed package sees 2 vLLM instances (prefill and decode)
+            - Why: both prefill vLLM and decode vLLM is in the world
         - Global rank:
             - [0, tp * pp) for prefill
             - [tp * pp, 2 * tp * pp) for decode
         - Parallel groups
-            - Extend _WORLD, _TP and _PP using `include_decoding_groups_if_disagg_enabled`
+            - Extend _WORLD, _TP and _PP using 
+              `include_decoding_groups_if_disagg_enabled`
             - Add a new parallel group `_DISAGG` for disaggregated prefill
                 - [ [0, tp * pp], [1, tp * pp + 1], .. ]
         - Local rank: unchanged
@@ -997,12 +994,14 @@ def initialize_model_parallel(
         get_world_group().device_group)
     if dist_kv.IS_DISTRIBUTED_KV_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE:
         # Disaggregated prefill enabled
-        # The world_size for this vLLM instance is tp * pp, but torch.distributed contains 2 vLLM instances, its world size is 2 * tp * pp
+        # The world_size for this vLLM instance is tp * pp, but
+        # torch.distributed contains 2 vLLM instances,
+        # its world size is 2 * tp * pp
         # Adjust the world_size to match.
         world_size = world_size // 2
 
-    if (world_size
-            != tensor_model_parallel_size * pipeline_model_parallel_size):
+    if (world_size !=
+            tensor_model_parallel_size * pipeline_model_parallel_size):
         raise RuntimeError(
             f"world_size ({world_size}) is not equal to "
             f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 679f8394688e..b774a649d39f 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -48,7 +48,8 @@ def _get_worker_kwargs(
         """Return worker init args for a given rank."""
         if distributed_init_method is None:
             distributed_init_method = get_distributed_init_method(
-                get_ip(), get_open_port(force=dist_kv.IS_DISTRIBUTED_KV_INSTANCE))
+                get_ip(),
+                get_open_port(force=dist_kv.IS_DISTRIBUTED_KV_INSTANCE))
         return dict(
             model_config=self.model_config,
             parallel_config=self.parallel_config,
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 944822887945..499e891d98fc 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -70,7 +70,8 @@ def _init_executor(self) -> None:
         # Since it only works for single node, we can use the loopback address
         # 127.0.0.1 for communication.
         distributed_init_method = get_distributed_init_method(
-            "127.0.0.1", get_open_port(force=dist_kv.IS_DISTRIBUTED_KV_INSTANCE))
+            "127.0.0.1",
+            get_open_port(force=dist_kv.IS_DISTRIBUTED_KV_INSTANCE))
 
         self.workers: List[ProcessWorkerWrapper] = []
         # This is the list of workers that are rank 0 of each TP group EXCEPT
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index c646e8536ba1..0cca5db1677e 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -6,8 +6,8 @@
 
 import msgspec
 
-import vllm.envs as envs
 import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
+import vllm.envs as envs
 from vllm.executor.distributed_gpu_executor import (  # yapf: disable
     DistributedGPUExecutor, DistributedGPUExecutorAsync)
 from vllm.executor.msgspec_utils import encode_hook
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index ab38302b3321..b846d1d707db 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -14,7 +14,6 @@
 import torch.distributed
 import torch.nn as nn
 
-
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
@@ -55,7 +54,6 @@
     _add_sampling_metadata_broadcastable_dict,
     _init_attn_metadata_from_tensor_dict,
     _init_sampling_metadata_from_tensor_dict, dump_input_when_exception)
-from vllm import _custom_ops as ops
 
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
@@ -1546,30 +1544,20 @@ def execute_model(
             model_forward_end = torch.cuda.Event(enable_timing=True)
             model_forward_start.record()
 
-
         hidden_or_intermediate_states = model_executable(
-        input_ids=model_input.input_tokens,
-        positions=model_input.input_positions,
-        kv_caches=kv_caches,
-        attn_metadata=model_input.attn_metadata,
-        intermediate_tensors=intermediate_tensors,
-        **MultiModalInputs.as_kwargs(multi_modal_kwargs,
-                                        device=self.device),
-        **seqlen_agnostic_kwargs)
-        
+            input_ids=model_input.input_tokens,
+            positions=model_input.input_positions,
+            kv_caches=kv_caches,
+            attn_metadata=model_input.attn_metadata,
+            intermediate_tensors=intermediate_tensors,
+            **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                                         device=self.device),
+            **seqlen_agnostic_kwargs)
+
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_end.record()
 
-        return hidden_or_intermediate_states
-
-    @torch.inference_mode()
-    def postprocess_model(
-        self,
-        model_input,
-        hidden_or_intermediate_states,
-        
-    ):
         if not get_pp_group().is_last_rank:
             if (self.is_driver_worker
                     and hidden_or_intermediate_states is not None
@@ -1587,7 +1575,16 @@ def postprocess_model(
                 hidden_or_intermediate_states.tensors["model_forward_time"] = (
                     torch.tensor(model_forward_time + orig_model_forward_time))
             return hidden_or_intermediate_states
-        
+
+        return hidden_or_intermediate_states
+
+    @torch.inference_mode()
+    def postprocess_model(
+        self,
+        model_input: ModelInputForGPUWithSamplingMetadata,
+        hidden_or_intermediate_states,
+    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
+
         logits = self.model.compute_logits(hidden_or_intermediate_states,
                                            model_input.sampling_metadata)
 
@@ -1603,6 +1600,7 @@ def postprocess_model(
             sampling_metadata=model_input.sampling_metadata,
         )
 
+        assert model_input.attn_metadata is not None
         decode_meta = model_input.attn_metadata.decode_metadata
         if self.return_hidden_states:
             # we only need to pass hidden states of most recent token
@@ -1620,9 +1618,7 @@ def postprocess_model(
             output.hidden_states = hidden_states
 
         return [output]
-    
-    
-    
+
 
 class CUDAGraphRunner:
 
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 22577ebf6949..7908fc466eb3 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -7,6 +7,8 @@
 
 import torch
 
+import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
+import vllm.distributed.parallel_state as ps
 from vllm.config import ObservabilityConfig
 from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
 from vllm.logger import init_logger
@@ -16,13 +18,11 @@
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import (enable_trace_function_call_for_thread,
                         update_environment_variables)
+from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
 from vllm.worker.model_runner_base import (BroadcastableModelInput,
                                            ModelRunnerBase,
                                            ModelRunnerInputBase)
 
-import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
-import vllm.distributed.parallel_state as ps
-
 logger = init_logger(__name__)
 
 
@@ -223,7 +223,6 @@ def execute_worker(self, worker_input: WorkerInput) -> None:
         Process an execution request.
         """
         raise NotImplementedError
-    
 
     def _get_worker_input_from_broadcast(
         self
@@ -327,19 +326,14 @@ def execute_model(
                     and self.observability_config.collect_model_execute_time):
                 orig_model_execute_time = intermediate_tensors.tensors.get(
                     "model_execute_time", torch.tensor(0)).item()
-        
-        
+
         # for disaggregated prefilling: allow bypassing model execution
         bypass_model_exec = False
-        
-        
-        # receive KV cache. 
-        # NOTE(kuntai): 
-        # If only a part of KV cache is received, we will adjust model_input
-        # to avoid prefill on the part of KV caches that are already received.
-        # This will not happen for disaggregated prefill, but will happen
-        # when connecting to a KV cache database (like LMCache).
+
+        # receive KV cache from prefill instance, or from LMCache
         if self.need_recv_kv(model_input, worker_input):
+            assert isinstance(self.model_runner, GPUModelRunnerBase), \
+                "Distributed KV transfer only support GPU modelrunner"
             hidden_or_intermediate_states, bypass_model_exec, model_input = \
                 ps.get_disagg_group().recv_kv_caches_and_hidden_states(
                     # model is used to know which layer the current worker
@@ -347,11 +341,12 @@ def execute_model(
                     # layers.
                     self.model_runner.model,
                     model_input,
-                    self.kv_cache[worker_input.virtual_engine],
+                    kv_caches=self.kv_cache[worker_input.virtual_engine]
+                    if self.kv_cache is not None else None,
                 )
             #assert bypass_model_exec
-        
-        if not bypass_model_exec: 
+
+        if not bypass_model_exec:
             hidden_or_intermediate_states = self.model_runner.execute_model(
                 model_input=model_input,
                 kv_caches=self.kv_cache[worker_input.virtual_engine]
@@ -360,24 +355,31 @@ def execute_model(
                 num_steps=num_steps,
                 **kwargs,
             )
-            
+
         # sending out KV cache
         if self.need_send_kv(model_input, worker_input):
+            assert isinstance(self.model_runner, GPUModelRunnerBase), \
+                "Distributed KV transfer only support GPU modelrunner"
             ps.get_disagg_group().send_kv_caches_and_hidden_states(
                 # model is used to know which layer the current worker
                 # is working on, so that we can send KV for only those
                 # layers.
                 self.model_runner.model,
                 model_input,
-                self.kv_cache[worker_input.virtual_engine],
+                self.kv_cache[worker_input.virtual_engine]
+                if self.kv_cache is not None else None,
                 hidden_or_intermediate_states,
             )
-            
-        # Get model output based on hidden state.
-        output = self.model_runner.postprocess_model(
-            model_input,
-            hidden_or_intermediate_states,
-        )
+
+        # separating postprocessing steps out from execute_model
+        # so that disaggregated prefill can completely bypass model forwarding
+        if isinstance(self.model_runner, ModelRunner):
+            output = self.model_runner.postprocess_model(
+                model_input,
+                hidden_or_intermediate_states,
+            )
+        else:
+            output = hidden_or_intermediate_states
 
         model_execute_time = time.perf_counter() - start_time
         if not get_pp_group().is_last_rank:
@@ -400,38 +402,43 @@ def execute_model(
         return output
 
     def need_recv_kv(self, model_input, worker_input) -> bool:
-        
+
+        if self.kv_cache is None:
+            return False
+
         kv_caches = self.kv_cache[worker_input.virtual_engine]
         prefill_meta = model_input.attn_metadata.prefill_metadata
-        
+
         # check if the current run is profiling
         is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
         # for disaggregated prefilling: allow bypassing model execution
-        
+
         return all([
-            is_prefill_run,
-            dist_kv.IS_KV_DECODE_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE,
-            not is_profile_run])
+            is_prefill_run, dist_kv.IS_KV_DECODE_INSTANCE
+            or dist_kv.IS_LMCACHE_INSTANCE, not is_profile_run
+        ])
 
-            
     def need_send_kv(self, model_input, worker_input) -> bool:
-        
+
+        if self.kv_cache is None:
+            return False
+
         kv_caches = self.kv_cache[worker_input.virtual_engine]
         prefill_meta = model_input.attn_metadata.prefill_metadata
-        model_executable = self.model_runner.model
-        
+        if not isinstance(self.model_runner, GPUModelRunnerBase):
+            return False
+
         # check if the current run is profiling
         is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
-        
+
         return all([
-            is_prefill_run,
-            dist_kv.IS_KV_PREFILL_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE,
-            not is_profile_run])
-        
+            is_prefill_run, dist_kv.IS_KV_PREFILL_INSTANCE
+            or dist_kv.IS_LMCACHE_INSTANCE, not is_profile_run
+        ])
 
     def _execute_model_spmd(
         self,

From f78a2eb59f30c84060c34f9b0d623e285e45aeee Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 15 Sep 2024 23:48:40 +0000
Subject: [PATCH 228/278] resolve circular import

---
 vllm/utils.py              | 2 +-
 vllm/worker/worker_base.py | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 1adab6191726..8e27e1f73f4b 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -535,7 +535,7 @@ def get_open_port(force: bool = False) -> int:
         if force and port is not None:
             # force vLLM to use envs.VLLM_PORT for torch.distributed init
             # This is because this port will binded by prefill instance
-            # But both prefill and decode instance need to use this port to 
+            # But both prefill and decode instance need to use this port to
             # initialize torch.distributed
             return port
         while True:
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 7908fc466eb3..d55400a40240 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -18,7 +18,6 @@
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.utils import (enable_trace_function_call_for_thread,
                         update_environment_variables)
-from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
 from vllm.worker.model_runner_base import (BroadcastableModelInput,
                                            ModelRunnerBase,
                                            ModelRunnerInputBase)
@@ -332,6 +331,7 @@ def execute_model(
 
         # receive KV cache from prefill instance, or from LMCache
         if self.need_recv_kv(model_input, worker_input):
+            from vllm.worker.model_runner import GPUModelRunnerBase
             assert isinstance(self.model_runner, GPUModelRunnerBase), \
                 "Distributed KV transfer only support GPU modelrunner"
             hidden_or_intermediate_states, bypass_model_exec, model_input = \
@@ -358,6 +358,7 @@ def execute_model(
 
         # sending out KV cache
         if self.need_send_kv(model_input, worker_input):
+            from vllm.worker.model_runner import GPUModelRunnerBase
             assert isinstance(self.model_runner, GPUModelRunnerBase), \
                 "Distributed KV transfer only support GPU modelrunner"
             ps.get_disagg_group().send_kv_caches_and_hidden_states(
@@ -373,6 +374,7 @@ def execute_model(
 
         # separating postprocessing steps out from execute_model
         # so that disaggregated prefill can completely bypass model forwarding
+        from vllm.worker.model_runner import ModelRunner
         if isinstance(self.model_runner, ModelRunner):
             output = self.model_runner.postprocess_model(
                 model_input,
@@ -427,6 +429,7 @@ def need_send_kv(self, model_input, worker_input) -> bool:
 
         kv_caches = self.kv_cache[worker_input.virtual_engine]
         prefill_meta = model_input.attn_metadata.prefill_metadata
+        from vllm.worker.model_runner import GPUModelRunnerBase
         if not isinstance(self.model_runner, GPUModelRunnerBase):
             return False
 

From 44dfa3f7142a455b2f6cfb36bf4c57465106723e Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 15 Sep 2024 23:49:01 +0000
Subject: [PATCH 229/278] fix redundant import

---
 tests/kv_transfer/test_send_recv.py | 81 ++++++++++++++---------------
 1 file changed, 39 insertions(+), 42 deletions(-)

diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 4bf757d7c849..994b907e0c89 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -1,10 +1,11 @@
+import os
+import time
+from typing import List
 
-import vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe as tdp
 import torch
-import os
-import random
 from tqdm import tqdm
-import time
+
+import vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe as tdp
 
 
 def test_run(my_rank, pipe):
@@ -35,20 +36,19 @@ def test_run(my_rank, pipe):
     assert torch.allclose(y, y2)
 
 
-
 def stress_test(my_rank, pipe):
-    
+
     torch.distributed.barrier()
-    
-    tensors = []
-    
-    
+
+    tensors: List[torch.Tensor] = []
+
     for i in tqdm(range(2000)):
         mean = torch.rand(1).item()
         std = torch.rand(1).item()
-        size = torch.randint(900, 1000, (2,))
-        x = torch.normal(mean * 1.0, std * 1.0, size=size.tolist()).to(pipe.device)
-        
+        size = torch.randint(900, 1000, (2, ))
+        x = torch.normal(mean * 1.0, std * 1.0,
+                         size=size.tolist()).to(pipe.device)
+
         # 5% probability of sending a None
         if torch.rand(1).item() < 0.05:
             tensors.append(None)
@@ -59,15 +59,13 @@ def stress_test(my_rank, pipe):
             tensors.append(x.mean().unsqueeze(0))
             tensors.append(x.std().unsqueeze(0))
 
-        
-        
     torch.distributed.barrier()
-    
+
     for i in tqdm(range(2000)):
         if my_rank == int((i % 10) > 3):
-            pipe.send_tensor(tensors[3*i])
-            pipe.send_tensor(tensors[3*i+1])
-            pipe.send_tensor(tensors[3*i+2])
+            pipe.send_tensor(tensors[3 * i])
+            pipe.send_tensor(tensors[3 * i + 1])
+            pipe.send_tensor(tensors[3 * i + 2])
         else:
             x = pipe.recv_tensor()
             mean = pipe.recv_tensor()
@@ -76,34 +74,36 @@ def stress_test(my_rank, pipe):
                 assert mean is None
                 assert std is None
             else:
-                assert torch.allclose(x, tensors[3*i])
+                assert torch.allclose(x, tensors[3 * i])
                 assert x.mean() == mean[0]
                 assert x.std() == std[0]
 
     torch.distributed.barrier()
 
     print("Stress test passed.")
-    
-    
-    
+
+
 def latency_test(my_rank, pipe, nelement, ntensor):
-    
+
     latencies = []
-    
+
     torch.distributed.barrier()
-    
+
     for i in tqdm(range(1000)):
-        
+
         tensors = []
-        
+
         if my_rank == 0:
             # create tensor
-            tensors = [torch.rand(nelement).to(pipe.device) for _ in range(ntensor)]
-        
+            tensors = [
+                torch.rand(nelement).to(pipe.device) for _ in range(ntensor)
+            ]
+
         torch.distributed.barrier()
-        
+
         if my_rank == 0:
-            t = torch.tensor([time.time()], dtype=torch.float64).to(pipe.device)
+            t = torch.tensor([time.time()],
+                             dtype=torch.float64).to(pipe.device)
             for tensor in tensors:
                 pipe.send_tensor(tensor)
             pipe.send_tensor(t)
@@ -114,7 +114,7 @@ def latency_test(my_rank, pipe, nelement, ntensor):
             latencies.append(time.time() - t.item())
 
     torch.distributed.barrier()
-            
+
     print('Latency test passed.')
     print('Latency:', torch.tensor(latencies).mean().item() * 1000, 'ms')
 
@@ -123,18 +123,15 @@ def latency_test(my_rank, pipe, nelement, ntensor):
 
     my_rank = int(os.environ['RANK'])
 
-
-    torch.distributed.init_process_group(
-                init_method="tcp://127.0.0.1:23456",
-                world_size=2,
-                rank=my_rank)
+    torch.distributed.init_process_group(init_method="tcp://127.0.0.1:23456",
+                                         world_size=2,
+                                         rank=my_rank)
 
     print("initialized! My rank is %d" % my_rank)
 
+    pipe = tdp.TorchDistributedPipe([[0, 1]], my_rank, "nccl")
 
-    pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "nccl")
-
-    torch.manual_seed(0) 
+    torch.manual_seed(0)
     test_run(my_rank, pipe)
     stress_test(my_rank, pipe)
-    latency_test(my_rank, pipe, 1024*8*128, 80)
+    latency_test(my_rank, pipe, 1024 * 8 * 128, 80)

From 822f3dc82a7950109933a4e31ee7e6d4709129e1 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Sun, 15 Sep 2024 23:55:50 +0000
Subject: [PATCH 230/278] rename to a shorter name

---
 tests/kv_transfer/test_lookup_buffer.py       | 80 +++++++++----------
 ...e_kv_lookup_buffer.py => simple_buffer.py} |  0
 vllm/distributed/kv_transfer/vllm_adapter.py  |  5 +-
 3 files changed, 42 insertions(+), 43 deletions(-)
 rename vllm/distributed/kv_transfer/kv_lookup_buffer/{simple_kv_lookup_buffer.py => simple_buffer.py} (100%)

diff --git a/tests/kv_transfer/test_lookup_buffer.py b/tests/kv_transfer/test_lookup_buffer.py
index ae19d068be9f..0730f091a34b 100644
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -1,24 +1,25 @@
-
-import vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe as tdp
-import vllm.distributed.kv_transfer.kv_lookup_buffer.simple_kv_lookup_buffer as sklb
-import torch
 import os
 import random
+
+import torch
 from tqdm import tqdm
-import time
 
-# TODO: the test depends on a lot of fields in the current implementation. We should have standard interface instead direct field access
+import vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer as sklb
+import vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe as tdp
+
+# TODO: the test depends on a lot of fields in the current implementation.
+# We should have standard interface instead direct field access
+
 
 def test_run(my_rank, buffer, device):
-    
-    # buffer should be empty in the beginning    
+
+    # buffer should be empty in the beginning
     if my_rank == 0:
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
 
-
     # insert
-    tokens = torch.tensor([1,2,3]).to(device)
+    tokens = torch.tensor([1, 2, 3]).to(device)
     roi = (tokens > 0)
     if my_rank == 0:
         key = 2.0 * torch.ones([5, 6]).to(device)
@@ -27,45 +28,47 @@ def test_run(my_rank, buffer, device):
         placeholder = torch.tensor([1]).to(device)
 
         buffer.insert(tokens, roi, key, value, placeholder)
-        
+
     torch.distributed.barrier()
-        
+
     # drop_select
     if my_rank == 1:
         tok, roi_, key, value, hidden = buffer.drop_select(tokens, roi)
         assert torch.allclose(tokens, tok)
         assert torch.allclose(roi, roi_)
-        assert torch.allclose(key, 2.0 * torch.ones([5, 6], device = device))
-        assert torch.allclose(value, 3.0 * torch.ones([5, 6], device = device))
+        assert torch.allclose(key, 2.0 * torch.ones([5, 6], device=device))
+        assert torch.allclose(value, 3.0 * torch.ones([5, 6], device=device))
     torch.distributed.barrier()
-    
+
     if my_rank == 0:
         assert buffer.buffer_size == 0
         assert len(buffer.buffer) == 0
-    
+
     print("Test run passed!")
 
+
 def stress_test(my_rank, buf, device):
-    
+
     torch.distributed.barrier()
     torch.manual_seed(100)
 
     reqs = [
         (
-         torch.rand(100).to(device),   # tokens
-         torch.ones(100).bool().to(device),    # roi
-         torch.rand(100).to(device),   # key
-         torch.rand(100).to(device),   # value
-         torch.rand(100).to(device),   # hidden
-         ) for i in tqdm(range(200))]
+            torch.rand(100).to(device),  # tokens
+            torch.ones(100).bool().to(device),  # roi
+            torch.rand(100).to(device),  # key
+            torch.rand(100).to(device),  # value
+            torch.rand(100).to(device),  # hidden
+        ) for i in tqdm(range(200))
+    ]
 
     random.seed(my_rank)
     random.shuffle(reqs)
-    
+
     torch.distributed.barrier()
-    
+
     n = 0
-    
+
     # the buffer size can only store 100 reqs
     # so the sender will occasionally block to wait for the receiver.
     for req in tqdm(reqs):
@@ -74,7 +77,7 @@ def stress_test(my_rank, buf, device):
         else:
             tok, roi, k, v, h = req
             tok_, roi_, k_, v_, h_ = buf.drop_select(tok, roi)
-            
+
             if tok_ is None:
                 assert roi_ is None
                 assert k_ is None
@@ -89,8 +92,7 @@ def stress_test(my_rank, buf, device):
                 assert torch.allclose(h, h_)
     print('Rank %d done' % my_rank)
     torch.distributed.barrier()
-    
-    
+
     if my_rank == 0:
         x = torch.tensor([0])
         torch.distributed.recv(x, 1)
@@ -103,30 +105,26 @@ def stress_test(my_rank, buf, device):
         torch.distributed.send(torch.tensor([n]), 0)
 
     print("Passed stress test!")
-        
-    
+
 
 if __name__ == "__main__":
 
     my_rank = int(os.environ['RANK'])
 
-
-    torch.distributed.init_process_group(
-                init_method="tcp://127.0.0.1:23456",
-                world_size=2,
-                rank=my_rank)
+    torch.distributed.init_process_group(init_method="tcp://127.0.0.1:23456",
+                                         world_size=2,
+                                         rank=my_rank)
 
     print("initialized! My rank is %d" % my_rank)
 
-
-    pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "nccl")
-    cpu_pipe = tdp.TorchDistributedPipe([[0,1]], my_rank, "gloo")
+    pipe = tdp.TorchDistributedPipe([[0, 1]], my_rank, "nccl")
+    cpu_pipe = tdp.TorchDistributedPipe([[0, 1]], my_rank, "gloo")
     buffer = sklb.SimpleKVLookupBuffer(cpu_pipe, pipe, 170000)
 
     test_run(my_rank, buffer, pipe.device)
-    
+
     stress_test(my_rank, buffer, pipe.device)
-    
+
     buffer.close()
     pipe.close()
     cpu_pipe.close()
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
similarity index 100%
rename from vllm/distributed/kv_transfer/kv_lookup_buffer/simple_kv_lookup_buffer.py
rename to vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 03392ec13f10..2edb426c5c8d 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -28,12 +28,11 @@
 import torch
 from torch.distributed import Backend
 
+import vllm.distributed.kv_transfer.kv_lookup_buffer.simple_buffer as sklb
 import vllm.envs as envs
 from vllm import _custom_ops as ops
 from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
     KVLookupBufferBase)
-from vllm.distributed.kv_transfer.kv_lookup_buffer.simple_kv_lookup_buffer import (
-    SimpleKVLookupBuffer)
 from vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe import (
     TorchDistributedPipe)
 from vllm.logger import init_logger
@@ -77,6 +76,8 @@ def __init__(
         self.send_buffer: Optional[KVLookupBufferBase] = None
         self.recv_buffer: Optional[KVLookupBufferBase] = None
 
+        SimpleKVLookupBuffer = sklb.SimpleKVLookupBuffer
+
         if IS_LMCACHE_INSTANCE:
             # when vLLM is connected with LMCache
             # it needs to both send and recv KV cache

From 7682269f27d2c23384e6b1ceed6415c7edfe184d Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Mon, 16 Sep 2024 00:06:43 +0000
Subject: [PATCH 231/278] remove unnecessary file

---
 tests/test_send_recv.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/test_send_recv.sh

diff --git a/tests/test_send_recv.sh b/tests/test_send_recv.sh
deleted file mode 100644
index e69de29bb2d1..000000000000

From b6e5eb35a21964942aba677e5eb6835b35fa09c3 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Mon, 16 Sep 2024 00:40:26 +0000
Subject: [PATCH 232/278] update kv transfer test

---
 tests/kv_transfer/test_launcher.py      | 52 +++++++++++++++++++++++++
 tests/kv_transfer/test_lookup_buffer.sh |  3 --
 tests/kv_transfer/test_send_recv.py     | 10 +++--
 tests/kv_transfer/test_send_recv.sh     |  3 --
 4 files changed, 58 insertions(+), 10 deletions(-)
 create mode 100644 tests/kv_transfer/test_launcher.py
 delete mode 100644 tests/kv_transfer/test_lookup_buffer.sh
 delete mode 100644 tests/kv_transfer/test_send_recv.sh

diff --git a/tests/kv_transfer/test_launcher.py b/tests/kv_transfer/test_launcher.py
new file mode 100644
index 000000000000..5c0aeb04b43f
--- /dev/null
+++ b/tests/kv_transfer/test_launcher.py
@@ -0,0 +1,52 @@
+import subprocess
+import pytest
+import sys
+import torch
+
+def run_python_script(script_name, timeout):
+    try:
+        # Start both processes asynchronously using Popen
+        process0 = subprocess.Popen(
+            [sys.executable, script_name],
+            env={"RANK": "0"},  # Set the RANK environment variable for process 0
+            stdout=sys.stdout,  # Pipe stdout to current stdout
+            stderr=sys.stderr,  # Pipe stderr to current stderr
+        )
+        
+        process1 = subprocess.Popen(
+            [sys.executable, script_name],
+            env={"RANK": "1"},  # Set the RANK environment variable for process 1
+            stdout=sys.stdout,  # Pipe stdout to current stdout
+            stderr=sys.stderr,  # Pipe stderr to current stderr
+        )
+
+        # Wait for both processes to complete, with a timeout
+        process0.wait(timeout=timeout)
+        process1.wait(timeout=timeout)
+
+        # Check the return status of both processes
+        if process0.returncode != 0:
+            pytest.fail(f"Test {script_name} failed for RANK=0 with return code {process0.returncode}")
+        if process1.returncode != 0:
+            pytest.fail(f"Test {script_name} failed for RANK=1 with return code {process1.returncode}")
+
+    except subprocess.TimeoutExpired:
+        # If either process times out, terminate both and fail the test
+        process0.terminate()
+        process1.terminate()
+        pytest.fail(f"Test {script_name} timed out")
+    except Exception as e:
+        pytest.fail(f"Test {script_name} failed with error: {str(e)}")
+
+# Define the test cases using pytest's parametrize
+@pytest.mark.parametrize("script_name,timeout", [
+    ("test_lookup_buffer.py", 60), # Second test case with a 60-second timeout
+    ("test_send_recv.py", 120)     # First test case with a 120-second timeout
+])
+def test_run_python_script(script_name, timeout):
+    # Check the number of GPUs
+    if torch.cuda.device_count() < 2:
+        pytest.skip(f"Skipping test {script_name} because fewer than 2 GPUs are available")
+    
+    # Run the test if there are at least 2 GPUs
+    run_python_script(script_name, timeout)
diff --git a/tests/kv_transfer/test_lookup_buffer.sh b/tests/kv_transfer/test_lookup_buffer.sh
deleted file mode 100644
index 336b540e7054..000000000000
--- a/tests/kv_transfer/test_lookup_buffer.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-
-RANK=0 python3 test_lookup_buffer.py &
-RANK=1 python3 test_lookup_buffer.py &
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index 994b907e0c89..f6da7f88d5f5 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -42,7 +42,7 @@ def stress_test(my_rank, pipe):
 
     tensors: List[torch.Tensor] = []
 
-    for i in tqdm(range(2000)):
+    for i in tqdm(range(500)):
         mean = torch.rand(1).item()
         std = torch.rand(1).item()
         size = torch.randint(900, 1000, (2, ))
@@ -61,7 +61,7 @@ def stress_test(my_rank, pipe):
 
     torch.distributed.barrier()
 
-    for i in tqdm(range(2000)):
+    for i in tqdm(range(500)):
         if my_rank == int((i % 10) > 3):
             pipe.send_tensor(tensors[3 * i])
             pipe.send_tensor(tensors[3 * i + 1])
@@ -89,7 +89,7 @@ def latency_test(my_rank, pipe, nelement, ntensor):
 
     torch.distributed.barrier()
 
-    for i in tqdm(range(1000)):
+    for i in tqdm(range(500)):
 
         tensors = []
 
@@ -134,4 +134,6 @@ def latency_test(my_rank, pipe, nelement, ntensor):
     torch.manual_seed(0)
     test_run(my_rank, pipe)
     stress_test(my_rank, pipe)
-    latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
+    
+    # Use this function if you want to test the latency of pipe impl.
+    # latency_test(my_rank, pipe, 1024 * 8 * 128, 80)
diff --git a/tests/kv_transfer/test_send_recv.sh b/tests/kv_transfer/test_send_recv.sh
deleted file mode 100644
index 2a478871bd0e..000000000000
--- a/tests/kv_transfer/test_send_recv.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-
-RANK=0 python3 test_send_recv.py &
-RANK=1 python3 test_send_recv.py &

From 58f5080cb53a2491af0645ef4e6cbcbac7c47588 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Mon, 16 Sep 2024 01:12:57 +0000
Subject: [PATCH 233/278] update tests

---
 tests/kv_transfer/disagg_test.py              | 107 ++++++++++++++++++
 .../{test_launcher.py => module_test.py}      |   0
 2 files changed, 107 insertions(+)
 create mode 100644 tests/kv_transfer/disagg_test.py
 rename tests/kv_transfer/{test_launcher.py => module_test.py} (100%)

diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
new file mode 100644
index 000000000000..2e8414a9f464
--- /dev/null
+++ b/tests/kv_transfer/disagg_test.py
@@ -0,0 +1,107 @@
+import os
+import sys
+import subprocess
+import time
+import pytest
+import requests
+import signal
+from subprocess import Popen
+import torch
+
+
+# Fixture to set up environment variables and teardown servers after tests
+@pytest.fixture(scope="module", autouse=True)
+def setup_servers():
+    if torch.cuda.device_count() < 4:
+        pytest.skip("Skipping test: fewer than 4 GPUs available")
+
+    # Set up environment variables
+    VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", shell=True).decode().strip()
+    os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP
+    os.environ["VLLM_PORT"] = "12345"
+
+    # Start prefill instance
+    prefill_cmd = [
+        sys.executable, "-m", "vllm.entrypoints.openai.api_server",
+        "-tp", "2",
+        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "--port", "8100",
+        "--gpu-memory-utilization", "0.8"
+    ]
+    prefill_env = os.environ.copy()
+    prefill_env["VLLM_DISAGG_PREFILL_ROLE"] = "prefill"
+    prefill_env["CUDA_VISIBLE_DEVICES"] = "0,1"
+    prefill_proc = Popen(prefill_cmd, env=prefill_env)
+
+    # Start decode instance
+    decode_cmd = [
+        sys.executable, "-m", "vllm.entrypoints.openai.api_server",
+        "-tp", "2",
+        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "--port", "8200",
+        "--gpu-memory-utilization", "0.8"
+    ]
+    decode_env = os.environ.copy()
+    decode_env["VLLM_DISAGG_PREFILL_ROLE"] = "decode"
+    decode_env["CUDA_VISIBLE_DEVICES"] = "2,3"
+    decode_proc = Popen(decode_cmd, env=decode_env)
+
+    # Wait for servers to be ready
+    assert wait_for_server(8100), "Prefill server did not start in time"
+    assert wait_for_server(8200), "Decode server did not start in time"
+
+    # Yield to the test function and handle teardown after tests
+    yield
+
+    # Cleanup: kill the processes
+    prefill_proc.terminate()
+    decode_proc.terminate()
+
+    # Additional cleanup if needed
+    prefill_proc.wait()
+    decode_proc.wait()
+
+# Helper function to wait for server
+def wait_for_server(port, timeout=120):
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            response = requests.get(f"http://localhost:{port}/v1/completions")
+            if response.status_code in [200, 405]:
+                return True
+        except requests.ConnectionError:
+            time.sleep(1)
+    return False
+
+# Test function to send curl requests and validate responses
+@pytest.mark.parametrize("prompt", [
+    "San Francisco is a",
+    "Santa Clara is a"
+])
+def test_disaggregated_prefilling(prompt):
+    # Send to prefill
+    response = requests.post(
+        "http://localhost:8100/v1/completions",
+        headers={"Content-Type": "application/json"},
+        json={
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "prompt": prompt,
+            "max_tokens": 1,
+            "temperature": 0
+        }
+    )
+    assert response.status_code == 200
+
+    # Send to decode
+    response = requests.post(
+        "http://localhost:8200/v1/completions",
+        headers={"Content-Type": "application/json"},
+        json={
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "prompt": prompt,
+            "max_tokens": 10,
+            "temperature": 0
+        }
+    )
+    assert response.status_code == 200
+    
\ No newline at end of file
diff --git a/tests/kv_transfer/test_launcher.py b/tests/kv_transfer/module_test.py
similarity index 100%
rename from tests/kv_transfer/test_launcher.py
rename to tests/kv_transfer/module_test.py

From 8f0538c5053a7c59c8c22da646d394fd7b3bf01e Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Mon, 16 Sep 2024 01:14:18 +0000
Subject: [PATCH 234/278] make fmt checker happy

---
 tests/kv_transfer/disagg_test.py    | 67 +++++++++++++----------------
 tests/kv_transfer/module_test.py    | 35 +++++++++------
 tests/kv_transfer/test_send_recv.py |  2 +-
 3 files changed, 54 insertions(+), 50 deletions(-)

diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
index 2e8414a9f464..fa6a527574cf 100644
--- a/tests/kv_transfer/disagg_test.py
+++ b/tests/kv_transfer/disagg_test.py
@@ -1,11 +1,11 @@
 import os
-import sys
 import subprocess
+import sys
 import time
+from subprocess import Popen
+
 import pytest
 import requests
-import signal
-from subprocess import Popen
 import torch
 
 
@@ -16,16 +16,15 @@ def setup_servers():
         pytest.skip("Skipping test: fewer than 4 GPUs available")
 
     # Set up environment variables
-    VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", shell=True).decode().strip()
+    VLLM_HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'",
+                                           shell=True).decode().strip()
     os.environ["VLLM_HOST_IP"] = VLLM_HOST_IP
     os.environ["VLLM_PORT"] = "12345"
 
     # Start prefill instance
     prefill_cmd = [
-        sys.executable, "-m", "vllm.entrypoints.openai.api_server",
-        "-tp", "2",
-        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct",
-        "--port", "8100",
+        sys.executable, "-m", "vllm.entrypoints.openai.api_server", "-tp", "2",
+        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--port", "8100",
         "--gpu-memory-utilization", "0.8"
     ]
     prefill_env = os.environ.copy()
@@ -35,10 +34,8 @@ def setup_servers():
 
     # Start decode instance
     decode_cmd = [
-        sys.executable, "-m", "vllm.entrypoints.openai.api_server",
-        "-tp", "2",
-        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct",
-        "--port", "8200",
+        sys.executable, "-m", "vllm.entrypoints.openai.api_server", "-tp", "2",
+        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--port", "8200",
         "--gpu-memory-utilization", "0.8"
     ]
     decode_env = os.environ.copy()
@@ -61,6 +58,7 @@ def setup_servers():
     prefill_proc.wait()
     decode_proc.wait()
 
+
 # Helper function to wait for server
 def wait_for_server(port, timeout=120):
     start_time = time.time()
@@ -73,35 +71,30 @@ def wait_for_server(port, timeout=120):
             time.sleep(1)
     return False
 
+
 # Test function to send curl requests and validate responses
-@pytest.mark.parametrize("prompt", [
-    "San Francisco is a",
-    "Santa Clara is a"
-])
+@pytest.mark.parametrize("prompt", ["San Francisco is a", "Santa Clara is a"])
 def test_disaggregated_prefilling(prompt):
     # Send to prefill
-    response = requests.post(
-        "http://localhost:8100/v1/completions",
-        headers={"Content-Type": "application/json"},
-        json={
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "prompt": prompt,
-            "max_tokens": 1,
-            "temperature": 0
-        }
-    )
+    response = requests.post("http://localhost:8100/v1/completions",
+                             headers={"Content-Type": "application/json"},
+                             json={
+                                 "model":
+                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "prompt": prompt,
+                                 "max_tokens": 1,
+                                 "temperature": 0
+                             })
     assert response.status_code == 200
 
     # Send to decode
-    response = requests.post(
-        "http://localhost:8200/v1/completions",
-        headers={"Content-Type": "application/json"},
-        json={
-            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
-            "prompt": prompt,
-            "max_tokens": 10,
-            "temperature": 0
-        }
-    )
+    response = requests.post("http://localhost:8200/v1/completions",
+                             headers={"Content-Type": "application/json"},
+                             json={
+                                 "model":
+                                 "meta-llama/Meta-Llama-3.1-8B-Instruct",
+                                 "prompt": prompt,
+                                 "max_tokens": 10,
+                                 "temperature": 0
+                             })
     assert response.status_code == 200
-    
\ No newline at end of file
diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/module_test.py
index 5c0aeb04b43f..10fb19a3128e 100644
--- a/tests/kv_transfer/module_test.py
+++ b/tests/kv_transfer/module_test.py
@@ -1,21 +1,25 @@
 import subprocess
-import pytest
 import sys
+
+import pytest
 import torch
 
+
 def run_python_script(script_name, timeout):
     try:
         # Start both processes asynchronously using Popen
         process0 = subprocess.Popen(
             [sys.executable, script_name],
-            env={"RANK": "0"},  # Set the RANK environment variable for process 0
+            env={"RANK":
+                 "0"},  # Set the RANK environment variable for process 0
             stdout=sys.stdout,  # Pipe stdout to current stdout
             stderr=sys.stderr,  # Pipe stderr to current stderr
         )
-        
+
         process1 = subprocess.Popen(
             [sys.executable, script_name],
-            env={"RANK": "1"},  # Set the RANK environment variable for process 1
+            env={"RANK":
+                 "1"},  # Set the RANK environment variable for process 1
             stdout=sys.stdout,  # Pipe stdout to current stdout
             stderr=sys.stderr,  # Pipe stderr to current stderr
         )
@@ -26,9 +30,11 @@ def run_python_script(script_name, timeout):
 
         # Check the return status of both processes
         if process0.returncode != 0:
-            pytest.fail(f"Test {script_name} failed for RANK=0 with return code {process0.returncode}")
+            pytest.fail(
+                f"Test {script_name} failed for RANK=0, {process0.returncode}")
         if process1.returncode != 0:
-            pytest.fail(f"Test {script_name} failed for RANK=1 with return code {process1.returncode}")
+            pytest.fail(
+                f"Test {script_name} failed for RANK=1, {process1.returncode}")
 
     except subprocess.TimeoutExpired:
         # If either process times out, terminate both and fail the test
@@ -38,15 +44,20 @@ def run_python_script(script_name, timeout):
     except Exception as e:
         pytest.fail(f"Test {script_name} failed with error: {str(e)}")
 
+
 # Define the test cases using pytest's parametrize
-@pytest.mark.parametrize("script_name,timeout", [
-    ("test_lookup_buffer.py", 60), # Second test case with a 60-second timeout
-    ("test_send_recv.py", 120)     # First test case with a 120-second timeout
-])
+@pytest.mark.parametrize(
+    "script_name,timeout",
+    [
+        ("test_lookup_buffer.py",
+         60),  # Second test case with a 60-second timeout
+        ("test_send_recv.py", 120)  # First test case with a 120-second timeout
+    ])
 def test_run_python_script(script_name, timeout):
     # Check the number of GPUs
     if torch.cuda.device_count() < 2:
-        pytest.skip(f"Skipping test {script_name} because fewer than 2 GPUs are available")
-    
+        pytest.skip(
+            f"Skipping test {script_name} because <2 GPUs are available")
+
     # Run the test if there are at least 2 GPUs
     run_python_script(script_name, timeout)
diff --git a/tests/kv_transfer/test_send_recv.py b/tests/kv_transfer/test_send_recv.py
index f6da7f88d5f5..ff771f34c032 100644
--- a/tests/kv_transfer/test_send_recv.py
+++ b/tests/kv_transfer/test_send_recv.py
@@ -134,6 +134,6 @@ def latency_test(my_rank, pipe, nelement, ntensor):
     torch.manual_seed(0)
     test_run(my_rank, pipe)
     stress_test(my_rank, pipe)
-    
+
     # Use this function if you want to test the latency of pipe impl.
     # latency_test(my_rank, pipe, 1024 * 8 * 128, 80)

From dda1f312ee49ff32d843a141818b82d9f295cbea Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Mon, 16 Sep 2024 01:18:19 +0000
Subject: [PATCH 235/278] constraint the model length

---
 tests/kv_transfer/disagg_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
index fa6a527574cf..fffd9ab6f42a 100644
--- a/tests/kv_transfer/disagg_test.py
+++ b/tests/kv_transfer/disagg_test.py
@@ -25,7 +25,7 @@ def setup_servers():
     prefill_cmd = [
         sys.executable, "-m", "vllm.entrypoints.openai.api_server", "-tp", "2",
         "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--port", "8100",
-        "--gpu-memory-utilization", "0.8"
+        "--gpu-memory-utilization", "0.8", "--max-model-len", "1000",
     ]
     prefill_env = os.environ.copy()
     prefill_env["VLLM_DISAGG_PREFILL_ROLE"] = "prefill"
@@ -36,7 +36,7 @@ def setup_servers():
     decode_cmd = [
         sys.executable, "-m", "vllm.entrypoints.openai.api_server", "-tp", "2",
         "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--port", "8200",
-        "--gpu-memory-utilization", "0.8"
+        "--gpu-memory-utilization", "0.8", "--max-model-len", "1000",
     ]
     decode_env = os.environ.copy()
     decode_env["VLLM_DISAGG_PREFILL_ROLE"] = "decode"

From 85d72fa6d4db8987dbb6c1eceda48eecaf6bb90b Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Mon, 16 Sep 2024 01:25:17 +0000
Subject: [PATCH 236/278] adjust path

---
 tests/kv_transfer/module_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/kv_transfer/module_test.py b/tests/kv_transfer/module_test.py
index 10fb19a3128e..355461919cd7 100644
--- a/tests/kv_transfer/module_test.py
+++ b/tests/kv_transfer/module_test.py
@@ -6,6 +6,7 @@
 
 
 def run_python_script(script_name, timeout):
+    script_name = f'kv_transfer/{script_name}'
     try:
         # Start both processes asynchronously using Popen
         process0 = subprocess.Popen(

From 60ede08c67dc2bbfa42bfbcd2496a3bd47b10f75 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Mon, 16 Sep 2024 01:25:32 +0000
Subject: [PATCH 237/278] add disagg prefill test to test pipeline

---
 .buildkite/test-pipeline.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 9b0cb6663a55..da79fd86b767 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -390,6 +390,18 @@ steps:
   - pytest -v -s distributed/test_pp_cudagraph.py
   - pytest -v -s distributed/test_pipeline_parallel.py
 
+- label: Disaggregated Prefill Test # 4min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/parallel_state.py
+  - vllm/distributed/kv_transfer
+  - vllm/worker/worker_base.py
+  - vllm/worker/model_runner.py
+  commands:
+  - pytest -v -s kv_transfer/module_test.py
+  - pytest -v -s kv_transfer/disagg_test.py
+
 - label: LoRA Long Context (Distributed) # 11min
   # This test runs llama 13B, so it is required to run on 4 GPUs.
   num_gpus: 4

From 0df75665d81b7c842d36c58b6f42272cbe05d4b3 Mon Sep 17 00:00:00 2001
From: YaoJiayi <120040070@link.cuhk.edu.cn>
Date: Mon, 16 Sep 2024 07:45:25 -0500
Subject: [PATCH 238/278] bugfix

---
 .../kv_lookup_buffer/simple_buffer.py         |  3 +++
 .../kv_pipe/torch_distributed_pipe.py         |  5 ++--
 vllm/distributed/kv_transfer/vllm_adapter.py  |  3 +++
 vllm/distributed/parallel_state.py            | 24 +++++++++++++------
 4 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index 9696032002fd..bd9a122bdf40 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -169,8 +169,11 @@ def drop_select(self, input_tokens: torch.Tensor,
         if isinstance(roi, torch.Tensor):
             roi = roi.clone()
 
+        logger.debug(f"Sending signal {self.normal_signal}")
         self.signal_pipe.send_tensor(self.normal_signal)
+        logger.debug(f"Sending input tokens")
         self.data_pipe.send_tensor(input_tokens)
+        logger.debug(f"Sending roi")
         self.data_pipe.send_tensor(roi)
 
         input_tokens = self.data_pipe.recv_tensor()
diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index 911bce88a38f..d4080f4739cf 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -189,10 +189,11 @@ def _send_impl(self, tensor):
 
         metadata = self._make_metadata(tensor)
         self._send_metadata(metadata)
-
-        torch.distributed.send(tensor,
+        logger.debug(f"Sent meta {metadata}")
+        torch.distributed.send(tensor.to(self.device),
                                dst=self.target_rank_for_send,
                                group=self.device_group)
+        logger.debug(f"Sent tensor {tensor}")
 
     def _recv_impl(self) -> torch.Tensor:
         """
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 2edb426c5c8d..4dc6e163abb7 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -107,6 +107,7 @@ def __init__(
             self.recv_buffer = SimpleKVLookupBuffer(self.recv_signal_pipe,
                                                     self.recv_pipe,
                                                     self.lookup_buffer_size)
+            self.tensor_device = 'cpu'
         else:
             # when performing disaggregated prefill, only 1 pipe is needed
             # at prefill instance this pipe is used for send KV cache
@@ -125,6 +126,8 @@ def __init__(
                                           self.lookup_buffer_size)
             self.send_buffer = buffer
             self.recv_buffer = buffer
+            
+            self.tensor_device = 'cuda'
 
     def send_kv_caches_and_hidden_states(
         self,
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 3615fa6af399..81322793f4f1 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1056,13 +1056,23 @@ def initialize_model_parallel(
             # decode global rank: i + world_size
             group_ranks.append([i, i + world_size])
         logger.debug("Distributed group is %s", str(group_ranks))
-        _DISAGG = dist_kv.KV_transfer_agent(
-            group_ranks=group_ranks,
-            local_rank=get_world_group().local_rank,
-            torch_distributed_backend=backend,
-        )
-        logger.debug("_DISAGG initialized for rank %d",
-                     torch.distributed.get_rank())
+        if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
+            _DISAGG = dist_kv.KV_transfer_agent(
+                group_ranks=group_ranks,
+                local_rank=get_world_group().local_rank,
+                torch_distributed_backend=backend,
+            )
+            logger.debug("_DISAGG initialized for rank %d",
+                        torch.distributed.get_rank())
+        elif dist_kv.IS_LMCACHE_INSTANCE:
+            _DISAGG = dist_kv.KV_transfer_agent(
+                group_ranks=group_ranks,
+                local_rank=get_world_group().local_rank,
+                torch_distributed_backend="gloo",
+            )
+            logger.debug("_DISAGG (LMC) initialized for rank %d",
+                        torch.distributed.get_rank())
+            
 
 
 def ensure_model_parallel_initialized(

From 73c1683f4534009df47c11acca84d91ab0ccb743 Mon Sep 17 00:00:00 2001
From: YaoJiayi <120040070@link.cuhk.edu.cn>
Date: Mon, 16 Sep 2024 10:19:25 -0500
Subject: [PATCH 239/278] bugfix

---
 .../kv_transfer/kv_lookup_buffer/simple_buffer.py         | 3 ---
 .../kv_transfer/kv_pipe/torch_distributed_pipe.py         | 4 ++--
 vllm/distributed/kv_transfer/vllm_adapter.py              | 8 ++++----
 vllm/worker/worker_base.py                                | 2 ++
 4 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index bd9a122bdf40..9696032002fd 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -169,11 +169,8 @@ def drop_select(self, input_tokens: torch.Tensor,
         if isinstance(roi, torch.Tensor):
             roi = roi.clone()
 
-        logger.debug(f"Sending signal {self.normal_signal}")
         self.signal_pipe.send_tensor(self.normal_signal)
-        logger.debug(f"Sending input tokens")
         self.data_pipe.send_tensor(input_tokens)
-        logger.debug(f"Sending roi")
         self.data_pipe.send_tensor(roi)
 
         input_tokens = self.data_pipe.recv_tensor()
diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index d4080f4739cf..c2c5cbbe95b0 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -189,11 +189,11 @@ def _send_impl(self, tensor):
 
         metadata = self._make_metadata(tensor)
         self._send_metadata(metadata)
-        logger.debug(f"Sent meta {metadata}")
+        #logger.debug(f"Sent meta {metadata}")
         torch.distributed.send(tensor.to(self.device),
                                dst=self.target_rank_for_send,
                                group=self.device_group)
-        logger.debug(f"Sent tensor {tensor}")
+        #logger.debug(f"Sent tensor {tensor}")
 
     def _recv_impl(self) -> torch.Tensor:
         """
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 4dc6e163abb7..caebc15f09b2 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -250,8 +250,8 @@ def recv_kv_caches_and_hidden_states(
 
                 key_cache, value_cache = kv_cache[0], kv_cache[1]
                 ops.reshape_and_cache_flash(
-                    keys[i],
-                    values[i],
+                    keys[i - model_executable.model.start_layer].to(key_cache.device),
+                    values[i - model_executable.model.start_layer].to(value_cache.device),
                     key_cache,
                     value_cache,
                     slot_mapping[start_pos:end_pos],
@@ -269,7 +269,7 @@ def recv_kv_caches_and_hidden_states(
             # so we need to recompute the hidden state
             logger.debug("[rank%d]: KV EMPTY recv DONE.",
                          torch.distributed.get_rank())
-            return None, bypass_model_exec, None
+            return None, bypass_model_exec, model_input
 
         if not is_complete:
             rebuilt_model_input = self.build_partial_prefill_input(
@@ -282,7 +282,7 @@ def recv_kv_caches_and_hidden_states(
             )
             logger.debug("[rank%d]: KV PARTIAL recv DONE.",
                          torch.distributed.get_rank())
-            return None, bypass_model_exec, rebuilt_model_input
+            return None, False, rebuilt_model_input
 
         # concatenate hidden states from different requests
         hidden_or_intermediate_states = torch.cat(
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index d55400a40240..ae2fb65cc455 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -334,6 +334,7 @@ def execute_model(
             from vllm.worker.model_runner import GPUModelRunnerBase
             assert isinstance(self.model_runner, GPUModelRunnerBase), \
                 "Distributed KV transfer only support GPU modelrunner"
+            logger.debug("Receiving KV caches")
             hidden_or_intermediate_states, bypass_model_exec, model_input = \
                 ps.get_disagg_group().recv_kv_caches_and_hidden_states(
                     # model is used to know which layer the current worker
@@ -361,6 +362,7 @@ def execute_model(
             from vllm.worker.model_runner import GPUModelRunnerBase
             assert isinstance(self.model_runner, GPUModelRunnerBase), \
                 "Distributed KV transfer only support GPU modelrunner"
+            logger.debug("Sending KV caches")
             ps.get_disagg_group().send_kv_caches_and_hidden_states(
                 # model is used to know which layer the current worker
                 # is working on, so that we can send KV for only those

From 70bec94bfed89a8b8d9eae207f6e06f6ea2c6447 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 19 Sep 2024 00:51:29 +0000
Subject: [PATCH 240/278] rename the environment variable to KV producer and KV
 consumer, for more clarity

---
 vllm/distributed/kv_transfer/vllm_adapter.py |  80 +-
 vllm/distributed/parallel_state.py           |  36 +-
 vllm/envs.py                                 |   6 +-
 vllm/worker/model_runner.py                  | 128 ++-
 vllm/worker/worker.py                        | 869 +++++++++----------
 5 files changed, 603 insertions(+), 516 deletions(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index caebc15f09b2..17c2e52b1174 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -1,5 +1,5 @@
 """vLLM distributed KV cache transfer API.
-These APIs are used in `vllm/worker/worker_base.py`.
+These APIs are used in `vllm/worker/model_runner.py`.
 
 Currently supporting TP. The TP between prefill and decode instance needs to be 
 the same.
@@ -38,20 +38,22 @@
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 
-assert envs.VLLM_DISAGG_PREFILL_ROLE in [None, "prefill", "decode", "lmcache"],\
-    "VLLM_DISAGG_PREFILL_ROLE can only be prefill, decode or lmcache."
+logger = init_logger(__name__)
 
-# currently the connections are hard-coded.
-# we only handle 2 cases:
-# - prefill vLLM --> decode vLLM
-# - vLLM --> LMCache
-IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE
-                                    in ["prefill", "decode"])
-IS_KV_PREFILL_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "prefill")
-IS_KV_DECODE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "decode")
-IS_LMCACHE_INSTANCE: bool = (envs.VLLM_DISAGG_PREFILL_ROLE == "lmcache")
+# check VLLM_DISTRIBUTERD_KV_ROLE and set corresponding flags
+assert envs.VLLM_DISTRIBUTERD_KV_ROLE in [None, "producer", "consumer", "both"],\
+    "VLLM_DISTRIBUTERD_KV_ROLE can only be producer, consumer or both."
+IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISTRIBUTERD_KV_ROLE
+                                    in ["producer", "consumer", "both"])
+IS_KV_PRODUCER: bool = (envs.VLLM_DISTRIBUTERD_KV_ROLE in ["producer", "both"])
+IS_KV_CONSUMER: bool = (envs.VLLM_DISTRIBUTERD_KV_ROLE == ["consumer", "both"])
+
+# When the current instance is both KV producer and KV consumer,
+# it is likely connected to a KV storage service on CPU/disk
+# so the communication backend needs to be "gloo" for that case.
+DISTRIBUTED_BACKEND: str = "gloo" if (IS_KV_PRODUCER and IS_KV_CONSUMER) else "nccl"
+DISTRIBUTED_DEVICE: str = "cpu" if (IS_KV_PRODUCER and IS_KV_CONSUMER) else "cuda"
 
-logger = init_logger(__name__)
 
 
 class KV_transfer_agent:
@@ -67,7 +69,7 @@ def __init__(
         self,
         group_ranks: List[List[int]],
         local_rank: int,
-        torch_distributed_backend: Union[str, Backend],
+        torch_distributed_backend: Union[str, Backend] = DISTRIBUTED_BACKEND,
         # FIXME(Kuntai): remove this hardcoding
         lookup_buffer_size: int = int(1e10)):
 
@@ -78,13 +80,15 @@ def __init__(
 
         SimpleKVLookupBuffer = sklb.SimpleKVLookupBuffer
 
-        if IS_LMCACHE_INSTANCE:
-            # when vLLM is connected with LMCache
-            # it needs to both send and recv KV cache
+        # In disaggregated prefill, the prefill vLLM only uses send pipe
+        # and the decode vLLM only uses recv pipe
+        # In remote KV cache store, vLLM will use both send pipe and recv pipe
+        # So we build both send pipe and recv pipe for simplicity.
+        if IS_KV_PRODUCER:
             self.send_pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
-                torch_distributed_backend,
+                DISTRIBUTED_BACKEND,
             )
             self.send_signal_pipe = TorchDistributedPipe(
                 group_ranks,
@@ -94,7 +98,7 @@ def __init__(
             self.recv_pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
-                torch_distributed_backend,
+                DISTRIBUTED_BACKEND,
             )
             self.recv_signal_pipe = TorchDistributedPipe(
                 group_ranks,
@@ -107,27 +111,39 @@ def __init__(
             self.recv_buffer = SimpleKVLookupBuffer(self.recv_signal_pipe,
                                                     self.recv_pipe,
                                                     self.lookup_buffer_size)
-            self.tensor_device = 'cpu'
+            self.tensor_device = DISTRIBUTED_DEVICE
         else:
-            # when performing disaggregated prefill, only 1 pipe is needed
-            # at prefill instance this pipe is used for send KV cache
-            # at decode instance this pipe is used for recv KV cache
-            self.pipe = TorchDistributedPipe(
+            
+            # the current vLLM instance is KV consumer, so it needs to connect 
+            # its recv pipe to the send pipe of KV producder
+            
+            self.recv_pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
-                torch_distributed_backend,
+                DISTRIBUTED_BACKEND,
             )
-            self.signal_pipe = TorchDistributedPipe(
+            self.recv_signal_pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
                 "gloo",
             )
-            buffer = SimpleKVLookupBuffer(self.signal_pipe, self.pipe,
-                                          self.lookup_buffer_size)
-            self.send_buffer = buffer
-            self.recv_buffer = buffer
-            
-            self.tensor_device = 'cuda'
+            self.send_pipe = TorchDistributedPipe(
+                group_ranks,
+                local_rank,
+                DISTRIBUTED_BACKEND,
+            )
+            self.send_signal_pipe = TorchDistributedPipe(
+                group_ranks,
+                local_rank,
+                "gloo",
+            )
+            self.send_buffer = SimpleKVLookupBuffer(self.send_signal_pipe,
+                                                    self.send_pipe,
+                                                    self.lookup_buffer_size)
+            self.recv_buffer = SimpleKVLookupBuffer(self.recv_signal_pipe,
+                                                    self.recv_pipe,
+                                                    self.lookup_buffer_size)
+            self.tensor_device = DISTRIBUTED_DEVICE
 
     def send_kv_caches_and_hidden_states(
         self,
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index b619fadb37c6..54f7968908de 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -981,10 +981,10 @@ def init_distributed_environment(
         # this backend is used for WORLD
         maybe_disagg_world_size = world_size
         maybe_disagg_rank = rank
-        if dist_kv.IS_DISTRIBUTED_KV_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE:
+        if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
             maybe_disagg_world_size = world_size * 2
             logger.debug("Disaggregated prefill enabled.")
-            if dist_kv.IS_KV_PREFILL_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE:
+            if dist_kv.IS_KV_PRODUCER:
                 # for prefill, the ranks are [0, world_size)
                 maybe_disagg_rank = rank
             else:
@@ -1016,7 +1016,7 @@ def init_distributed_environment(
     if _WORLD is None:
         ranks = [[i for i in range(world_size)]]
         # offset the distributed group
-        if dist_kv.IS_DISTRIBUTED_KV_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE:
+        if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
             ranks = include_decoding_groups_if_disagg_enabled(
                 ranks, world_size)
 
@@ -1079,9 +1079,9 @@ def initialize_model_parallel(
     world_size: int = torch.distributed.get_world_size()
     backend = backend or torch.distributed.get_backend(
         get_world_group().device_group)
-    if dist_kv.IS_DISTRIBUTED_KV_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE:
+    if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
         # Disaggregated prefill enabled
-        # The world_size for this vLLM instance is tp * pp, but
+        # This vLLM instance thinks its word size is tp * pp, but
         # torch.distributed contains 2 vLLM instances,
         # its world size is 2 * tp * pp
         # Adjust the world_size to match.
@@ -1135,8 +1135,8 @@ def initialize_model_parallel(
                                     group_name="pp")
     logger.debug("_PP initialized for rank %d", torch.distributed.get_rank())
 
-    # TODO(Jiayi): perhaps we need to separate lmcache and disagg
-    if dist_kv.IS_DISTRIBUTED_KV_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE:
+    
+    if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
         global _DISAGG
         logger.debug("Disaggregated prefill enabled, create _DISAGG group")
         group_ranks = []
@@ -1145,22 +1145,12 @@ def initialize_model_parallel(
             # decode global rank: i + world_size
             group_ranks.append([i, i + world_size])
         logger.debug("Distributed group is %s", str(group_ranks))
-        if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
-            _DISAGG = dist_kv.KV_transfer_agent(
-                group_ranks=group_ranks,
-                local_rank=get_world_group().local_rank,
-                torch_distributed_backend=backend,
-            )
-            logger.debug("_DISAGG initialized for rank %d",
-                        torch.distributed.get_rank())
-        elif dist_kv.IS_LMCACHE_INSTANCE:
-            _DISAGG = dist_kv.KV_transfer_agent(
-                group_ranks=group_ranks,
-                local_rank=get_world_group().local_rank,
-                torch_distributed_backend="gloo",
-            )
-            logger.debug("_DISAGG (LMC) initialized for rank %d",
-                        torch.distributed.get_rank())
+        _DISAGG = dist_kv.KV_transfer_agent(
+            group_ranks=group_ranks,
+            local_rank=get_world_group().local_rank,
+        )
+        logger.debug("_DISAGG initialized for rank %d",
+                    torch.distributed.get_rank())
             
 
 def ensure_model_parallel_initialized(
diff --git a/vllm/envs.py b/vllm/envs.py
index 65c67104349e..407dd942cc9f 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -367,9 +367,9 @@ def get_default_config_root():
     lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
 
     # Specify the role of current vllm instance
-    # Value can be "prefill", "decode".
-    "VLLM_DISAGG_PREFILL_ROLE":
-    lambda: os.getenv("VLLM_DISAGG_PREFILL_ROLE", None),
+    # Value can be "producer", "consumer" or "both".
+    "VLLM_DISTRIBUTERD_KV_ROLE":
+    lambda: os.getenv("VLLM_DISTRIBUTERD_KV_ROLE", None),
 
     # If set, vllm will skip the deprecation warnings.
     "VLLM_NO_DEPRECATION_WARNING":
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 7293ffe86c0f..78ffc40082fc 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -22,7 +22,7 @@
                          ModelConfig, ObservabilityConfig, ParallelConfig,
                          PromptAdapterConfig, SchedulerConfig)
 from vllm.core.scheduler import SchedulerOutputs
-from vllm.distributed import get_pp_group
+from vllm.distributed import get_pp_group, get_disagg_group
 from vllm.distributed.parallel_state import graph_capture
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
@@ -55,6 +55,8 @@
     _init_attn_metadata_from_tensor_dict,
     _init_sampling_metadata_from_tensor_dict, dump_input_when_exception)
 
+import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
+
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
 
@@ -1576,6 +1578,24 @@ def execute_model(
         else:
             model_executable = self.model
 
+        # Receive KV cache in distributed KV cache transfer setting
+        # In disagg prefill setting, it will also recv hidden states and bypass
+        # model forwarding
+        # In KV cache database setting, it will change the model input so that
+        # we can skip prefilling on tokens that successfully received KV caches
+        # NOTE: The receive operation is blocking
+        bypass_model_exec = False
+        if self.recv_kv_needed(model_input, kv_caches):
+            hidden_or_intermediate_states, bypass_model_exec, model_input = \
+                get_disagg_group().recv_kv_caches_and_hidden_states(
+                    # model is used to know which layer the current worker
+                    # is working on, so that we can receive KV for only those
+                    # layers.
+                    model_executable,
+                    model_input,
+                    kv_caches=kv_caches
+                )
+
         multi_modal_kwargs = model_input.multi_modal_kwargs or {}
         seqlen_agnostic_kwargs = {
             "finished_requests_ids": model_input.finished_requests_ids,
@@ -1587,20 +1607,36 @@ def execute_model(
             model_forward_end = torch.cuda.Event(enable_timing=True)
             model_forward_start.record()
 
-        hidden_or_intermediate_states = model_executable(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=model_input.attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            **MultiModalInputs.as_kwargs(multi_modal_kwargs,
-                                         device=self.device),
-            **seqlen_agnostic_kwargs)
+        if not bypass_model_exec:
+            hidden_or_intermediate_states = model_executable(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                kv_caches=kv_caches,
+                attn_metadata=model_input.attn_metadata,
+                intermediate_tensors=intermediate_tensors,
+                **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                                            device=self.device),
+                **seqlen_agnostic_kwargs)
 
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_end.record()
+            
+        # Sending KV cache in distributed KV cache transfer setting
+        # NOTE: the send operation is non-blocking
+        if self.send_kv_needed(model_input, kv_caches):
+            logger.debug("Sending KV caches")
+            get_disagg_group().send_kv_caches_and_hidden_states(
+                # model is used to know which layer the current worker
+                # is working on, so that we can send KV for only those
+                # layers.
+                model_executable,
+                model_input,
+                kv_caches,
+                hidden_or_intermediate_states,
+            )
 
+        # Compute the logits in the last pipeline stage.
         if not get_pp_group().is_last_rank:
             if (self.is_driver_worker
                     and hidden_or_intermediate_states is not None
@@ -1619,15 +1655,6 @@ def execute_model(
                     torch.tensor(model_forward_time + orig_model_forward_time))
             return hidden_or_intermediate_states
 
-        return hidden_or_intermediate_states
-
-    @torch.inference_mode()
-    def postprocess_model(
-        self,
-        model_input: ModelInputForGPUWithSamplingMetadata,
-        hidden_or_intermediate_states,
-    ) -> Optional[Union[List[SamplerOutput], IntermediateTensors]]:
-
         logits = self.model.compute_logits(hidden_or_intermediate_states,
                                            model_input.sampling_metadata)
 
@@ -1642,9 +1669,23 @@ def postprocess_model(
             logits=logits,
             sampling_metadata=model_input.sampling_metadata,
         )
+        if (self.observability_config is not None
+                and self.observability_config.collect_model_forward_time
+                and output is not None):
+            model_forward_end.synchronize()
+            model_forward_time = model_forward_start.elapsed_time(
+                model_forward_end)
+            orig_model_forward_time = 0.0
+            if intermediate_tensors is not None:
+                orig_model_forward_time = intermediate_tensors.tensors.get(
+                    "model_forward_time", torch.tensor(0.0)).item()
+            # If there are multiple workers, we are still tracking the latency
+            # from the start time of the driver worker to the end time of the
+            # driver worker. The model forward time will then end up covering
+            # the communication time as well.
+            output.model_forward_time = (orig_model_forward_time +
+                                         model_forward_time)
 
-        assert model_input.attn_metadata is not None
-        decode_meta = model_input.attn_metadata.decode_metadata
         if self.return_hidden_states:
             # we only need to pass hidden states of most recent token
             assert model_input.sampling_metadata is not None
@@ -1661,6 +1702,49 @@ def postprocess_model(
             output.hidden_states = hidden_states
 
         return [output]
+    
+    
+    def recv_kv_needed(self, model_input, kv_caches) -> bool:
+        """
+            Need to receive KV when
+            1. current vLLM instance is KV cache *consumer*
+            2. this batch is not a profiling run
+            3. this batch is a prefill run
+        """
+
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+
+        # check if the current run is profiling
+        is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
+        # check if the current run is prefill
+        is_prefill_run = prefill_meta is not None
+
+        return all([
+            dist_kv.IS_KV_DECODE_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE,
+            not is_profile_run,
+            is_prefill_run, 
+        ])
+
+    def send_kv_needed(self, model_input, kv_caches) -> bool:
+        """
+            Need to receive KV when
+            1. current vLLM instance is KV cache *producer*
+            2. this batch is not a profiling run
+            3. this batch is a prefill run
+        """
+
+        prefill_meta = model_input.attn_metadata.prefill_metadata
+
+        # check if the current run is profiling
+        is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
+        # check if the current run is prefill
+        is_prefill_run = prefill_meta is not None
+
+        return all([
+            dist_kv.IS_KV_PREFILL_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE,
+            not is_profile_run,
+            is_prefill_run
+        ])
 
 
 class CUDAGraphRunner:
@@ -1844,4 +1928,4 @@ def _get_max_graph_batch_size(max_num_seqs: int) -> int:
     if padded_size in _BATCH_SIZES_TO_CAPTURE:
         return padded_size
     assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1]
-    return _BATCH_SIZES_TO_CAPTURE[-1]
+    return _BATCH_SIZES_TO_CAPTURE[-1]
\ No newline at end of file
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 3851843afc96..abc6f98b5f30 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,488 +1,485 @@
-"""A GPU worker class."""
-import gc
+import dataclasses
+import importlib
 import os
-from typing import Dict, List, Optional, Set, Tuple, Type, Union
+import time
+from abc import ABC, abstractmethod
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 
 import torch
-import torch.distributed
-
-import vllm.envs as envs
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig)
-from vllm.distributed import (ensure_model_parallel_initialized,
-                              init_distributed_environment,
-                              set_custom_all_reduce)
+
+from vllm.config import ObservabilityConfig
+from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.model_executor import set_random_seed
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.platforms import current_platform
-from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
-                           SequenceGroupMetadata, SequenceGroupMetadataDelta)
-from vllm.worker.cache_engine import CacheEngine
-from vllm.worker.embedding_model_runner import EmbeddingModelRunner
-from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
-from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
-from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput
+from vllm.sequence import ExecuteModelRequest, IntermediateTensors
+from vllm.utils import (enable_trace_function_call_for_thread,
+                        update_environment_variables)
+from vllm.worker.model_runner_base import (BroadcastableModelInput,
+                                           ModelRunnerBase,
+                                           ModelRunnerInputBase)
 
 logger = init_logger(__name__)
 
 
-class Worker(LocalOrDistributedWorkerBase):
-    """A worker class that executes (a partition of) the model on a GPU.
-
-    Each worker is associated with a single GPU. The worker is responsible for
-    maintaining the KV cache and executing the model on the GPU. In case of
-    distributed inference, each worker is assigned a partition of the model.
+class WorkerBase(ABC):
+    """Worker interface that allows vLLM to cleanly separate implementations for
+    different hardware. Also abstracts control plane communication, e.g., to
+    communicate request metadata to other workers.
     """
 
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
-        device_config: DeviceConfig,
-        cache_config: CacheConfig,
-        load_config: LoadConfig,
-        local_rank: int,
-        rank: int,
-        distributed_init_method: str,
-        lora_config: Optional[LoRAConfig] = None,
-        speculative_config: Optional[SpeculativeConfig] = None,
-        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
-        is_driver_worker: bool = False,
-        model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
-        observability_config: Optional[ObservabilityConfig] = None,
-    ) -> None:
-        self.model_config = model_config
-        self.parallel_config = parallel_config
-        self.parallel_config.rank = rank
-        self.scheduler_config = scheduler_config
-        self.device_config = device_config
-        self.cache_config = cache_config
-        self.local_rank = local_rank
-        self.rank = rank
-        self.distributed_init_method = distributed_init_method
-        self.lora_config = lora_config
-        self.load_config = load_config
-        self.prompt_adapter_config = prompt_adapter_config
-        self.is_driver_worker = is_driver_worker
-        if parallel_config and is_driver_worker:
-            assert rank % parallel_config.tensor_parallel_size == 0, \
-                   "Driver worker should be rank 0 of tensor parallel group."
-        if self.model_config.trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
-        self.observability_config = observability_config
-
-        # Return hidden states from target model if the draft model is an
-        # mlp_speculator
-        speculative_args = {} if speculative_config is None \
-            or (speculative_config.draft_model_config.model ==
-                model_config.model) \
-            or (speculative_config.draft_model_config.hf_config.model_type
-                not in ["medusa", "mlp_speculator", "eagle"]) \
-                    else {"return_hidden_states": True}
-
-        ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
-        if model_runner_cls is not None:
-            ModelRunnerClass = model_runner_cls
-        elif self._is_embedding_model():
-            ModelRunnerClass = EmbeddingModelRunner
-        elif self._is_encoder_decoder_model():
-            ModelRunnerClass = EncoderDecoderModelRunner
-        self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
-            model_config,
-            parallel_config,
-            scheduler_config,
-            device_config,
-            cache_config,
-            load_config=load_config,
-            lora_config=self.lora_config,
-            kv_cache_dtype=self.cache_config.cache_dtype,
-            is_driver_worker=is_driver_worker,
-            prompt_adapter_config=prompt_adapter_config,
-            observability_config=observability_config,
-            **speculative_args,
-        )
-        # Uninitialized cache engine. Will be initialized by
-        # initialize_cache.
-        self.cache_engine: List[CacheEngine]
-        # Initialize gpu_cache as embedding models don't initialize kv_caches
-        self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
-        self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
-
-        # Torch profiler. Enabled and configured through env vars:
-        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
-        if envs.VLLM_TORCH_PROFILER_DIR:
-            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
-            logger.info("Profiling enabled. Traces will be saved to: %s",
-                        torch_profiler_trace_dir)
-            self.profiler = torch.profiler.profile(
-                activities=[
-                    torch.profiler.ProfilerActivity.CPU,
-                    torch.profiler.ProfilerActivity.CUDA,
-                ],
-                with_stack=True,
-                on_trace_ready=torch.profiler.tensorboard_trace_handler(
-                    torch_profiler_trace_dir, use_gzip=True))
-        else:
-            self.profiler = None
+    @abstractmethod
+    def init_device(self) -> None:
+        """Initialize device state, such as loading the model or other on-device
+        memory allocations.
+        """
+        raise NotImplementedError
 
-    def start_profile(self):
-        if self.profiler is None:
-            raise RuntimeError("Profiler is not enabled.")
-        self.profiler.start()
+    @abstractmethod
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available blocks for the GPU KV cache and
+        swappable CPU KV cache.
+
+        The implementation may run profiling or other heuristics to determine
+        the size of caches.
 
-    def stop_profile(self):
-        if self.profiler is None:
-            raise RuntimeError("Profiler is not enabled.")
-        self.profiler.stop()
+        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
+        are blocks that are "active" on the device and can be appended to.
+        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
+        appended to.
+        """
+        raise NotImplementedError
 
-    def _is_encoder_decoder_model(self):
-        return self.model_config.is_encoder_decoder_model
+    @abstractmethod
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Initialize the KV cache with the given size in blocks.
+        """
+        raise NotImplementedError
 
-    def _is_embedding_model(self):
-        return self.model_config.is_embedding_model
+    @current_platform.inference_mode()
+    def start_worker_execution_loop(self) -> None:
+        """Execute model loop in parallel worker.
 
-    def init_device(self) -> None:
-        if self.device_config.device.type == "cuda":
-            # torch.distributed.all_reduce does not free the input tensor until
-            # the synchronization point. This causes the memory usage to grow
-            # as the number of all_reduce calls increases. This env var disables
-            # this behavior.
-            # Related issue:
-            # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
-            os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
-
-            # This env var set by Ray causes exceptions with graph building.
-            os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
-            self.device = torch.device(f"cuda:{self.local_rank}")
-            torch.cuda.set_device(self.device)
-
-            _check_if_gpu_supports_dtype(self.model_config.dtype)
-            gc.collect()
-            torch.cuda.empty_cache()
-            self.init_gpu_memory = torch.cuda.mem_get_info()[0]
-        else:
-            raise RuntimeError(
-                f"Not support device type: {self.device_config.device}")
-        # Initialize the distributed environment.
-        init_worker_distributed_environment(self.parallel_config, self.rank,
-                                            self.distributed_init_method,
-                                            self.local_rank)
-        # Set random seed.
-        set_random_seed(self.model_config.seed)
-
-    def load_model(self):
-        self.model_runner.load_model()
-
-    def save_sharded_state(
-        self,
-        path: str,
-        pattern: Optional[str] = None,
-        max_size: Optional[int] = None,
-    ) -> None:
-        self.model_runner.save_sharded_state(
-            path,
-            pattern=pattern,
-            max_size=max_size,
-        )
+        You can stop the loop by executing a driver worker with an empty output.
+        See `stop_remote_worker_execution_loop` for more details.
+        """
+        while True:
+            output = self.execute_model(execute_model_req=None)
+            if output is None:
+                return None
 
-    def save_tensorized_model(
+    @abstractmethod
+    def execute_model(
         self,
-        tensorizer_config: TensorizerConfig,
-    ) -> None:
-        self.model_runner.save_tensorized_model(
-            tensorizer_config=tensorizer_config, )
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[List[SamplerOutput]]:
+        raise NotImplementedError
 
-    @torch.inference_mode()
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Profiles the peak memory usage of the model to determine how many
-        KV blocks may be allocated without OOMs.
+    @abstractmethod
+    def get_cache_block_size_bytes(self) -> int:
+        """Return the size of a single cache block, in bytes. Used in
+        speculative decoding.
+        """
+        raise NotImplementedError
 
-        The engine will first conduct a profiling of the existing memory usage.
-        Then, it calculate the maximum possible number of GPU and CPU blocks
-        that can be allocated with the remaining free memory.
+    @abstractmethod
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError
 
-        .. tip::
-            You may limit the usage of GPU memory
-            by adjusting the `gpu_memory_utilization` parameter.
-        """
-        # Profile the memory usage of the model and get the maximum number of
-        # cache blocks that can be allocated with the remaining free memory.
-        torch.cuda.empty_cache()
-
-        # Execute a forward pass with dummy inputs to profile the memory usage
-        # of the model.
-        self.model_runner.profile_run()
-
-        # Calculate the number of blocks that can be allocated with the
-        # profiled peak memory.
-        torch.cuda.synchronize()
-        free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
-        # NOTE(woosuk): Here we assume that the other processes using the same
-        # GPU did not change their memory usage during the profiling.
-        peak_memory = self.init_gpu_memory - free_gpu_memory
-        assert peak_memory > 0, (
-            "Error in memory profiling. "
-            f"Initial free memory {self.init_gpu_memory}, current free memory"
-            f" {free_gpu_memory}. This happens when the GPU memory was "
-            "not properly cleaned up before initializing the vLLM instance.")
-
-        cache_block_size = self.get_cache_block_size_bytes()
-        num_gpu_blocks = int(
-            (total_gpu_memory * self.cache_config.gpu_memory_utilization -
-             peak_memory) // cache_block_size)
-        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
-                             cache_block_size)
-        num_gpu_blocks = max(num_gpu_blocks, 0)
-        num_cpu_blocks = max(num_cpu_blocks, 0)
-        if self.model_runner.lora_manager:
-            self.model_runner.remove_all_loras()
-        gc.collect()
-        torch.cuda.empty_cache()
-        return num_gpu_blocks, num_cpu_blocks
+    @abstractmethod
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError
 
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Allocate GPU and CPU KV cache with the specified number of blocks.
 
-        This also warms up the model, which may record CUDA graphs.
+class LoraNotSupportedWorkerBase(WorkerBase):
+    """Partial implementation of WorkerBase that raises exceptions when LoRA
+    methods are invoked.
+    """
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise ValueError(f"{type(self)} does not support LoRA")
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise ValueError(f"{type(self)} does not support LoRA")
+
+    def pin_lora(self, lora_id: int) -> bool:
+        return ValueError(
+            f"{type(self)} does not support LoRA")  # type: ignore
+
+    def list_loras(self) -> Set[int]:
+        raise ValueError(f"{type(self)} does not support LoRA")
+
+
+@dataclasses.dataclass(frozen=True)
+class WorkerInput:
+    """Local inputs to each worker. May contain device-specific data. These
+    fields should be broadcastable to other workers.
+    """
+
+    num_seq_groups: Optional[int] = None
+    blocks_to_swap_in: Optional[torch.Tensor] = None
+    blocks_to_swap_out: Optional[torch.Tensor] = None
+    blocks_to_copy: Optional[torch.Tensor] = None
+    virtual_engine: int = 0
+    num_steps: int = 1
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+        cls: Type["WorkerInput"],
+        tensor_dict: Dict[str, Any],
+    ) -> "WorkerInput":
+        """
+        Pop fields from the given tensor_dict and populate a new instance of
+        WorkerInput.
         """
-        raise_if_cache_size_invalid(num_gpu_blocks,
-                                    self.cache_config.block_size,
-                                    self.model_config.max_model_len)
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = num_cpu_blocks
-
-        self._init_cache_engine()
-        self._warm_up_model()
-
-    def _init_cache_engine(self):
-        assert self.cache_config.num_gpu_blocks is not None
-        self.cache_engine = [
-            CacheEngine(self.cache_config, self.model_config,
-                        self.parallel_config, self.device_config)
-            for _ in range(self.parallel_config.pipeline_parallel_size)
-        ]
-        self.gpu_cache = [
-            self.cache_engine[ve].gpu_cache
-            for ve in range(self.parallel_config.pipeline_parallel_size)
-        ]
-
-    def _warm_up_model(self) -> None:
-        if not self.model_config.enforce_eager:
-            self.model_runner.capture_model(self.gpu_cache)
-        # Reset the seed to ensure that the random state is not affected by
-        # the model initialization and profiling.
-        set_random_seed(self.model_config.seed)
+        return cls(
+            num_seq_groups=tensor_dict.pop("num_seq_groups"),
+            blocks_to_swap_in=tensor_dict.pop("blocks_to_swap_in"),
+            blocks_to_swap_out=tensor_dict.pop("blocks_to_swap_out"),
+            blocks_to_copy=tensor_dict.pop("blocks_to_copy"),
+            virtual_engine=tensor_dict["virtual_engine"],
+            num_steps=tensor_dict.pop("num_steps"),
+        )
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        """
+        Extract broadcastable fields.
+        """
+        tensor_dict = {
+            "num_seq_groups": self.num_seq_groups,
+            "blocks_to_swap_in": self.blocks_to_swap_in,
+            "blocks_to_swap_out": self.blocks_to_swap_out,
+            "blocks_to_copy": self.blocks_to_copy,
+            "virtual_engine": self.virtual_engine,
+            "num_steps": self.num_steps,
+        }
+
+        return tensor_dict
+
+
+class LocalOrDistributedWorkerBase(WorkerBase):
+    """
+    Partial implementation of WorkerBase that has a default `execute_model`
+    definition to perform metadata transfer between workers when in distributed
+    mode. Subclasses of this interface should use model runners that inherit
+    from ModelRunnerBase, and should only need to implement worker-local logic.
+    If custom control plane logic is needed to transfer metadata, or if the
+    model runner cannot inherit from ModelRunnerBase, use WorkerBase instead.
+    """
+    is_driver_worker: bool
+    model_runner: ModelRunnerBase
+    observability_config: Optional[ObservabilityConfig] = None
 
     @property
+    @abstractmethod
     def do_metadata_broadcast(self) -> bool:
-        return self.parallel_config.tensor_parallel_size > 1
+        """
+        Used by the default `execute_model` to check whether broadcast is
+        needed to transfer request inputs from the driver worker to other
+        workers in the TP group. If WorkerBase subclass only supports
+        single-worker execution, then this method should return False.
+        """
+        raise NotImplementedError
 
     @property
+    @abstractmethod
     def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
-        return self.gpu_cache
+        """
+        Gets the list of kv caches to pass to the worker's model runner. Each
+        element in the list is a kv cache corresponding to a particular virtual
+        engine (PP stream). Used by the default `execute_model`. If the worker's
+        model runner does not follow the ModelRunnerBase interface, then inherit
+        from WorkerBase instead.
+        """
+        raise NotImplementedError
 
-    @torch.inference_mode()
+    @abstractmethod
     def prepare_worker_input(
             self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
-        virtual_engine = execute_model_req.virtual_engine
-        num_steps = execute_model_req.num_steps
-        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
-        # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
-        # they contain parameters to launch cudamemcpyasync.
-        blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
-                                         device="cpu",
-                                         dtype=torch.int64).view(-1, 2)
-        blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out,
-                                          device="cpu",
-                                          dtype=torch.int64).view(-1, 2)
-        # `blocks_to_copy` is a gpu tensor. The src and tgt of
-        # blocks to copy are in the same device, and `blocks_to_copy`
-        # can be used directly within cuda kernels.
-        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
-                                      device=self.device,
-                                      dtype=torch.int64).view(-1, 2)
-
-        return WorkerInput(
-            num_seq_groups=num_seq_groups,
-            blocks_to_swap_in=blocks_to_swap_in,
-            blocks_to_swap_out=blocks_to_swap_out,
-            blocks_to_copy=blocks_to_copy,
-            virtual_engine=virtual_engine,
-            num_steps=num_steps,
-        )
+        """
+        Prepare the inputs to WorkerBase.execute_worker from an execution
+        request. This method may move data to the worker's local device. It is
+        not allowed to communicate with other workers or devices.
+        """
+        raise NotImplementedError
 
-    @torch.inference_mode()
+    @abstractmethod
     def execute_worker(self, worker_input: WorkerInput) -> None:
-        virtual_engine = worker_input.virtual_engine
-        # Issue cache operations.
-        if (worker_input.blocks_to_swap_in is not None
-                and worker_input.blocks_to_swap_in.numel() > 0):
-            self.cache_engine[virtual_engine].swap_in(
-                worker_input.blocks_to_swap_in)
-        if (worker_input.blocks_to_swap_out is not None
-                and worker_input.blocks_to_swap_out.numel() > 0):
-            self.cache_engine[virtual_engine].swap_out(
-                worker_input.blocks_to_swap_out)
-        if (worker_input.blocks_to_copy is not None
-                and worker_input.blocks_to_copy.numel() > 0):
-            self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
-
-    def _get_cached_seq_group_metadata(
-            self,
-            seq_group_metadata_list: List[Union[SequenceGroupMetadata,
-                                                SequenceGroupMetadataDelta]],
-            finished_request_ids: List[str]) -> List[SequenceGroupMetadata]:
-        """Return a list of cached Sequence Group Metadata after updating its
-        state.
-
-        It is used because scheduler only sends delta to workers to reduce
-        the data payload size. The function also cleans up cache based on
-        a given `finished_request_ids`.
         """
-        new_seq_group_metadata_list = []
-        for metadata_or_delta in seq_group_metadata_list:
-            request_id = metadata_or_delta.request_id
-            if request_id not in self._seq_group_metadata_cache:
-                # The first prefill.
-                assert isinstance(metadata_or_delta, SequenceGroupMetadata)
-                self._seq_group_metadata_cache[request_id] = metadata_or_delta
-            else:
-                # The first prefill is already cached.
-                if isinstance(metadata_or_delta, SequenceGroupMetadataDelta):
-                    self._seq_group_metadata_cache[request_id].apply_delta(
-                        metadata_or_delta)
-                else:
-                    # If metadata snapshot is sent again, it is
-                    # preempted. Reset the cache because we need to start
-                    # from scratch.
-                    assert isinstance(metadata_or_delta, SequenceGroupMetadata)
-                    self._seq_group_metadata_cache[
-                        request_id] = metadata_or_delta
-
-            new_seq_group_metadata_list.append(
-                self._seq_group_metadata_cache[request_id])
-
-        # Clean up finished ids
-        for finished_id in finished_request_ids:
-            del self._seq_group_metadata_cache[finished_id]
-
-        return new_seq_group_metadata_list
-
-    def _execute_model_spmd(
-        self,
-        execute_model_req: ExecuteModelRequest,
-        intermediate_tensors: Optional[IntermediateTensors] = None,
-    ) -> Optional[List[SamplerOutput]]:
-        if execute_model_req is not None:
-            new_seq_group_metadata_list = self._get_cached_seq_group_metadata(
+        Process an execution request.
+        """
+        raise NotImplementedError
+
+    def _get_worker_input_from_broadcast(
+        self
+    ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[
+            str, torch.Tensor]]]:
+        """ Get the worker input from the broadcasted tensor dict. """
+        assert self.do_metadata_broadcast
+        assert not self.is_driver_worker
+        broadcast_data = broadcast_tensor_dict(src=0)
+        if not broadcast_data:
+            return None
+
+        worker_input = WorkerInput.from_broadcasted_tensor_dict(broadcast_data)
+        model_input = (
+            self.model_runner.make_model_input_from_broadcasted_tensor_dict(
+                broadcast_data))
+
+        kwargs = extract_previous_hidden_states(broadcast_data)
+
+        return model_input, worker_input, kwargs
+
+    def _get_driver_input_and_broadcast(
+        self, execute_model_req: ExecuteModelRequest
+    ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]:
+        """ Get the driver input and broadcast it to other workers.  """
+        assert self.is_driver_worker
+
+        worker_input: WorkerInput = self.prepare_worker_input(
+            execute_model_req=execute_model_req)
+        model_input: ModelRunnerInputBase = (
+            self.model_runner.prepare_model_input(
                 execute_model_req.seq_group_metadata_list,
-                execute_model_req.finished_requests_ids)
+                execute_model_req.virtual_engine,
+                execute_model_req.finished_requests_ids))
 
-            execute_model_req.seq_group_metadata_list = (
-                new_seq_group_metadata_list)
-        output = super()._execute_model_spmd(execute_model_req,
-                                             intermediate_tensors)
-        return output
+        kwargs = extract_previous_hidden_states(execute_model_req)
 
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        return self.model_runner.add_lora(lora_request)
+        if self.do_metadata_broadcast:
+            broadcast_data = worker_input.as_broadcastable_tensor_dict()
+            broadcast_data.update(model_input.as_broadcastable_tensor_dict())
+            broadcast_data.update(kwargs)
+            broadcast_tensor_dict(broadcast_data, src=0)
 
-    def remove_lora(self, lora_id: int) -> bool:
-        return self.model_runner.remove_lora(lora_id)
+        if execute_model_req.async_callback:
+            model_input = dataclasses.replace(  # type: ignore
+                model_input,
+                async_callback=execute_model_req.async_callback)
 
-    def pin_lora(self, lora_id: int) -> bool:
-        return self.model_runner.pin_lora(lora_id)
+        return model_input, worker_input, kwargs
 
-    def list_loras(self) -> Set[int]:
-        return self.model_runner.list_loras()
+    def prepare_input(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[
+            str, torch.Tensor]]]:
+        """
+        Prepare the inputs to ModelRunner and workers.
+        """
+        if self.is_driver_worker:
+            if execute_model_req is None:
+                if self.do_metadata_broadcast:
+                    # This signals that there's no more requests to process for
+                    # now. All workers are running infinite loop with
+                    # broadcast_tensor_dict, and it stops the loop when the
+                    # driver broadcasts an empty input. Send an empty input to
+                    # notify all other workers to stop their execution loop.
+                    broadcast_tensor_dict({}, src=0)
+                return None
+            return self._get_driver_input_and_broadcast(execute_model_req)
+        else:
+            return self._get_worker_input_from_broadcast()
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None,
+    ) -> Optional[List[SamplerOutput]]:
+        """Executes at least one model step on the given sequences, unless no
+        sequences are provided."""
+        start_time = time.perf_counter()
+
+        inputs = self.prepare_input(execute_model_req)
+        if inputs is None:
+            return None
+
+        model_input, worker_input, kwargs = inputs
+        num_steps = worker_input.num_steps
+
+        self.execute_worker(worker_input)
+
+        # If there is no input, we don't need to execute the model.
+        if worker_input.num_seq_groups == 0:
+            return []
+
+        intermediate_tensors = None
+        orig_model_execute_time = 0.0
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = IntermediateTensors(
+                get_pp_group().recv_tensor_dict(
+                    all_gather_group=get_tp_group()))
+            if (self.observability_config is not None
+                    and self.observability_config.collect_model_execute_time):
+                orig_model_execute_time = intermediate_tensors.tensors.get(
+                    "model_execute_time", torch.tensor(0)).item()
+
+        output = self.model_runner.execute_model(
+            model_input=model_input,
+            kv_caches=self.kv_cache[worker_input.virtual_engine]
+            if self.kv_cache is not None else None,
+            intermediate_tensors=intermediate_tensors,
+            num_steps=num_steps,
+            **kwargs,
+        )
 
-    def add_prompt_adapter(
-            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
-        return self.model_runner.add_prompt_adapter(prompt_adapter_request)
+        model_execute_time = time.perf_counter() - start_time
+        if not get_pp_group().is_last_rank:
+            # output is IntermediateTensors
+            if (self.observability_config is not None
+                    and self.observability_config.collect_model_execute_time):
+                output.tensors["model_execute_time"] = torch.tensor(
+                    model_execute_time + orig_model_execute_time)
+            get_pp_group().send_tensor_dict(output.tensors,
+                                            all_gather_group=get_tp_group())
+            return [None]
+        if (self.observability_config is not None
+                and self.observability_config.collect_model_execute_time
+                and output is not None):
+            for o in output:
+                o.model_execute_time = (orig_model_execute_time +
+                                        model_execute_time)
+
+        # output is List[SamplerOutput]
+        return output
 
-    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        return self.model_runner.remove_lora(prompt_adapter_id)
+    def _execute_model_spmd(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        intermediate_tensors: Optional[IntermediateTensors] = None
+    ) -> Optional[List[SamplerOutput]]:
+        """
+        Execute model in Single Program Multiple Data (SPMD) fashion.
+        All workers take the same request, prepare the input and
+        execute the model.
+        """
+        assert execute_model_req is not None, (
+            "_execute_model_spmd() requires each worker to take in an "
+            "ExecuteModelRequest")
+        worker_input: WorkerInput = self.prepare_worker_input(
+            execute_model_req=execute_model_req)
+        model_input: ModelRunnerInputBase = (
+            self.model_runner.prepare_model_input(
+                execute_model_req.seq_group_metadata_list))
+
+        self.execute_worker(worker_input)
+
+        # If there is no input, we don't need to execute the model.
+        if worker_input.num_seq_groups == 0:
+            return []
+
+        kwargs = extract_previous_hidden_states(execute_model_req)
+
+        return self.model_runner.execute_model(
+            model_input=model_input,
+            kv_caches=self.kv_cache[worker_input.virtual_engine]
+            if self.kv_cache is not None else None,
+            intermediate_tensors=intermediate_tensors,
+            **kwargs,
+        )
 
-    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
-        return self.model_runner.pin_prompt_adapter(prompt_adapter_id)
 
-    def list_prompt_adapters(self) -> Set[int]:
-        return self.model_runner.list_prompt_adapters()
+class WorkerWrapperBase:
+    """
+    The whole point of this class is to lazily initialize the worker.
+    We first instantiate the WorkerWrapper, which remembers the worker module
+    and class name. Then, when we call `update_environment_variables`, and the
+    real initialization happens in `init_worker`.
+
+    If worker_class_fn is specified, it will be executed to get the worker
+    class.
+    Otherwise, the worker class will be obtained by dynamically importing it
+    using worker_module_name and worker_class_name.
+    """
 
-    @property
-    def max_model_len(self) -> int:
-        return self.model_config.max_model_len
+    def __init__(
+        self,
+        worker_module_name: str,
+        worker_class_name: str,
+        trust_remote_code: bool = False,
+        worker_class_fn: Optional[Callable[[],
+                                           Type[WorkerBase]]] = None) -> None:
+        self.worker_module_name = worker_module_name
+        self.worker_class_name = worker_class_name
+        self.worker_class_fn = worker_class_fn
+        self.worker: Optional[WorkerBase] = None
+        if trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
 
-    @property
-    def vocab_size(self) -> int:
-        return self.model_runner.vocab_size
+    @staticmethod
+    def update_environment_variables(envs: Dict[str, str]) -> None:
+        key = 'CUDA_VISIBLE_DEVICES'
+        if key in envs and key in os.environ:
+            # overwriting CUDA_VISIBLE_DEVICES is desired behavior
+            # suppress the warning in `update_environment_variables`
+            del os.environ[key]
+        update_environment_variables(envs)
 
-    def get_cache_block_size_bytes(self) -> int:
-        """Get the size of the KV cache block size in bytes.
+    def init_worker(self, *args, **kwargs):
+        """
+        Here we inject some common logic before initializing the worker.
+        Arguments are passed to the worker class constructor.
         """
-        return CacheEngine.get_cache_block_size(self.cache_config,
-                                                self.model_config,
-                                                self.parallel_config)
-
-
-def init_worker_distributed_environment(
-    parallel_config: ParallelConfig,
-    rank: int,
-    distributed_init_method: Optional[str] = None,
-    local_rank: int = -1,
-) -> None:
-    """Initialize the distributed environment."""
-    set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
-
-    init_distributed_environment(parallel_config.world_size, rank,
-                                 distributed_init_method, local_rank)
-
-    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size)
-
-
-def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
-    # Check if the GPU supports the dtype.
-    if torch_dtype == torch.bfloat16:  # noqa: SIM102
-        if not current_platform.has_device_capability(80):
-            capability = current_platform.get_device_capability()
-            gpu_name = current_platform.get_device_name()
-
-            if capability is None:
-                compute_str = "does not have a compute capability"
-            else:
-                version_str = capability.as_version_str()
-                compute_str = f"has compute capability {version_str}"
-
-            raise ValueError(
-                "Bfloat16 is only supported on GPUs with compute capability "
-                f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
-                "You can use float16 instead by explicitly setting the"
-                "`dtype` flag in CLI, for example: --dtype=half.")
-
-
-def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
-                                max_model_len) -> None:
-    if num_gpu_blocks <= 0:
-        raise ValueError("No available memory for the cache blocks. "
-                         "Try increasing `gpu_memory_utilization` when "
-                         "initializing the engine.")
-    max_seq_len = block_size * num_gpu_blocks
-    if max_model_len > max_seq_len:
-        raise ValueError(
-            f"The model's max seq len ({max_model_len}) "
-            "is larger than the maximum number of tokens that can be "
-            f"stored in KV cache ({max_seq_len}). Try increasing "
-            "`gpu_memory_utilization` or decreasing `max_model_len` when "
-            "initializing the engine.")
+        enable_trace_function_call_for_thread()
+
+        # see https://github.com/NVIDIA/nccl/issues/1234
+        os.environ['NCCL_CUMEM_ENABLE'] = '0'
+
+        from vllm.plugins import load_general_plugins
+        load_general_plugins()
+
+        if self.worker_class_fn:
+            worker_class = self.worker_class_fn()
+        else:
+            mod = importlib.import_module(self.worker_module_name)
+            worker_class = getattr(mod, self.worker_class_name)
+
+        self.worker = worker_class(*args, **kwargs)
+        assert self.worker is not None
+
+    def execute_method(self, method, *args, **kwargs):
+        try:
+            target = self if self.worker is None else self.worker
+            executor = getattr(target, method)
+            return executor(*args, **kwargs)
+        except Exception as e:
+            # if the driver worker also execute methods,
+            # exceptions in the rest worker may cause deadlock in rpc like ray
+            # see https://github.com/vllm-project/vllm/issues/3455
+            # print the error and inform the user to solve the error
+            msg = (f"Error executing method {method}. "
+                   "This might cause deadlock in distributed execution.")
+            logger.exception(msg)
+            raise e
+
+
+def extract_previous_hidden_states(
+        data: Union[ExecuteModelRequest, Dict[str, torch.Tensor]]) -> \
+            Dict[str, torch.Tensor]:
+    """If data contains previous_hidden_states, extract it. This returns a dict
+    which can be used directly as additional kwargs in any following 
+    execute_model calls. This is used in draft models like EAGLE."""
+    output = {}
+
+    # When called from non-driver worker, data is dict but when called from
+    # driver worker, data is ExecuteModelRequest.
+    if isinstance(data, dict):
+        if "previous_hidden_states" in data:
+            output["previous_hidden_states"] = data["previous_hidden_states"]
+    elif data.previous_hidden_states is not None:
+        output["previous_hidden_states"] = data.previous_hidden_states\
+            .hidden_states
+
+    return output
\ No newline at end of file

From e787e42debbef0947011ba4b61620b5ad29e1735 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 19 Sep 2024 00:54:37 +0000
Subject: [PATCH 241/278] revert worker to vllm main

---
 vllm/worker/worker.py      | 869 +++++++++++++++++++------------------
 vllm/worker/worker_base.py | 110 +----
 2 files changed, 445 insertions(+), 534 deletions(-)

diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index abc6f98b5f30..a12e026b2bdf 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,485 +1,488 @@
-import dataclasses
-import importlib
+"""A GPU worker class."""
+import gc
 import os
-import time
-from abc import ABC, abstractmethod
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
+from typing import Dict, List, Optional, Set, Tuple, Type, Union
 
 import torch
-
-from vllm.config import ObservabilityConfig
-from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
+import torch.distributed
+
+import vllm.envs as envs
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+                         ModelConfig, ObservabilityConfig, ParallelConfig,
+                         PromptAdapterConfig, SchedulerConfig,
+                         SpeculativeConfig)
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment,
+                              set_custom_all_reduce)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.model_executor import set_random_seed
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.platforms import current_platform
-from vllm.sequence import ExecuteModelRequest, IntermediateTensors
-from vllm.utils import (enable_trace_function_call_for_thread,
-                        update_environment_variables)
-from vllm.worker.model_runner_base import (BroadcastableModelInput,
-                                           ModelRunnerBase,
-                                           ModelRunnerInputBase)
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
+                           SequenceGroupMetadata, SequenceGroupMetadataDelta)
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.embedding_model_runner import EmbeddingModelRunner
+from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
+from vllm.worker.model_runner import GPUModelRunnerBase, ModelRunner
+from vllm.worker.worker_base import LocalOrDistributedWorkerBase, WorkerInput
 
 logger = init_logger(__name__)
 
 
-class WorkerBase(ABC):
-    """Worker interface that allows vLLM to cleanly separate implementations for
-    different hardware. Also abstracts control plane communication, e.g., to
-    communicate request metadata to other workers.
-    """
-
-    @abstractmethod
-    def init_device(self) -> None:
-        """Initialize device state, such as loading the model or other on-device
-        memory allocations.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available blocks for the GPU KV cache and
-        swappable CPU KV cache.
-
-        The implementation may run profiling or other heuristics to determine
-        the size of caches.
-
-        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
-        are blocks that are "active" on the device and can be appended to.
-        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
-        appended to.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache with the given size in blocks.
-        """
-        raise NotImplementedError
-
-    @current_platform.inference_mode()
-    def start_worker_execution_loop(self) -> None:
-        """Execute model loop in parallel worker.
+class Worker(LocalOrDistributedWorkerBase):
+    """A worker class that executes (a partition of) the model on a GPU.
 
-        You can stop the loop by executing a driver worker with an empty output.
-        See `stop_remote_worker_execution_loop` for more details.
-        """
-        while True:
-            output = self.execute_model(execute_model_req=None)
-            if output is None:
-                return None
+    Each worker is associated with a single GPU. The worker is responsible for
+    maintaining the KV cache and executing the model on the GPU. In case of
+    distributed inference, each worker is assigned a partition of the model.
+    """
 
-    @abstractmethod
-    def execute_model(
+    def __init__(
         self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> Optional[List[SamplerOutput]]:
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_cache_block_size_bytes(self) -> int:
-        """Return the size of a single cache block, in bytes. Used in
-        speculative decoding.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise NotImplementedError
-
-    @abstractmethod
-    def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError
-
-    @abstractmethod
-    def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError
-
-    @abstractmethod
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError
-
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        lora_config: Optional[LoRAConfig] = None,
+        speculative_config: Optional[SpeculativeConfig] = None,
+        prompt_adapter_config: Optional[PromptAdapterConfig] = None,
+        is_driver_worker: bool = False,
+        model_runner_cls: Optional[Type[GPUModelRunnerBase]] = None,
+        observability_config: Optional[ObservabilityConfig] = None,
+    ) -> None:
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.parallel_config.rank = rank
+        self.scheduler_config = scheduler_config
+        self.device_config = device_config
+        self.cache_config = cache_config
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.lora_config = lora_config
+        self.load_config = load_config
+        self.prompt_adapter_config = prompt_adapter_config
+        self.is_driver_worker = is_driver_worker
+        if parallel_config and is_driver_worker:
+            assert rank % parallel_config.tensor_parallel_size == 0, \
+                   "Driver worker should be rank 0 of tensor parallel group."
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
+        self.observability_config = observability_config
+
+        # Return hidden states from target model if the draft model is an
+        # mlp_speculator
+        speculative_args = {} if speculative_config is None \
+            or (speculative_config.draft_model_config.model ==
+                model_config.model) \
+            or (speculative_config.draft_model_config.hf_config.model_type
+                not in ["medusa", "mlp_speculator", "eagle"]) \
+                    else {"return_hidden_states": True}
+
+        ModelRunnerClass: Type[GPUModelRunnerBase] = ModelRunner
+        if model_runner_cls is not None:
+            ModelRunnerClass = model_runner_cls
+        elif self._is_embedding_model():
+            ModelRunnerClass = EmbeddingModelRunner
+        elif self._is_encoder_decoder_model():
+            ModelRunnerClass = EncoderDecoderModelRunner
+        self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
+            model_config,
+            parallel_config,
+            scheduler_config,
+            device_config,
+            cache_config,
+            load_config=load_config,
+            lora_config=self.lora_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=is_driver_worker,
+            prompt_adapter_config=prompt_adapter_config,
+            observability_config=observability_config,
+            **speculative_args,
+        )
+        # Uninitialized cache engine. Will be initialized by
+        # initialize_cache.
+        self.cache_engine: List[CacheEngine]
+        # Initialize gpu_cache as embedding models don't initialize kv_caches
+        self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
+        self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
+
+        # Torch profiler. Enabled and configured through env vars:
+        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        torch_profiler_trace_dir)
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+                ],
+                with_stack=True,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    torch_profiler_trace_dir, use_gzip=True))
+        else:
+            self.profiler = None
 
-class LoraNotSupportedWorkerBase(WorkerBase):
-    """Partial implementation of WorkerBase that raises exceptions when LoRA
-    methods are invoked.
-    """
+    def start_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.start()
 
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise ValueError(f"{type(self)} does not support LoRA")
+    def stop_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.stop()
 
-    def remove_lora(self, lora_id: int) -> bool:
-        raise ValueError(f"{type(self)} does not support LoRA")
+    def _is_encoder_decoder_model(self):
+        return self.model_config.is_encoder_decoder_model
 
-    def pin_lora(self, lora_id: int) -> bool:
-        return ValueError(
-            f"{type(self)} does not support LoRA")  # type: ignore
+    def _is_embedding_model(self):
+        return self.model_config.is_embedding_model
 
-    def list_loras(self) -> Set[int]:
-        raise ValueError(f"{type(self)} does not support LoRA")
+    def init_device(self) -> None:
+        if self.device_config.device.type == "cuda":
+            # torch.distributed.all_reduce does not free the input tensor until
+            # the synchronization point. This causes the memory usage to grow
+            # as the number of all_reduce calls increases. This env var disables
+            # this behavior.
+            # Related issue:
+            # https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
+            os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
+
+            # This env var set by Ray causes exceptions with graph building.
+            os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
+            self.device = torch.device(f"cuda:{self.local_rank}")
+            torch.cuda.set_device(self.device)
+
+            _check_if_gpu_supports_dtype(self.model_config.dtype)
+            gc.collect()
+            torch.cuda.empty_cache()
+            self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
+        # Initialize the distributed environment.
+        init_worker_distributed_environment(self.parallel_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank)
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+    def load_model(self):
+        self.model_runner.load_model()
+
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        self.model_runner.save_sharded_state(
+            path,
+            pattern=pattern,
+            max_size=max_size,
+        )
 
+    def save_tensorized_model(
+        self,
+        tensorizer_config: TensorizerConfig,
+    ) -> None:
+        self.model_runner.save_tensorized_model(
+            tensorizer_config=tensorizer_config, )
 
-@dataclasses.dataclass(frozen=True)
-class WorkerInput:
-    """Local inputs to each worker. May contain device-specific data. These
-    fields should be broadcastable to other workers.
-    """
+    @torch.inference_mode()
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Profiles the peak memory usage of the model to determine how many
+        KV blocks may be allocated without OOMs.
 
-    num_seq_groups: Optional[int] = None
-    blocks_to_swap_in: Optional[torch.Tensor] = None
-    blocks_to_swap_out: Optional[torch.Tensor] = None
-    blocks_to_copy: Optional[torch.Tensor] = None
-    virtual_engine: int = 0
-    num_steps: int = 1
-
-    @classmethod
-    def from_broadcasted_tensor_dict(
-        cls: Type["WorkerInput"],
-        tensor_dict: Dict[str, Any],
-    ) -> "WorkerInput":
-        """
-        Pop fields from the given tensor_dict and populate a new instance of
-        WorkerInput.
-        """
-        return cls(
-            num_seq_groups=tensor_dict.pop("num_seq_groups"),
-            blocks_to_swap_in=tensor_dict.pop("blocks_to_swap_in"),
-            blocks_to_swap_out=tensor_dict.pop("blocks_to_swap_out"),
-            blocks_to_copy=tensor_dict.pop("blocks_to_copy"),
-            virtual_engine=tensor_dict["virtual_engine"],
-            num_steps=tensor_dict.pop("num_steps"),
-        )
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the maximum possible number of GPU and CPU blocks
+        that can be allocated with the remaining free memory.
 
-    def as_broadcastable_tensor_dict(
-            self) -> Dict[str, Union[int, torch.Tensor]]:
-        """
-        Extract broadcastable fields.
+        .. tip::
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
         """
-        tensor_dict = {
-            "num_seq_groups": self.num_seq_groups,
-            "blocks_to_swap_in": self.blocks_to_swap_in,
-            "blocks_to_swap_out": self.blocks_to_swap_out,
-            "blocks_to_copy": self.blocks_to_copy,
-            "virtual_engine": self.virtual_engine,
-            "num_steps": self.num_steps,
-        }
-
-        return tensor_dict
+        # Profile the memory usage of the model and get the maximum number of
+        # cache blocks that can be allocated with the remaining free memory.
+        torch.cuda.empty_cache()
+
+        # Execute a forward pass with dummy inputs to profile the memory usage
+        # of the model.
+        self.model_runner.profile_run()
+
+        # Calculate the number of blocks that can be allocated with the
+        # profiled peak memory.
+        torch.cuda.synchronize()
+        free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+        # NOTE(woosuk): Here we assume that the other processes using the same
+        # GPU did not change their memory usage during the profiling.
+        peak_memory = self.init_gpu_memory - free_gpu_memory
+        assert peak_memory > 0, (
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_gpu_memory}, current free memory"
+            f" {free_gpu_memory}. This happens when the GPU memory was "
+            "not properly cleaned up before initializing the vLLM instance.")
+
+        cache_block_size = self.get_cache_block_size_bytes()
+        num_gpu_blocks = int(
+            (total_gpu_memory * self.cache_config.gpu_memory_utilization -
+             peak_memory) // cache_block_size)
+        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+                             cache_block_size)
+        num_gpu_blocks = max(num_gpu_blocks, 0)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
+        if self.model_runner.lora_manager:
+            self.model_runner.remove_all_loras()
+        gc.collect()
+        torch.cuda.empty_cache()
+        return num_gpu_blocks, num_cpu_blocks
 
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Allocate GPU and CPU KV cache with the specified number of blocks.
 
-class LocalOrDistributedWorkerBase(WorkerBase):
-    """
-    Partial implementation of WorkerBase that has a default `execute_model`
-    definition to perform metadata transfer between workers when in distributed
-    mode. Subclasses of this interface should use model runners that inherit
-    from ModelRunnerBase, and should only need to implement worker-local logic.
-    If custom control plane logic is needed to transfer metadata, or if the
-    model runner cannot inherit from ModelRunnerBase, use WorkerBase instead.
-    """
-    is_driver_worker: bool
-    model_runner: ModelRunnerBase
-    observability_config: Optional[ObservabilityConfig] = None
+        This also warms up the model, which may record CUDA graphs.
+        """
+        raise_if_cache_size_invalid(num_gpu_blocks,
+                                    self.cache_config.block_size,
+                                    self.model_config.max_model_len)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        self._init_cache_engine()
+        self._warm_up_model()
+
+    def _init_cache_engine(self):
+        assert self.cache_config.num_gpu_blocks is not None
+        self.cache_engine = [
+            CacheEngine(self.cache_config, self.model_config,
+                        self.parallel_config, self.device_config)
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.gpu_cache = [
+            self.cache_engine[ve].gpu_cache
+            for ve in range(self.parallel_config.pipeline_parallel_size)
+        ]
+
+    def _warm_up_model(self) -> None:
+        if not self.model_config.enforce_eager:
+            self.model_runner.capture_model(self.gpu_cache)
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
 
     @property
-    @abstractmethod
     def do_metadata_broadcast(self) -> bool:
-        """
-        Used by the default `execute_model` to check whether broadcast is
-        needed to transfer request inputs from the driver worker to other
-        workers in the TP group. If WorkerBase subclass only supports
-        single-worker execution, then this method should return False.
-        """
-        raise NotImplementedError
+        return self.parallel_config.tensor_parallel_size > 1
 
     @property
-    @abstractmethod
     def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
-        """
-        Gets the list of kv caches to pass to the worker's model runner. Each
-        element in the list is a kv cache corresponding to a particular virtual
-        engine (PP stream). Used by the default `execute_model`. If the worker's
-        model runner does not follow the ModelRunnerBase interface, then inherit
-        from WorkerBase instead.
-        """
-        raise NotImplementedError
+        return self.gpu_cache
 
-    @abstractmethod
+    @torch.inference_mode()
     def prepare_worker_input(
             self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
-        """
-        Prepare the inputs to WorkerBase.execute_worker from an execution
-        request. This method may move data to the worker's local device. It is
-        not allowed to communicate with other workers or devices.
-        """
-        raise NotImplementedError
+        virtual_engine = execute_model_req.virtual_engine
+        num_steps = execute_model_req.num_steps
+        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
+        # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
+        # they contain parameters to launch cudamemcpyasync.
+        blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
+                                         device="cpu",
+                                         dtype=torch.int64).view(-1, 2)
+        blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out,
+                                          device="cpu",
+                                          dtype=torch.int64).view(-1, 2)
+        # `blocks_to_copy` is a gpu tensor. The src and tgt of
+        # blocks to copy are in the same device, and `blocks_to_copy`
+        # can be used directly within cuda kernels.
+        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
+                                      device=self.device,
+                                      dtype=torch.int64).view(-1, 2)
+
+        return WorkerInput(
+            num_seq_groups=num_seq_groups,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            virtual_engine=virtual_engine,
+            num_steps=num_steps,
+        )
 
-    @abstractmethod
+    @torch.inference_mode()
     def execute_worker(self, worker_input: WorkerInput) -> None:
+        virtual_engine = worker_input.virtual_engine
+        # Issue cache operations.
+        if (worker_input.blocks_to_swap_in is not None
+                and worker_input.blocks_to_swap_in.numel() > 0):
+            self.cache_engine[virtual_engine].swap_in(
+                worker_input.blocks_to_swap_in)
+        if (worker_input.blocks_to_swap_out is not None
+                and worker_input.blocks_to_swap_out.numel() > 0):
+            self.cache_engine[virtual_engine].swap_out(
+                worker_input.blocks_to_swap_out)
+        if (worker_input.blocks_to_copy is not None
+                and worker_input.blocks_to_copy.numel() > 0):
+            self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
+
+    def _get_cached_seq_group_metadata(
+            self,
+            seq_group_metadata_list: List[Union[SequenceGroupMetadata,
+                                                SequenceGroupMetadataDelta]],
+            finished_request_ids: List[str]) -> List[SequenceGroupMetadata]:
+        """Return a list of cached Sequence Group Metadata after updating its
+        state.
+
+        It is used because scheduler only sends delta to workers to reduce
+        the data payload size. The function also cleans up cache based on
+        a given `finished_request_ids`.
         """
-        Process an execution request.
-        """
-        raise NotImplementedError
-
-    def _get_worker_input_from_broadcast(
-        self
-    ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[
-            str, torch.Tensor]]]:
-        """ Get the worker input from the broadcasted tensor dict. """
-        assert self.do_metadata_broadcast
-        assert not self.is_driver_worker
-        broadcast_data = broadcast_tensor_dict(src=0)
-        if not broadcast_data:
-            return None
-
-        worker_input = WorkerInput.from_broadcasted_tensor_dict(broadcast_data)
-        model_input = (
-            self.model_runner.make_model_input_from_broadcasted_tensor_dict(
-                broadcast_data))
-
-        kwargs = extract_previous_hidden_states(broadcast_data)
-
-        return model_input, worker_input, kwargs
-
-    def _get_driver_input_and_broadcast(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Tuple[BroadcastableModelInput, WorkerInput, Dict[str, torch.Tensor]]:
-        """ Get the driver input and broadcast it to other workers.  """
-        assert self.is_driver_worker
-
-        worker_input: WorkerInput = self.prepare_worker_input(
-            execute_model_req=execute_model_req)
-        model_input: ModelRunnerInputBase = (
-            self.model_runner.prepare_model_input(
-                execute_model_req.seq_group_metadata_list,
-                execute_model_req.virtual_engine,
-                execute_model_req.finished_requests_ids))
-
-        kwargs = extract_previous_hidden_states(execute_model_req)
+        new_seq_group_metadata_list = []
+        for metadata_or_delta in seq_group_metadata_list:
+            request_id = metadata_or_delta.request_id
+            if request_id not in self._seq_group_metadata_cache:
+                # The first prefill.
+                assert isinstance(metadata_or_delta, SequenceGroupMetadata)
+                self._seq_group_metadata_cache[request_id] = metadata_or_delta
+            else:
+                # The first prefill is already cached.
+                if isinstance(metadata_or_delta, SequenceGroupMetadataDelta):
+                    self._seq_group_metadata_cache[request_id].apply_delta(
+                        metadata_or_delta)
+                else:
+                    # If metadata snapshot is sent again, it is
+                    # preempted. Reset the cache because we need to start
+                    # from scratch.
+                    assert isinstance(metadata_or_delta, SequenceGroupMetadata)
+                    self._seq_group_metadata_cache[
+                        request_id] = metadata_or_delta
+
+            new_seq_group_metadata_list.append(
+                self._seq_group_metadata_cache[request_id])
+
+        # Clean up finished ids
+        for finished_id in finished_request_ids:
+            del self._seq_group_metadata_cache[finished_id]
+
+        return new_seq_group_metadata_list
 
-        if self.do_metadata_broadcast:
-            broadcast_data = worker_input.as_broadcastable_tensor_dict()
-            broadcast_data.update(model_input.as_broadcastable_tensor_dict())
-            broadcast_data.update(kwargs)
-            broadcast_tensor_dict(broadcast_data, src=0)
-
-        if execute_model_req.async_callback:
-            model_input = dataclasses.replace(  # type: ignore
-                model_input,
-                async_callback=execute_model_req.async_callback)
-
-        return model_input, worker_input, kwargs
-
-    def prepare_input(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> Optional[Tuple[BroadcastableModelInput, WorkerInput, Dict[
-            str, torch.Tensor]]]:
-        """
-        Prepare the inputs to ModelRunner and workers.
-        """
-        if self.is_driver_worker:
-            if execute_model_req is None:
-                if self.do_metadata_broadcast:
-                    # This signals that there's no more requests to process for
-                    # now. All workers are running infinite loop with
-                    # broadcast_tensor_dict, and it stops the loop when the
-                    # driver broadcasts an empty input. Send an empty input to
-                    # notify all other workers to stop their execution loop.
-                    broadcast_tensor_dict({}, src=0)
-                return None
-            return self._get_driver_input_and_broadcast(execute_model_req)
-        else:
-            return self._get_worker_input_from_broadcast()
-
-    def execute_model(
+    def _execute_model_spmd(
         self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
+        execute_model_req: ExecuteModelRequest,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
     ) -> Optional[List[SamplerOutput]]:
-        """Executes at least one model step on the given sequences, unless no
-        sequences are provided."""
-        start_time = time.perf_counter()
-
-        inputs = self.prepare_input(execute_model_req)
-        if inputs is None:
-            return None
-
-        model_input, worker_input, kwargs = inputs
-        num_steps = worker_input.num_steps
-
-        self.execute_worker(worker_input)
-
-        # If there is no input, we don't need to execute the model.
-        if worker_input.num_seq_groups == 0:
-            return []
-
-        intermediate_tensors = None
-        orig_model_execute_time = 0.0
-        if not get_pp_group().is_first_rank:
-            intermediate_tensors = IntermediateTensors(
-                get_pp_group().recv_tensor_dict(
-                    all_gather_group=get_tp_group()))
-            if (self.observability_config is not None
-                    and self.observability_config.collect_model_execute_time):
-                orig_model_execute_time = intermediate_tensors.tensors.get(
-                    "model_execute_time", torch.tensor(0)).item()
-
-        output = self.model_runner.execute_model(
-            model_input=model_input,
-            kv_caches=self.kv_cache[worker_input.virtual_engine]
-            if self.kv_cache is not None else None,
-            intermediate_tensors=intermediate_tensors,
-            num_steps=num_steps,
-            **kwargs,
-        )
+        if execute_model_req is not None:
+            new_seq_group_metadata_list = self._get_cached_seq_group_metadata(
+                execute_model_req.seq_group_metadata_list,
+                execute_model_req.finished_requests_ids)
 
-        model_execute_time = time.perf_counter() - start_time
-        if not get_pp_group().is_last_rank:
-            # output is IntermediateTensors
-            if (self.observability_config is not None
-                    and self.observability_config.collect_model_execute_time):
-                output.tensors["model_execute_time"] = torch.tensor(
-                    model_execute_time + orig_model_execute_time)
-            get_pp_group().send_tensor_dict(output.tensors,
-                                            all_gather_group=get_tp_group())
-            return [None]
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_execute_time
-                and output is not None):
-            for o in output:
-                o.model_execute_time = (orig_model_execute_time +
-                                        model_execute_time)
-
-        # output is List[SamplerOutput]
+            execute_model_req.seq_group_metadata_list = (
+                new_seq_group_metadata_list)
+        output = super()._execute_model_spmd(execute_model_req,
+                                             intermediate_tensors)
         return output
 
-    def _execute_model_spmd(
-        self,
-        execute_model_req: ExecuteModelRequest,
-        intermediate_tensors: Optional[IntermediateTensors] = None
-    ) -> Optional[List[SamplerOutput]]:
-        """
-        Execute model in Single Program Multiple Data (SPMD) fashion.
-        All workers take the same request, prepare the input and
-        execute the model.
-        """
-        assert execute_model_req is not None, (
-            "_execute_model_spmd() requires each worker to take in an "
-            "ExecuteModelRequest")
-        worker_input: WorkerInput = self.prepare_worker_input(
-            execute_model_req=execute_model_req)
-        model_input: ModelRunnerInputBase = (
-            self.model_runner.prepare_model_input(
-                execute_model_req.seq_group_metadata_list))
-
-        self.execute_worker(worker_input)
-
-        # If there is no input, we don't need to execute the model.
-        if worker_input.num_seq_groups == 0:
-            return []
-
-        kwargs = extract_previous_hidden_states(execute_model_req)
-
-        return self.model_runner.execute_model(
-            model_input=model_input,
-            kv_caches=self.kv_cache[worker_input.virtual_engine]
-            if self.kv_cache is not None else None,
-            intermediate_tensors=intermediate_tensors,
-            **kwargs,
-        )
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        return self.model_runner.add_lora(lora_request)
 
+    def remove_lora(self, lora_id: int) -> bool:
+        return self.model_runner.remove_lora(lora_id)
 
-class WorkerWrapperBase:
-    """
-    The whole point of this class is to lazily initialize the worker.
-    We first instantiate the WorkerWrapper, which remembers the worker module
-    and class name. Then, when we call `update_environment_variables`, and the
-    real initialization happens in `init_worker`.
-
-    If worker_class_fn is specified, it will be executed to get the worker
-    class.
-    Otherwise, the worker class will be obtained by dynamically importing it
-    using worker_module_name and worker_class_name.
-    """
+    def pin_lora(self, lora_id: int) -> bool:
+        return self.model_runner.pin_lora(lora_id)
 
-    def __init__(
-        self,
-        worker_module_name: str,
-        worker_class_name: str,
-        trust_remote_code: bool = False,
-        worker_class_fn: Optional[Callable[[],
-                                           Type[WorkerBase]]] = None) -> None:
-        self.worker_module_name = worker_module_name
-        self.worker_class_name = worker_class_name
-        self.worker_class_fn = worker_class_fn
-        self.worker: Optional[WorkerBase] = None
-        if trust_remote_code:
-            # note: lazy import to avoid importing torch before initializing
-            from vllm.utils import init_cached_hf_modules
-            init_cached_hf_modules()
+    def list_loras(self) -> Set[int]:
+        return self.model_runner.list_loras()
 
-    @staticmethod
-    def update_environment_variables(envs: Dict[str, str]) -> None:
-        key = 'CUDA_VISIBLE_DEVICES'
-        if key in envs and key in os.environ:
-            # overwriting CUDA_VISIBLE_DEVICES is desired behavior
-            # suppress the warning in `update_environment_variables`
-            del os.environ[key]
-        update_environment_variables(envs)
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        return self.model_runner.add_prompt_adapter(prompt_adapter_request)
 
-    def init_worker(self, *args, **kwargs):
-        """
-        Here we inject some common logic before initializing the worker.
-        Arguments are passed to the worker class constructor.
-        """
-        enable_trace_function_call_for_thread()
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.model_runner.remove_lora(prompt_adapter_id)
 
-        # see https://github.com/NVIDIA/nccl/issues/1234
-        os.environ['NCCL_CUMEM_ENABLE'] = '0'
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        return self.model_runner.pin_prompt_adapter(prompt_adapter_id)
 
-        from vllm.plugins import load_general_plugins
-        load_general_plugins()
+    def list_prompt_adapters(self) -> Set[int]:
+        return self.model_runner.list_prompt_adapters()
 
-        if self.worker_class_fn:
-            worker_class = self.worker_class_fn()
-        else:
-            mod = importlib.import_module(self.worker_module_name)
-            worker_class = getattr(mod, self.worker_class_name)
-
-        self.worker = worker_class(*args, **kwargs)
-        assert self.worker is not None
-
-    def execute_method(self, method, *args, **kwargs):
-        try:
-            target = self if self.worker is None else self.worker
-            executor = getattr(target, method)
-            return executor(*args, **kwargs)
-        except Exception as e:
-            # if the driver worker also execute methods,
-            # exceptions in the rest worker may cause deadlock in rpc like ray
-            # see https://github.com/vllm-project/vllm/issues/3455
-            # print the error and inform the user to solve the error
-            msg = (f"Error executing method {method}. "
-                   "This might cause deadlock in distributed execution.")
-            logger.exception(msg)
-            raise e
-
-
-def extract_previous_hidden_states(
-        data: Union[ExecuteModelRequest, Dict[str, torch.Tensor]]) -> \
-            Dict[str, torch.Tensor]:
-    """If data contains previous_hidden_states, extract it. This returns a dict
-    which can be used directly as additional kwargs in any following 
-    execute_model calls. This is used in draft models like EAGLE."""
-    output = {}
-
-    # When called from non-driver worker, data is dict but when called from
-    # driver worker, data is ExecuteModelRequest.
-    if isinstance(data, dict):
-        if "previous_hidden_states" in data:
-            output["previous_hidden_states"] = data["previous_hidden_states"]
-    elif data.previous_hidden_states is not None:
-        output["previous_hidden_states"] = data.previous_hidden_states\
-            .hidden_states
-
-    return output
\ No newline at end of file
+    @property
+    def max_model_len(self) -> int:
+        return self.model_config.max_model_len
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_runner.vocab_size
+
+    def get_cache_block_size_bytes(self) -> int:
+        """Get the size of the KV cache block size in bytes.
+        """
+        return CacheEngine.get_cache_block_size(self.cache_config,
+                                                self.model_config,
+                                                self.parallel_config)
+
+
+def init_worker_distributed_environment(
+    parallel_config: ParallelConfig,
+    rank: int,
+    distributed_init_method: Optional[str] = None,
+    local_rank: int = -1,
+) -> None:
+    """Initialize the distributed environment."""
+    set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
+
+    init_distributed_environment(parallel_config.world_size, rank,
+                                 distributed_init_method, local_rank)
+
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)
+
+
+def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
+    # Check if the GPU supports the dtype.
+    if torch_dtype == torch.bfloat16:  # noqa: SIM102
+        if not current_platform.has_device_capability(80):
+            capability = current_platform.get_device_capability()
+            gpu_name = current_platform.get_device_name()
+
+            if capability is None:
+                compute_str = "does not have a compute capability"
+            else:
+                version_str = capability.as_version_str()
+                compute_str = f"has compute capability {version_str}"
+
+            raise ValueError(
+                "Bfloat16 is only supported on GPUs with compute capability "
+                f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "
+                "You can use float16 instead by explicitly setting the"
+                "`dtype` flag in CLI, for example: --dtype=half.")
+
+
+def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
+                                max_model_len) -> None:
+    if num_gpu_blocks <= 0:
+        raise ValueError("No available memory for the cache blocks. "
+                         "Try increasing `gpu_memory_utilization` when "
+                         "initializing the engine.")
+    max_seq_len = block_size * num_gpu_blocks
+    if max_model_len > max_seq_len:
+        raise ValueError(
+            f"The model's max seq len ({max_model_len}) "
+            "is larger than the maximum number of tokens that can be "
+            f"stored in KV cache ({max_seq_len}). Try increasing "
+            "`gpu_memory_utilization` or decreasing `max_model_len` when "
+            "initializing the engine.")
\ No newline at end of file
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index ae2fb65cc455..abc6f98b5f30 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -7,8 +7,6 @@
 
 import torch
 
-import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
-import vllm.distributed.parallel_state as ps
 from vllm.config import ObservabilityConfig
 from vllm.distributed import broadcast_tensor_dict, get_pp_group, get_tp_group
 from vllm.logger import init_logger
@@ -326,64 +324,14 @@ def execute_model(
                 orig_model_execute_time = intermediate_tensors.tensors.get(
                     "model_execute_time", torch.tensor(0)).item()
 
-        # for disaggregated prefilling: allow bypassing model execution
-        bypass_model_exec = False
-
-        # receive KV cache from prefill instance, or from LMCache
-        if self.need_recv_kv(model_input, worker_input):
-            from vllm.worker.model_runner import GPUModelRunnerBase
-            assert isinstance(self.model_runner, GPUModelRunnerBase), \
-                "Distributed KV transfer only support GPU modelrunner"
-            logger.debug("Receiving KV caches")
-            hidden_or_intermediate_states, bypass_model_exec, model_input = \
-                ps.get_disagg_group().recv_kv_caches_and_hidden_states(
-                    # model is used to know which layer the current worker
-                    # is working on, so that we can receive KV for only those
-                    # layers.
-                    self.model_runner.model,
-                    model_input,
-                    kv_caches=self.kv_cache[worker_input.virtual_engine]
-                    if self.kv_cache is not None else None,
-                )
-            #assert bypass_model_exec
-
-        if not bypass_model_exec:
-            hidden_or_intermediate_states = self.model_runner.execute_model(
-                model_input=model_input,
-                kv_caches=self.kv_cache[worker_input.virtual_engine]
-                if self.kv_cache is not None else None,
-                intermediate_tensors=intermediate_tensors,
-                num_steps=num_steps,
-                **kwargs,
-            )
-
-        # sending out KV cache
-        if self.need_send_kv(model_input, worker_input):
-            from vllm.worker.model_runner import GPUModelRunnerBase
-            assert isinstance(self.model_runner, GPUModelRunnerBase), \
-                "Distributed KV transfer only support GPU modelrunner"
-            logger.debug("Sending KV caches")
-            ps.get_disagg_group().send_kv_caches_and_hidden_states(
-                # model is used to know which layer the current worker
-                # is working on, so that we can send KV for only those
-                # layers.
-                self.model_runner.model,
-                model_input,
-                self.kv_cache[worker_input.virtual_engine]
-                if self.kv_cache is not None else None,
-                hidden_or_intermediate_states,
-            )
-
-        # separating postprocessing steps out from execute_model
-        # so that disaggregated prefill can completely bypass model forwarding
-        from vllm.worker.model_runner import ModelRunner
-        if isinstance(self.model_runner, ModelRunner):
-            output = self.model_runner.postprocess_model(
-                model_input,
-                hidden_or_intermediate_states,
-            )
-        else:
-            output = hidden_or_intermediate_states
+        output = self.model_runner.execute_model(
+            model_input=model_input,
+            kv_caches=self.kv_cache[worker_input.virtual_engine]
+            if self.kv_cache is not None else None,
+            intermediate_tensors=intermediate_tensors,
+            num_steps=num_steps,
+            **kwargs,
+        )
 
         model_execute_time = time.perf_counter() - start_time
         if not get_pp_group().is_last_rank:
@@ -405,46 +353,6 @@ def execute_model(
         # output is List[SamplerOutput]
         return output
 
-    def need_recv_kv(self, model_input, worker_input) -> bool:
-
-        if self.kv_cache is None:
-            return False
-
-        kv_caches = self.kv_cache[worker_input.virtual_engine]
-        prefill_meta = model_input.attn_metadata.prefill_metadata
-
-        # check if the current run is profiling
-        is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
-        # check if the current run is prefill
-        is_prefill_run = prefill_meta is not None
-        # for disaggregated prefilling: allow bypassing model execution
-
-        return all([
-            is_prefill_run, dist_kv.IS_KV_DECODE_INSTANCE
-            or dist_kv.IS_LMCACHE_INSTANCE, not is_profile_run
-        ])
-
-    def need_send_kv(self, model_input, worker_input) -> bool:
-
-        if self.kv_cache is None:
-            return False
-
-        kv_caches = self.kv_cache[worker_input.virtual_engine]
-        prefill_meta = model_input.attn_metadata.prefill_metadata
-        from vllm.worker.model_runner import GPUModelRunnerBase
-        if not isinstance(self.model_runner, GPUModelRunnerBase):
-            return False
-
-        # check if the current run is profiling
-        is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
-        # check if the current run is prefill
-        is_prefill_run = prefill_meta is not None
-
-        return all([
-            is_prefill_run, dist_kv.IS_KV_PREFILL_INSTANCE
-            or dist_kv.IS_LMCACHE_INSTANCE, not is_profile_run
-        ])
-
     def _execute_model_spmd(
         self,
         execute_model_req: ExecuteModelRequest,
@@ -574,4 +482,4 @@ def extract_previous_hidden_states(
         output["previous_hidden_states"] = data.previous_hidden_states\
             .hidden_states
 
-    return output
+    return output
\ No newline at end of file

From 9874b42f5373188f6efeb498a088c50fdee22232 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 19 Sep 2024 00:57:20 +0000
Subject: [PATCH 242/278] bug fix

---
 vllm/worker/model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 78ffc40082fc..a6f91667fedf 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1720,7 +1720,7 @@ def recv_kv_needed(self, model_input, kv_caches) -> bool:
         is_prefill_run = prefill_meta is not None
 
         return all([
-            dist_kv.IS_KV_DECODE_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE,
+            dist_kv.IS_KV_CONSUMER,
             not is_profile_run,
             is_prefill_run, 
         ])
@@ -1741,7 +1741,7 @@ def send_kv_needed(self, model_input, kv_caches) -> bool:
         is_prefill_run = prefill_meta is not None
 
         return all([
-            dist_kv.IS_KV_PREFILL_INSTANCE or dist_kv.IS_LMCACHE_INSTANCE,
+            dist_kv.IS_KV_PRODUCER,
             not is_profile_run,
             is_prefill_run
         ])

From 5950ad530ee03c1a8a00a71cad2970979c8d66d9 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 19 Sep 2024 03:17:49 +0000
Subject: [PATCH 243/278] fix typo: Distributerd -> Distributed

---
 examples/disagg_prefill/disagg_prefill_example.sh | 9 +++++----
 vllm/distributed/kv_transfer/vllm_adapter.py      | 8 ++++----
 vllm/envs.py                                      | 4 ++--
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index 56b6f44c7418..57cebb3775a0 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -5,6 +5,7 @@
 
 export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
 export VLLM_PORT=12345
+export VLLM_LOGGING_LEVEL=DEBUG
 
 # a function that waits vLLM server to start
 wait_for_server() {
@@ -15,16 +16,16 @@ wait_for_server() {
     done" && return 0 || return 1
 }
 
-# prefilling instance
-VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 python3 \
+# prefilling instance, which is the KV producer
+VLLM_DISTRIBUTED_KV_ROLE=producer CUDA_VISIBLE_DEVICES=0 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8100 \
     --max-model-len 10000 \
     --gpu-memory-utilization 0.8 &
 
-# decoding instance
-VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=1 python3 \
+# decoding instance, which is the KV consumer
+VLLM_DISTRIBUTED_KV_ROLE=consumer CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8200 \
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 17c2e52b1174..1a918ff0ae32 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -41,12 +41,12 @@
 logger = init_logger(__name__)
 
 # check VLLM_DISTRIBUTERD_KV_ROLE and set corresponding flags
-assert envs.VLLM_DISTRIBUTERD_KV_ROLE in [None, "producer", "consumer", "both"],\
+assert envs.VLLM_DISTRIBUTED_KV_ROLE in [None, "producer", "consumer", "both"],\
     "VLLM_DISTRIBUTERD_KV_ROLE can only be producer, consumer or both."
-IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISTRIBUTERD_KV_ROLE
+IS_DISTRIBUTED_KV_INSTANCE: bool = (envs.VLLM_DISTRIBUTED_KV_ROLE
                                     in ["producer", "consumer", "both"])
-IS_KV_PRODUCER: bool = (envs.VLLM_DISTRIBUTERD_KV_ROLE in ["producer", "both"])
-IS_KV_CONSUMER: bool = (envs.VLLM_DISTRIBUTERD_KV_ROLE == ["consumer", "both"])
+IS_KV_PRODUCER: bool = (envs.VLLM_DISTRIBUTED_KV_ROLE in ["producer", "both"])
+IS_KV_CONSUMER: bool = (envs.VLLM_DISTRIBUTED_KV_ROLE in ["consumer", "both"])
 
 # When the current instance is both KV producer and KV consumer,
 # it is likely connected to a KV storage service on CPU/disk
diff --git a/vllm/envs.py b/vllm/envs.py
index 407dd942cc9f..cc2c8a11e5af 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -368,8 +368,8 @@ def get_default_config_root():
 
     # Specify the role of current vllm instance
     # Value can be "producer", "consumer" or "both".
-    "VLLM_DISTRIBUTERD_KV_ROLE":
-    lambda: os.getenv("VLLM_DISTRIBUTERD_KV_ROLE", None),
+    "VLLM_DISTRIBUTED_KV_ROLE":
+    lambda: os.getenv("VLLM_DISTRIBUTED_KV_ROLE", None),
 
     # If set, vllm will skip the deprecation warnings.
     "VLLM_NO_DEPRECATION_WARNING":

From c116684042e205b789c6e259ef3dd3a6301806b0 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 19 Sep 2024 03:26:39 +0000
Subject: [PATCH 244/278] remove the debug flag in example -- user don't need
 it

---
 examples/disagg_prefill/disagg_prefill_example.sh | 9 ++++++++-
 vllm/distributed/parallel_state.py                | 7 ++++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/disagg_prefill/disagg_prefill_example.sh
index 57cebb3775a0..efec87855dbe 100644
--- a/examples/disagg_prefill/disagg_prefill_example.sh
+++ b/examples/disagg_prefill/disagg_prefill_example.sh
@@ -5,7 +5,14 @@
 
 export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
 export VLLM_PORT=12345
-export VLLM_LOGGING_LEVEL=DEBUG
+
+# install quart first -- required for disagg prefill proxy serve
+if python3 -c "import quart" &> /dev/null; then
+    echo "Quart is already installed."
+else
+    echo "Quart is not installed. Installing..."
+    python3 -m pip install quart
+fi 
 
 # a function that waits vLLM server to start
 wait_for_server() {
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 54f7968908de..2c6f61fca8f4 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -986,14 +986,15 @@ def init_distributed_environment(
             logger.debug("Disaggregated prefill enabled.")
             if dist_kv.IS_KV_PRODUCER:
                 # for prefill, the ranks are [0, world_size)
+                logger.debug("rank %d is KV producer.", rank)
                 maybe_disagg_rank = rank
             else:
                 # this is decode instance.
                 # offset global rank by tp * pp (which is world_size)
                 maybe_disagg_rank = rank + world_size
-
-        logger.debug("Before: world size %d, rank %d", maybe_disagg_world_size,
-                     maybe_disagg_rank)
+                logger.debug("rank %d is KV producer, adjust it to %d",
+                             rank,
+                             maybe_disagg_rank)
 
         torch.distributed.init_process_group(
             backend=backend,

From 44e8875b83a016c77ae2f3643a5270be2875d1c4 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 19 Sep 2024 03:33:11 +0000
Subject: [PATCH 245/278] fix typo

---
 benchmarks/benchmark_serving.py               |  8 +++----
 .../disagg_overhead_benchmark.sh              | 22 +++----------------
 vllm/distributed/parallel_state.py            |  2 +-
 3 files changed, 8 insertions(+), 24 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index a2a049ed5671..f9d719a16008 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -136,9 +136,9 @@ def sample_sonnet_requests(
     prefix_len: int,
     tokenizer: PreTrainedTokenizerBase,
 ) -> List[Tuple[str, str, int, int, None]]:
-    assert input_len >= prefix_len, (
-        "'args.sonnet-input-len' must be greater than or equal to "
-        "'args.prefix-input-len'.")
+    assert (
+        input_len > prefix_len
+    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
 
     # Load the dataset.
     with open(dataset_path) as f:
@@ -963,4 +963,4 @@ def main(args: argparse.Namespace):
     )
 
     args = parser.parse_args()
-    main(args)
+    main(args)
\ No newline at end of file
diff --git a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
index 36116172ab7c..dec00c2c9fe0 100644
--- a/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_overhead_benchmark.sh
@@ -52,31 +52,15 @@ benchmark() {
   input_len=2048
   output_len=$2
 
-  # large model
-  # VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
-  #     -m vllm.entrypoints.openai.api_server \
-  #     --model $model \
-  #     --port 8100 \
-  #     -tp 4 \
-  #     --max-model-len 30000 \
-  #     --gpu-memory-utilization 0.8 &
-  # VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
-  #   -m vllm.entrypoints.openai.api_server \
-  #   --model $model \
-  #   --port 8200 \
-  #   -tp 4 \
-  #   --max-model-len 30000 \
-  #   --gpu-memory-utilization 0.8 &
-
-  VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0 python3 \
+
+  VLLM_DISTRIBUTED_KV_ROLE=producer CUDA_VISIBLE_DEVICES=0 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8100 \
     --max-model-len 10000 \
     --gpu-memory-utilization 0.8 &
 
-# decoding instance
-VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=1 python3 \
+  VLLM_DISTRIBUTED_KV_ROLE=consumer CUDA_VISIBLE_DEVICES=1 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model meta-llama/Meta-Llama-3.1-8B-Instruct \
     --port 8200 \
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 2c6f61fca8f4..64faa5b81263 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -992,7 +992,7 @@ def init_distributed_environment(
                 # this is decode instance.
                 # offset global rank by tp * pp (which is world_size)
                 maybe_disagg_rank = rank + world_size
-                logger.debug("rank %d is KV producer, adjust it to %d",
+                logger.debug("rank %d is KV consumer, adjust it to %d",
                              rank,
                              maybe_disagg_rank)
 

From 181928fd72b6b297995913e3f7c67ddb79fab7fb Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 19 Sep 2024 03:41:41 +0000
Subject: [PATCH 246/278] fixing benchmark_serving.py

---
 benchmarks/benchmark_serving.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index f9d719a16008..a407a263120b 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -626,9 +626,9 @@ def main(args: argparse.Namespace):
                 prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
             )
-            input_requests = [(prompt, prompt_len, output_len)
+            input_requests = [(prompt, prompt_len, output_len, None)
                               for prompt, prompt_formatted, prompt_len,
-                              output_len in input_requests]
+                              output_len, _ in input_requests]
         else:
             assert (
                 tokenizer.chat_template or tokenizer.default_chat_template
@@ -641,9 +641,9 @@ def main(args: argparse.Namespace):
                 prefix_len=args.sonnet_prefix_len,
                 tokenizer=tokenizer,
             )
-            input_requests = [(prompt_formatted, prompt_len, output_len)
+            input_requests = [(prompt_formatted, prompt_len, output_len, None)
                               for prompt, prompt_formatted, prompt_len,
-                              output_len in input_requests]
+                              output_len, _ in input_requests]
 
     elif args.dataset_name == "hf":
         input_requests = sample_hf_requests(

From c17d18daeafb91726e73cb8be19e34c00a247fce Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 19 Sep 2024 04:03:19 +0000
Subject: [PATCH 247/278] fix the example

---
 .../disagg_benchmarks/disagg_performance_benchmark.sh     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
index 1da5669dd1cd..0e6875363f4d 100644
--- a/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
+++ b/benchmarks/disagg_benchmarks/disagg_performance_benchmark.sh
@@ -38,7 +38,7 @@ wait_for_server() {
 launch_chunked_prefill() {
   model="meta-llama/Meta-Llama-3.1-70B-Instruct"
   # disagg prefill
-  VLLM_RPC_PORT=5570 CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+  CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
       -m vllm.entrypoints.openai.api_server \
       --model $model \
       --port 8100 \
@@ -48,7 +48,7 @@ launch_chunked_prefill() {
       --disable-log-requests \
       --enable-chunked-prefill \
       --gpu-memory-utilization 0.8 &
-  VLLM_RPC_PORT=5580 CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+  CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model $model \
     --port 8200 \
@@ -68,7 +68,7 @@ launch_chunked_prefill() {
 launch_disagg_prefill() {
   model="meta-llama/Meta-Llama-3.1-70B-Instruct" 
   # disagg prefill
-  VLLM_PORT=12345 VLLM_RPC_PORT=5570 VLLM_DISAGG_PREFILL_ROLE=prefill CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
+  VLLM_PORT=12345 VLLM_DISTRIBUTED_KV_ROLE=producer CUDA_VISIBLE_DEVICES=0,1,2,3 python3 \
       -m vllm.entrypoints.openai.api_server \
       --model $model \
       --port 8100 \
@@ -77,7 +77,7 @@ launch_disagg_prefill() {
       --disable-log-stats \
       --disable-log-requests \
       --gpu-memory-utilization 0.8 &
-  VLLM_PORT=12345 VLLM_RPC_PORT=5580 VLLM_DISAGG_PREFILL_ROLE=decode CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
+  VLLM_PORT=12345 VLLM_DISTRIBUTED_KV_ROLE=consumer CUDA_VISIBLE_DEVICES=4,5,6,7 python3 \
     -m vllm.entrypoints.openai.api_server \
     --model $model \
     --port 8200 \

From 0b00876f064a148609150e79d75f1cfaed6c5ab6 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Thu, 19 Sep 2024 05:23:47 +0000
Subject: [PATCH 248/278] update build partial prefill input

---
 .../disagg_prefill_example.sh                 |   0
 vllm/distributed/kv_transfer/vllm_adapter.py  | 237 +++++++++---------
 2 files changed, 123 insertions(+), 114 deletions(-)
 rename examples/{disagg_prefill => distributed_kv}/disagg_prefill_example.sh (100%)

diff --git a/examples/disagg_prefill/disagg_prefill_example.sh b/examples/distributed_kv/disagg_prefill_example.sh
similarity index 100%
rename from examples/disagg_prefill/disagg_prefill_example.sh
rename to examples/distributed_kv/disagg_prefill_example.sh
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 1a918ff0ae32..9b9a035185ad 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -52,6 +52,7 @@
 # it is likely connected to a KV storage service on CPU/disk
 # so the communication backend needs to be "gloo" for that case.
 DISTRIBUTED_BACKEND: str = "gloo" if (IS_KV_PRODUCER and IS_KV_CONSUMER) else "nccl"
+# corresponding device
 DISTRIBUTED_DEVICE: str = "cpu" if (IS_KV_PRODUCER and IS_KV_CONSUMER) else "cuda"
 
 
@@ -288,7 +289,7 @@ def recv_kv_caches_and_hidden_states(
             return None, bypass_model_exec, model_input
 
         if not is_complete:
-            rebuilt_model_input = self.build_partial_prefill_input(
+            rebuilt_model_input = build_partial_prefill_input(
                 model_input,
                 input_tokens_list,
                 num_computed_tokens_list,
@@ -307,117 +308,125 @@ def recv_kv_caches_and_hidden_states(
         logger.debug("[rank%d]: KV recv DONE.", torch.distributed.get_rank())
         return hidden_or_intermediate_states, bypass_model_exec, model_input
 
-    def build_partial_prefill_input(
-        self,
-        model_input: "ModelInputForGPUWithSamplingMetadata",
-        input_tokens_list: List[torch.Tensor],
-        num_computed_tokens_list: List[int],
-        start_pos_list: List[int],
-        slot_mapping_flat: torch.Tensor,
-        device: torch.device,
-    ) -> "ModelInputForGPUWithSamplingMetadata":
-        rebuilt_input_tokens = []
-        rebuilt_input_positions = []
-        rebuilt_query_lens = []
-
-        rebuilt_num_prefills = 0
-        rebuilt_num_prefill_tokens = 0
-        rebuilt_slot_mapping = []
-        rebuilt_max_query_len = 0
-
-        rebuilt_block_tables = []
-
-        rebuilt_query_start_loc = [0]
-        rebuilt_context_lens_tensor = []
-        rebuilt_selected_token_indices = []
-
-        # recounting query and context lengths
-        for idx in range(len(input_tokens_list)):
-            token_tensor = input_tokens_list[idx]
-            num_token = len(token_tensor)
-            num_computed_token = num_computed_tokens_list[idx]
-            start_pos = start_pos_list[idx]
-
-            rebuilt_input_tokens.append(token_tensor[num_computed_token:])
-            # TODO(Jiayi): please check the correctness of next line
-            rebuilt_input_positions.append(
-                model_input.input_positions[start_pos +
-                                            num_computed_token:start_pos +
-                                            num_token])
-            q_len = num_token - num_computed_token
-            rebuilt_query_lens.append(q_len)
-
-            # Attn metadata-related
-            rebuilt_num_prefills += 1
-            rebuilt_num_prefill_tokens += q_len
-            rebuilt_slot_mapping.append(
-                slot_mapping_flat[start_pos + num_computed_token:start_pos +
-                                  num_token])
-            rebuilt_max_query_len = max(q_len, rebuilt_max_query_len)
-            # TODO(Jiayi): remove hard-code (block_size=16)
-            blk_size = 16
-            temp_block_table = [
-                i // blk_size
-                for i in range(start_pos, start_pos + num_token, blk_size)
-            ]
-            rebuilt_block_tables.append(temp_block_table)
-            rebuilt_query_start_loc.append(q_len)  #start with 0
-            rebuilt_context_lens_tensor.append(num_computed_token)
-
-            # Sampling metadata related
-            #seq_groups (use rebuilt query lens)
-            rebuilt_selected_token_indices.append(start_pos + q_len - 1)
-
-        # rebuilt attn_metadata
-        rebuilt_attn_metadata = deepcopy(model_input.attn_metadata)
-        rebuilt_attn_metadata.num_prefills = rebuilt_num_prefills
-        rebuilt_attn_metadata.num_prefill_tokens = rebuilt_num_prefill_tokens
-        rebuilt_attn_metadata.slot_mapping = torch.cat(
-            rebuilt_slot_mapping).to(device)
-        rebuilt_attn_metadata.max_query_len = rebuilt_max_query_len
-
-        rebuilt_attn_metadata.block_tables = torch.tensor(
-            rebuilt_block_tables,
-            dtype=model_input.attn_metadata.block_tables.dtype).to(device)
-
-        rebuilt_attn_metadata.query_start_loc = torch.tensor(
-            rebuilt_query_start_loc,
-            dtype=model_input.attn_metadata.query_start_loc.dtype).to(device)
-        rebuilt_attn_metadata.context_lens_tensor = torch.tensor(
-            rebuilt_context_lens_tensor,
-            dtype=model_input.attn_metadata.context_lens_tensor.dtype,
-        ).to(device)
-
-        rebuilt_attn_metadata._cached_prefill_metadata = None
-
-        # rebuilt sampling_metadata
-        rebuilt_sampling_metadata = deepcopy(model_input.sampling_metadata)
-        for idx, q_len in enumerate(rebuilt_query_lens):
+
+
+
+def build_partial_prefill_input(
+    model_input: "ModelInputForGPUWithSamplingMetadata",
+    input_tokens_list: List[torch.Tensor],
+    num_computed_tokens_list: List[int],
+    start_pos_list: List[int],
+    slot_mapping_flat: torch.Tensor,
+    device: torch.device,
+) -> "ModelInputForGPUWithSamplingMetadata":
+    """
+    Helper function to rebuild the model input for the current request.
+    Goal: avoid running redundant prefill on those tokens that already has KV 
+    caches received.
+    """
+    rebuilt_input_tokens = []
+    rebuilt_input_positions = []
+    rebuilt_query_lens = []
+
+    rebuilt_num_prefills = 0
+    rebuilt_num_prefill_tokens = 0
+    rebuilt_slot_mapping = []
+    rebuilt_max_query_len = 0
+
+    rebuilt_block_tables = []
+
+    rebuilt_query_start_loc = [0]
+    rebuilt_context_lens_tensor = []
+    rebuilt_selected_token_indices = []
+
+    # recounting query and context lengths
+    for idx in range(len(input_tokens_list)):
+        token_tensor = input_tokens_list[idx]
+        num_token = len(token_tensor)
+        num_computed_token = num_computed_tokens_list[idx]
+        start_pos = start_pos_list[idx]
+
+        rebuilt_input_tokens.append(token_tensor[num_computed_token:])
+        # TODO(Jiayi): please check the correctness of next line
+        rebuilt_input_positions.append(
+            model_input.input_positions[start_pos +
+                                        num_computed_token : start_pos +
+                                        num_token])
+        q_len = num_token - num_computed_token
+        rebuilt_query_lens.append(q_len)
+
+        # Attn metadata-related
+        rebuilt_num_prefills += 1
+        rebuilt_num_prefill_tokens += q_len
+        new_slot_mapping = slot_mapping_flat[start_pos + num_computed_token : start_pos + num_token]
+        rebuilt_slot_mapping.append(new_slot_mapping)
+        rebuilt_max_query_len = max(q_len, rebuilt_max_query_len)
+        # TODO(Jiayi): remove hard-code (block_size=16)
+        blk_size = 16
+        temp_block_table = [
+            slot_mapping_flat[i] // blk_size
+            for i in range(start_pos, start_pos + num_token, blk_size)
+        ]
+        rebuilt_block_tables.append(temp_block_table)
+        rebuilt_query_start_loc.append(rebuilt_num_prefill_tokens)  #start with 0
+        rebuilt_context_lens_tensor.append(num_computed_token)
+
+        # Sampling metadata related
+        #seq_groups (use rebuilt query lens)
+        rebuilt_selected_token_indices.append(rebuilt_num_prefill_tokens - 1)
+
+    # rebuilt attn_metadata
+    rebuilt_attn_metadata = deepcopy(model_input.attn_metadata)
+    rebuilt_attn_metadata.num_prefills = rebuilt_num_prefills
+    rebuilt_attn_metadata.num_prefill_tokens = rebuilt_num_prefill_tokens
+    rebuilt_attn_metadata.slot_mapping = torch.cat(
+        rebuilt_slot_mapping).to(device)
+    rebuilt_attn_metadata.max_query_len = rebuilt_max_query_len
+
+    rebuilt_attn_metadata.block_tables = torch.tensor(
+        rebuilt_block_tables,
+        dtype=model_input.attn_metadata.block_tables.dtype).to(device)
+
+    rebuilt_attn_metadata.query_start_loc = torch.tensor(
+        rebuilt_query_start_loc,
+        dtype=model_input.attn_metadata.query_start_loc.dtype).to(device)
+    rebuilt_attn_metadata.context_lens_tensor = torch.tensor(
+        rebuilt_context_lens_tensor,
+        dtype=model_input.attn_metadata.context_lens_tensor.dtype,
+    ).to(device)
+
+    rebuilt_attn_metadata._cached_prefill_metadata = None
+
+    # rebuilt sampling_metadata
+    rebuilt_sampling_metadata = deepcopy(model_input.sampling_metadata)
+    for idx, q_len in enumerate(rebuilt_query_lens):
+        if rebuilt_sampling_metadata.seq_groups is not None:
             rebuilt_sampling_metadata.seq_groups[idx].query_len = q_len
-        rebuilt_sampling_metadata.selected_token_indices = torch.tensor(
-            rebuilt_selected_token_indices,
-            dtype=model_input.sampling_metadata.selected_token_indices.dtype,
-        ).to(device)
-
-        # import here to avoid circular import.
-        from vllm.worker.model_runner import (
-            ModelInputForGPUWithSamplingMetadata)
-        rebuilt_model_input = ModelInputForGPUWithSamplingMetadata(
-            input_tokens=torch.cat(rebuilt_input_tokens).to(device),
-            input_positions=torch.cat(rebuilt_input_positions).to(device),
-            seq_lens=model_input.seq_lens,
-            query_lens=rebuilt_query_lens,
-            lora_mapping=model_input.lora_mapping,
-            lora_requests=model_input.lora_requests,
-            attn_metadata=rebuilt_attn_metadata,
-            prompt_adapter_mapping=model_input.prompt_adapter_mapping,
-            prompt_adapter_requests=model_input.prompt_adapter_requests,
-            multi_modal_kwargs=model_input.multi_modal_kwargs,
-            request_ids_to_seq_ids=model_input.request_ids_to_seq_ids,
-            finished_requests_ids=model_input.finished_requests_ids,
-            virtual_engine=model_input.virtual_engine,
-            sampling_metadata=rebuilt_sampling_metadata,
-            is_prompt=model_input.is_prompt,
-        )
-
-        return rebuilt_model_input
+
+    rebuilt_sampling_metadata.selected_token_indices = torch.tensor(
+        rebuilt_selected_token_indices,
+        dtype=model_input.sampling_metadata.selected_token_indices.dtype,
+    ).to(device)
+
+    # import here to avoid circular import.
+    from vllm.worker.model_runner import (
+        ModelInputForGPUWithSamplingMetadata)
+    rebuilt_model_input = ModelInputForGPUWithSamplingMetadata(
+        input_tokens=torch.cat(rebuilt_input_tokens).to(device),
+        input_positions=torch.cat(rebuilt_input_positions).to(device),
+        seq_lens=model_input.seq_lens,
+        query_lens=rebuilt_query_lens,
+        lora_mapping=model_input.lora_mapping,
+        lora_requests=model_input.lora_requests,
+        attn_metadata=rebuilt_attn_metadata,
+        prompt_adapter_mapping=model_input.prompt_adapter_mapping,
+        prompt_adapter_requests=model_input.prompt_adapter_requests,
+        multi_modal_kwargs=model_input.multi_modal_kwargs,
+        request_ids_to_seq_ids=model_input.request_ids_to_seq_ids,
+        finished_requests_ids=model_input.finished_requests_ids,
+        virtual_engine=model_input.virtual_engine,
+        sampling_metadata=rebuilt_sampling_metadata,
+        is_prompt=model_input.is_prompt,
+    )
+
+    return rebuilt_model_input

From 94a5086ae10d52d3d2d0d8704854aa07d6a789a5 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 20 Sep 2024 01:06:11 +0000
Subject: [PATCH 249/278] bug fix for LMCache -- adjust vLLM's rebuild input,
 and merge the logic to reduce random exit branch

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 55 ++++++++++++--------
 vllm/distributed/parallel_state.py           |  3 +-
 2 files changed, 35 insertions(+), 23 deletions(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 9b9a035185ad..f901f9e22abc 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -86,6 +86,7 @@ def __init__(
         # In remote KV cache store, vLLM will use both send pipe and recv pipe
         # So we build both send pipe and recv pipe for simplicity.
         if IS_KV_PRODUCER:
+ 
             self.send_pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
@@ -207,11 +208,12 @@ def recv_kv_caches_and_hidden_states(
     ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
                "ModelInputForGPUWithSamplingMetadata"]:
 
-        # When this flag is set to False, it means that
+        # When this flag is set to False, it means that at least for one 
+        # request its corresponding KV cache or hidden state is missing.
+        # In this case we need to do prefilling to recompute missing KV cache 
+        # and hidden states. 
         bypass_model_exec = True
 
-        # This is disagg decode instance, during prefill state
-        # Need to receive KV from the prefill instance
         input_tokens_tensor = model_input.input_tokens
         seq_lens = model_input.attn_metadata.seq_lens
         slot_mapping = model_input.attn_metadata.slot_mapping.flatten()
@@ -231,6 +233,7 @@ def recv_kv_caches_and_hidden_states(
             current_tokens = input_tokens_tensor[start_pos:end_pos]
             num_tokens = slen
 
+            # collecting data for rebuilding the input
             input_tokens_list.append(current_tokens)
             start_pos_list.append(start_pos)
 
@@ -246,19 +249,26 @@ def recv_kv_caches_and_hidden_states(
                 num_computed_tokens_list.append(0)
                 continue
 
-            # TODO(Jiayi): change the logic here (need roi)
             roi: torch.Tensor = ret[1]
             keys: torch.Tensor = ret[2]
             values: torch.Tensor = ret[3]
             hidden: torch.Tensor = ret[4]
 
-            # Jiayi: currently assume roi is a prefix
             num_computed_tokens = roi.shape[0]
             num_computed_tokens_list.append(num_computed_tokens)
-            is_complete = (num_computed_tokens == num_tokens)
+
+            # check if both KV cache and the hidden states are received
+            # If not, need to redo the forwarding to compute missing states
+            if not all([
+                (num_computed_tokens == num_tokens),
+                hidden is not None
+            ]):
+                bypass_model_exec = False
+                
+            # update the end position based on how many tokens are cached.
             end_pos = start_pos + num_computed_tokens
 
-            # receive KV cache from disaggregated prefill instance
+            # put received KV caches into paged memory
             for i in range(model_executable.model.start_layer,
                            model_executable.model.end_layer):
 
@@ -279,33 +289,30 @@ def recv_kv_caches_and_hidden_states(
 
             hidden_or_intermediate_states_for_one_req.append(hidden)
 
-        # FIXME(Jiayi): we need to support only skip m out of n reqs in a batch
-        # same for prefix caching
-        if not bypass_model_exec:
+        if bypass_model_exec == False:
             # Some of the KV cache is not retrieved
-            # so we need to recompute the hidden state
-            logger.debug("[rank%d]: KV EMPTY recv DONE.",
+            # so we need to adjust model_input and redo the forwarding.
+            logger.debug("[rank%d]: Failed to receive all KVs and hidden "
+                         "states, redo model forwarding.",
                          torch.distributed.get_rank())
-            return None, bypass_model_exec, model_input
-
-        if not is_complete:
             rebuilt_model_input = build_partial_prefill_input(
                 model_input,
                 input_tokens_list,
                 num_computed_tokens_list,
                 start_pos_list,
                 slot_mapping,
-                device=kv_cache[0].device,
+                device=input_tokens_tensor.device,
             )
-            logger.debug("[rank%d]: KV PARTIAL recv DONE.",
-                         torch.distributed.get_rank())
-            return None, False, rebuilt_model_input
+            model_input = rebuilt_model_input
+            hidden_or_intermediate_states = None
 
-        # concatenate hidden states from different requests
-        hidden_or_intermediate_states = torch.cat(
+        else:
+            logger.debug("[rank%d]: Successfully received all KVs and hidden "
+                         "states, skip model forwarding.",
+                         torch.distributed.get_rank())
+            hidden_or_intermediate_states = torch.cat(
             hidden_or_intermediate_states_for_one_req, dim=0)
 
-        logger.debug("[rank%d]: KV recv DONE.", torch.distributed.get_rank())
         return hidden_or_intermediate_states, bypass_model_exec, model_input
 
 
@@ -344,6 +351,10 @@ def build_partial_prefill_input(
         token_tensor = input_tokens_list[idx]
         num_token = len(token_tensor)
         num_computed_token = num_computed_tokens_list[idx]
+        # currently attention kernel cannot handle the case where there is 0 
+        # query token.
+        if num_computed_token == num_token:
+            num_computed_token -= 1
         start_pos = start_pos_list[idx]
 
         rebuilt_input_tokens.append(token_tensor[num_computed_token:])
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 64faa5b81263..ab24ef09090c 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -983,7 +983,7 @@ def init_distributed_environment(
         maybe_disagg_rank = rank
         if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
             maybe_disagg_world_size = world_size * 2
-            logger.debug("Disaggregated prefill enabled.")
+            logger.debug("Distributed KV transfer enabled.")
             if dist_kv.IS_KV_PRODUCER:
                 # for prefill, the ranks are [0, world_size)
                 logger.debug("rank %d is KV producer.", rank)
@@ -996,6 +996,7 @@ def init_distributed_environment(
                              rank,
                              maybe_disagg_rank)
 
+
         torch.distributed.init_process_group(
             backend=backend,
             init_method=distributed_init_method,

From 8099fb3a132d1b0fc1beac411189ed019721e692 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 20 Sep 2024 01:07:33 +0000
Subject: [PATCH 250/278] make format checker happy

---
 vllm/distributed/kv_transfer/vllm_adapter.py | 71 ++++++++++----------
 vllm/distributed/parallel_state.py           |  9 +--
 2 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index f901f9e22abc..8e57b5171e04 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -51,10 +51,11 @@
 # When the current instance is both KV producer and KV consumer,
 # it is likely connected to a KV storage service on CPU/disk
 # so the communication backend needs to be "gloo" for that case.
-DISTRIBUTED_BACKEND: str = "gloo" if (IS_KV_PRODUCER and IS_KV_CONSUMER) else "nccl"
+DISTRIBUTED_BACKEND: str = "gloo" if (IS_KV_PRODUCER
+                                      and IS_KV_CONSUMER) else "nccl"
 # corresponding device
-DISTRIBUTED_DEVICE: str = "cpu" if (IS_KV_PRODUCER and IS_KV_CONSUMER) else "cuda"
-
+DISTRIBUTED_DEVICE: str = "cpu" if (IS_KV_PRODUCER
+                                    and IS_KV_CONSUMER) else "cuda"
 
 
 class KV_transfer_agent:
@@ -86,7 +87,7 @@ def __init__(
         # In remote KV cache store, vLLM will use both send pipe and recv pipe
         # So we build both send pipe and recv pipe for simplicity.
         if IS_KV_PRODUCER:
- 
+
             self.send_pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
@@ -115,10 +116,10 @@ def __init__(
                                                     self.lookup_buffer_size)
             self.tensor_device = DISTRIBUTED_DEVICE
         else:
-            
-            # the current vLLM instance is KV consumer, so it needs to connect 
+
+            # the current vLLM instance is KV consumer, so it needs to connect
             # its recv pipe to the send pipe of KV producder
-            
+
             self.recv_pipe = TorchDistributedPipe(
                 group_ranks,
                 local_rank,
@@ -208,10 +209,10 @@ def recv_kv_caches_and_hidden_states(
     ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool,
                "ModelInputForGPUWithSamplingMetadata"]:
 
-        # When this flag is set to False, it means that at least for one 
+        # When this flag is set to False, it means that at least for one
         # request its corresponding KV cache or hidden state is missing.
-        # In this case we need to do prefilling to recompute missing KV cache 
-        # and hidden states. 
+        # In this case we need to do prefilling to recompute missing KV cache
+        # and hidden states.
         bypass_model_exec = True
 
         input_tokens_tensor = model_input.input_tokens
@@ -259,12 +260,10 @@ def recv_kv_caches_and_hidden_states(
 
             # check if both KV cache and the hidden states are received
             # If not, need to redo the forwarding to compute missing states
-            if not all([
-                (num_computed_tokens == num_tokens),
-                hidden is not None
-            ]):
+            if not all([(num_computed_tokens == num_tokens), hidden is not None
+                        ]):
                 bypass_model_exec = False
-                
+
             # update the end position based on how many tokens are cached.
             end_pos = start_pos + num_computed_tokens
 
@@ -277,8 +276,10 @@ def recv_kv_caches_and_hidden_states(
 
                 key_cache, value_cache = kv_cache[0], kv_cache[1]
                 ops.reshape_and_cache_flash(
-                    keys[i - model_executable.model.start_layer].to(key_cache.device),
-                    values[i - model_executable.model.start_layer].to(value_cache.device),
+                    keys[i - model_executable.model.start_layer].to(
+                        key_cache.device),
+                    values[i - model_executable.model.start_layer].to(
+                        value_cache.device),
                     key_cache,
                     value_cache,
                     slot_mapping[start_pos:end_pos],
@@ -289,12 +290,12 @@ def recv_kv_caches_and_hidden_states(
 
             hidden_or_intermediate_states_for_one_req.append(hidden)
 
-        if bypass_model_exec == False:
+        if not bypass_model_exec:
             # Some of the KV cache is not retrieved
             # so we need to adjust model_input and redo the forwarding.
-            logger.debug("[rank%d]: Failed to receive all KVs and hidden "
-                         "states, redo model forwarding.",
-                         torch.distributed.get_rank())
+            logger.debug(
+                "[rank%d]: Failed to receive all KVs and hidden "
+                "states, redo model forwarding.", torch.distributed.get_rank())
             rebuilt_model_input = build_partial_prefill_input(
                 model_input,
                 input_tokens_list,
@@ -307,17 +308,15 @@ def recv_kv_caches_and_hidden_states(
             hidden_or_intermediate_states = None
 
         else:
-            logger.debug("[rank%d]: Successfully received all KVs and hidden "
-                         "states, skip model forwarding.",
-                         torch.distributed.get_rank())
+            logger.debug(
+                "[rank%d]: Successfully received all KVs and hidden "
+                "states, skip model forwarding.", torch.distributed.get_rank())
             hidden_or_intermediate_states = torch.cat(
-            hidden_or_intermediate_states_for_one_req, dim=0)
+                hidden_or_intermediate_states_for_one_req, dim=0)
 
         return hidden_or_intermediate_states, bypass_model_exec, model_input
 
 
-
-
 def build_partial_prefill_input(
     model_input: "ModelInputForGPUWithSamplingMetadata",
     input_tokens_list: List[torch.Tensor],
@@ -351,7 +350,7 @@ def build_partial_prefill_input(
         token_tensor = input_tokens_list[idx]
         num_token = len(token_tensor)
         num_computed_token = num_computed_tokens_list[idx]
-        # currently attention kernel cannot handle the case where there is 0 
+        # currently attention kernel cannot handle the case where there is 0
         # query token.
         if num_computed_token == num_token:
             num_computed_token -= 1
@@ -361,7 +360,7 @@ def build_partial_prefill_input(
         # TODO(Jiayi): please check the correctness of next line
         rebuilt_input_positions.append(
             model_input.input_positions[start_pos +
-                                        num_computed_token : start_pos +
+                                        num_computed_token:start_pos +
                                         num_token])
         q_len = num_token - num_computed_token
         rebuilt_query_lens.append(q_len)
@@ -369,7 +368,9 @@ def build_partial_prefill_input(
         # Attn metadata-related
         rebuilt_num_prefills += 1
         rebuilt_num_prefill_tokens += q_len
-        new_slot_mapping = slot_mapping_flat[start_pos + num_computed_token : start_pos + num_token]
+        new_slot_mapping = slot_mapping_flat[start_pos +
+                                             num_computed_token:start_pos +
+                                             num_token]
         rebuilt_slot_mapping.append(new_slot_mapping)
         rebuilt_max_query_len = max(q_len, rebuilt_max_query_len)
         # TODO(Jiayi): remove hard-code (block_size=16)
@@ -379,7 +380,8 @@ def build_partial_prefill_input(
             for i in range(start_pos, start_pos + num_token, blk_size)
         ]
         rebuilt_block_tables.append(temp_block_table)
-        rebuilt_query_start_loc.append(rebuilt_num_prefill_tokens)  #start with 0
+        rebuilt_query_start_loc.append(
+            rebuilt_num_prefill_tokens)  #start with 0
         rebuilt_context_lens_tensor.append(num_computed_token)
 
         # Sampling metadata related
@@ -390,8 +392,8 @@ def build_partial_prefill_input(
     rebuilt_attn_metadata = deepcopy(model_input.attn_metadata)
     rebuilt_attn_metadata.num_prefills = rebuilt_num_prefills
     rebuilt_attn_metadata.num_prefill_tokens = rebuilt_num_prefill_tokens
-    rebuilt_attn_metadata.slot_mapping = torch.cat(
-        rebuilt_slot_mapping).to(device)
+    rebuilt_attn_metadata.slot_mapping = torch.cat(rebuilt_slot_mapping).to(
+        device)
     rebuilt_attn_metadata.max_query_len = rebuilt_max_query_len
 
     rebuilt_attn_metadata.block_tables = torch.tensor(
@@ -420,8 +422,7 @@ def build_partial_prefill_input(
     ).to(device)
 
     # import here to avoid circular import.
-    from vllm.worker.model_runner import (
-        ModelInputForGPUWithSamplingMetadata)
+    from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata)
     rebuilt_model_input = ModelInputForGPUWithSamplingMetadata(
         input_tokens=torch.cat(rebuilt_input_tokens).to(device),
         input_positions=torch.cat(rebuilt_input_positions).to(device),
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index ab24ef09090c..c4f2b8529dd2 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -992,11 +992,9 @@ def init_distributed_environment(
                 # this is decode instance.
                 # offset global rank by tp * pp (which is world_size)
                 maybe_disagg_rank = rank + world_size
-                logger.debug("rank %d is KV consumer, adjust it to %d",
-                             rank,
+                logger.debug("rank %d is KV consumer, adjust it to %d", rank,
                              maybe_disagg_rank)
 
-
         torch.distributed.init_process_group(
             backend=backend,
             init_method=distributed_init_method,
@@ -1137,7 +1135,6 @@ def initialize_model_parallel(
                                     group_name="pp")
     logger.debug("_PP initialized for rank %d", torch.distributed.get_rank())
 
-    
     if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
         global _DISAGG
         logger.debug("Disaggregated prefill enabled, create _DISAGG group")
@@ -1152,8 +1149,8 @@ def initialize_model_parallel(
             local_rank=get_world_group().local_rank,
         )
         logger.debug("_DISAGG initialized for rank %d",
-                    torch.distributed.get_rank())
-            
+                     torch.distributed.get_rank())
+
 
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,

From 603864e302650e01ae46634da51bcc7252e08cb5 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 20 Sep 2024 01:12:10 +0000
Subject: [PATCH 251/278] make ruff and yapf happy, also fix test bug

---
 tests/kv_transfer/disagg_test.py             | 36 +++++++++++++++-----
 vllm/distributed/kv_transfer/vllm_adapter.py |  2 +-
 vllm/worker/model_runner.py                  | 23 +++++--------
 3 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
index fffd9ab6f42a..96203b2dc65a 100644
--- a/tests/kv_transfer/disagg_test.py
+++ b/tests/kv_transfer/disagg_test.py
@@ -23,23 +23,43 @@ def setup_servers():
 
     # Start prefill instance
     prefill_cmd = [
-        sys.executable, "-m", "vllm.entrypoints.openai.api_server", "-tp", "2",
-        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--port", "8100",
-        "--gpu-memory-utilization", "0.8", "--max-model-len", "1000",
+        sys.executable,
+        "-m",
+        "vllm.entrypoints.openai.api_server",
+        "-tp",
+        "2",
+        "--model",
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "--port",
+        "8100",
+        "--gpu-memory-utilization",
+        "0.8",
+        "--max-model-len",
+        "1000",
     ]
     prefill_env = os.environ.copy()
-    prefill_env["VLLM_DISAGG_PREFILL_ROLE"] = "prefill"
+    prefill_env["VLLM_DISTRIBUTED_KV_ROLE"] = "producer"
     prefill_env["CUDA_VISIBLE_DEVICES"] = "0,1"
     prefill_proc = Popen(prefill_cmd, env=prefill_env)
 
     # Start decode instance
     decode_cmd = [
-        sys.executable, "-m", "vllm.entrypoints.openai.api_server", "-tp", "2",
-        "--model", "meta-llama/Meta-Llama-3.1-8B-Instruct", "--port", "8200",
-        "--gpu-memory-utilization", "0.8", "--max-model-len", "1000",
+        sys.executable,
+        "-m",
+        "vllm.entrypoints.openai.api_server",
+        "-tp",
+        "2",
+        "--model",
+        "meta-llama/Meta-Llama-3.1-8B-Instruct",
+        "--port",
+        "8200",
+        "--gpu-memory-utilization",
+        "0.8",
+        "--max-model-len",
+        "1000",
     ]
     decode_env = os.environ.copy()
-    decode_env["VLLM_DISAGG_PREFILL_ROLE"] = "decode"
+    decode_env["VLLM_DISTRIBUTED_KV_ROLE"] = "consumer"
     decode_env["CUDA_VISIBLE_DEVICES"] = "2,3"
     decode_proc = Popen(decode_cmd, env=decode_env)
 
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 8e57b5171e04..7516e7c5ff30 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -422,7 +422,7 @@ def build_partial_prefill_input(
     ).to(device)
 
     # import here to avoid circular import.
-    from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata)
+    from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
     rebuilt_model_input = ModelInputForGPUWithSamplingMetadata(
         input_tokens=torch.cat(rebuilt_input_tokens).to(device),
         input_positions=torch.cat(rebuilt_input_positions).to(device),
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index a6f91667fedf..f1dbc7c1803e 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -14,6 +14,7 @@
 import torch.distributed
 import torch.nn as nn
 
+import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
 import vllm.envs as envs
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.attention.backends.abstract import AttentionState
@@ -22,7 +23,7 @@
                          ModelConfig, ObservabilityConfig, ParallelConfig,
                          PromptAdapterConfig, SchedulerConfig)
 from vllm.core.scheduler import SchedulerOutputs
-from vllm.distributed import get_pp_group, get_disagg_group
+from vllm.distributed import get_disagg_group, get_pp_group
 from vllm.distributed.parallel_state import graph_capture
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
@@ -55,8 +56,6 @@
     _init_attn_metadata_from_tensor_dict,
     _init_sampling_metadata_from_tensor_dict, dump_input_when_exception)
 
-import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
-
 if TYPE_CHECKING:
     from vllm.attention.backends.abstract import AttentionBackend
 
@@ -1615,13 +1614,13 @@ def execute_model(
                 attn_metadata=model_input.attn_metadata,
                 intermediate_tensors=intermediate_tensors,
                 **MultiModalInputs.as_kwargs(multi_modal_kwargs,
-                                            device=self.device),
+                                             device=self.device),
                 **seqlen_agnostic_kwargs)
 
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
             model_forward_end.record()
-            
+
         # Sending KV cache in distributed KV cache transfer setting
         # NOTE: the send operation is non-blocking
         if self.send_kv_needed(model_input, kv_caches):
@@ -1702,8 +1701,7 @@ def execute_model(
             output.hidden_states = hidden_states
 
         return [output]
-    
-    
+
     def recv_kv_needed(self, model_input, kv_caches) -> bool:
         """
             Need to receive KV when
@@ -1722,7 +1720,7 @@ def recv_kv_needed(self, model_input, kv_caches) -> bool:
         return all([
             dist_kv.IS_KV_CONSUMER,
             not is_profile_run,
-            is_prefill_run, 
+            is_prefill_run,
         ])
 
     def send_kv_needed(self, model_input, kv_caches) -> bool:
@@ -1740,11 +1738,8 @@ def send_kv_needed(self, model_input, kv_caches) -> bool:
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
 
-        return all([
-            dist_kv.IS_KV_PRODUCER,
-            not is_profile_run,
-            is_prefill_run
-        ])
+        return all(
+            [dist_kv.IS_KV_PRODUCER, not is_profile_run, is_prefill_run])
 
 
 class CUDAGraphRunner:
@@ -1928,4 +1923,4 @@ def _get_max_graph_batch_size(max_num_seqs: int) -> int:
     if padded_size in _BATCH_SIZES_TO_CAPTURE:
         return padded_size
     assert padded_size > _BATCH_SIZES_TO_CAPTURE[-1]
-    return _BATCH_SIZES_TO_CAPTURE[-1]
\ No newline at end of file
+    return _BATCH_SIZES_TO_CAPTURE[-1]

From 1d7a1c99ff45246024013049cae54dfd4c624502 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 20 Sep 2024 01:13:25 +0000
Subject: [PATCH 252/278] remove empty file

---
 tests/random_send_recv.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/random_send_recv.py

diff --git a/tests/random_send_recv.py b/tests/random_send_recv.py
deleted file mode 100644
index e69de29bb2d1..000000000000

From 10ad09c0213fba08896f841c2ac109c83fa88ff4 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 20 Sep 2024 04:33:24 +0000
Subject: [PATCH 253/278] fix bug when world_size == -1

---
 vllm/distributed/parallel_state.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index c4f2b8529dd2..efeb6253e56b 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -979,6 +979,8 @@ def init_distributed_environment(
             "distributed_init_method must be provided when initializing "
             "distributed environment")
         # this backend is used for WORLD
+
+        # offset world size and rank in disaggregated prefill scenario
         maybe_disagg_world_size = world_size
         maybe_disagg_rank = rank
         if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
@@ -994,6 +996,7 @@ def init_distributed_environment(
                 maybe_disagg_rank = rank + world_size
                 logger.debug("rank %d is KV consumer, adjust it to %d", rank,
                              maybe_disagg_rank)
+                
 
         torch.distributed.init_process_group(
             backend=backend,
@@ -1014,11 +1017,16 @@ def init_distributed_environment(
 
     global _WORLD
     if _WORLD is None:
-        ranks = [[i for i in range(world_size)]]
-        # offset the distributed group
+        # in single node the world size can be -1
+        # need to infer the world size from torch.distributed.get_world_size()
+        torch_dist_world_size = torch.distributed.get_world_size()
         if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
-            ranks = include_decoding_groups_if_disagg_enabled(
-                ranks, world_size)
+            # two vLLM instances in the world
+            # so this vLLM instance's world size is half of the world size
+            torch_dist_world_size = torch_dist_world_size // 2
+        ranks = [[i for i in range(torch_dist_world_size)]]
+        ranks = include_decoding_groups_if_disagg_enabled(
+            ranks, world_size)
 
         _WORLD = init_world_group(ranks, local_rank, backend)
         logger.debug("_WORLD initialized for rank %d",

From 38e3a5759cffff52110d42401e1b728b65aad184 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 20 Sep 2024 04:34:43 +0000
Subject: [PATCH 254/278] adjust comments

---
 vllm/distributed/parallel_state.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index efeb6253e56b..a44487561141 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1017,7 +1017,7 @@ def init_distributed_environment(
 
     global _WORLD
     if _WORLD is None:
-        # in single node the world size can be -1
+        # in single node single process the world size can be -1
         # need to infer the world size from torch.distributed.get_world_size()
         torch_dist_world_size = torch.distributed.get_world_size()
         if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:

From e2bd481b7795b4b398231d516c947abab9f4e5bd Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 20 Sep 2024 04:37:51 +0000
Subject: [PATCH 255/278] make yapf and ruff happy

---
 vllm/distributed/parallel_state.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index a44487561141..fcef0f53b0e2 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -996,7 +996,6 @@ def init_distributed_environment(
                 maybe_disagg_rank = rank + world_size
                 logger.debug("rank %d is KV consumer, adjust it to %d", rank,
                              maybe_disagg_rank)
-                
 
         torch.distributed.init_process_group(
             backend=backend,
@@ -1025,8 +1024,7 @@ def init_distributed_environment(
             # so this vLLM instance's world size is half of the world size
             torch_dist_world_size = torch_dist_world_size // 2
         ranks = [[i for i in range(torch_dist_world_size)]]
-        ranks = include_decoding_groups_if_disagg_enabled(
-            ranks, world_size)
+        ranks = include_decoding_groups_if_disagg_enabled(ranks, world_size)
 
         _WORLD = init_world_group(ranks, local_rank, backend)
         logger.debug("_WORLD initialized for rank %d",

From 49793376dc4242c5e2974d45e4e8fa787a2a2048 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Fri, 20 Sep 2024 17:10:55 +0000
Subject: [PATCH 256/278] relaunch CI

---
 vllm/distributed/parallel_state.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index fcef0f53b0e2..4630046d1073 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -1021,7 +1021,7 @@ def init_distributed_environment(
         torch_dist_world_size = torch.distributed.get_world_size()
         if dist_kv.IS_DISTRIBUTED_KV_INSTANCE:
             # two vLLM instances in the world
-            # so this vLLM instance's world size is half of the world size
+            # so this vLLM instance's world size is half of torch's world size
             torch_dist_world_size = torch_dist_world_size // 2
         ranks = [[i for i in range(torch_dist_world_size)]]
         ranks = include_decoding_groups_if_disagg_enabled(ranks, world_size)

From a2007dc17c2c7ffc022913e3fc7c1dee4a8b57b7 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 06:49:58 +0000
Subject: [PATCH 257/278] change get_open_port so that it is easier to
 understand

---
 vllm/utils.py | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 3efd7e51b290..17a0a47ef8d9 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -34,6 +34,7 @@
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
 from vllm.platforms import current_platform
+import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
 
 logger = init_logger(__name__)
 
@@ -545,16 +546,34 @@ def get_open_zmq_ipc_path() -> str:
 
 def get_open_port(force: bool = False) -> int:
     port = envs.VLLM_PORT
-    if port is not None:
-        if force and port is not None:
-            # force vLLM to use envs.VLLM_PORT for torch.distributed init
-            # This is because this port will binded by prefill instance
-            # But both prefill and decode instance need to use this port to
-            # initialize torch.distributed
+    
+    if force:
+        # This flag will only be True in disaggregated prefill scenario
+        # and it has to be set so that vLLM can connect prefill vLLM instance
+        # and decode vLLM instance.
+        assert port is not None, "Please set VLLM_PORT in order to use "
+        "disaggregated prefill and distributed KV cache transfer."
+        
+        # For prefill vLLM instance (KV producer) this port must be empty.
+        # For decode vLLM instance this port can be non-empty.
+        if dist_kv.IS_KV_PRODUCER:
+            try:
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                    s.bind(("", port))
+                    return port
+            except OSError as e:
+                logger.error("Port %d must be empty so that prefill vLLM "
+                             "instance can use this port to initialize "
+                             "distributed KV communication with decode "
+                             "vLLM instance.", port)
+                raise e
+        else:
             return port
+        
+    if port is not None:
         while True:
             try:
-                logger.error('Trying port %d', port)
+                logger.debug('Trying port %d', port)
                 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                     s.bind(("", port))
                     return port

From ce434f5ed0d3af2d7de1280ef47dee0c08bd1f41 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 06:52:22 +0000
Subject: [PATCH 258/278] adjust comment

---
 vllm/utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 17a0a47ef8d9..0c87bfc65f10 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -551,8 +551,8 @@ def get_open_port(force: bool = False) -> int:
         # This flag will only be True in disaggregated prefill scenario
         # and it has to be set so that vLLM can connect prefill vLLM instance
         # and decode vLLM instance.
-        assert port is not None, "Please set VLLM_PORT in order to use "
-        "disaggregated prefill and distributed KV cache transfer."
+        assert port is not None, "Please set environment variable VLLM_PORT in"
+        " order to use disaggregated prefill and distributed KV cache transfer"
         
         # For prefill vLLM instance (KV producer) this port must be empty.
         # For decode vLLM instance this port can be non-empty.
@@ -573,7 +573,7 @@ def get_open_port(force: bool = False) -> int:
     if port is not None:
         while True:
             try:
-                logger.debug('Trying port %d', port)
+                logger.info('Trying port %d', port)
                 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                     s.bind(("", port))
                     return port

From f224c71f0b2de554a199d334ab0186895b566c2b Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 06:54:25 +0000
Subject: [PATCH 259/278] make format checker happy

---
 vllm/utils.py              | 17 +++++++++--------
 vllm/worker/worker.py      |  2 +-
 vllm/worker/worker_base.py |  2 +-
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index 0c87bfc65f10..d2ba2272575e 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -31,10 +31,10 @@
 from packaging.version import Version
 from typing_extensions import ParamSpec, TypeIs, assert_never
 
+import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
 import vllm.envs as envs
 from vllm.logger import enable_trace_function_call, init_logger
 from vllm.platforms import current_platform
-import vllm.distributed.kv_transfer.vllm_adapter as dist_kv
 
 logger = init_logger(__name__)
 
@@ -546,14 +546,14 @@ def get_open_zmq_ipc_path() -> str:
 
 def get_open_port(force: bool = False) -> int:
     port = envs.VLLM_PORT
-    
+
     if force:
         # This flag will only be True in disaggregated prefill scenario
         # and it has to be set so that vLLM can connect prefill vLLM instance
         # and decode vLLM instance.
         assert port is not None, "Please set environment variable VLLM_PORT in"
         " order to use disaggregated prefill and distributed KV cache transfer"
-        
+
         # For prefill vLLM instance (KV producer) this port must be empty.
         # For decode vLLM instance this port can be non-empty.
         if dist_kv.IS_KV_PRODUCER:
@@ -562,14 +562,15 @@ def get_open_port(force: bool = False) -> int:
                     s.bind(("", port))
                     return port
             except OSError as e:
-                logger.error("Port %d must be empty so that prefill vLLM "
-                             "instance can use this port to initialize "
-                             "distributed KV communication with decode "
-                             "vLLM instance.", port)
+                logger.error(
+                    "Port %d must be empty so that prefill vLLM "
+                    "instance can use this port to initialize "
+                    "distributed KV communication with decode "
+                    "vLLM instance.", port)
                 raise e
         else:
             return port
-        
+
     if port is not None:
         while True:
             try:
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index a12e026b2bdf..3851843afc96 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -485,4 +485,4 @@ def raise_if_cache_size_invalid(num_gpu_blocks, block_size,
             "is larger than the maximum number of tokens that can be "
             f"stored in KV cache ({max_seq_len}). Try increasing "
             "`gpu_memory_utilization` or decreasing `max_model_len` when "
-            "initializing the engine.")
\ No newline at end of file
+            "initializing the engine.")
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index abc6f98b5f30..6ba4f272315c 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -482,4 +482,4 @@ def extract_previous_hidden_states(
         output["previous_hidden_states"] = data.previous_hidden_states\
             .hidden_states
 
-    return output
\ No newline at end of file
+    return output

From 5d9b007b3b26eb445cfe1689dd1fb179aa5fb737 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 07:01:42 +0000
Subject: [PATCH 260/278] adjust model runner docstring

---
 vllm/worker/model_runner.py | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index f1dbc7c1803e..da33cda71dff 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1584,7 +1584,7 @@ def execute_model(
         # we can skip prefilling on tokens that successfully received KV caches
         # NOTE: The receive operation is blocking
         bypass_model_exec = False
-        if self.recv_kv_needed(model_input, kv_caches):
+        if self.need_recv_kv(model_input, kv_caches):
             hidden_or_intermediate_states, bypass_model_exec, model_input = \
                 get_disagg_group().recv_kv_caches_and_hidden_states(
                     # model is used to know which layer the current worker
@@ -1623,11 +1623,10 @@ def execute_model(
 
         # Sending KV cache in distributed KV cache transfer setting
         # NOTE: the send operation is non-blocking
-        if self.send_kv_needed(model_input, kv_caches):
-            logger.debug("Sending KV caches")
+        if self.need_send_kv(model_input, kv_caches):
             get_disagg_group().send_kv_caches_and_hidden_states(
-                # model is used to know which layer the current worker
-                # is working on, so that we can send KV for only those
+                # model_executable is used to know which layer the current 
+                # worker is working on, so that we can send KV for only those
                 # layers.
                 model_executable,
                 model_input,
@@ -1702,12 +1701,16 @@ def execute_model(
 
         return [output]
 
-    def recv_kv_needed(self, model_input, kv_caches) -> bool:
-        """
-            Need to receive KV when
-            1. current vLLM instance is KV cache *consumer*
+    def need_recv_kv(self, model_input, kv_caches) -> bool:
+        """Check if we need to receive kv-cache from the other worker.
+        We need to receive KV when
+            1. current vLLM instance is KV cache consumer/decode vLLM instance
             2. this batch is not a profiling run
             3. this batch is a prefill run
+            
+        Args:
+            model_input: input to the model executable
+            kv_caches: vLLM's paged memory
         """
 
         prefill_meta = model_input.attn_metadata.prefill_metadata
@@ -1723,12 +1726,16 @@ def recv_kv_needed(self, model_input, kv_caches) -> bool:
             is_prefill_run,
         ])
 
-    def send_kv_needed(self, model_input, kv_caches) -> bool:
-        """
-            Need to receive KV when
-            1. current vLLM instance is KV cache *producer*
+    def need_send_kv(self, model_input, kv_caches) -> bool:
+        """Check if we need to send kv-cache from the other worker.
+        We need to send KV when
+            1. current vLLM instance is KV cache producer/prefill vLLM instance
             2. this batch is not a profiling run
             3. this batch is a prefill run
+            
+        Args:
+            model_input: input to the model executable
+            kv_caches: vLLM's paged memory
         """
 
         prefill_meta = model_input.attn_metadata.prefill_metadata

From 6255dca7d649953b51f3f68e4157045c21997604 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 07:04:11 +0000
Subject: [PATCH 261/278] make format checker happy

---
 vllm/worker/model_runner.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index da33cda71dff..eba5ddd14090 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1625,7 +1625,7 @@ def execute_model(
         # NOTE: the send operation is non-blocking
         if self.need_send_kv(model_input, kv_caches):
             get_disagg_group().send_kv_caches_and_hidden_states(
-                # model_executable is used to know which layer the current 
+                # model_executable is used to know which layer the current
                 # worker is working on, so that we can send KV for only those
                 # layers.
                 model_executable,
@@ -1720,14 +1720,11 @@ def need_recv_kv(self, model_input, kv_caches) -> bool:
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
 
-        return all([
-            dist_kv.IS_KV_CONSUMER,
-            not is_profile_run,
-            is_prefill_run,
-        ])
+        return dist_kv.IS_KV_CONSUMER and (
+            not is_profile_run) and is_prefill_run
 
     def need_send_kv(self, model_input, kv_caches) -> bool:
-        """Check if we need to send kv-cache from the other worker.
+        """Check if we need to send kv-cache to the other worker.
         We need to send KV when
             1. current vLLM instance is KV cache producer/prefill vLLM instance
             2. this batch is not a profiling run
@@ -1745,8 +1742,8 @@ def need_send_kv(self, model_input, kv_caches) -> bool:
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
 
-        return all(
-            [dist_kv.IS_KV_PRODUCER, not is_profile_run, is_prefill_run])
+        return dist_kv.IS_KV_PRODUCER and (
+            not is_profile_run) and is_prefill_run
 
 
 class CUDAGraphRunner:

From 71ae27592b35ecf4d48d44ab12eb8295ed2a6f1e Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 18:38:29 +0000
Subject: [PATCH 262/278] change data == [] to not data (thanks Cody)

---
 vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index 9696032002fd..96f8f14561e7 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -78,7 +78,7 @@ def _send_tensor_and_dec_size(self,
 
     def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
 
-        if data == [] or data is None:
+        if not data:
             return 0
         if isinstance(data, torch.Tensor):
             return data.element_size() * data.numel()

From 80164ea3f33b17a5daaa9aa8202f561f1d829867 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 18:47:18 +0000
Subject: [PATCH 263/278] fix misleading  to available

---
 vllm/utils.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/utils.py b/vllm/utils.py
index d2ba2272575e..a423eb355370 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -549,14 +549,15 @@ def get_open_port(force: bool = False) -> int:
 
     if force:
         # This flag will only be True in disaggregated prefill scenario
-        # and it has to be set so that vLLM can connect prefill vLLM instance
-        # and decode vLLM instance.
+        # and VLLM_PORT must be set so that vLLM can connect prefill vLLM 
+        # instance and decode vLLM instance.
         assert port is not None, "Please set environment variable VLLM_PORT in"
         " order to use disaggregated prefill and distributed KV cache transfer"
 
-        # For prefill vLLM instance (KV producer) this port must be empty.
-        # For decode vLLM instance this port can be non-empty.
+        # For prefill vLLM instance (KV producer), `port` must be available.
+        # For decode vLLM instance `port` can be not available.
         if dist_kv.IS_KV_PRODUCER:
+            # `port` must be available.
             try:
                 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                     s.bind(("", port))
@@ -569,6 +570,7 @@ def get_open_port(force: bool = False) -> int:
                     "vLLM instance.", port)
                 raise e
         else:
+            # `port` can be not available
             return port
 
     if port is not None:

From 52c2d1084c7607f637d1c3e5cac6b8f02bd46490 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 18:49:50 +0000
Subject: [PATCH 264/278] add new line and run format checker

---
 benchmarks/benchmark_serving.py | 2 +-
 vllm/utils.py                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index a407a263120b..d0e1cb41a68b 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -963,4 +963,4 @@ def main(args: argparse.Namespace):
     )
 
     args = parser.parse_args()
-    main(args)
\ No newline at end of file
+    main(args)
diff --git a/vllm/utils.py b/vllm/utils.py
index a423eb355370..55de17e242b2 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -549,7 +549,7 @@ def get_open_port(force: bool = False) -> int:
 
     if force:
         # This flag will only be True in disaggregated prefill scenario
-        # and VLLM_PORT must be set so that vLLM can connect prefill vLLM 
+        # and VLLM_PORT must be set so that vLLM can connect prefill vLLM
         # instance and decode vLLM instance.
         assert port is not None, "Please set environment variable VLLM_PORT in"
         " order to use disaggregated prefill and distributed KV cache transfer"

From 09478ef416d99a75c96ec8b2547518aef7d05771 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 19:46:34 +0000
Subject: [PATCH 265/278] add docstring for lookup buffer

---
 .../kv_transfer/kv_lookup_buffer/base.py      | 93 ++++++++++++++++++-
 .../kv_lookup_buffer/simple_buffer.py         | 18 +++-
 2 files changed, 102 insertions(+), 9 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index 80802f87987a..4bacde288434 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -1,3 +1,12 @@
+"""
+This file contains a new class `KVLookupBufferBase` that allows developers to 
+think of KV cache operations as inserting new KV cache entries (`insert`) 
+into the lookup buffer and querying existing KV caches (`drop_select`) 
+from the lookup buffer.
+
+All distributed communications are abstracted behind this class.
+"""
+
 from abc import ABC, abstractmethod
 from typing import List, Optional
 
@@ -5,21 +14,97 @@
 
 
 class KVLookupBufferBase(ABC):
+    """
+    Abstract base class for a lookup buffer.
+
+    This class provides an abstraction for a key-value (KV) cache lookup buffer.
+    
+    The key of the lookup buffer:
+    - input_tokens: token IDs of the request
+    - roi: a binary mask on top of input_tokens.
+      - Purpose of roi: Since KV cache may only be available for a subset of 
+        tokens in the input (for example, when vLLM is connected to an external 
+        KV cache service), roi specifies the subset of tokens that the KV cache 
+        is associated with.
+      - NOTE: roi can be further extended to describe which part of KV the 
+        current process is holding (each process may only hold a part of KV 
+        due to TP and PP). This is not implemented for now.
+        
+    The value of the lookup buffer:
+    - key: the key tensor in the KV cache
+    - value: the value tensor in the KV cache
+    - hidden: the final hidden state generated by model forwarding. This allows 
+      vLLM to bypass further model forwarding by transmitting the hidden state.
+    """
 
     @abstractmethod
     def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
                key: torch.Tensor, value: torch.Tensor,
                hidden: torch.Tensor) -> None:
+        """Insert into the lookup buffer.
+        
+        The functionality is similar to the following python statement
+        ```
+        buffer[input_tokens, roi] = [key, value, hidden]
+        ```
+        
+        FIXME: in the future, we should only have two arguments, key and value,
+        where key is a tensor dict and value is a tensor dict.
+        
+        FIXME: we should transmit both sampler outputs and the hidden states.
+
+        Args:
+            input_tokens (torch.Tensor): token IDs.
+            roi (torch.Tensor): A binary mask on top of the input tokens
+            key (torch.Tensor): The key tensor in the KV cache.
+            value (torch.Tensor): The value tensor in the KV cache.
+            hidden (torch.Tensor): The final hidden state tensor generated 
+                                   during model forwarding to bypass model 
+                                   forwarding.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
         raise NotImplementedError
 
     @abstractmethod
-    def drop_select(self, input_tokens: torch.Tensor,
-                    roi: torch.Tensor) -> List[Optional[torch.Tensor]]:
+    def drop_select(
+            self, input_tokens: Optional[torch.Tensor],
+            roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
+        """
+        Select and *drop* KV cache entries from the lookup buffer.
+        
+        The functionality is similar to the following python statements
+        ```
+        ret = buffer.pop(input_tokens, roi)
+        return ret
+        ```
+        
+        If `input_tokens` and `roi` is `None`, it means selecting any of the
+        KV caches in the buffer, return, and remove it from the buffer, useful
+        when offloading KV cache to KV cache storage service.
+
+        Args:
+            input_tokens (torch.Tensor): token IDs.
+            roi (torch.Tensor): A binary mask on top of the input tokens
+
+        Returns:
+            List[Optional[torch.Tensor]]: A list of tensors. Can be None.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
         raise NotImplementedError
 
     @abstractmethod
-    def close(self):
+    def close(self) -> None:
         """
-        Close the buffer, release resources.
+        Close the buffer and release resources.
+
+        This method is responsible for cleaning up resources related to the 
+        lookup buffer when it is no longer needed.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
         """
         raise NotImplementedError
diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index 96f8f14561e7..33f9b7c44024 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -57,7 +57,7 @@ def _matches(self, tokens_roi_sender: List[torch.Tensor],
             # so any of the data in the buffer can be drop-selected
             return True
 
-        # Assuming that roi is a mask on tokens
+        # Assuming that roi is a binary mask on tokens
         tokens_sender = tokens_sender[roi_sender]
         tokens_recver = tokens_recver[roi_recver]
 
@@ -128,6 +128,9 @@ def drop_select_handler(self):
                 matched_length = 0
 
                 # perform input tokens and roi matching
+                # FIXME: this matching is O(n), ideally it should be O(1)
+                # but this buffer size won't (and shouldn't) be too large so
+                # the fix is not urgent.
                 with self.buffer_lock:
 
                     for _ in range(len(self.buffer)):
@@ -158,11 +161,13 @@ def drop_select_handler(self):
 
         logger.debug("Closing drop_select_handler")
 
-    def drop_select(self, input_tokens: torch.Tensor,
-                    roi: torch.Tensor) -> List[Optional[torch.Tensor]]:
+    def drop_select(
+            self, input_tokens: Optional[torch.Tensor],
+            roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
 
         assert self.request_handling_thread is None, \
-            "drop_select should be called by the receiver"
+            "drop_select should be called by the KV cache consumer "\
+            "(e.g. the decode vLLM instance)"
 
         if isinstance(input_tokens, torch.Tensor):
             input_tokens = input_tokens.clone()
@@ -188,8 +193,11 @@ def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
                key: torch.Tensor, value: torch.Tensor,
                hidden: torch.Tensor) -> None:
 
+        if self.buffer_size > self.buffer_size_threshold:
+            # log outside the while loop to avoid this message being logged 
+            # repeatedly.
+            logger.debug("KV transfer buffer is full. Handling...")
         while self.buffer_size > self.buffer_size_threshold:
-            # logger.debug("KV transfer buffer is full. Handling...")
             self.full_handler()
 
         self._add_to_buffer(input_tokens, roi, key, value, hidden)

From 06cb15c7b13c84fd201f99dafb866e8669c1494a Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 19:47:09 +0000
Subject: [PATCH 266/278] align docstring syntax

---
 vllm/distributed/kv_transfer/kv_lookup_buffer/base.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
index 4bacde288434..bad119a1aa92 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/base.py
@@ -71,8 +71,7 @@ def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
     def drop_select(
             self, input_tokens: Optional[torch.Tensor],
             roi: Optional[torch.Tensor]) -> List[Optional[torch.Tensor]]:
-        """
-        Select and *drop* KV cache entries from the lookup buffer.
+        """Select and *drop* KV cache entries from the lookup buffer.
         
         The functionality is similar to the following python statements
         ```
@@ -98,8 +97,7 @@ def drop_select(
 
     @abstractmethod
     def close(self) -> None:
-        """
-        Close the buffer and release resources.
+        """Close the buffer and release resources.
 
         This method is responsible for cleaning up resources related to the 
         lookup buffer when it is no longer needed.

From 7c11a392c012bc795a444ff547df103cb45cbbe6 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 19:58:56 +0000
Subject: [PATCH 267/278] add docstring for abstract classes

---
 .../kv_lookup_buffer/simple_buffer.py         |  2 +-
 vllm/distributed/kv_transfer/kv_pipe/base.py  | 47 ++++++++++++++++++-
 2 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index 33f9b7c44024..a3a9e8a2846b 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -194,7 +194,7 @@ def insert(self, input_tokens: torch.Tensor, roi: torch.Tensor,
                hidden: torch.Tensor) -> None:
 
         if self.buffer_size > self.buffer_size_threshold:
-            # log outside the while loop to avoid this message being logged 
+            # log outside the while loop to avoid this message being logged
             # repeatedly.
             logger.debug("KV transfer buffer is full. Handling...")
         while self.buffer_size > self.buffer_size_threshold:
diff --git a/vllm/distributed/kv_transfer/kv_pipe/base.py b/vllm/distributed/kv_transfer/kv_pipe/base.py
index 0955b4e83889..79e235b48fd7 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/base.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/base.py
@@ -1,3 +1,13 @@
+"""
+This file defines 
+`KVPipeBase`
+that provides an abstraction for sending and receiving tensors, or None, via
+distributed communications.
+
+All distributed communications for disagg prefill & KV cache storage should be
+handled by `KVPipeBase`.
+"""
+
 from abc import ABC, abstractmethod
 from typing import Optional
 
@@ -5,15 +15,50 @@
 
 
 class KVPipeBase(ABC):
+    """
+    This class provides an interface for sending and receiving tensors, or
+    None, by distributed communications.
+    """
 
     @abstractmethod
     def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """Send a tensor, or None, via the pipe.
+        
+        Need to support sending None -- important for error handling.
+        
+        TODO: add a `key` argument so that we can use traditional 
+        key-value database as the distributed communication mechanism behind 
+        the pipe.
+
+        Args:
+            tensor (Optional[torch.Tensor]): The tensor to be sent. Can be None.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
         raise NotImplementedError
 
     @abstractmethod
     def recv_tensor(self) -> Optional[torch.Tensor]:
+        """Receive a tensor (can be None) from the pipeline.
+
+        Returns:
+            Optional[torch.Tensor]: The tensor received from the pipeline. Can 
+                                    be None.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
         raise NotImplementedError
 
     @abstractmethod
-    def close(self):
+    def close(self) -> None:
+        """Close the pipeline and release resources.
+
+        This method is responsible for closing the communication pipeline 
+        and releasing any resources associated with it.
+
+        Raises:
+            NotImplementedError: This method must be implemented in subclasses.
+        """
         raise NotImplementedError

From 37bac34d2e59c2a374a2d557ba38b026c68d5b72 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 20:03:23 +0000
Subject: [PATCH 268/278] put assertion at the end of the function

---
 .../distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index a3a9e8a2846b..8dfa61780dda 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -82,8 +82,8 @@ def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
             return 0
         if isinstance(data, torch.Tensor):
             return data.element_size() * data.numel()
-        else:
-            raise AssertionError("Unknown data type %s" % type(data))
+
+        raise AssertionError("Unknown data type %s" % type(data))
 
     def _add_to_buffer(self, input_tokens: torch.Tensor, roi: torch.Tensor,
                        key: torch.Tensor, value: torch.Tensor,

From 111abb463df1825aeab0c8632429d297f1deeb08 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 20:17:22 +0000
Subject: [PATCH 269/278] add fp8 support to pipe

---
 .../kv_transfer/kv_pipe/torch_distributed_pipe.py         | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index c2c5cbbe95b0..f58643d316a0 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -15,13 +15,15 @@
 # this means that the sended object is None.
 NONE_INT = -150886311
 
-# Mapping tensor dtype to a int, used for tensor metadata transmission
+# Mapping tensor dtype to INT64, used for tensor metadata transmission
 FLOAT16_INT = -543205003776624
 INT64_INT = -375623078607432
 BOOL_INT = -28035262008646
 BFLOAT16_INT = -452084912267662
 FLOAT32_INT = -1049557997456592
 FLOAT64_INT = -452201007054137
+FLOAT8_E4M3FN_INT = -1066697177659525
+FLOAT8_E5M2_INT = -618182574682355
 
 DTYPE2INT = {
     torch.float16: FLOAT16_INT,
@@ -30,6 +32,8 @@
     torch.bfloat16: BFLOAT16_INT,
     torch.float32: FLOAT32_INT,
     torch.float64: FLOAT64_INT,
+    torch.float8_e4m3fn: FLOAT8_E4M3FN_INT,
+    torch.float8_e5m2: FLOAT8_E5M2_INT,
 }
 
 INT2DTYPE = {
@@ -39,6 +43,8 @@
     BFLOAT16_INT: torch.bfloat16,
     FLOAT32_INT: torch.float32,
     FLOAT64_INT: torch.float64,
+    FLOAT8_E4M3FN_INT: torch.float8_e4m3fn,
+    FLOAT8_E5M2_INT: torch.float8_e5m2,
 }
 
 

From 394afaa21f0636f84d5b39239f726a2edf0c4816 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 20:24:34 +0000
Subject: [PATCH 270/278] adjust docstrings

---
 .../kv_pipe/torch_distributed_pipe.py         | 37 ++++++-------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
index f58643d316a0..3fe3fa289c66 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/torch_distributed_pipe.py
@@ -89,9 +89,6 @@ def __init__(
         self.target_rank_for_recv = self.ranks[(self.rank_in_group - 1) %
                                                self.world_size]
 
-        # FIXME: why we need this?
-        torch.set_default_device(self.device)
-
         self.transport_thread: Optional[ThreadPoolExecutor] = None
         self.buffer_size = 0
         self.buffer_size_lock = threading.Lock()
@@ -110,8 +107,7 @@ def _select_device(self, backend: Union[str, Backend]):
             return "cpu"
 
     def _make_metadata(self, tensor: torch.Tensor) -> torch.Tensor:
-        """
-        Create the metadata on based on the input tensor, and move it to GPU.
+        """Create the metadata on based on the input tensor, and move it to GPU.
         The metadata's length is `TorchDistributedPipe.METADATA_LENGTH`.
 
         Currently, the metadata is a int64 tensor and it includes dtype, number
@@ -129,7 +125,9 @@ def _make_metadata(self, tensor: torch.Tensor) -> torch.Tensor:
         Returns:
             - metadata: the metadata tensor, on self.device
         """
-        buffer = torch.empty(self.METADATA_LENGTH, dtype=self.METADATA_DTYPE)
+        buffer = torch.empty(self.METADATA_LENGTH,
+                             dtype=self.METADATA_DTYPE,
+                             device="cpu")
         buffer[0] = DTYPE2INT[tensor.dtype]
         ndims = len(tensor.shape)
         buffer[1] = len(tensor.shape)
@@ -139,8 +137,7 @@ def _make_metadata(self, tensor: torch.Tensor) -> torch.Tensor:
 
     def _prepare_recv_buffer(self,
                              d_metadata_buffer: torch.Tensor) -> torch.Tensor:
-        """
-        Create a buffer to receive the tensor based on the metadata.
+        """Create a buffer to receive the tensor based on the metadata.
 
         Parameters:
             - d_metadata_buffer: the metadata tensor on self.device
@@ -155,8 +152,7 @@ def _prepare_recv_buffer(self,
         return torch.empty(shape, dtype=dtype, device=self.device)
 
     def _send_metadata(self, d_metadata_buffer: torch.Tensor):
-        """
-        Send the metadata buffer to the target rank.
+        """Send the metadata buffer to the target rank.
         """
         torch.distributed.send(
             d_metadata_buffer,
@@ -165,8 +161,7 @@ def _send_metadata(self, d_metadata_buffer: torch.Tensor):
         )
 
     def _recv_metadata(self) -> torch.Tensor:
-        """
-        Receive the metadata buffer from the target rank.
+        """Receive the metadata buffer from the target rank.
 
         Returns:
             - metadata_buffer: the metadata buffer tensor, on self.device
@@ -195,11 +190,9 @@ def _send_impl(self, tensor):
 
         metadata = self._make_metadata(tensor)
         self._send_metadata(metadata)
-        #logger.debug(f"Sent meta {metadata}")
         torch.distributed.send(tensor.to(self.device),
                                dst=self.target_rank_for_send,
                                group=self.device_group)
-        #logger.debug(f"Sent tensor {tensor}")
 
     def _recv_impl(self) -> torch.Tensor:
         """
@@ -235,17 +228,14 @@ def send_tensor_wrapper(self, tensor):
             traceback.print_exc()
 
     def block_if_full(self):
-        """
-        Block the current thread if the buffer size is larger than 1e9.
-        """
+        """Block the current thread if the buffer size is larger than 1e9."""
         # TODO: replace this 1e9 with a configurable parameter or a constant
         while self.buffer_size > 1e9:
             logger.debug("KV cache transfer pipe is full. Waiting...")
             time.sleep(0.05)
 
     def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
-        """
-        Sends a tensor to the destination rank in a non-blocking way.
+        """Sends a tensor to the destination rank in a non-blocking way.
         Flow: send tensor dim -- send tensor shape -- send tensor data
         """
 
@@ -254,9 +244,8 @@ def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
 
         if tensor is None:
             tensor = self.none_tensor
-            tensor_size = 0
-        else:
-            tensor_size = tensor.element_size() * tensor.numel()
+
+        tensor_size = tensor.element_size() * tensor.numel()
 
         assert (
             0 < len(tensor.shape) < self.MAX_TENSOR_DIMENSIONS
@@ -294,9 +283,7 @@ def recv_tensor(self) -> Optional[torch.Tensor]:
             return tensor
 
     def close(self):
-        """
-        Close the pipe and release the resources.
-        """
+        """Close the pipe and release the resources."""
         if (hasattr(self, "transport_thread")
                 and self.transport_thread is not None):
             self.transport_thread.shutdown()

From 76019f1f42c8142b682f6c277f2282decc4c965c Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 21:49:05 +0000
Subject: [PATCH 271/278] bug fix: check isinstance(torch.Tensor) before
 checking NOne

---
 .../kv_transfer/kv_lookup_buffer/simple_buffer.py         | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index 8dfa61780dda..41c2fba31fbe 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -77,11 +77,13 @@ def _send_tensor_and_dec_size(self,
         self.data_pipe.send_tensor(tensor)
 
     def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
-
-        if not data:
-            return 0
+        
         if isinstance(data, torch.Tensor):
             return data.element_size() * data.numel()
+        if not data:
+            # cannot perform `not data` on a tensor
+            # so this check needs to go after the check above
+            return 0
 
         raise AssertionError("Unknown data type %s" % type(data))
 

From 93ec62b8556e279d2c050bdc1c3247831bd39466 Mon Sep 17 00:00:00 2001
From: KuntaiDu <kuntai@uchicago.edu>
Date: Tue, 24 Sep 2024 22:16:03 +0000
Subject: [PATCH 272/278] make format check happy

---
 vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
index 41c2fba31fbe..eb052e2e41e1 100644
--- a/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
+++ b/vllm/distributed/kv_transfer/kv_lookup_buffer/simple_buffer.py
@@ -77,7 +77,7 @@ def _send_tensor_and_dec_size(self,
         self.data_pipe.send_tensor(tensor)
 
     def _get_element_size(self, data: Optional[Union[List, torch.Tensor]]):
-        
+
         if isinstance(data, torch.Tensor):
             return data.element_size() * data.numel()
         if not data:

From c5bdf64887725a5719bfb304419318fdc3f49ef0 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 10 Oct 2024 09:35:48 -0700
Subject: [PATCH 273/278] Adjust to latest changes of `kv_caches`: it is now
 always a tensor.

---
 vllm/worker/model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 91c352d9babe..b7459dc838a5 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1774,7 +1774,7 @@ def need_recv_kv(self, model_input, kv_caches) -> bool:
         prefill_meta = model_input.attn_metadata.prefill_metadata
 
         # check if the current run is profiling
-        is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
+        is_profile_run = (kv_caches.numel() == 0)
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
 
@@ -1796,7 +1796,7 @@ def need_send_kv(self, model_input, kv_caches) -> bool:
         prefill_meta = model_input.attn_metadata.prefill_metadata
 
         # check if the current run is profiling
-        is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
+        is_profile_run = (kv_caches.numel() == 0)
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
 

From 596eb642e3d80823c0c3c7efe2fb7c65ae48e391 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 10 Oct 2024 09:42:29 -0700
Subject: [PATCH 274/278] debug

---
 vllm/worker/model_runner.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index b7459dc838a5..1949f3b58178 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1774,6 +1774,7 @@ def need_recv_kv(self, model_input, kv_caches) -> bool:
         prefill_meta = model_input.attn_metadata.prefill_metadata
 
         # check if the current run is profiling
+        print(kv_caches)
         is_profile_run = (kv_caches.numel() == 0)
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
@@ -1796,6 +1797,7 @@ def need_send_kv(self, model_input, kv_caches) -> bool:
         prefill_meta = model_input.attn_metadata.prefill_metadata
 
         # check if the current run is profiling
+        print(kv_caches)
         is_profile_run = (kv_caches.numel() == 0)
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None

From 683bd9cff790adcbeb3fe8ccbc1b07a33810e518 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 10 Oct 2024 09:44:33 -0700
Subject: [PATCH 275/278] bug fix: kv_caches will be list of torch.tensor([])
 in profile run.

---
 vllm/worker/model_runner.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 1949f3b58178..852c04041349 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1774,8 +1774,7 @@ def need_recv_kv(self, model_input, kv_caches) -> bool:
         prefill_meta = model_input.attn_metadata.prefill_metadata
 
         # check if the current run is profiling
-        print(kv_caches)
-        is_profile_run = (kv_caches.numel() == 0)
+        is_profile_run = (kv_caches[0].numel() == 0)
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
 
@@ -1797,8 +1796,7 @@ def need_send_kv(self, model_input, kv_caches) -> bool:
         prefill_meta = model_input.attn_metadata.prefill_metadata
 
         # check if the current run is profiling
-        print(kv_caches)
-        is_profile_run = (kv_caches.numel() == 0)
+        is_profile_run = (kv_caches[0].numel() == 0)
         # check if the current run is prefill
         is_prefill_run = prefill_meta is not None
 

From 521daba7eead630d1b0c6b5bf4f8ba83580fb22a Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 10 Oct 2024 12:59:40 -0700
Subject: [PATCH 276/278] Relax server start timeout limit

---
 tests/kv_transfer/disagg_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/kv_transfer/disagg_test.py b/tests/kv_transfer/disagg_test.py
index 96203b2dc65a..3dfacbdc5fe8 100644
--- a/tests/kv_transfer/disagg_test.py
+++ b/tests/kv_transfer/disagg_test.py
@@ -80,7 +80,7 @@ def setup_servers():
 
 
 # Helper function to wait for server
-def wait_for_server(port, timeout=120):
+def wait_for_server(port, timeout=240):
     start_time = time.time()
     while time.time() - start_time < timeout:
         try:

From 9a190b1097a3b878c5e1477bf7d7552cad0e2e3d Mon Sep 17 00:00:00 2001
From: Feng Ren <alogfans@gmail.com>
Date: Wed, 27 Nov 2024 21:20:23 +0800
Subject: [PATCH 277/278] Integrate with Mooncake Transfer Engine

Signed-off-by: Feng Ren <alogfans@gmail.com>
Co-authored-by: Shangming Cai <caishangming@linux.alibaba.com>
---
 .../kv_pipe/mooncake_distributed_pipe.py      | 236 ++++++++++++++++++
 vllm/distributed/kv_transfer/vllm_adapter.py  | 128 ++++++----
 2 files changed, 322 insertions(+), 42 deletions(-)
 create mode 100644 vllm/distributed/kv_transfer/kv_pipe/mooncake_distributed_pipe.py

diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_distributed_pipe.py
new file mode 100644
index 000000000000..660c5ad76393
--- /dev/null
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_distributed_pipe.py
@@ -0,0 +1,236 @@
+import io
+import json
+import os
+import pickle
+import time
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import List, Optional
+
+import mooncake_vllm_adaptor as mva
+import torch
+import zmq
+
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+NONE_INT = -150886311
+
+
+@dataclass
+class MooncakeTransferEngineConfig:
+    prefill_url: str
+    decode_url: str
+    metadata_server: str
+    protocol: str
+    device_name: str
+
+    @staticmethod
+    def from_file(file_path: str) -> 'MooncakeTransferEngineConfig':
+        """Load the config from a JSON file."""
+        with open(file_path, 'r') as fin:
+            config = json.load(fin)
+        return MooncakeTransferEngineConfig(
+            prefill_url=config.get("prefill_url"),
+            decode_url=config.get("decode_url"),
+            metadata_server=config.get("metadata_server"),
+            protocol=config.get("protocol", "tcp"),
+            device_name=config.get("device_name", ""),
+        )
+
+    @staticmethod
+    def load_from_env() -> 'MooncakeTransferEngineConfig':
+        """Load config from a file specified in the environment variable."""
+        config_file_path = os.getenv('MOONCAKE_CONFIG_PATH')
+        if config_file_path is None:
+            raise ValueError(
+                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set.")
+        return MooncakeTransferEngineConfig.from_file(config_file_path)
+
+
+class MooncakeTransferEngine:
+    """Handles the transfer of data using mooncake_vllm_adaptor and ZeroMQ."""
+
+    def __init__(self, rank_in_group: int):
+        self.engine = mva.mooncake_vllm_adaptor()
+
+        try:
+            self.config = MooncakeTransferEngineConfig.load_from_env()
+            logger.info("Configuration loaded successfully.")
+        except ValueError as e:
+            logger.error(e)
+            raise
+        except Exception as exc:
+            logger.error(
+                f"An error occurred while loading the configuration: {exc}")
+            raise
+
+        self.initialize(
+            self.config.prefill_url if rank_in_group == 0 else
+            self.config.decode_url, self.config.metadata_server,
+            self.config.protocol, self.config.device_name)
+
+        self.remote_url = (self.config.decode_url
+                           if rank_in_group == 0 else self.config.prefill_url)
+
+        # Initialize ZeroMQ context and sockets
+        self.context = zmq.Context()  # type: ignore[attr-defined]
+        self.sender_socket = self.context.socket(zmq.constants.PUSH)
+        self.receiver_socket = self.context.socket(zmq.constants.PULL)
+        self.sender_ack = self.context.socket(zmq.constants.PULL)
+        self.receiver_ack = self.context.socket(zmq.constants.PUSH)
+
+        host, port = self.remote_url.split(':')
+        self.buffer_cleaner = ThreadPoolExecutor(max_workers=1)
+        self._setup_sockets(rank_in_group, host, port)
+
+    def _setup_sockets(self, rank_in_group: int, host: str, port: str) -> None:
+        """Set up ZeroMQ sockets for sending and receiving data."""
+        if rank_in_group == 0:
+            self.sender_socket.bind(f"tcp://*:{int(port) + 1}")
+            self.receiver_socket.connect(f"tcp://{host}:{int(port) + 2}")
+            self.sender_ack.connect(f"tcp://{host}:{int(port) + 3}")
+            self.receiver_ack.bind(f"tcp://*:{int(port) + 4}")
+        else:
+            self.receiver_socket.connect(f"tcp://{host}:{int(port) + 1}")
+            self.sender_socket.bind(f"tcp://*:{int(port) + 2}")
+            self.receiver_ack.bind(f"tcp://*:{int(port) + 3}")
+            self.sender_ack.connect(f"tcp://{host}:{int(port) + 4}")
+
+    def initialize(self, local_hostname: str, metadata_server: str,
+                   protocol: str, device_name: str) -> None:
+        """Initialize the mooncake instance."""
+        self.engine.initialize(local_hostname, metadata_server, protocol,
+                               device_name)
+
+    def allocate_managed_buffer(self, length: int) -> int:
+        """Allocate a managed buffer of the specified length."""
+        ret = self.engine.allocateManagedBuffer(length)
+        if ret <= 0:
+            logger.error("Allocation Return Error")
+            raise Exception("Allocation Return Error")
+        return ret
+
+    def free_managed_buffer(self, buffer: int, length: int) -> int:
+        """Free a previously allocated managed buffer."""
+        return self.engine.freeManagedBuffer(buffer, length)
+
+    def transfer_sync(self, buffer: int, peer_buffer_address: int,
+                      length: int) -> int:
+        """Synchronously transfer data to the specified address."""
+        ret = self.engine.transferSync(self.remote_url, buffer,
+                                       peer_buffer_address, length)
+        if ret < 0:
+            logger.error("Transfer Return Error")
+            raise Exception("Transfer Return Error")
+        return ret
+
+    def write_bytes_to_buffer(self, buffer: int, user_data: bytes,
+                              length: int) -> int:
+        """Write bytes to the allocated buffer."""
+        return self.engine.writeBytesToBuffer(buffer, user_data, length)
+
+    def read_bytes_from_buffer(self, buffer: int, length: int) -> bytes:
+        """Read bytes from the allocated buffer."""
+        return self.engine.readBytesFromBuffer(buffer, length)
+
+    def wait_for_ack(self, src_ptr: int, length: int) -> None:
+        """Asynchronously wait for ACK from the receiver."""
+        ack = self.sender_ack.recv_pyobj()
+        if ack != b'ACK':
+            logger.error("Failed to receive ACK from the receiver")
+
+        self.free_managed_buffer(src_ptr, length)
+
+    def send_bytes(self, user_data: bytes) -> None:
+        """Send bytes to the remote process."""
+        length = len(user_data)
+        src_ptr = self.allocate_managed_buffer(length)
+        self.write_bytes_to_buffer(src_ptr, user_data, length)
+        self.sender_socket.send_pyobj((src_ptr, length))
+        self.buffer_cleaner.submit(self.wait_for_ack, src_ptr, length)
+
+    def recv_bytes(self) -> bytes:
+        """Receive bytes from the remote process."""
+        src_ptr, length = self.receiver_socket.recv_pyobj()
+        dst_ptr = self.allocate_managed_buffer(length)
+        self.transfer_sync(dst_ptr, src_ptr, length)
+        ret = self.read_bytes_from_buffer(dst_ptr, length)
+
+        # Buffer cleanup
+        self.receiver_ack.send_pyobj((b'ACK'))
+        self.free_managed_buffer(dst_ptr, length)
+
+        return ret
+
+
+class MooncakeDistributedPipe(KVPipeBase):
+    """MooncakeTransferEngine based Pipe implementation."""
+
+    def __init__(self, group_ranks: List[List[int]], local_rank: int):
+        """Initialize the mooncake pipe and set related parameters."""
+        self.rank = torch.distributed.get_rank()
+        self.local_rank = local_rank
+
+        self.ranks = self.get_ranks(group_ranks)
+        self.world_size = len(self.ranks)
+        self.rank_in_group = self.ranks.index(self.rank)
+
+        assert self.rank_in_group <= 1
+        self.device = self._select_device()
+
+        self.transfer_engine = MooncakeTransferEngine(self.rank_in_group)
+        self.transport_thread: Optional[ThreadPoolExecutor] = None
+        self.none_tensor = torch.tensor([NONE_INT], device=self.device)
+
+    def get_ranks(self, group_ranks: List[List[int]]) -> List[int]:
+        """Get the ranks for the current process."""
+        for ranks in group_ranks:
+            if self.rank in ranks:
+                return ranks
+        raise ValueError("Rank not found in group")
+
+    def _select_device(self) -> torch.device:
+        """Select available device (CUDA or CPU)."""
+        return torch.device(
+            f"cuda:{self.local_rank}") if torch.cuda.is_available() else "cpu"
+
+    def tensor_hash(self, tensor: torch.Tensor) -> int:
+        """Calculate the hash value of the tensor."""
+        return hash(tensor.data_ptr())
+
+    def _send_impl(self, tensor: torch.Tensor) -> None:
+        """Implement the tensor sending logic."""
+        value_bytes = pickle.dumps(tensor)
+        self.transfer_engine.send_bytes(value_bytes)
+
+    def _recv_impl(self) -> torch.Tensor:
+        """Implement the tensor receiving logic."""
+        data = self.transfer_engine.recv_bytes()
+        return pickle.loads(data)
+
+    def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
+        """Send tensor to the target process."""
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+        tensor = tensor if tensor is not None else self.none_tensor
+        assert (0 < len(tensor.shape))
+        self.transport_thread.submit(self._send_impl, tensor)
+
+    def recv_tensor(self) -> Optional[torch.Tensor]:
+        """Receive tensor from other processes."""
+        if self.transport_thread is None:
+            self.transport_thread = ThreadPoolExecutor(max_workers=1)
+        tensor = self.transport_thread.submit(self._recv_impl).result()
+        if tensor.numel() == 1 and tensor.item() == NONE_INT:
+            return None
+        else:
+            return tensor
+
+    def close(self) -> None:
+        """Cleanup logic when closing the pipe."""
+        self.transfer_engine.sender_socket.close()
+        self.transfer_engine.receiver_socket.close()
+        self.transfer_engine.context.term()  # Terminate the ZMQ context
+        logger.info("Closed the transfer engine and cleaned up resources.")
diff --git a/vllm/distributed/kv_transfer/vllm_adapter.py b/vllm/distributed/kv_transfer/vllm_adapter.py
index 7516e7c5ff30..ee022da405d4 100644
--- a/vllm/distributed/kv_transfer/vllm_adapter.py
+++ b/vllm/distributed/kv_transfer/vllm_adapter.py
@@ -23,6 +23,7 @@
 if TYPE_CHECKING:
     from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 
+import os
 from copy import deepcopy
 
 import torch
@@ -33,6 +34,9 @@
 from vllm import _custom_ops as ops
 from vllm.distributed.kv_transfer.kv_lookup_buffer.base import (
     KVLookupBufferBase)
+from vllm.distributed.kv_transfer.kv_pipe.base import KVPipeBase
+from vllm.distributed.kv_transfer.kv_pipe.mooncake_distributed_pipe import (
+    MooncakeDistributedPipe)
 from vllm.distributed.kv_transfer.kv_pipe.torch_distributed_pipe import (
     TorchDistributedPipe)
 from vllm.logger import init_logger
@@ -80,34 +84,55 @@ def __init__(
         self.send_buffer: Optional[KVLookupBufferBase] = None
         self.recv_buffer: Optional[KVLookupBufferBase] = None
 
+        self.send_pipe: Optional[KVPipeBase] = None
+        self.recv_pipe: Optional[KVPipeBase] = None
+        self.send_signal_pipe: Optional[KVPipeBase] = None
+        self.recv_signal_pipe: Optional[KVPipeBase] = None
+
         SimpleKVLookupBuffer = sklb.SimpleKVLookupBuffer
 
+        # Check if MOONCAKE_CONFIG_PATH is set
+        use_mooncake_distributed_pipe = os.getenv(
+            'MOONCAKE_CONFIG_PATH') is not None
+
         # In disaggregated prefill, the prefill vLLM only uses send pipe
         # and the decode vLLM only uses recv pipe
         # In remote KV cache store, vLLM will use both send pipe and recv pipe
         # So we build both send pipe and recv pipe for simplicity.
         if IS_KV_PRODUCER:
 
-            self.send_pipe = TorchDistributedPipe(
-                group_ranks,
-                local_rank,
-                DISTRIBUTED_BACKEND,
-            )
-            self.send_signal_pipe = TorchDistributedPipe(
-                group_ranks,
-                local_rank,
-                "gloo",
-            )
-            self.recv_pipe = TorchDistributedPipe(
-                group_ranks,
-                local_rank,
-                DISTRIBUTED_BACKEND,
-            )
-            self.recv_signal_pipe = TorchDistributedPipe(
-                group_ranks,
-                local_rank,
-                "gloo",
-            )
+            if use_mooncake_distributed_pipe:
+                # Use MooncakeDistributedPipe if environment variable is set
+                self.send_pipe = MooncakeDistributedPipe(
+                    group_ranks,
+                    local_rank,
+                )
+                self.recv_pipe = self.send_pipe
+                self.send_signal_pipe = self.send_pipe
+                self.recv_signal_pipe = self.send_pipe
+            else:
+                # Use TorchDistributedPipe as default
+                self.send_pipe = TorchDistributedPipe(
+                    group_ranks,
+                    local_rank,
+                    DISTRIBUTED_BACKEND,
+                )
+                self.recv_pipe = TorchDistributedPipe(
+                    group_ranks,
+                    local_rank,
+                    DISTRIBUTED_BACKEND,
+                )
+
+                self.send_signal_pipe = TorchDistributedPipe(
+                    group_ranks,
+                    local_rank,
+                    "gloo",
+                )
+                self.recv_signal_pipe = TorchDistributedPipe(
+                    group_ranks,
+                    local_rank,
+                    "gloo",
+                )
             self.send_buffer = SimpleKVLookupBuffer(self.send_signal_pipe,
                                                     self.send_pipe,
                                                     self.lookup_buffer_size)
@@ -120,26 +145,40 @@ def __init__(
             # the current vLLM instance is KV consumer, so it needs to connect
             # its recv pipe to the send pipe of KV producder
 
-            self.recv_pipe = TorchDistributedPipe(
-                group_ranks,
-                local_rank,
-                DISTRIBUTED_BACKEND,
-            )
-            self.recv_signal_pipe = TorchDistributedPipe(
-                group_ranks,
-                local_rank,
-                "gloo",
-            )
-            self.send_pipe = TorchDistributedPipe(
-                group_ranks,
-                local_rank,
-                DISTRIBUTED_BACKEND,
-            )
-            self.send_signal_pipe = TorchDistributedPipe(
-                group_ranks,
-                local_rank,
-                "gloo",
-            )
+            if use_mooncake_distributed_pipe:
+                # Use MooncakeDistributedPipe if environment variable is set
+                self.recv_pipe = MooncakeDistributedPipe(
+                    group_ranks,
+                    local_rank,
+                )
+                # We only need to initialize MooncakeDistributedPipe once, it
+                # supports bidirectional transmission
+                self.send_pipe = self.recv_pipe
+                self.recv_signal_pipe = self.recv_pipe
+                self.send_signal_pipe = self.recv_pipe
+            else:
+                # Use TorchDistributedPipe as default
+                self.recv_pipe = TorchDistributedPipe(
+                    group_ranks,
+                    local_rank,
+                    DISTRIBUTED_BACKEND,
+                )
+                self.send_pipe = TorchDistributedPipe(
+                    group_ranks,
+                    local_rank,
+                    DISTRIBUTED_BACKEND,
+                )
+
+                self.recv_signal_pipe = TorchDistributedPipe(
+                    group_ranks,
+                    local_rank,
+                    "gloo",
+                )
+                self.send_signal_pipe = TorchDistributedPipe(
+                    group_ranks,
+                    local_rank,
+                    "gloo",
+                )
             self.send_buffer = SimpleKVLookupBuffer(self.send_signal_pipe,
                                                     self.send_pipe,
                                                     self.lookup_buffer_size)
@@ -163,6 +202,13 @@ def send_kv_caches_and_hidden_states(
         start_layer = model_executable.model.start_layer
         end_layer = model_executable.model.end_layer
 
+        # fix potential bugs on Volta and Turing GPUs
+        model_config = model_executable.model.config
+        hidden_size = model_config.hidden_size
+        num_heads = model_config.num_key_value_heads
+        num_hidden_layers = model_config.num_attention_heads
+        head_size = int(hidden_size/num_hidden_layers)
+
         # query_lens contains new KV caches that are added to vLLM.
         # so we will send them to decode instance
         # FIXME(Kuntai): This assume that all requests are prefill.
@@ -176,8 +222,6 @@ def send_kv_caches_and_hidden_states(
             for layer_id in range(start_layer, end_layer):
                 kv_cache = kv_caches[layer_id - start_layer]
 
-                _, _, num_heads, head_size = kv_cache[0].shape
-
                 key_cache = kv_cache[0].reshape(-1, num_heads, head_size)
                 value_cache = kv_cache[1].reshape(-1, num_heads, head_size)
 

From c1477fbac5345ff9fa04a317d739ade9d9eafb6b Mon Sep 17 00:00:00 2001
From: Shangming Cai <caishangming@linux.alibaba.com>
Date: Fri, 29 Nov 2024 11:21:05 +0800
Subject: [PATCH 278/278] Fix format to make ruff happy.

---
 .../kv_transfer/kv_pipe/mooncake_distributed_pipe.py   | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/vllm/distributed/kv_transfer/kv_pipe/mooncake_distributed_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/mooncake_distributed_pipe.py
index 660c5ad76393..f8a52fdc929f 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/mooncake_distributed_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/mooncake_distributed_pipe.py
@@ -1,8 +1,6 @@
-import io
 import json
 import os
 import pickle
-import time
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import dataclass
 from typing import List, Optional
@@ -29,7 +27,7 @@ class MooncakeTransferEngineConfig:
     @staticmethod
     def from_file(file_path: str) -> 'MooncakeTransferEngineConfig':
         """Load the config from a JSON file."""
-        with open(file_path, 'r') as fin:
+        with open(file_path) as fin:
             config = json.load(fin)
         return MooncakeTransferEngineConfig(
             prefill_url=config.get("prefill_url"),
@@ -63,7 +61,7 @@ def __init__(self, rank_in_group: int):
             raise
         except Exception as exc:
             logger.error(
-                f"An error occurred while loading the configuration: {exc}")
+                "An error occurred while loading the configuration: %s", exc)
             raise
 
         self.initialize(
@@ -159,7 +157,7 @@ def recv_bytes(self) -> bytes:
         ret = self.read_bytes_from_buffer(dst_ptr, length)
 
         # Buffer cleanup
-        self.receiver_ack.send_pyobj((b'ACK'))
+        self.receiver_ack.send_pyobj(b'ACK')
         self.free_managed_buffer(dst_ptr, length)
 
         return ret
@@ -215,7 +213,7 @@ def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         if self.transport_thread is None:
             self.transport_thread = ThreadPoolExecutor(max_workers=1)
         tensor = tensor if tensor is not None else self.none_tensor
-        assert (0 < len(tensor.shape))
+        assert (len(tensor.shape) > 0)
         self.transport_thread.submit(self._send_impl, tensor)
 
     def recv_tensor(self) -> Optional[torch.Tensor]: