[NIXL] heterogeneous block_size support (#26759)

xuechendi · NickLucche · web-flow · commit c9e665852abb · 2025-11-14T21:51:32.000-08:00
Signed-off-by: Chendi Xue &lt;chendi.xue@intel.com&gt;
Signed-off-by: Chendi.Xue &lt;chendi.xue@intel.com&gt;
Co-authored-by: Nicolò Lucchesi &lt;nicolo.lucchesi@gmail.com&gt;
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -49,6 +49,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
 PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
 DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
 GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2}
+PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-16}
+DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-16}
 
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
@@ -136,6 +138,7 @@ run_tests_for_model() {
     vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
+    --block-size ${PREFILL_BLOCK_SIZE} \
     --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --tensor-parallel-size $PREFILLER_TP_SIZE \
     --kv-transfer-config '$KV_CONFIG'"
@@ -177,6 +180,7 @@ run_tests_for_model() {
     vllm serve $model_name \
     --port $PORT \
     --enforce-eager \
+    --block-size ${DECODE_BLOCK_SIZE} \
     --gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
     --kv-transfer-config '$KV_CONFIG'"
   
diff --git a/tests/v1/kv_connector/unit/test_nixl_connector.py b/tests/v1/kv_connector/unit/test_nixl_connector.py
@@ -407,6 +407,7 @@ def _nixl_handshake(
                 # `self.kv_cache_layout` is only forced to HND when vllm engine
                 # is started. We mock HND here.
                 kv_cache_layout="HND",
+                block_size=self.block_size,
             ),
             remote_tp_size=remote_tp_size,
         )
@@ -652,6 +653,7 @@ def test_handshake_fails_on_kv_cache_layout_mismatch(self, dist_init):
                 block_lens=worker.block_len_per_layer,
                 attn_backend_name=worker.backend_name,
                 kv_cache_layout=mismatched_layout,
+                block_size=worker.block_size,
             )
 
             with pytest.raises(RuntimeError):
@@ -706,6 +708,7 @@ def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental(
                 block_lens=[i * 2 for i in worker.block_len_per_layer],
                 attn_backend_name=worker.backend_name,
                 kv_cache_layout="HND",
+                block_size=worker.block_size,
             )
 
             # We don't check layout for homogeneous TP and MLA for now, as the
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py