Skip to content

Commit c9e6658

Browse files
[NIXL] heterogeneous block_size support (#26759)
Signed-off-by: Chendi Xue <[email protected]> Signed-off-by: Chendi.Xue <[email protected]> Co-authored-by: Nicolò Lucchesi <[email protected]>
1 parent 363aaee commit c9e6658

File tree

3 files changed

+257
-59
lines changed

3 files changed

+257
-59
lines changed

tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1} # Default to 1
4949
PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
5050
DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
5151
GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.2}
52+
PREFILL_BLOCK_SIZE=${PREFILL_BLOCK_SIZE:-16}
53+
DECODE_BLOCK_SIZE=${DECODE_BLOCK_SIZE:-16}
5254

5355
# Find the git repository root directory
5456
GIT_ROOT=$(git rev-parse --show-toplevel)
@@ -136,6 +138,7 @@ run_tests_for_model() {
136138
vllm serve $model_name \
137139
--port $PORT \
138140
--enforce-eager \
141+
--block-size ${PREFILL_BLOCK_SIZE} \
139142
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
140143
--tensor-parallel-size $PREFILLER_TP_SIZE \
141144
--kv-transfer-config '$KV_CONFIG'"
@@ -177,6 +180,7 @@ run_tests_for_model() {
177180
vllm serve $model_name \
178181
--port $PORT \
179182
--enforce-eager \
183+
--block-size ${DECODE_BLOCK_SIZE} \
180184
--gpu-memory-utilization $GPU_MEMORY_UTILIZATION \
181185
--kv-transfer-config '$KV_CONFIG'"
182186

tests/v1/kv_connector/unit/test_nixl_connector.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,7 @@ def _nixl_handshake(
407407
# `self.kv_cache_layout` is only forced to HND when vllm engine
408408
# is started. We mock HND here.
409409
kv_cache_layout="HND",
410+
block_size=self.block_size,
410411
),
411412
remote_tp_size=remote_tp_size,
412413
)
@@ -652,6 +653,7 @@ def test_handshake_fails_on_kv_cache_layout_mismatch(self, dist_init):
652653
block_lens=worker.block_len_per_layer,
653654
attn_backend_name=worker.backend_name,
654655
kv_cache_layout=mismatched_layout,
656+
block_size=worker.block_size,
655657
)
656658

657659
with pytest.raises(RuntimeError):
@@ -706,6 +708,7 @@ def test_handshake_succeed_on_kv_cache_layout_mismatch_with_experimental(
706708
block_lens=[i * 2 for i in worker.block_len_per_layer],
707709
attn_backend_name=worker.backend_name,
708710
kv_cache_layout="HND",
711+
block_size=worker.block_size,
709712
)
710713

711714
# We don't check layout for homogeneous TP and MLA for now, as the

0 commit comments

Comments
 (0)