Add token sharding functions and tests for context parallelism

Qirui Yang · Qirui Yang · commit 3716b20d678f · 2025-10-02T10:13:26.000-07:00
diff --git a/vllm/v1/attention/backends/cp_utils.py b/vllm/v1/attention/backends/cp_utils.py
@@ -48,7 +48,6 @@ def _cp_shard_positions_for_prefill(
     # Compute the token index ranges for the two shards handled by this rank
     chunk0_start = cp_rank * cp_shard_size
     chunk1_start = (2 * cp_size - cp_rank - 1) * cp_shard_size
-
     chunk0_arange = arange_np[chunk0_start:chunk0_start + cp_shard_size]
     chunk1_arange = arange_np[chunk1_start:chunk1_start + cp_shard_size]
 
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
@@ -10,6 +10,7 @@
 from vllm.logger import init_logger
 from vllm.utils import cdiv
 from vllm.v1.utils import CpuGpuBuffer
+from vllm.distributed.parallel_state import get_context_parallel_world_size
 
 logger = init_logger(__name__)
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -72,6 +72,8 @@
     AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata,
     create_fast_prefill_custom_backend,
     reorder_batch_to_split_decodes_and_prefills, split_attn_metadata)
+from vllm.v1.attention.backends.cp_utils import (
+    cp_shard_positions_for_prefill, cp_get_computed_positions)
 from vllm.v1.cudagraph_dispatcher import CudagraphDispatcher
 # yapf conflicts with isort for this block
 # yapf: disable