[PCP] common supports for PCP

pisceskkk · FENP · LookAround0301 · pisceskkk · commit 58cbd8f00865 · 2025-11-18T17:51:00.000+08:00
Co-authored-by: QiuChunshuo &lt;qiuchunshuo@huawei.com&gt;
Co-authored-by: FENP &lt;yuanyongjie.yyj@antgroup.com&gt;
Co-authored-by: LookAround &lt;lixushi@huawei.com&gt;
Co-authored-by: Jingchun Gao &lt;gaojingchun1@huawei.com&gt;
Co-authored-by: zhenwenqi2024 &lt;zhenwenqi_2022@qq.com&gt;
Signed-off-by: QiuChunshuo &lt;qiuchunshuo@huawei.com&gt;
Signed-off-by: FENP &lt;yuanyongjie.yyj@antgroup.com&gt;
Signed-off-by: LookAround &lt;lixushi@huawei.com&gt;
Signed-off-by: Jingchun Gao &lt;gaojingchun1@huawei.com&gt;
Signed-off-by: zhenwenqi2024 &lt;zhenwenqi_2022@qq.com&gt;
diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py
@@ -196,7 +196,7 @@ def _compare_cp_with_tp(
         str(pp_size),
         "--decode-context-parallel-size",
         str(dcp_size),
-        "--dcp-kv-cache-interleave-size",
+        "--cp-kv-cache-interleave-size",
         str(cp_kv_cache_interleave_size),
         "--distributed-executor-backend",
         distributed_backend,
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
@@ -325,11 +325,6 @@ def _validate_parallel_config(self) -> Self:
                     "num_redundant_experts."
                 )
 
-        if self.prefill_context_parallel_size > 1:
-            raise ValueError(
-                "Prefill context parallelism is not fully supported. "
-                "Please set prefill_context_parallel_size to 1."
-            )
         return self
 
     @property
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
@@ -45,7 +45,7 @@
     AttentionCGSupport,
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
-    get_dcp_local_seq_lens,
+    get_cp_local_seq_lens,
     get_kv_cache_layout,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
@@ -384,7 +384,7 @@ def schedule(
             )
             dcp_context_kv_lens_cpu = seq_lens_cpu - query_kv_lens_cpu
 
-            dcp_context_kv_lens_cpu = get_dcp_local_seq_lens(
+            dcp_context_kv_lens_cpu = get_cp_local_seq_lens(
                 dcp_context_kv_lens_cpu,
                 self.dcp_world_size,
                 self.dcp_rank,
diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py
@@ -225,7 +225,7 @@
 from vllm.v1.attention.backends.utils import (
     AttentionMetadataBuilder,
     CommonAttentionMetadata,
-    get_dcp_local_seq_lens,
+    get_cp_local_seq_lens,
     get_per_layer_parameters,
     infer_global_hyperparameters,
     split_decodes_and_prefills,
@@ -831,7 +831,7 @@ def build(
                 )
 
                 if self.dcp_world_size > 1:
-                    local_context_lens_allranks = get_dcp_local_seq_lens(
+                    local_context_lens_allranks = get_cp_local_seq_lens(
                         context_lens_cpu,
                         self.dcp_world_size,
                         None,
diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py
@@ -48,6 +48,19 @@
 def is_valid_kv_cache_layout(value: str) -> bool:
     return value in get_args(KVCacheLayoutType)
 
+@dataclass
+class PrefillContextParallelMetadata:
+    """
+    Attention metadata for prefill context parallel
+    """
+    q_head_indices: torch.Tensor
+    q_tail_indices: torch.Tensor
+    q_head_start_loc: torch.Tensor
+    kv_for_head_indices: torch.Tensor
+    kv_for_tail_indices : torch.Tensor
+    kv_for_head_indptr: torch.Tensor
+    kv_for_tail_indptr: torch.Tensor
+    q_full_indices: torch.Tensor
 
 @dataclass
 class CommonAttentionMetadata:
@@ -94,6 +107,7 @@ class CommonAttentionMetadata:
     dcp_local_seq_lens: torch.Tensor | None = None
     """Sequence lengths of the local rank in decode context parallelism world"""
 
+    pcp_metadata: PrefillContextParallelMetadata | None = None
 
 def slice_query_start_locs(
     query_start_loc: torch.Tensor,
@@ -1077,35 +1091,35 @@ def compute_causal_conv1d_metadata(query_start_loc_p: torch.Tensor):
     return nums_dict, batch_ptr, token_chunk_offset_ptr
 
 
-def get_dcp_local_seq_lens(
+def get_cp_local_seq_lens(
     seq_lens: torch.Tensor,
-    dcp_size: int = 1,
-    dcp_rank: int | None = None,
+    cp_size: int = 1,
+    cp_rank: int | None = None,
     cp_kv_cache_interleave_size: int = 1,
 ) -> torch.Tensor:
     """While using dcp, kv_cache size stored on each rank may be different,
     use this function to calculate split decode seq_lens of each dcp rank.
     Only consider dcp now, we can extend the case of cp based on this.
     """
     num_requests = seq_lens.size(0)
-    if dcp_rank is None:
+    if cp_rank is None:
         rank_offsets = (
-            torch.arange(dcp_size, dtype=torch.int32)
+            torch.arange(cp_size, dtype=torch.int32)
             .unsqueeze(0)
             .repeat(num_requests, 1)
         )
     else:
-        rank_offsets = torch.Tensor([[dcp_rank]]).to(dtype=torch.int32)
+        rank_offsets = torch.Tensor([[cp_rank]]).to(dtype=torch.int32)
     seq_lens_tiled = (
         seq_lens.to(torch.int32).unsqueeze(-1).repeat(1, rank_offsets.shape[1])
     )
     base = (
         seq_lens_tiled
         // cp_kv_cache_interleave_size
-        // dcp_size
+        // cp_size
         * cp_kv_cache_interleave_size
     )
-    remainder = seq_lens_tiled - base * dcp_size
+    remainder = seq_lens_tiled - base * cp_size
     remainder = torch.clip(
         remainder - rank_offsets * cp_kv_cache_interleave_size,
         0,
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
@@ -1222,7 +1222,7 @@ def _report_kv_cache_config(
     dcp_size = vllm_config.parallel_config.decode_context_parallel_size
     pcp_size = vllm_config.parallel_config.prefill_context_parallel_size
     if pcp_size * dcp_size > 1:
-        num_tokens *= (pcp_size * dcp_size)
+        num_tokens *= pcp_size * dcp_size
         logger.info(
             "Multiplying the GPU KV cache size by the cp_world_size %d "
             "(pcp_world_size %d * dcp_world_size %d).",
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py