@@ -72,6 +72,8 @@ class ParallelConfig:
7272 """Number of pipeline parallel groups."""
7373 tensor_parallel_size : int = 1
7474 """Number of tensor parallel groups."""
75+ prefill_context_parallel_size : int = 1
76+ """Number of prefill context parallel groups."""
7577 data_parallel_size : int = 1
7678 """Number of data parallel groups. MoE layers will be sharded according to
7779 the product of the tensor parallel size and data parallel size."""
@@ -227,15 +229,19 @@ class is dynamically inherited by the worker class. This is used to inject
227229 not change by dcp, it simply reuse the GPUs of TP group, and tp_size
228230 needs to be divisible by dcp_size."""
229231
230- dcp_kv_cache_interleave_size : int = 1
231- """Interleave size of kv_cache storage while using dcp or cp > 1,
232- store interleave_size tokens on (d)cp i,
233- then store next interleave_size tokens on (d)cp i+1.
234- Interleave_size=1: token-level align, token i is stored on rank i % (d)cp_size.
235- Interleave_size=block_size: block-level align, first fill the block on first rank,
236- token is stored on rank i+1 block j after rank i block j is full.
237- Block_size should be greater than or equal to dcp_kv_cache_interleave_size.
238- Block_size should be divisible by dcp_kv_cache_interleave_size.
232+ cp_kv_cache_interleave_size : int = 1
233+ """Interleave size of kv_cache storage while using dcp or pcp.
234+ For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
235+ and `total_cp_world_size = pcp_world_size * dcp_world_szie`.
236+ store interleave_size tokens on total_cp_rank i,
237+ then store next interleave_size tokens on taotal_cp_rank i+1.
238+ Interleave_size=1: token-level alignment, where token `i` is stored on
239+ total_cp_rank `i % total_cp_world_size`.
240+ Interleave_size=block_size: block-level alignment, where tokens are
241+ first populated to the preceding ranks. Tokens are then stored
242+ in (rank i+1, block j) only after (rank i, block j) is fully occupied.
243+ Block_size should be greater than or equal to cp_kv_cache_interleave_size.
244+ Block_size should be divisible by cp_kv_cache_interleave_size.
239245 """
240246
241247 _api_process_count : int = Field (default = 1 , gt = 0 )
@@ -300,6 +306,11 @@ def _validate_parallel_config(self) -> Self:
300306 "num_redundant_experts."
301307 )
302308
309+ if self .prefill_context_parallel_size > 1 :
310+ raise ValueError (
311+ "Prefill context parallelism is not fully supported. "
312+ "Please set prefill_context_parallel_size to 1."
313+ )
303314 return self
304315
305316 @property
@@ -479,7 +490,11 @@ def __post_init__(self) -> None:
479490 )
480491
481492 # Continue with the rest of the initialization
482- self .world_size = self .pipeline_parallel_size * self .tensor_parallel_size
493+ self .world_size = (
494+ self .pipeline_parallel_size
495+ * self .tensor_parallel_size
496+ * self .prefill_context_parallel_size
497+ )
483498
484499 if self .distributed_executor_backend == "external_launcher" :
485500 logger .info ("Using external launcher for distributed inference." )
0 commit comments