@@ -72,6 +72,8 @@ class ParallelConfig:
7272 """Number of pipeline parallel groups."""
7373 tensor_parallel_size : int = 1
7474 """Number of tensor parallel groups."""
75+ prefill_context_parallel_size : int = 1
76+ """Number of prefill context parallel groups."""
7577 data_parallel_size : int = 1
7678 """Number of data parallel groups. MoE layers will be sharded according to
7779 the product of the tensor parallel size and data parallel size."""
@@ -240,14 +242,25 @@ class is dynamically inherited by the worker class. This is used to inject
240242 needs to be divisible by dcp_size."""
241243
242244 dcp_kv_cache_interleave_size : int = 1
243- """Interleave size of kv_cache storage while using dcp or cp > 1,
244- store interleave_size tokens on (d)cp i,
245- then store next interleave_size tokens on (d)cp i+1.
246- Interleave_size=1: token-level align, token i is stored on rank i % (d)cp_size.
247- Interleave_size=block_size: block-level align, first fill the block on first rank,
248- token is stored on rank i+1 block j after rank i block j is full.
249- Block_size should be greater than or equal to dcp_kv_cache_interleave_size.
250- Block_size should be divisible by dcp_kv_cache_interleave_size.
245+ """
246+ Interleave size of kv_cache storage while using DCP.
247+ dcp_kv_cache_interleave_size has been replaced by cp_kv_cache_interleave_size,
248+ and will be deprecated when PCP is fully supported.
249+
250+ """
251+ cp_kv_cache_interleave_size : int = 1
252+ """Interleave size of kv_cache storage while using DCP or PCP.
253+ For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
254+ and `total_cp_world_size = pcp_world_size * dcp_world_szie`.
255+ store interleave_size tokens on total_cp_rank i,
256+ then store next interleave_size tokens on taotal_cp_rank i+1.
257+ Interleave_size=1: token-level alignment, where token `i` is stored on
258+ total_cp_rank `i % total_cp_world_size`.
259+ Interleave_size=block_size: block-level alignment, where tokens are
260+ first populated to the preceding ranks. Tokens are then stored
261+ in (rank i+1, block j) only after (rank i, block j) is fully occupied.
262+ Block_size should be greater than or equal to cp_kv_cache_interleave_size.
263+ Block_size should be divisible by cp_kv_cache_interleave_size.
251264 """
252265
253266 _api_process_count : int = Field (default = 1 , gt = 0 )
@@ -312,6 +325,11 @@ def _validate_parallel_config(self) -> Self:
312325 "num_redundant_experts."
313326 )
314327
328+ if self .prefill_context_parallel_size > 1 :
329+ raise ValueError (
330+ "Prefill context parallelism is not fully supported. "
331+ "Please set prefill_context_parallel_size to 1."
332+ )
315333 return self
316334
317335 @property
@@ -508,7 +526,11 @@ def __post_init__(self) -> None:
508526 )
509527
510528 # Continue with the rest of the initialization
511- self .world_size = self .pipeline_parallel_size * self .tensor_parallel_size
529+ self .world_size = (
530+ self .pipeline_parallel_size
531+ * self .tensor_parallel_size
532+ * self .prefill_context_parallel_size
533+ )
512534
513535 if self .distributed_executor_backend == "external_launcher" :
514536 logger .info ("Using external launcher for distributed inference." )
0 commit comments