[New Model] DeepSeek-V3.2 (Rebased to Main) (vllm-project#25896)

zyongye · heheda12345 · youkaichao · devpatelio · commit f7f0c2a93be2 · 2025-11-28T16:29:07.000-08:00
Signed-off-by: Chen Zhang &lt;zhangch99@outlook.com&gt;
Signed-off-by: youkaichao &lt;youkaichao@gmail.com&gt;
Signed-off-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
Signed-off-by: mgoin &lt;mgoin64@gmail.com&gt;
Signed-off-by: NickLucche &lt;nlucches@redhat.com&gt;
Signed-off-by: Yongye Zhu &lt;zyy1102000@gmail.com&gt;
Signed-off-by: Barry Kang &lt;43644113+Barry-Delaney@users.noreply.github.com&gt;
Signed-off-by: Lucia Fang &lt;fanglu@meta.com&gt;
Co-authored-by: Chen Zhang &lt;zhangch99@outlook.com&gt;
Co-authored-by: youkaichao &lt;youkaichao@gmail.com&gt;
Co-authored-by: Lucas Wilkinson &lt;lwilkins@redhat.com&gt;
Co-authored-by: Robert Shaw &lt;114415538+robertgshaw2-redhat@users.noreply.github.com&gt;
Co-authored-by: Lucas Wilkinson &lt;LucasWilkinson@users.noreply.github.com&gt;
Co-authored-by: yewentao256 &lt;zhyanwentao@126.com&gt;
Co-authored-by: Wentao Ye &lt;44945378+yewentao256@users.noreply.github.com&gt;
Co-authored-by: mgoin &lt;mgoin64@gmail.com&gt;
Co-authored-by: Lucia Fang &lt;116399278+luccafong@users.noreply.github.com&gt;
Co-authored-by: Lucia Fang &lt;fanglu@meta.com&gt;
Co-authored-by: NickLucche &lt;nlucches@redhat.com&gt;
Co-authored-by: Siyuan Fu &lt;siyuanf@nvidia.com&gt;
Co-authored-by: Matthew Bonanni &lt;mbonanni@redhat.com&gt;
Co-authored-by: Xiaozhu Meng &lt;mxz297@gmail.com&gt;
Co-authored-by: Barry Kang &lt;43644113+Barry-Delaney@users.noreply.github.com&gt;
diff --git a/vllm/attention/ops/flashmla.py b/vllm/attention/ops/flashmla.py
@@ -136,7 +136,7 @@ def flash_mla_with_kvcache(
         descale_k is None
     ), "descale_q and descale_k should be both None or both not None"
 
-    if indices is None and q.element_size() == 1:
+    if (descale_q is not None) and (descale_k is not None):
         out, softmax_lse = torch.ops._flashmla_extension_C.fwd_kvcache_mla_fp8(
             q, k_cache, head_dim_v, cache_seqlens, block_table, softmax_scale,
             causal, tile_scheduler_metadata, num_splits, descale_q, descale_k)
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
@@ -400,7 +400,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                 "exactly equal.", mamba_padding_pct)
 
 
-class DeepseekV32ForCausalLM(VerifyAndUpdateConfig):
+class DeepseekV3ForCausalLM(VerifyAndUpdateConfig):
 
     @classmethod
     def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
@@ -409,20 +409,20 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
         """
         hf_config = vllm_config.model_config.hf_config
 
-        # Mirror the check in vllm/model_executor/models/deepseek_v2.py
         is_v32 = hasattr(hf_config, "index_topk")
-        assert is_v32
 
-        # For DeepSeekV3.2, we use a custom fp8 format as default (i.e.
-        #   "auto")
-        cache_config = vllm_config.cache_config
-        if cache_config.cache_dtype == "auto" or \
-            cache_config.cache_dtype.startswith("fp8"):
-            cache_config.cache_dtype = "fp8_ds_mla"
-            logger.info("Using custom fp8 kv-cache format for DeepSeekV3.2")
-        if cache_config.cache_dtype == "bfloat16":
-            cache_config.cache_dtype = "auto"
-            logger.info("Using bfloat16 kv-cache for DeepSeekV3.2")
+        if is_v32:
+            # For DeepSeekV3.2, we use a custom fp8 format as default (i.e.
+            #   "auto")
+            cache_config = vllm_config.cache_config
+            if cache_config.cache_dtype == "auto" or \
+                cache_config.cache_dtype.startswith("fp8"):
+                cache_config.cache_dtype = "fp8_ds_mla"
+                logger.info(
+                    "Using custom fp8 kv-cache format for DeepSeekV3.2")
+            if cache_config.cache_dtype == "bfloat16":
+                cache_config.cache_dtype = "auto"
+                logger.info("Using bfloat16 kv-cache for DeepSeekV3.2")
 
 
 MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
@@ -441,5 +441,5 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
     "MambaForCausalLM": MambaModelConfig,
     "Mamba2ForCausalLM": MambaModelConfig,
     "FalconMambaForCausalLM": MambaModelConfig,
-    "DeepseekV32ForCausalLM": DeepseekV32ForCausalLM,
+    "DeepseekV3ForCausalLM": DeepseekV3ForCausalLM,
 }
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
@@ -64,17 +64,13 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.models.utils import sequence_parallel_chunk
-<<<<<<< HEAD
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils import cdiv, direct_register_custom_op
 from vllm.utils.deep_gemm import fp8_mqa_logits, fp8_paged_mqa_logits
 from vllm.v1.attention.backends.mla.indexer import (DeepseekV32IndexerBackend,
                                                     DeepseekV32IndexerMetadata)
 from vllm.v1.kv_cache_interface import KVCacheSpec, MLAAttentionSpec
-=======
-from vllm.sequence import IntermediateTensors
->>>>>>> a5354b3ed ([Bugfix][WideEP] Apply TP Attn + EP MoE fix to other models (#24982))
 
 from .interfaces import MixtureOfExperts, SupportsLoRA, SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
@@ -587,43 +583,44 @@ def sparse_attn_indexer(
     topk_indices_buffer[:hidden_states.shape[0]] = -1
     if has_prefill:
         prefill_metadata = attn_metadata.prefill
-        for chunk in prefill_metadata.chunks:
-            k_fp8 = torch.empty([chunk.total_seq_lens, head_dim],
-                                device=k.device,
-                                dtype=torch.float8_e4m3fn)
-            k_scale = torch.empty([chunk.total_seq_lens, 1],
-                                  device=k.device,
-                                  dtype=torch.float32)
-            cp_gather_indexer_k_quant_cache(
-                kv_cache,
-                k_fp8,
-                k_scale,
-                chunk.block_table,
-                chunk.cu_seq_lens,
-                chunk.num_reqs,
-            )
-            logits = fp8_mqa_logits(
-                q_fp8[chunk.token_start:chunk.token_end],
-                (k_fp8, k_scale),
-                weights[chunk.token_start:chunk.token_end],
-                chunk.cu_seqlen_ks,
-                chunk.cu_seqlen_ke,
-            )
-            topk_indices = logits.topk(min(topk_tokens, logits.shape[-1]),
-                                       dim=-1)[1]
-            topk_indices -= chunk.cu_seqlen_ks[:, None]
-            mask_lo = topk_indices >= 0
-            mask_hi = topk_indices - (chunk.cu_seqlen_ke -
-                                      chunk.cu_seqlen_ks)[:, None] < 0
-            mask = torch.full_like(topk_indices,
-                                   False,
-                                   dtype=torch.bool,
-                                   device=topk_indices.device)
-            mask = mask_lo & mask_hi
-            topk_indices = topk_indices.masked_fill(~mask, -1)
-            topk_indices_buffer[
-                chunk.token_start:chunk.token_end, :topk_indices.
-                shape[-1]] = topk_indices.to(dtype=torch.int32)
+        num_prefills = attn_metadata.num_prefills
+        k_fp8 = torch.empty([prefill_metadata.total_seq_lens, head_dim],
+                            device=k.device,
+                            dtype=torch.float8_e4m3fn)
+        k_scale = torch.empty([prefill_metadata.total_seq_lens, 1],
+                              device=k.device,
+                              dtype=torch.float32)
+        cp_gather_indexer_k_quant_cache(
+            kv_cache,
+            k_fp8,
+            k_scale,
+            prefill_metadata.block_table,
+            prefill_metadata.cu_seq_lens,
+            num_prefills,
+        )
+        cu_seqlen_ks = prefill_metadata.cu_seqlen_ks
+        cu_seqlen_ke = prefill_metadata.cu_seqlen_ke
+        num_tokens = attn_metadata.num_actual_tokens
+        logits = fp8_mqa_logits(
+            q_fp8[num_decode_tokens:num_tokens],
+            (k_fp8, k_scale),
+            weights[num_decode_tokens:num_tokens],
+            cu_seqlen_ks,
+            cu_seqlen_ke,
+        )
+        topk_indices = logits.topk(min(topk_tokens, logits.shape[-1]),
+                                   dim=-1)[1]
+        topk_indices -= cu_seqlen_ks[:, None]
+        mask_lo = topk_indices >= 0
+        mask_hi = topk_indices - (cu_seqlen_ke - cu_seqlen_ks)[:, None] < 0
+        mask = torch.full_like(topk_indices,
+                               False,
+                               dtype=torch.bool,
+                               device=topk_indices.device)
+        mask = mask_lo & mask_hi
+        topk_indices = topk_indices.masked_fill(~mask, -1)
+        topk_indices_buffer[num_decode_tokens:num_tokens, :topk_indices.
+                            shape[-1]] = topk_indices.to(dtype=torch.int32)
 
     if has_decode:
         decode_metadata = attn_metadata.decode
diff --git a/vllm/v1/attention/backends/mla/indexer.py b/vllm/v1/attention/backends/mla/indexer.py
@@ -49,20 +49,14 @@ def get_kv_cache_stride_order() -> tuple[int, ...]:
 
 
 @dataclass
-class DeepseekV32IndexerPrefillChunkMetadata:
+class DeepseekV32IndexerPrefillMetadata:
     block_table: torch.Tensor
+    query_start_loc: torch.Tensor
+    max_query_len: int
     cu_seqlen_ks: torch.Tensor
     cu_seqlen_ke: torch.Tensor
     cu_seq_lens: torch.Tensor
     total_seq_lens: int
-    token_start: int
-    token_end: int
-    num_reqs: int
-
-
-@dataclass
-class DeepseekV32IndexerPrefillMetadata:
-    chunks: list[DeepseekV32IndexerPrefillChunkMetadata]
 
 
 @dataclass
@@ -104,8 +98,8 @@ class DeepseekV32IndexerMetadata:
 
 # TODO (zyongye) optimize this, this is now vibe coded
 def kv_spans_from_batches(
-        start_seq_loc: torch.Tensor, seq_len_per_batch: torch.Tensor,
-        device: torch.device) -> tuple[torch.Tensor, torch.Tensor]:
+        start_seq_loc: torch.Tensor,
+        seq_len_per_batch: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Args:
       start_seq_loc: 1D long tensor [B+1], cumulative counts of 
@@ -128,14 +122,15 @@ def kv_spans_from_batches(
     are the **last** `counts[i]` positions of that sequence.
     """
     q = start_seq_loc.to(dtype=torch.long)
-    L = seq_len_per_batch.to(dtype=torch.long)
+    L = seq_len_per_batch.to(dtype=torch.long, device=q.device)
     assert q.dim() == 1 and L.dim() == 1
     assert q.numel() == L.numel() + 1, "start_seq_loc must have length B+1"
 
     # Selected tokens per batch and totals
     counts = q[1:] - q[:-1]  # [B]
     N = int(q[-1].item())  # total selected tokens
     B = L.numel()
+    device = L.device
 
     if N == 0:
         return (torch.empty(0, dtype=torch.long, device=device),
@@ -145,7 +140,8 @@ def kv_spans_from_batches(
     kv_starts_per_batch = torch.cumsum(L, dim=0) - L  # [B]
 
     # For each selected token, which batch does it belong to?
-    batch_id = torch.repeat_interleave(torch.arange(B), counts)  # [N]
+    batch_id = torch.repeat_interleave(torch.arange(B, device=device),
+                                       counts)  # [N]
 
     # Map batch KV start to each token
     start_tensor = kv_starts_per_batch[batch_id]  # [N]
@@ -155,56 +151,27 @@ def kv_spans_from_batches(
     L_expand = torch.repeat_interleave(L, counts)  # [N]
     m_expand = torch.repeat_interleave(counts, counts)  # [N]
     # position within the selected block: 1..counts[b]
-    pos_within = (torch.arange(N, dtype=torch.long) -
+    pos_within = (torch.arange(N, device=device, dtype=torch.long) -
                   torch.repeat_interleave(q[:-1], counts) + 1)
 
     local_pos = L_expand - m_expand + pos_within  # [N], 1-based
     end_location = start_tensor + local_pos  # exclusive end
 
-    return start_tensor.int().to(device), end_location.int().to(device)
+    return start_tensor.int(), end_location.int()
 
 
 def get_max_prefill_buffer_size(vllm_config: VllmConfig):
     max_model_len = vllm_config.model_config.max_model_len
-    # NOTE(Chen): 2 is a magic number for controlling the prefill buffer size.
-    # May be tuned later.
-    return max_model_len * 2
-
-
-def split_prefill_chunks(seq_lens_cpu: torch.Tensor,
-                         max_prefill_buffer_size: int,
-                         reqs_start: int) -> list[tuple[int, int]]:
-    """
-    Split the prefill chunks into a list of tuples of (reqs_start, reqs_end)
-    such that the total sequence length of each chunk is less than the
-    maximum prefill buffer size.
-
-    Args:
-        seq_lens_cpu: The sequence lengths of the prefill requests.
-        max_prefill_buffer_size: The maximum prefill buffer size.
-        reqs_start: The start index of the prefill requests.
-    
-    Returns:
-        A list of tuples of (reqs_start, reqs_end).
-    """
-    chunk_seq_ids = []
-    total_seq_lens = 0
-    for i in range(reqs_start, len(seq_lens_cpu)):
-        cur_seq_len = seq_lens_cpu[i].item()
-        assert cur_seq_len <= max_prefill_buffer_size
-        total_seq_lens += cur_seq_len
-        if total_seq_lens > max_prefill_buffer_size:
-            chunk_seq_ids.append((reqs_start, i))
-            reqs_start = i
-            total_seq_lens = cur_seq_len
-    if total_seq_lens > 0:
-        chunk_seq_ids.append((reqs_start, len(seq_lens_cpu)))
-    return chunk_seq_ids
+    # max_num_batched_tokens = \
+    #     vllm_config.scheduler_config.max_num_batched_tokens
+    max_num_seq = vllm_config.scheduler_config.max_num_seqs
+    # NOTE(Chen): an estimated max size of flattened_kv. Need to double check.
+    return max_model_len * max_num_seq
 
 
 class DeepseekV32IndexerMetadataBuilder(AttentionMetadataBuilder):
     cudagraph_support: ClassVar[AttentionCGSupport] = \
-        AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
+        AttentionCGSupport.UNIFORM_BATCH
 
     reorder_batch_threshold: int = 1
 
@@ -234,33 +201,6 @@ def __init__(self, *args, **kwargs):
                                                      dtype=torch.int32,
                                                      device=self.device)
 
-    def build_one_prefill_chunk(self, reqs_start, reqs_end,
-                                query_start_loc_cpu, seq_lens_cpu,
-                                block_table):
-        prefill_query_start_loc = query_start_loc_cpu[
-            reqs_start:reqs_end + 1] - query_start_loc_cpu[reqs_start]
-        cu_seqlen_ks, cu_seqlen_ke = kv_spans_from_batches(
-            prefill_query_start_loc, seq_lens_cpu[reqs_start:reqs_end],
-            self.device)
-        token_start = query_start_loc_cpu[reqs_start].item()
-        token_end = query_start_loc_cpu[reqs_end].item()
-        total_seq_lens = seq_lens_cpu[reqs_start:reqs_end].sum()
-        assert total_seq_lens <= self.max_prefill_buffer_size
-        cu_seq_lens = torch.cat([
-            torch.zeros(1, dtype=torch.int32),
-            seq_lens_cpu[reqs_start:reqs_end].cumsum(dim=0)
-        ]).to(torch.int32).to(self.device)
-        return DeepseekV32IndexerPrefillChunkMetadata(
-            cu_seqlen_ks=cu_seqlen_ks,
-            cu_seqlen_ke=cu_seqlen_ke,
-            cu_seq_lens=cu_seq_lens,
-            total_seq_lens=total_seq_lens,
-            block_table=block_table[reqs_start:reqs_end],
-            token_start=token_start,
-            token_end=token_end,
-            num_reqs=reqs_end - reqs_start,
-        )
-
     def build(self,
               common_prefix_len: int,
               common_attn_metadata: CommonAttentionMetadata,
@@ -269,7 +209,11 @@ def build(self,
         num_reqs = common_attn_metadata.num_reqs
         num_tokens = common_attn_metadata.num_actual_tokens
 
-        query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu
+        device = self.device
+        block_table_tensor = common_attn_metadata.block_table_tensor
+
+        query_start_loc = common_attn_metadata.query_start_loc
+
         num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = \
             split_decodes_and_prefills(
                 common_attn_metadata,
@@ -280,20 +224,27 @@ def build(self,
 
         prefill_metadata = None
         if num_prefills > 0:
-            chunk_seq_ids = split_prefill_chunks(
-                common_attn_metadata.seq_lens_cpu,
-                self.max_prefill_buffer_size,
-                num_decodes,
-            )
-            chunks = [
-                self.build_one_prefill_chunk(
-                    reqs_start, reqs_end, query_start_loc_cpu,
-                    common_attn_metadata.seq_lens_cpu,
-                    common_attn_metadata.block_table_tensor)
-                for reqs_start, reqs_end in chunk_seq_ids
-            ]
+            reqs_start = num_decodes
+            prefill_query_start_loc = query_start_loc[
+                reqs_start:] - query_start_loc[reqs_start]
+            cu_seqlen_ks, cu_seqlen_ke = kv_spans_from_batches(
+                prefill_query_start_loc,
+                common_attn_metadata.seq_lens[reqs_start:])
+            total_seq_lens = common_attn_metadata.seq_lens[reqs_start:].sum()
+            assert total_seq_lens < self.max_prefill_buffer_size
+            cu_seq_lens = torch.cat([
+                torch.zeros(1, dtype=torch.int32, device=device),
+                common_attn_metadata.seq_lens[reqs_start:].cumsum(dim=0)
+            ]).to(torch.int32).cuda()
             prefill_metadata = DeepseekV32IndexerPrefillMetadata(
-                chunks=chunks, )
+                block_table=block_table_tensor[reqs_start:, ...],
+                query_start_loc=prefill_query_start_loc,
+                max_query_len=common_attn_metadata.max_query_len,
+                cu_seqlen_ks=cu_seqlen_ks,
+                cu_seqlen_ke=cu_seqlen_ke,
+                cu_seq_lens=cu_seq_lens,
+                total_seq_lens=total_seq_lens,
+            )
 
         decode_metadata = None
         if num_decodes > 0: