vllm-project · simon-mo · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
@@ -406,7 +406,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             # easily by changing the way we layout chunks in the
             # mamba2 kernels.
 
-            base_chunk_size = model_config.get_mamba_chunk_size()
+            base_chunk_size = mamba_block_size or model_config.get_mamba_chunk_size()
             attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token)
             chunk_size = lcm(base_chunk_size, kernel_block_alignment_size)
             attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size)

@@ -93,7 +93,11 @@
             )
 
             connector_vllm_config = copy.copy(self.vllm_config)
-            connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config)
+
+            # We're dynamically inserting a kv_cache_config variable into the
+            # connector_vllm_config. This is distinct from the cache_config
+            # that is already in there.
+            connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config)  # type: ignore[attr-defined]
             self.connector = KVConnectorFactory.create_connector(
                 config=connector_vllm_config, role=KVConnectorRole.SCHEDULER
             )
@@ -1335,7 +1339,7 @@
            assert len(self.kv_cache_config.kv_cache_groups) == 1
            return self.connector.request_finished(request, block_ids[0])
        else:
            return self.connector.request_finished(request, block_ids)
-
+            return self.connector.request_finished(request, block_ids)  # type: ignore[attr-defined]
-
+            return self.connector.request_finished(request, block_ids)  # type: ignore[attr-defined]

    def _update_waiting_for_remote_kv(self, request: Request) -> bool:
        """