diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index b0a48a9f1d45..7150977e9266 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -406,7 +406,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None: # easily by changing the way we layout chunks in the # mamba2 kernels. - base_chunk_size = model_config.get_mamba_chunk_size() + base_chunk_size = mamba_block_size or model_config.get_mamba_chunk_size() attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token) chunk_size = lcm(base_chunk_size, kernel_block_alignment_size) attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index ad6fbee2ec08..98c8f08b0aae 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -13,7 +13,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1 import ( KVConnectorBase_V1, KVConnectorRole, - supports_hma, + SupportsHMA, ) from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats from vllm.logger import init_logger @@ -93,7 +93,11 @@ def __init__( ) connector_vllm_config = copy.copy(self.vllm_config) - connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config) + + # We're dynamically inserting a kv_cache_config variable into the + # connector_vllm_config. This is distinct from the cache_config + # that is already in there. + connector_vllm_config.kv_cache_config = copy.copy(kv_cache_config) # type: ignore[attr-defined] self.connector = KVConnectorFactory.create_connector( config=connector_vllm_config, role=KVConnectorRole.SCHEDULER ) @@ -1327,15 +1331,15 @@ def _connector_finished( block_ids = self.kv_cache_manager.get_block_ids(request.request_id) - if not supports_hma(self.connector): + if not isinstance(self.connector, SupportsHMA): # NOTE(Kuntai): We should deprecate this code path after we enforce # all connectors to support HMA. # Hybrid memory allocator should be already turned off for this # code path, but let's double-check here. assert len(self.kv_cache_config.kv_cache_groups) == 1 return self.connector.request_finished(request, block_ids[0]) - else: - return self.connector.request_finished(request, block_ids) + + return self.connector.request_finished_all_groups(request, block_ids) def _update_waiting_for_remote_kv(self, request: Request) -> bool: """