From 0a60364f3f3b07467bb4341fdc78bde840cd3f63 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 31 Mar 2025 17:35:58 +0800 Subject: [PATCH 001/155] runnable Signed-off-by: Abatom --- .../disagg_xpyd/disagg_prefill_proxy_xpyd.py | 149 +++++++++ .../disagg_xpyd/disagg_prefill_xpyd.sh | 108 ++++++ vllm/config.py | 9 + .../kv_transfer/kv_connector/factory.py | 5 + .../kv_transfer/kv_connector/p2p_connector.py | 309 ++++++++++++++++++ .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 286 ++++++++++++++++ 6 files changed, 866 insertions(+) create mode 100644 examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py create mode 100644 examples/online_serving/disagg_xpyd/disagg_prefill_xpyd.sh create mode 100644 vllm/distributed/kv_transfer/kv_connector/p2p_connector.py create mode 100644 vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py new file mode 100644 index 000000000000..7a1322f5afb2 --- /dev/null +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +import pickle +import random +import socket +import threading +import uuid + +import aiohttp +import zmq +from quart import Quart, make_response, request + +from vllm.logger import init_logger + +prefill_instances: dict[str, str] = {} # http_address: zmq_address +decode_instances: dict[str, str] = {} # http_address: zmq_address + +prefill_cv = threading.Condition() +decode_cv = threading.Condition() + + +def _listen_for_register(poller, router_socket): + while True: + socks = dict(poller.poll()) + if router_socket in socks: + remote_address, message = router_socket.recv_multipart() + # data: {"type": "P", "http_address": "ip:port", + # "zmq_address": "ip:port"} + data = pickle.loads(message) + # print("Received message from %s, data: %s", + # remote_address.decode(), data) + if data["type"] == "P": + global prefill_instances + global prefill_cv + with prefill_cv: + prefill_instances[ + data["http_address"]] = data["zmq_address"] + elif data["type"] == "D": + global decode_instances + global decode_cv + with decode_cv: + decode_instances[ + data["http_address"]] = data["zmq_address"] + else: + print("Unexpected, Received message from %s, data: %s", + remote_address, data) + + +def start_service_discovery(hostname, port): + if not hostname: + hostname = socket.gethostname() + if port == 0: + raise ValueError("Port cannot be 0") + + context = zmq.Context() + router_socket = context.socket(zmq.ROUTER) + router_socket.bind(f"tcp://{hostname}:{port}") + + poller = zmq.Poller() + poller.register(router_socket, zmq.POLLIN) + + _listener_thread = threading.Thread(target=_listen_for_register, + args=[poller, router_socket], + daemon=True) + _listener_thread.start() + return _listener_thread + + +AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) + +app = Quart(__name__) + + +def random_uuid() -> str: + return str(uuid.uuid4().hex) + + +async def forward_request(url, data, request_id): + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + headers = { + "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", + "X-Request-Id": request_id + } + async with session.post(url=url, json=data, + headers=headers) as response: + if response.status == 200: + # if response.headers.get('Transfer-Encoding') == 'chunked': + if True: + async for chunk_bytes in response.content.iter_chunked( + 1024): + yield chunk_bytes + else: + content = await response.read() + yield content + + +@app.route('/v1/completions', methods=['POST']) +async def handle_request(): + try: + original_request_data = await request.get_json() + + prefill_request = original_request_data.copy() + # change max_tokens = 1 to let it only do prefill + prefill_request['max_tokens'] = 1 + + global prefill_instances + global prefill_cv + with prefill_cv: + prefill_address, prefill_zmq_address = random.choice(list(prefill_instances.items())) + print("handle_request, prefill_address: %s, zmq_address: %s", prefill_address, + prefill_zmq_address) + + global decode_instances + global decode_cv + with decode_cv: + decode_address, decode_zmq_address = random.choice(list(decode_instances.items())) + print("handle_request, decode_address: %s, zmq_address: %s", decode_address, + decode_zmq_address) + + request_id = f"___prefill_address_{prefill_zmq_address}___decode_address_{decode_zmq_address}_{random_uuid()}" + + # finish prefill + async for _ in forward_request( + f'http://{prefill_address}/v1/completions', prefill_request, + request_id): + continue + + # return decode + generator = forward_request(f'http://{decode_address}/v1/completions', + original_request_data, request_id) + response = await make_response(generator) + response.timeout = None + + return response + + except Exception as e: + import sys + import traceback + exc_info = sys.exc_info() + print("Error occurred in disagg prefill proxy server") + print(e) + print("".join(traceback.format_exception(*exc_info))) + + +if __name__ == '__main__': + t = start_service_discovery("0.0.0.0", 30001) + app.run(host='0.0.0.0', port=10001) + t.join() diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_xpyd.sh b/examples/online_serving/disagg_xpyd/disagg_prefill_xpyd.sh new file mode 100644 index 000000000000..87087565380c --- /dev/null +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_xpyd.sh @@ -0,0 +1,108 @@ +#!/bin/bash +set -xe + +# Trap the SIGINT signal (triggered by Ctrl+C) +trap 'cleanup' INT + +# Cleanup function +cleanup() { + echo "Caught Ctrl+C, cleaning up..." + # Cleanup commands + pgrep python | xargs kill -9 + pkill -f python + echo "Cleanup complete. Exiting." + exit 0 +} + +export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + +python3 disagg_prefill_proxy_xpyd.py & + +MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct} + +## 2P2D, TP=1 +## prefilling instance, which is the KV producer +#CUDA_VISIBLE_DEVICES=4 vllm serve $MODEL_NAME \ +# --host 0.0.0.0 \ +# --port 20001 \ +# --served-model-name base_model \ +# --max-model-len 8192 \ +# --gpu-memory-utilization 0.8 \ +# --kv-transfer-config \ +# '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_producer","http_port":"20001","kv_port":"21001"}' & +# +## prefilling instance, which is the KV producer +#CUDA_VISIBLE_DEVICES=5 vllm serve $MODEL_NAME \ +# --host 0.0.0.0 \ +# --port 20002 \ +# --served-model-name base_model \ +# --max-model-len 8192 \ +# --gpu-memory-utilization 0.8 \ +# --kv-transfer-config \ +# '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_producer","http_port":"20002","kv_port":"22001"}' & +# +## decoding instance, which is the KV consumer +#CUDA_VISIBLE_DEVICES=6 vllm serve $MODEL_NAME \ +# --host 0.0.0.0 \ +# --port 20003 \ +# --served-model-name base_model \ +# --max-model-len 8192 \ +# --gpu-memory-utilization 0.8 \ +# --kv-transfer-config \ +# '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_consumer","http_port":"20003","kv_port":"23001"}' & +# +## decoding instance, which is the KV consumer +#CUDA_VISIBLE_DEVICES=7 vllm serve $MODEL_NAME \ +# --host 0.0.0.0 \ +# --port 20004 \ +# --served-model-name base_model \ +# --max-model-len 8192 \ +# --gpu-memory-utilization 0.8 \ +# --kv-transfer-config \ +# '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_consumer","http_port":"20004","kv_port":"24001"}' & + + +# 2P2D, TP=2 +# prefilling instance, which is the KV producer +CUDA_VISIBLE_DEVICES=0,1 vllm serve $MODEL_NAME \ + --host 0.0.0.0 \ + --port 20001 \ + --tensor-parallel-size 2 \ + --served-model-name base_model \ + --max-model-len 8192 \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config \ + '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_producer","http_port":"20001","kv_port":"21001"}' & + +# prefilling instance, which is the KV producer +CUDA_VISIBLE_DEVICES=2,3 vllm serve $MODEL_NAME \ + --host 0.0.0.0 \ + --port 20002 \ + --tensor-parallel-size 2 \ + --served-model-name base_model \ + --max-model-len 8192 \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config \ + '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_producer","http_port":"20002","kv_port":"22001"}' & + +# decoding instance, which is the KV consumer +CUDA_VISIBLE_DEVICES=4,5 vllm serve $MODEL_NAME \ + --host 0.0.0.0 \ + --port 20003 \ + --tensor-parallel-size 2 \ + --served-model-name base_model \ + --max-model-len 8192 \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config \ + '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_consumer","http_port":"20003","kv_port":"23001"}' & + +# decoding instance, which is the KV consumer +CUDA_VISIBLE_DEVICES=6,7 vllm serve $MODEL_NAME \ + --host 0.0.0.0 \ + --port 20004 \ + --tensor-parallel-size 2 \ + --served-model-name base_model \ + --max-model-len 8192 \ + --gpu-memory-utilization 0.8 \ + --kv-transfer-config \ + '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_consumer","http_port":"20004","kv_port":"24001"}' & \ No newline at end of file diff --git a/vllm/config.py b/vllm/config.py index 6a15109c6744..aea37c97c3be 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2939,6 +2939,15 @@ class KVTransferConfig(BaseModel): # The KV connector port, used to build distributed connection kv_port: int = 14579 + # The HTTP port used by OpenAI is the same as the `--port` parameter. + http_port: int = 20001 + + # The IP of the proxy. + proxy_ip: str = "127.0.0.1" + + # The Port of the proxy. + proxy_port: int = 30001 + # any extra config that the connector may need kv_connector_extra_config: dict[str, Any] = {} diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index e37ce6dc75b0..c4e1a0891462 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -39,6 +39,11 @@ def create_connector(cls, rank: int, local_rank: int, # Register various connectors here. # The registration should not be done in each individual file, as we want to # only load the files corresponding to the current connector. +KVConnectorFactory.register_connector( + "P2pConnector", + "vllm.distributed.kv_transfer.kv_connector.p2p_connector", + "P2pConnector") + KVConnectorFactory.register_connector( "PyNcclConnector", "vllm.distributed.kv_transfer.kv_connector.simple_connector", diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py new file mode 100644 index 000000000000..01e29ed4404e --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py @@ -0,0 +1,309 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +Simple KV Cache Connector for Distributed Machine Learning Inference + +The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache +producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe or +MooncakePipe. + +But the logic can be extended to support other pipe and lookup buffer. +""" +import re +from typing import TYPE_CHECKING, List, Tuple, Union + +import torch + +import vllm.envs as envs +from vllm import _custom_ops as ops +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase +from vllm.distributed.kv_transfer.kv_pipe.p2p_nccl_pipe import P2pNcclPipe +from vllm.logger import init_logger +from vllm.sequence import IntermediateTensors + +if TYPE_CHECKING: + from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata + +logger = init_logger(__name__) + + +class P2pConnector(KVConnectorBase): + + def __init__( + self, + rank: int, + local_rank: int, + config: VllmConfig, + ): + self.rank = rank + self.config = config.kv_transfer_config + self.tp_size = config.parallel_config.tensor_parallel_size + self.is_deepseek_mla = config.model_config.is_deepseek_mla + self.use_mla_opt = not envs.VLLM_MLA_DISABLE + + assert self.config.kv_connector == "P2pConnector" + + self.lookup_buffer_size = self.config.kv_buffer_size + + self.p2p_nccl_pipe = P2pNcclPipe( + local_rank=local_rank, + config=self.config, + hostname="", + port_offset=rank, + ) + + def send_kv_caches_and_hidden_states( + self, + model_executable: torch.nn.Module, + model_input: "ModelInputForGPUWithSamplingMetadata", + kv_caches: List[torch.Tensor], + hidden_or_intermediate_states: Union[torch.Tensor, + IntermediateTensors], + ) -> None: + + # input_tokens_tensor = model_input.input_tokens + seq_lens = model_input.attn_metadata.seq_lens + slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten() + num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens + request_ids = list(model_input.request_ids_to_seq_ids.keys()) + start_layer = model_executable.model.start_layer + end_layer = model_executable.model.end_layer + + model_config = model_executable.model.config + num_heads = int(model_config.num_key_value_heads / self.tp_size) + hidden_size = model_config.hidden_size + num_attention_heads = model_config.num_attention_heads + + # Deepseek's MLA (Multi-head Latent Attention) uses two different + # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0. + # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied, + # resulting in a kv_cache shape of [num_blks, blk_size, 1, + # kv_lora_rank + qk_rope_head_dim]. + # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading + # to a kv_cache shape of [2, num_blks, blk_size, + # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim]. + # For more details, see vllm/attention/backends/mla/common.py. + if self.is_deepseek_mla and self.use_mla_opt: + head_size = model_config.kv_lora_rank + \ + model_config.qk_rope_head_dim + num_heads = 1 + elif self.is_deepseek_mla and not self.use_mla_opt: + head_size = model_config.qk_nope_head_dim + \ + model_config.qk_rope_head_dim + else: + head_size = getattr(model_config, "head_dim", + int(hidden_size // num_attention_heads)) + + # query_lens contains new KV caches that are added to vLLM. + # so we will send them to decode instance + # FIXME(Kuntai): This assume that all requests are prefill. + for idx, slen in enumerate(seq_lens): + start_pos = sum(seq_lens[:idx]) + end_pos = start_pos + slen + + if start_pos >= num_prefill_tokens: + # vllm/worker/model_runner.py::_prepare_model_input_tensors: + # - input_tokens[:num_prefill_tokens] contains prefill tokens. + # - input_tokens[num_prefill_tokens:] contains decode tokens. + logger.warning("You have some decode requests while using " + "SimpleConnector. Their KVCache won't be sent.") + break + + # current_tokens = input_tokens_tensor[start_pos:end_pos] + + keys, values = [], [] + + for layer_id in range(start_layer, end_layer): + kv_cache = kv_caches[layer_id - start_layer] + + if self.is_deepseek_mla and self.use_mla_opt: + key_cache = kv_cache.reshape(-1, num_heads, head_size) + value_cache = kv_cache.reshape(-1, num_heads, head_size) + else: + key_cache = kv_cache[0].reshape(-1, num_heads, head_size) + value_cache = kv_cache[1].reshape(-1, num_heads, head_size) + + current_slot_mapping = slot_mapping_flat[start_pos:end_pos] + + keys.append(key_cache[current_slot_mapping].unsqueeze(0)) + values.append(value_cache[current_slot_mapping].unsqueeze(0)) + + keys = torch.cat(keys, dim=0) + values = torch.cat(values, dim=0) + + request_id = request_ids[idx] + ip, port = self.parse_request_id(request_id, True) + remote_address = ip + ":" + str(port + self.rank) + + self.p2p_nccl_pipe.send_tensor(request_id + "keys", keys, + remote_address) + self.p2p_nccl_pipe.send_tensor(request_id + "values", values, + remote_address) + self.p2p_nccl_pipe.send_tensor( + request_id + "hidden", + hidden_or_intermediate_states[start_pos:end_pos], + remote_address) + + logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank()) + + def recv_kv_caches_and_hidden_states( + self, model_executable: torch.nn.Module, + model_input: "ModelInputForGPUWithSamplingMetadata", + kv_caches: List[torch.Tensor] + ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, + "ModelInputForGPUWithSamplingMetadata"]: + + # When bypass_model_exec is set to False, it means that at least for one + # request its corresponding KV cache or hidden state is missing. + # In this case we need to do prefilling to recompute missing KV cache + # and hidden states. + bypass_model_exec = True + + model_config = model_executable.model.config + + input_tokens_tensor = model_input.input_tokens + seq_lens = model_input.attn_metadata.seq_lens + num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens + slot_mapping = model_input.attn_metadata.slot_mapping.flatten() + request_ids = list(model_input.request_ids_to_seq_ids.keys()) + + hidden_or_intermediate_states_for_one_req = [] + + input_tokens_list = [] + num_computed_tokens_list = [] + start_pos_list = [] + + # enumerate different requests + # FIXME(Kuntai): This impl assumes that all requests are prefill. + for idx, slen in enumerate(seq_lens): + start_pos = sum(seq_lens[:idx]) + end_pos = start_pos + slen + + if start_pos >= num_prefill_tokens: + # This can happen during inflight batching. See: + # vllm/worker/model_runner.py::_prepare_model_input_tensors: + # - input_tokens[:num_prefill_tokens] contains prefill tokens. + # - input_tokens[num_prefill_tokens:] contains decode tokens. + logger.warning("You should set --enable_chunked_prefill=False " + "and --max_num_batched_tokens " + "should be equal to --max_seq_len_to_capture") + bypass_model_exec = False + assert start_pos == num_prefill_tokens + break + + current_tokens = input_tokens_tensor[start_pos:end_pos] + num_tokens = slen + + # collecting data for rebuilding the input + input_tokens_list.append(current_tokens) + start_pos_list.append(start_pos) + + request_id = request_ids[idx] + keys = self.p2p_nccl_pipe.recv_tensor(request_id + "keys") + values = self.p2p_nccl_pipe.recv_tensor(request_id + "values") + hidden = self.p2p_nccl_pipe.recv_tensor(request_id + "hidden") + + # ip, port = self.parse_request_id(request_id, False) + # remote_address = ip + ":" + str(port + self.rank) + # keys = self.p2p_nccl_pipe.recv_tensor(request_id + "keys", + # remote_address) + # values = self.p2p_nccl_pipe.recv_tensor(request_id + "values", + # remote_address) + # hidden = self.p2p_nccl_pipe.recv_tensor(request_id + "hidden", + # remote_address) + + num_computed_tokens = current_tokens.shape[0] + num_computed_tokens_list.append(num_computed_tokens) + + # check if both KV cache and the hidden states are received + # If not, need to redo the forwarding to compute missing states + if not all([(num_computed_tokens == num_tokens), hidden is not None + ]): + bypass_model_exec = False + + # update the end position based on how many tokens are cached. + end_pos = start_pos + num_computed_tokens + + # put received KV caches into paged memory + for i in range(model_executable.model.start_layer, + model_executable.model.end_layer): + + kv_cache = kv_caches[i - model_executable.model.start_layer] + layer = model_executable.model.layers[i] + + if self.is_deepseek_mla and self.use_mla_opt: + layer.self_attn.attn = layer.self_attn.mla_attn + k_c_normed_k_pe = keys[ + i - model_executable.model.start_layer].to( + kv_cache.device).squeeze(1) + k_c_normed = k_c_normed_k_pe[:, :model_config.kv_lora_rank] + k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank:] + ops.concat_and_cache_mla( + k_c_normed, + k_pe, + kv_cache, + slot_mapping[start_pos:end_pos], + layer.self_attn.attn.kv_cache_dtype, + layer.self_attn.attn._k_scale, + ) + else: + key_cache, value_cache = kv_cache[0], kv_cache[1] + ops.reshape_and_cache_flash( + keys[i - model_executable.model.start_layer].to( + key_cache.device), + values[i - model_executable.model.start_layer].to( + value_cache.device), + key_cache, + value_cache, + slot_mapping[start_pos:end_pos], + layer.self_attn.attn.kv_cache_dtype, + layer.self_attn.attn._k_scale, + layer.self_attn.attn._v_scale, + ) + + hidden_or_intermediate_states_for_one_req.append(hidden) + + if not bypass_model_exec: + # Some of the KV cache is not retrieved + # Here we will fall back to normal model forwarding + # But optionally you can adjust model_input so that you only do + # prefilling on those tokens that are missing KV caches. + logger.warning( + "[rank%d]: Failed to receive all KVs and hidden " + "states, redo model forwarding.", torch.distributed.get_rank()) + hidden_or_intermediate_states = None + + else: + logger.debug( + "[rank%d]: Successfully received all KVs and hidden " + "states, skip model forwarding.", torch.distributed.get_rank()) + hidden_or_intermediate_states = torch.cat( + hidden_or_intermediate_states_for_one_req, dim=0) + + return hidden_or_intermediate_states, bypass_model_exec, model_input + + @staticmethod + def parse_request_id(request_id: str, is_prefill = True) -> Tuple[str, int]: + logger.info("parse_request_id, request_id: %s, is_prefill: %s", request_id, is_prefill) + # Regular expression to match the string hostname and integer port + if is_prefill: + pattern = r"___decode_address_(.*):(\d+)" + else: + pattern = r"___prefill_address_(.*):(\d+)___" + + # Use re.search to find the pattern in the request_id + match = re.search(pattern, request_id) + if match: + # Extract the ranks + ip = match.group(1) + port = int(match.group(2)) + + logger.info( + f"parse_request_id, {request_id=}, {ip=}, {port=}" + ) + return ip, port + raise ValueError( + f"Request id {request_id} does not contain hostname and port") + + def close(self): + self.p2p_nccl_pipe.close() diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py new file mode 100644 index 000000000000..bfe11adcbb4d --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -0,0 +1,286 @@ +# SPDX-License-Identifier: Apache-2.0 +import logging +import pickle +import socket +import threading +import time +import typing +from collections import deque +from typing import Any, Deque, Dict, List, Optional + +import torch +import zmq + +from vllm.config import KVTransferConfig +from vllm.distributed.device_communicators.pynccl_wrapper import ( + NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum) +from vllm.utils import current_stream, get_ip + +logger = logging.getLogger(__name__) + + +class P2pNcclPipe: + + def __init__(self, + local_rank: int, + config: KVTransferConfig, + hostname: str = "", + port_offset: int = 0, + library_path: Optional[str] = None) -> None: + self.config = config + self.local_rank = local_rank + self.device = torch.device(f"cuda:{self.local_rank}") + self.nccl = NCCLLibrary(library_path) + + if not hostname: + hostname = get_ip() + port = self.config.kv_port + port_offset + if port == 0: + raise ValueError("Port cannot be 0") + self._hostname = hostname + self._port = port + self.zmq_address = f"{self._hostname}:{self._port}" + self.http_address = f"{self._hostname}:{self.config.http_port}" + self.proxy_address = f"{self.config.proxy_ip}:{self.config.proxy_port}" + + self.context = zmq.Context() + self.router_socket = self.context.socket(zmq.ROUTER) + self.router_socket.bind(f"tcp://{self.zmq_address}") + + self.poller = zmq.Poller() + self.poller.register(self.router_socket, zmq.POLLIN) + + self.send_store: Deque[List[Any]] = deque() # tensor_id: torch.Tensor + self.recv_store: Dict[str, + torch.Tensor] = {} # tensor_id: torch.Tensor + self.socks: Dict[str, Any] = {} # remote_address: client socket + self.comms: Dict[str, Any] = {} # remote_address: (ncclComm_t, rank) + + # self.buffer_size = 0 + # self.buffer_size_threshold = self.config.kv_buffer_size + + self.send_store_cv = threading.Condition() + self.recv_store_cv = threading.Condition() + self.comm_cv = threading.Condition() + + logger.info(f"P2pNcclPipe, {self._hostname=}, {self._port=}, {port_offset=}, {self.zmq_address=}, {self.http_address=}, {self.proxy_address=}") + + self._listener_thread = threading.Thread( + target=self._listen_for_requests, daemon=True) + self._listener_thread.start() + + self._send_thread = threading.Thread(target=self._send_sync, + daemon=True) + self._send_thread.start() + + if port_offset == 0: + self._ping_thread = threading.Thread(target=self._ping, + daemon=True) + self._ping_thread.start() + + def _create_connect(self, remote_address: typing.Optional[str] = None): + assert remote_address is not None + if remote_address not in self.socks: + sock = self.context.socket(zmq.DEALER) + sock.setsockopt_string(zmq.IDENTITY, self.zmq_address) + sock.connect(f"tcp://{remote_address}") + self.socks[remote_address] = sock + if remote_address in self.comms: + logger.info("comm exists, remote_address: %s, comms: %s", remote_address, self.comms) + return sock, self.comms[remote_address] + + unique_id = self.nccl.ncclGetUniqueId() + unique_id_obj = pickle.dumps(unique_id) + data = {"cmd": "NEW", "unique_id": unique_id_obj} + sock.send(pickle.dumps(data)) + + with torch.cuda.device(self.device): + rank = 0 + comm: ncclComm_t = self.nccl.ncclCommInitRank( + 2, unique_id, rank) + self.comms[remote_address] = (comm, rank) + logger.info("ncclCommInitRank Success, %s ๐Ÿ‘‰ %s, MyRank: %s", + self.zmq_address, remote_address, rank) + + return self.socks[remote_address], self.comms[remote_address] + + def send_tensor( + self, + tensor_id: str, + tensor: torch.Tensor, + remote_address: typing.Optional[str] = None, + ): + tensor = tensor.clone() + if remote_address is None: + with self.recv_store_cv: + self.recv_store[tensor_id] = tensor + self.recv_store_cv.notify() + else: + with self.send_store_cv: + self.send_store.append([tensor_id, remote_address, tensor]) + self.send_store_cv.notify() + + def recv_tensor( + self, + tensor_id: str, + remote_address: typing.Optional[str] = None, + ) -> torch.Tensor: + logger.info("Recv From %s, tensor_id: %s", remote_address, tensor_id) + + if remote_address is None: + with self.recv_store_cv: + while tensor_id not in self.recv_store: + self.recv_store_cv.wait() + return self.recv_store.pop(tensor_id) + + if remote_address not in self.socks: + self._create_connect(remote_address) + + sock = self.socks[remote_address] + comm, rank = self.comms[remote_address] + + data = {"cmd": "GET", "tensor_id": tensor_id} + sock.send(pickle.dumps(data)) + + message = sock.recv() + data = pickle.loads(message) + if data["ret"] == 0: + tensor = torch.empty(data["shape"], + dtype=data["dtype"], + device=self.device) + self._recv(comm, tensor, rank ^ 1) + return tensor + + return None + + def _listen_for_requests(self): + while True: + socks = dict(self.poller.poll()) + if self.router_socket in socks: + remote_address, message = self.router_socket.recv_multipart() + data = pickle.loads(message) + logger.debug("Received message from %s, data: %s", + remote_address.decode(), data) + if data["cmd"] == "NEW": + unique_id = pickle.loads(data["unique_id"]) + with torch.cuda.device(self.device): + rank = 1 + comm: ncclComm_t = self.nccl.ncclCommInitRank( + 2, unique_id, rank) + self.comms[remote_address.decode()] = (comm, rank) + logger.info( + "ncclCommInitRank Success, %s ๐Ÿ‘ˆ %s, MyRank: %s", + self.zmq_address, remote_address.decode(), rank) + elif data["cmd"] == "PUT": + tensor_id = data["tensor_id"] + self.router_socket.send_multipart([remote_address, b"0"]) + tensor = torch.empty(data["shape"], + dtype=data["dtype"], + device=self.device) + comm, rank = self.comms[remote_address.decode()] + self._recv(comm, tensor, rank ^ 1) + with self.recv_store_cv: + self.recv_store[tensor_id] = tensor + self.recv_store_cv.notify() + logger.info( + "Recv Tensor, %s ๐Ÿ‘ˆ %s, rank: %s, data: %s, tensor: %s", + self.zmq_address, remote_address.decode(), rank, data, + tensor.shape) + elif data["cmd"] == "GET": + tensor_id = data["tensor_id"] + with self.send_store_cv: + for item in self.send_store: + _tensor_id, _remote_address, tensor = item + if tensor_id == _tensor_id: + data = { + "ret": 0, + "shape": tensor.shape, + "dtype": tensor.dtype + } + else: + data = {"ret": 1} + self.router_socket.send_multipart( + [remote_address, + pickle.dumps(data)]) + if data["ret"] == 0: + self._send(comm, tensor.to(self.device), + rank ^ 1) + break + else: + logger.info( + "Unexpected, Received message from %s, data: %s", + remote_address, data) + + def _send_sync(self): + while True: + with self.send_store_cv: + while not self.send_store: + self.send_store_cv.wait() + + tensor_id, remote_address, tensor = self.send_store.popleft() + + if remote_address not in self.socks: + self._create_connect(remote_address) + + sock = self.socks[remote_address] + comm, rank = self.comms[remote_address] + data = { + "cmd": "PUT", + "tensor_id": tensor_id, + "shape": tensor.shape, + "dtype": tensor.dtype + } + sock.send(pickle.dumps(data)) + + response = sock.recv() + if response != b"0" or tensor is None: + return + + self._send(comm, tensor.to(self.device), rank ^ 1) + logger.info( + "Send Tensor, %s ๐Ÿ‘‰ %s, MyRank: %s, data: %s, tensor: %s", + self.zmq_address, remote_address, rank, data, tensor.shape) + + def _ping(self): + sock = self.context.socket(zmq.DEALER) + sock.setsockopt_string(zmq.IDENTITY, self.zmq_address) + logger.info("ping start, zmq_address: %s", self.zmq_address) + sock.connect(f"tcp://{self.proxy_address}") + data = { + "type": "P" if self.config.is_kv_producer else "D", + "http_address": self.http_address, + "zmq_address": self.zmq_address + } + while True: + sock.send(pickle.dumps(data)) + # logger.info("ping, zmq_address: %s", self.zmq_address) + time.sleep(1) + + def _send(self, comm, tensor: torch.Tensor, dst: int, stream=None): + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}") + if stream is None: + stream = current_stream() + + with self.comm_cv: + self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), dst, + comm, cudaStream_t(stream.cuda_stream)) + + def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): + assert tensor.device == self.device, ( + f"this nccl communicator is created to work on {self.device}, " + f"but the input tensor is on {tensor.device}") + if stream is None: + stream = current_stream() + + with self.comm_cv: + self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), + ncclDataTypeEnum.from_torch(tensor.dtype), src, + comm, cudaStream_t(stream.cuda_stream)) + + def close(self) -> None: + self._listener_thread.join() + self._ping_thread.join() + pass From 016d00421bb2dea8d06964f8431ebc0b72ab992a Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 31 Mar 2025 21:33:44 +0800 Subject: [PATCH 002/155] format Signed-off-by: Abatom --- .../disagg_xpyd/disagg_prefill_proxy_xpyd.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index 7a1322f5afb2..c01484ab14a5 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -11,8 +11,6 @@ import zmq from quart import Quart, make_response, request -from vllm.logger import init_logger - prefill_instances: dict[str, str] = {} # http_address: zmq_address decode_instances: dict[str, str] = {} # http_address: zmq_address @@ -44,7 +42,7 @@ def _listen_for_register(poller, router_socket): data["http_address"]] = data["zmq_address"] else: print("Unexpected, Received message from %s, data: %s", - remote_address, data) + remote_address, data) def start_service_discovery(hostname, port): @@ -107,18 +105,22 @@ async def handle_request(): global prefill_instances global prefill_cv with prefill_cv: - prefill_address, prefill_zmq_address = random.choice(list(prefill_instances.items())) - print("handle_request, prefill_address: %s, zmq_address: %s", prefill_address, - prefill_zmq_address) + prefill_address, prefill_zmq_address = random.choice( + list(prefill_instances.items())) + print("handle_request, prefill_address: %s, zmq_address: %s", + prefill_address, prefill_zmq_address) global decode_instances global decode_cv with decode_cv: - decode_address, decode_zmq_address = random.choice(list(decode_instances.items())) - print("handle_request, decode_address: %s, zmq_address: %s", decode_address, - decode_zmq_address) - - request_id = f"___prefill_address_{prefill_zmq_address}___decode_address_{decode_zmq_address}_{random_uuid()}" + decode_address, decode_zmq_address = random.choice( + list(decode_instances.items())) + print("handle_request, decode_address: %s, zmq_address: %s", + decode_address, decode_zmq_address) + + request_id = ( + f"___prefill_address_{prefill_zmq_address}___decode_address_{decode_zmq_address}_{random_uuid()}" + ) # finish prefill async for _ in forward_request( From 448bed9b223f260bbc073e569cb9fc5234597a14 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 31 Mar 2025 21:39:49 +0800 Subject: [PATCH 003/155] format Signed-off-by: Abatom --- .../disagg_xpyd/disagg_prefill_proxy_xpyd.py | 16 ++++++++-------- .../kv_transfer/kv_connector/p2p_connector.py | 14 +++++++------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index c01484ab14a5..d0c5a61c864a 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -105,21 +105,21 @@ async def handle_request(): global prefill_instances global prefill_cv with prefill_cv: - prefill_address, prefill_zmq_address = random.choice( + prefill_address, prefill_zmq_addr = random.choice( list(prefill_instances.items())) - print("handle_request, prefill_address: %s, zmq_address: %s", - prefill_address, prefill_zmq_address) + print("handle_request, prefill_addr: %s, zmq_addr: %s", + prefill_address, prefill_zmq_addr) global decode_instances global decode_cv with decode_cv: - decode_address, decode_zmq_address = random.choice( + decode_addr, decode_zmq_addr = random.choice( list(decode_instances.items())) - print("handle_request, decode_address: %s, zmq_address: %s", - decode_address, decode_zmq_address) + print("handle_request, decode_addr: %s, zmq_addr: %s", decode_addr, + decode_zmq_addr) request_id = ( - f"___prefill_address_{prefill_zmq_address}___decode_address_{decode_zmq_address}_{random_uuid()}" + f"___prefill_addr_{prefill_zmq_addr}___decode_addr_{decode_zmq_addr}_{random_uuid()}" ) # finish prefill @@ -129,7 +129,7 @@ async def handle_request(): continue # return decode - generator = forward_request(f'http://{decode_address}/v1/completions', + generator = forward_request(f'http://{decode_addr}/v1/completions', original_request_data, request_id) response = await make_response(generator) response.timeout = None diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py index 01e29ed4404e..07c3e169f8c4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py @@ -283,13 +283,14 @@ def recv_kv_caches_and_hidden_states( return hidden_or_intermediate_states, bypass_model_exec, model_input @staticmethod - def parse_request_id(request_id: str, is_prefill = True) -> Tuple[str, int]: - logger.info("parse_request_id, request_id: %s, is_prefill: %s", request_id, is_prefill) + def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: + logger.info("parse_request_id, request_id: %s, is_prefill: %s", + request_id, is_prefill) # Regular expression to match the string hostname and integer port if is_prefill: - pattern = r"___decode_address_(.*):(\d+)" + pattern = r"___decode_addr_(.*):(\d+)" else: - pattern = r"___prefill_address_(.*):(\d+)___" + pattern = r"___prefill_addr_(.*):(\d+)___" # Use re.search to find the pattern in the request_id match = re.search(pattern, request_id) @@ -298,9 +299,8 @@ def parse_request_id(request_id: str, is_prefill = True) -> Tuple[str, int]: ip = match.group(1) port = int(match.group(2)) - logger.info( - f"parse_request_id, {request_id=}, {ip=}, {port=}" - ) + logger.info("parse_request_id, request_id: %s, ip: %s, port: %s", + request_id, ip, str(port)) return ip, port raise ValueError( f"Request id {request_id} does not contain hostname and port") From 825fe065962e58232330c2c25ac61d5863106aa8 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 31 Mar 2025 21:41:35 +0800 Subject: [PATCH 004/155] format Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index bfe11adcbb4d..a85807f39c0a 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import logging import pickle -import socket import threading import time import typing @@ -63,8 +62,6 @@ def __init__(self, self.recv_store_cv = threading.Condition() self.comm_cv = threading.Condition() - logger.info(f"P2pNcclPipe, {self._hostname=}, {self._port=}, {port_offset=}, {self.zmq_address=}, {self.http_address=}, {self.proxy_address=}") - self._listener_thread = threading.Thread( target=self._listen_for_requests, daemon=True) self._listener_thread.start() @@ -86,7 +83,8 @@ def _create_connect(self, remote_address: typing.Optional[str] = None): sock.connect(f"tcp://{remote_address}") self.socks[remote_address] = sock if remote_address in self.comms: - logger.info("comm exists, remote_address: %s, comms: %s", remote_address, self.comms) + logger.info("comm exists, remote_address: %s, comms: %s", + remote_address, self.comms) return sock, self.comms[remote_address] unique_id = self.nccl.ncclGetUniqueId() From 477fe2b8fd972e40b0ae425540de3b3b6816c3c7 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 31 Mar 2025 21:47:48 +0800 Subject: [PATCH 005/155] format Signed-off-by: Abatom --- .../disagg_xpyd/disagg_prefill_proxy_xpyd.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index d0c5a61c864a..ba8dc8b45c59 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -105,10 +105,10 @@ async def handle_request(): global prefill_instances global prefill_cv with prefill_cv: - prefill_address, prefill_zmq_addr = random.choice( + prefill_addr, prefill_zmq_addr = random.choice( list(prefill_instances.items())) print("handle_request, prefill_addr: %s, zmq_addr: %s", - prefill_address, prefill_zmq_addr) + prefill_addr, prefill_zmq_addr) global decode_instances global decode_cv @@ -123,9 +123,8 @@ async def handle_request(): ) # finish prefill - async for _ in forward_request( - f'http://{prefill_address}/v1/completions', prefill_request, - request_id): + async for _ in forward_request(f'http://{prefill_addr}/v1/completions', + prefill_request, request_id): continue # return decode From dd6dcf9a1d2166278895588eebe4598212c9ab71 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 31 Mar 2025 21:51:30 +0800 Subject: [PATCH 006/155] pass Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index a85807f39c0a..1968de5eeb38 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -281,4 +281,3 @@ def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): def close(self) -> None: self._listener_thread.join() self._ping_thread.join() - pass From 7eb1575fba5dec663fcc9d56ed419b035f968c14 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 31 Mar 2025 22:12:22 +0800 Subject: [PATCH 007/155] format Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_connector/factory.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index c4e1a0891462..59adeab2e01e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -40,8 +40,7 @@ def create_connector(cls, rank: int, local_rank: int, # The registration should not be done in each individual file, as we want to # only load the files corresponding to the current connector. KVConnectorFactory.register_connector( - "P2pConnector", - "vllm.distributed.kv_transfer.kv_connector.p2p_connector", + "P2pConnector", "vllm.distributed.kv_transfer.kv_connector.p2p_connector", "P2pConnector") KVConnectorFactory.register_connector( @@ -62,4 +61,4 @@ def create_connector(cls, rank: int, local_rank: int, KVConnectorFactory.register_connector( "MooncakeStoreConnector", "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector", - "MooncakeStoreConnector") \ No newline at end of file + "MooncakeStoreConnector") From a0d37bb64d5ba14e4939d85968db09d356685121 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 31 Mar 2025 22:17:56 +0800 Subject: [PATCH 008/155] format Signed-off-by: Abatom --- .../distributed/kv_transfer/kv_connector/p2p_connector.py | 8 -------- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 1 + 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py index 07c3e169f8c4..8388a6cb819b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py @@ -1,13 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -""" -Simple KV Cache Connector for Distributed Machine Learning Inference -The SimpleConnector transfers KV caches between prefill vLLM worker (KV cache -producer) and decode vLLM worker (KV cache consumer) using PyNcclPipe or -MooncakePipe. - -But the logic can be extended to support other pipe and lookup buffer. -""" import re from typing import TYPE_CHECKING, List, Tuple, Union diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 1968de5eeb38..febea74cb1fb 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 + import logging import pickle import threading From da335ea21b0ffb5651afa0faf5ea8dda651dc112 Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 1 Apr 2025 18:18:50 +0800 Subject: [PATCH 009/155] move some args to kv_connector_extra_config Signed-off-by: Abatom --- .../disagg_xpyd/disagg_prefill_xpyd.sh | 16 ++++++++-------- vllm/config.py | 9 --------- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 11 ++++++++--- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_xpyd.sh b/examples/online_serving/disagg_xpyd/disagg_prefill_xpyd.sh index 87087565380c..6918d9f3ac9b 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_xpyd.sh +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_xpyd.sh @@ -29,7 +29,7 @@ MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct} # --max-model-len 8192 \ # --gpu-memory-utilization 0.8 \ # --kv-transfer-config \ -# '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_producer","http_port":"20001","kv_port":"21001"}' & +# '{"kv_connector":"P2pConnector","kv_role":"kv_producer","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20001"}}' & # ## prefilling instance, which is the KV producer #CUDA_VISIBLE_DEVICES=5 vllm serve $MODEL_NAME \ @@ -39,7 +39,7 @@ MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct} # --max-model-len 8192 \ # --gpu-memory-utilization 0.8 \ # --kv-transfer-config \ -# '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_producer","http_port":"20002","kv_port":"22001"}' & +# '{"kv_connector":"P2pConnector","kv_role":"kv_producer","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20002"}}' & # ## decoding instance, which is the KV consumer #CUDA_VISIBLE_DEVICES=6 vllm serve $MODEL_NAME \ @@ -49,7 +49,7 @@ MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct} # --max-model-len 8192 \ # --gpu-memory-utilization 0.8 \ # --kv-transfer-config \ -# '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_consumer","http_port":"20003","kv_port":"23001"}' & +# '{"kv_connector":"P2pConnector","kv_role":"kv_consumer","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20003"}}' & # ## decoding instance, which is the KV consumer #CUDA_VISIBLE_DEVICES=7 vllm serve $MODEL_NAME \ @@ -59,7 +59,7 @@ MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct} # --max-model-len 8192 \ # --gpu-memory-utilization 0.8 \ # --kv-transfer-config \ -# '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_consumer","http_port":"20004","kv_port":"24001"}' & +# '{"kv_connector":"P2pConnector","kv_role":"kv_consumer","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20004"}}' & # 2P2D, TP=2 @@ -72,7 +72,7 @@ CUDA_VISIBLE_DEVICES=0,1 vllm serve $MODEL_NAME \ --max-model-len 8192 \ --gpu-memory-utilization 0.8 \ --kv-transfer-config \ - '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_producer","http_port":"20001","kv_port":"21001"}' & + '{"kv_connector":"P2pConnector","kv_role":"kv_producer","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20001"}}' & # prefilling instance, which is the KV producer CUDA_VISIBLE_DEVICES=2,3 vllm serve $MODEL_NAME \ @@ -83,7 +83,7 @@ CUDA_VISIBLE_DEVICES=2,3 vllm serve $MODEL_NAME \ --max-model-len 8192 \ --gpu-memory-utilization 0.8 \ --kv-transfer-config \ - '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_producer","http_port":"20002","kv_port":"22001"}' & + '{"kv_connector":"P2pConnector","kv_role":"kv_producer","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20002"}}' & # decoding instance, which is the KV consumer CUDA_VISIBLE_DEVICES=4,5 vllm serve $MODEL_NAME \ @@ -94,7 +94,7 @@ CUDA_VISIBLE_DEVICES=4,5 vllm serve $MODEL_NAME \ --max-model-len 8192 \ --gpu-memory-utilization 0.8 \ --kv-transfer-config \ - '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_consumer","http_port":"20003","kv_port":"23001"}' & + '{"kv_connector":"P2pConnector","kv_role":"kv_consumer","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20003"}}' & # decoding instance, which is the KV consumer CUDA_VISIBLE_DEVICES=6,7 vllm serve $MODEL_NAME \ @@ -105,4 +105,4 @@ CUDA_VISIBLE_DEVICES=6,7 vllm serve $MODEL_NAME \ --max-model-len 8192 \ --gpu-memory-utilization 0.8 \ --kv-transfer-config \ - '{"proxy_ip":"0.0.0.0","proxy_port":"30001","kv_connector":"P2pConnector","kv_role":"kv_consumer","http_port":"20004","kv_port":"24001"}' & \ No newline at end of file + '{"kv_connector":"P2pConnector","kv_role":"kv_consumer","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20004"}}' & \ No newline at end of file diff --git a/vllm/config.py b/vllm/config.py index aea37c97c3be..6a15109c6744 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2939,15 +2939,6 @@ class KVTransferConfig(BaseModel): # The KV connector port, used to build distributed connection kv_port: int = 14579 - # The HTTP port used by OpenAI is the same as the `--port` parameter. - http_port: int = 20001 - - # The IP of the proxy. - proxy_ip: str = "127.0.0.1" - - # The Port of the proxy. - proxy_port: int = 30001 - # any extra config that the connector may need kv_connector_extra_config: dict[str, Any] = {} diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index febea74cb1fb..a853a793e9bd 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -40,9 +40,14 @@ def __init__(self, self._hostname = hostname self._port = port self.zmq_address = f"{self._hostname}:{self._port}" - self.http_address = f"{self._hostname}:{self.config.http_port}" - self.proxy_address = f"{self.config.proxy_ip}:{self.config.proxy_port}" - + self.http_address = ( + f"{self._hostname}:" + f"{self.config.kv_connector_extra_config["http_port"]}" + ) + self.proxy_address = ( + f"{self.config.kv_connector_extra_config["proxy_ip"]}:" + f"{self.config.kv_connector_extra_config["proxy_port"]}" + ) self.context = zmq.Context() self.router_socket = self.context.socket(zmq.ROUTER) self.router_socket.bind(f"tcp://{self.zmq_address}") From 178ca2f6e2c82f8484a2f104f8c2646ef4b1e614 Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 1 Apr 2025 18:35:04 +0800 Subject: [PATCH 010/155] format Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index a853a793e9bd..7f3bb0395a5e 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -42,12 +42,10 @@ def __init__(self, self.zmq_address = f"{self._hostname}:{self._port}" self.http_address = ( f"{self._hostname}:" - f"{self.config.kv_connector_extra_config["http_port"]}" - ) + f"{self.config.kv_connector_extra_config['http_port']}") self.proxy_address = ( - f"{self.config.kv_connector_extra_config["proxy_ip"]}:" - f"{self.config.kv_connector_extra_config["proxy_port"]}" - ) + f"{self.config.kv_connector_extra_config['proxy_ip']}:" + f"{self.config.kv_connector_extra_config['proxy_port']}") self.context = zmq.Context() self.router_socket = self.context.socket(zmq.ROUTER) self.router_socket.bind(f"tcp://{self.zmq_address}") From f03ac47f64eed695f85cc63880f5ed9bb925dc0e Mon Sep 17 00:00:00 2001 From: Abatom Date: Sun, 6 Apr 2025 13:44:17 +0800 Subject: [PATCH 011/155] remove some code comments Signed-off-by: Abatom --- .../kv_transfer/kv_connector/p2p_connector.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py index 8388a6cb819b..0363c3ea23ce 100644 --- a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py @@ -195,15 +195,6 @@ def recv_kv_caches_and_hidden_states( values = self.p2p_nccl_pipe.recv_tensor(request_id + "values") hidden = self.p2p_nccl_pipe.recv_tensor(request_id + "hidden") - # ip, port = self.parse_request_id(request_id, False) - # remote_address = ip + ":" + str(port + self.rank) - # keys = self.p2p_nccl_pipe.recv_tensor(request_id + "keys", - # remote_address) - # values = self.p2p_nccl_pipe.recv_tensor(request_id + "values", - # remote_address) - # hidden = self.p2p_nccl_pipe.recv_tensor(request_id + "hidden", - # remote_address) - num_computed_tokens = current_tokens.shape[0] num_computed_tokens_list.append(num_computed_tokens) From 603a3550fa5f1202810581215ce9b435a2b4b60b Mon Sep 17 00:00:00 2001 From: Abatom Date: Sun, 6 Apr 2025 15:12:26 +0800 Subject: [PATCH 012/155] Replace pickle with msgpack Signed-off-by: Abatom --- .../disagg_xpyd/disagg_prefill_proxy_xpyd.py | 4 ++-- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 20 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index ba8dc8b45c59..47d03d7e9734 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 import os -import pickle import random import socket import threading import uuid import aiohttp +import msgpack import zmq from quart import Quart, make_response, request @@ -25,7 +25,7 @@ def _listen_for_register(poller, router_socket): remote_address, message = router_socket.recv_multipart() # data: {"type": "P", "http_address": "ip:port", # "zmq_address": "ip:port"} - data = pickle.loads(message) + data = msgpack.loads(message) # print("Received message from %s, data: %s", # remote_address.decode(), data) if data["type"] == "P": diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 7f3bb0395a5e..84172ad395f3 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -1,13 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 import logging -import pickle import threading import time import typing from collections import deque from typing import Any, Deque, Dict, List, Optional +import msgpack import torch import zmq @@ -92,9 +92,9 @@ def _create_connect(self, remote_address: typing.Optional[str] = None): return sock, self.comms[remote_address] unique_id = self.nccl.ncclGetUniqueId() - unique_id_obj = pickle.dumps(unique_id) + unique_id_obj = msgpack.dumps(unique_id) data = {"cmd": "NEW", "unique_id": unique_id_obj} - sock.send(pickle.dumps(data)) + sock.send(msgpack.dumps(data)) with torch.cuda.device(self.device): rank = 0 @@ -142,10 +142,10 @@ def recv_tensor( comm, rank = self.comms[remote_address] data = {"cmd": "GET", "tensor_id": tensor_id} - sock.send(pickle.dumps(data)) + sock.send(msgpack.dumps(data)) message = sock.recv() - data = pickle.loads(message) + data = msgpack.loads(message) if data["ret"] == 0: tensor = torch.empty(data["shape"], dtype=data["dtype"], @@ -160,11 +160,11 @@ def _listen_for_requests(self): socks = dict(self.poller.poll()) if self.router_socket in socks: remote_address, message = self.router_socket.recv_multipart() - data = pickle.loads(message) + data = msgpack.loads(message) logger.debug("Received message from %s, data: %s", remote_address.decode(), data) if data["cmd"] == "NEW": - unique_id = pickle.loads(data["unique_id"]) + unique_id = msgpack.loads(data["unique_id"]) with torch.cuda.device(self.device): rank = 1 comm: ncclComm_t = self.nccl.ncclCommInitRank( @@ -203,7 +203,7 @@ def _listen_for_requests(self): data = {"ret": 1} self.router_socket.send_multipart( [remote_address, - pickle.dumps(data)]) + msgpack.dumps(data)]) if data["ret"] == 0: self._send(comm, tensor.to(self.device), rank ^ 1) @@ -232,7 +232,7 @@ def _send_sync(self): "shape": tensor.shape, "dtype": tensor.dtype } - sock.send(pickle.dumps(data)) + sock.send(msgpack.dumps(data)) response = sock.recv() if response != b"0" or tensor is None: @@ -254,7 +254,7 @@ def _ping(self): "zmq_address": self.zmq_address } while True: - sock.send(pickle.dumps(data)) + sock.send(msgpack.dumps(data)) # logger.info("ping, zmq_address: %s", self.zmq_address) time.sleep(1) From 2acb321c43e9cf549b9399dabe12d2cca65428e0 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 7 Apr 2025 10:47:31 +0800 Subject: [PATCH 013/155] fix bug Signed-off-by: Abatom --- .../device_communicators/pynccl_wrapper.py | 20 +++++++++++++++++++ .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 13 ++++++------ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 4f04899e92e6..f259c060e1e9 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -271,6 +271,26 @@ def ncclGetUniqueId(self) -> ncclUniqueId: ctypes.byref(unique_id))) return unique_id + def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId: + """ + Reconstructs an `ncclUniqueId` object from bytes data. + + Args: + data: Must be a 128-byte data block (matching NCCL's unique_id format). + + Returns: + ncclUniqueId: The reconstructed NCCL Unique ID object. + + Raises: + ValueError: If the input data length is not 128 bytes. + """ + if len(data) != 128: + raise ValueError(f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes") + + unique_id = ncclUniqueId() + ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128) + return unique_id + def ncclCommInitRank(self, world_size: int, unique_id: ncclUniqueId, rank: int) -> ncclComm_t: comm = ncclComm_t() diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 84172ad395f3..b2c197aac42b 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -92,8 +92,7 @@ def _create_connect(self, remote_address: typing.Optional[str] = None): return sock, self.comms[remote_address] unique_id = self.nccl.ncclGetUniqueId() - unique_id_obj = msgpack.dumps(unique_id) - data = {"cmd": "NEW", "unique_id": unique_id_obj} + data = {"cmd": "NEW", "unique_id": bytes(unique_id.internal)} sock.send(msgpack.dumps(data)) with torch.cuda.device(self.device): @@ -148,7 +147,7 @@ def recv_tensor( data = msgpack.loads(message) if data["ret"] == 0: tensor = torch.empty(data["shape"], - dtype=data["dtype"], + dtype=getattr(torch, data["dtype"]), device=self.device) self._recv(comm, tensor, rank ^ 1) return tensor @@ -164,7 +163,7 @@ def _listen_for_requests(self): logger.debug("Received message from %s, data: %s", remote_address.decode(), data) if data["cmd"] == "NEW": - unique_id = msgpack.loads(data["unique_id"]) + unique_id = self.nccl.unique_id_from_bytes(bytes(data["unique_id"])) with torch.cuda.device(self.device): rank = 1 comm: ncclComm_t = self.nccl.ncclCommInitRank( @@ -177,7 +176,7 @@ def _listen_for_requests(self): tensor_id = data["tensor_id"] self.router_socket.send_multipart([remote_address, b"0"]) tensor = torch.empty(data["shape"], - dtype=data["dtype"], + dtype=getattr(torch, data["dtype"]), device=self.device) comm, rank = self.comms[remote_address.decode()] self._recv(comm, tensor, rank ^ 1) @@ -197,7 +196,7 @@ def _listen_for_requests(self): data = { "ret": 0, "shape": tensor.shape, - "dtype": tensor.dtype + "dtype": str(tensor.dtype).replace("torch.", "") } else: data = {"ret": 1} @@ -230,7 +229,7 @@ def _send_sync(self): "cmd": "PUT", "tensor_id": tensor_id, "shape": tensor.shape, - "dtype": tensor.dtype + "dtype": str(tensor.dtype).replace("torch.", "") } sock.send(msgpack.dumps(data)) From b957dd7307143cb50b34cf7057599ac1c156f64c Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 7 Apr 2025 10:57:39 +0800 Subject: [PATCH 014/155] Out Of Memory Signed-off-by: Abatom --- .../device_communicators/pynccl_wrapper.py | 5 +- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 64 +++++++++++++------ 2 files changed, 47 insertions(+), 22 deletions(-) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index f259c060e1e9..83619c27f22f 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -276,7 +276,7 @@ def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId: Reconstructs an `ncclUniqueId` object from bytes data. Args: - data: Must be a 128-byte data block (matching NCCL's unique_id format). + data: Must be a 128-byte data block (matching NCCL's unique_id). Returns: ncclUniqueId: The reconstructed NCCL Unique ID object. @@ -285,7 +285,8 @@ def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId: ValueError: If the input data length is not 128 bytes. """ if len(data) != 128: - raise ValueError(f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes") + raise ValueError( + f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes") unique_id = ncclUniqueId() ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index b2c197aac42b..7af6da380d02 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -163,7 +163,8 @@ def _listen_for_requests(self): logger.debug("Received message from %s, data: %s", remote_address.decode(), data) if data["cmd"] == "NEW": - unique_id = self.nccl.unique_id_from_bytes(bytes(data["unique_id"])) + unique_id = self.nccl.unique_id_from_bytes( + bytes(data["unique_id"])) with torch.cuda.device(self.device): rank = 1 comm: ncclComm_t = self.nccl.ncclCommInitRank( @@ -173,20 +174,33 @@ def _listen_for_requests(self): "ncclCommInitRank Success, %s ๐Ÿ‘ˆ %s, MyRank: %s", self.zmq_address, remote_address.decode(), rank) elif data["cmd"] == "PUT": - tensor_id = data["tensor_id"] - self.router_socket.send_multipart([remote_address, b"0"]) - tensor = torch.empty(data["shape"], - dtype=getattr(torch, data["dtype"]), - device=self.device) - comm, rank = self.comms[remote_address.decode()] - self._recv(comm, tensor, rank ^ 1) - with self.recv_store_cv: - self.recv_store[tensor_id] = tensor - self.recv_store_cv.notify() - logger.info( - "Recv Tensor, %s ๐Ÿ‘ˆ %s, rank: %s, data: %s, tensor: %s", - self.zmq_address, remote_address.decode(), rank, data, - tensor.shape) + try: + tensor = torch.empty(data["shape"], + dtype=getattr( + torch, data["dtype"]), + device=self.device) + + self.router_socket.send_multipart( + [remote_address, b"0"]) + comm, rank = self.comms[remote_address.decode()] + self._recv(comm, tensor, rank ^ 1) + + tensor_id = data["tensor_id"] + with self.recv_store_cv: + self.recv_store[tensor_id] = tensor + self.recv_store_cv.notify() + logger.info( + "Recv Tensor, %s ๐Ÿ‘ˆ %s, rank: %s, data: %s, " + "tensor: %s", self.zmq_address, + remote_address.decode(), rank, data, tensor.shape) + + except torch.cuda.OutOfMemoryError: + self.router_socket.send_multipart( + [remote_address, b"1"]) + logger.warning( + "Recv Tensor, Out Of Memory, %s ๐Ÿ‘ˆ %s, data: %s", + self.zmq_address, remote_address.decode(), data) + elif data["cmd"] == "GET": tensor_id = data["tensor_id"] with self.send_store_cv: @@ -194,9 +208,12 @@ def _listen_for_requests(self): _tensor_id, _remote_address, tensor = item if tensor_id == _tensor_id: data = { - "ret": 0, - "shape": tensor.shape, - "dtype": str(tensor.dtype).replace("torch.", "") + "ret": + 0, + "shape": + tensor.shape, + "dtype": + str(tensor.dtype).replace("torch.", "") } else: data = {"ret": 1} @@ -234,8 +251,15 @@ def _send_sync(self): sock.send(msgpack.dumps(data)) response = sock.recv() - if response != b"0" or tensor is None: - return + if response != b"0": + self.send_store.append([tensor_id, remote_address, tensor]) + logger.warning( + "Send Tensor, Peer Out Of Memory, %s ๐Ÿ‘‰ %s, " + "MyRank: %s, data: %s, tensor: %s, size: %fGB", + self.zmq_address, remote_address, rank, data, + tensor.shape, + tensor.element_size() * tensor.numel() / 1024**3) + continue self._send(comm, tensor.to(self.device), rank ^ 1) logger.info( From f6407b3775d9fbfba8690b9a3c5f1ec6651cae1f Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 7 Apr 2025 22:05:12 +0800 Subject: [PATCH 015/155] _send_sync Signed-off-by: Abatom --- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 56 +++++++++---------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 7af6da380d02..863ca7dd0208 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -234,37 +234,35 @@ def _send_sync(self): with self.send_store_cv: while not self.send_store: self.send_store_cv.wait() - tensor_id, remote_address, tensor = self.send_store.popleft() - if remote_address not in self.socks: - self._create_connect(remote_address) - - sock = self.socks[remote_address] - comm, rank = self.comms[remote_address] - data = { - "cmd": "PUT", - "tensor_id": tensor_id, - "shape": tensor.shape, - "dtype": str(tensor.dtype).replace("torch.", "") - } - sock.send(msgpack.dumps(data)) - - response = sock.recv() - if response != b"0": - self.send_store.append([tensor_id, remote_address, tensor]) - logger.warning( - "Send Tensor, Peer Out Of Memory, %s ๐Ÿ‘‰ %s, " - "MyRank: %s, data: %s, tensor: %s, size: %fGB", - self.zmq_address, remote_address, rank, data, - tensor.shape, - tensor.element_size() * tensor.numel() / 1024**3) - continue - - self._send(comm, tensor.to(self.device), rank ^ 1) - logger.info( - "Send Tensor, %s ๐Ÿ‘‰ %s, MyRank: %s, data: %s, tensor: %s", - self.zmq_address, remote_address, rank, data, tensor.shape) + if remote_address not in self.socks: + self._create_connect(remote_address) + + sock = self.socks[remote_address] + comm, rank = self.comms[remote_address] + data = { + "cmd": "PUT", + "tensor_id": tensor_id, + "shape": tensor.shape, + "dtype": str(tensor.dtype).replace("torch.", "") + } + sock.send(msgpack.dumps(data)) + + response = sock.recv() + if response != b"0": + self.send_store.append([tensor_id, remote_address, tensor]) + logger.warning( + "Send Tensor, Peer Out Of Memory, %s ๐Ÿ‘‰ %s, " + "MyRank: %s, data: %s, tensor: %s, size: %fGB", + self.zmq_address, remote_address, rank, data, tensor.shape, + tensor.element_size() * tensor.numel() / 1024**3) + continue + + self._send(comm, tensor.to(self.device), rank ^ 1) + logger.info( + "Send Tensor, %s ๐Ÿ‘‰ %s, MyRank: %s, data: %s, tensor: %s", + self.zmq_address, remote_address, rank, data, tensor.shape) def _ping(self): sock = self.context.socket(zmq.DEALER) From 5c165d9a507cf350cc1d71a486039cf7f8eb7288 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 9 Apr 2025 09:58:49 +0800 Subject: [PATCH 016/155] add p2p_nccl_connector.py based on V1 Signed-off-by: Abatom --- .../kv_connector/p2p_nccl_connector.py | 303 ++++++++++++++++++ .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 6 +- 2 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 vllm/distributed/kv_transfer/kv_connector/p2p_nccl_connector.py diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/p2p_nccl_connector.py new file mode 100644 index 000000000000..a66580a86080 --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/p2p_nccl_connector.py @@ -0,0 +1,303 @@ +# SPDX-License-Identifier: Apache-2.0 + +import re +from dataclasses import dataclass +from typing import TYPE_CHECKING, Optional, Tuple + +import torch + +from vllm.config import VllmConfig +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.distributed.kv_transfer.kv_pipe.p2p_nccl_pipe import P2pNcclPipe +from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheManager + from vllm.v1.core.kv_cache_utils import KVCacheBlock + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +@dataclass +class ReqMeta: + # Request Id + request_id: str + # Request tokens + token_ids: torch.Tensor + # Slot mappings, should have the same length as token_ids + slot_mapping: torch.Tensor + # Is store or load + is_store: bool + + ## Blocks allocated by the scheduler (no-longer needed) + # block_ids: torch.Tensor + + @staticmethod + def from_request(request: "Request", block_size: int, + is_store: bool) -> "ReqMeta": + valid_num_tokens = align_to_block_size(len(request.prompt_token_ids), + block_size) + token_ids = torch.tensor(request.prompt_token_ids)[:valid_num_tokens] + block_ids = torch.tensor(request.block_ids) + num_blocks = block_ids.shape[0] + block_offsets = torch.arange(0, block_size) + slot_mapping = block_offsets.reshape((1, block_size)) + \ + block_ids.reshape((num_blocks, 1)) * block_size + slot_mapping = slot_mapping.flatten()[:valid_num_tokens] + return ReqMeta( + request_id=request.request_id, + token_ids=token_ids, + slot_mapping=slot_mapping, + is_store=is_store, + ) + + +@dataclass +class P2pNcclConnectorMetadata(KVConnectorMetadata): + requests: list[ReqMeta] + + def __init__(self): + self.requests = [] + + def add_request( + self, + request: "Request", + block_size: int, + is_store: bool, + ) -> None: + self.requests.append( + ReqMeta.from_request(request, block_size, is_store)) + + +class P2pNcclConnector(KVConnectorBase_V1): + + def __init__(self, rank: Optional[int], local_rank: Optional[int], + config: "VllmConfig", role: KVConnectorRole): + super().__init__( + rank=rank, + local_rank=local_rank, + config=config, + role=role, + ) + self._block_size = config.cache_config.block_size + self._requests_need_load: list[str] = [] + logger.info(config.kv_transfer_config) + + self.p2p_nccl_pipe = P2pNcclPipe( + local_rank=local_rank, # type: ignore + config=self.config, + hostname="", + port_offset=rank, # type: ignore + ) + + def start_load_kv(self, forward_context: "ForwardContext", + **kwargs) -> None: + """Start loading the KV cache from the connector buffer to vLLM's + paged KV buffer. + + Args: + forward_context (ForwardContext): the forward context. + **kwargs: additional arguments for the load operation + + Note: + The number of elements in kv_caches and layer_names should be + the same. + """ + + def inject_kv_into_layer( + dst_kv_cache_layer: torch.Tensor, + src_kv_cache: torch.Tensor, + slot_mapping: torch.Tensor, + ) -> None: + """Inject the KV cache into the layer. + + Args: + dst_kv_cache_layer (torch.Tensor): the destination KV cache + layer. In shape [2, num_pages, page_size, xxx]. + src_kv_cache (torch.Tensor): the source KV cache. In shape + [2, num_tokens, xxx]. + slot_mapping (torch.Tensor): the slot mapping. In shape + [num_tokens]. + """ + dst_kv_cache_layer_shape = dst_kv_cache_layer.shape + num_pages = dst_kv_cache_layer_shape[1] + page_size = dst_kv_cache_layer_shape[2] + dst_kv_cache_layer = dst_kv_cache_layer.reshape( + 2, num_pages * page_size, -1) + dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache + dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) + + # Get the metadata + metadata: KVConnectorMetadata = \ + self._get_connector_metadata() + assert isinstance(metadata, P2pNcclConnectorMetadata) + + if metadata is None: + logger.warning( + "In connector.start_load_kv, but the connector metadata is None" + ) + return + + attn_metadata = forward_context.attn_metadata + if attn_metadata is None: + logger.warning( + "In connector.start_load_kv, but the attn_metadata is None") + return + + # Load the KV for each request each layer + for request in metadata.requests: + if request.is_store: + continue + logger.info("Inject KV cache of %d tokens to the paged memory", + len(request.slot_mapping)) + for layer_name in forward_context.no_compile_layers: + attn_layer = forward_context.no_compile_layers[layer_name] + kv_cache_layer = attn_layer.kv_cache[ \ + forward_context.virtual_engine] + + kv_cache = self.p2p_nccl_pipe.recv_tensor(request.request_id + + "-" + layer_name) + + inject_kv_into_layer(kv_cache_layer, kv_cache, + request.slot_mapping) + + def wait_for_layer_load(self, layer_name: str) -> None: + """Blocking until the KV for a specific layer is loaded into vLLM's + paged buffer. + + This interface will be useful for layer-by-layer pipelining. + + Args: + layer_name: the name of that layer + """ + return + + def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", **kwargs) -> None: + """Start saving the a layer of KV cache from vLLM's paged buffer + to the connector. + + Args: + layer_name (str): the name of the layer. + kv_layer (torch.Tensor): the paged KV buffer of the current + layer in vLLM. + attn_metadata (AttentionMetadata): the attention metadata. + **kwargs: additional arguments for the save operation. + """ + + def extract_kv_from_layer( + layer: torch.Tensor, + slot_mapping: torch.Tensor, + ) -> torch.Tensor: + """Extract the KV cache from the layer. + + Assume the shape of the layer is (2, num_pages, page_size, xxx). + """ + num_pages, page_size = layer.shape[1], layer.shape[2] + return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, + ...] + + connector_metadata = self._get_connector_metadata() + assert isinstance(connector_metadata, P2pNcclConnectorMetadata) + for request in connector_metadata.requests: + if request.is_store: + request_id = request.request_id + ip, port = self.parse_request_id(request_id, True) + remote_address = ip + ":" + str(port + self.rank) + kv_cache = extract_kv_from_layer(kv_layer, + request.slot_mapping) + self.p2p_nccl_pipe.send_tensor(request_id + "-" + layer_name, + kv_cache, remote_address) + + def wait_for_save(self): + return + + def get_external_prefix_cache_blocks( + self, + request: "Request", + computed_blocks: list["KVCacheBlock"], + num_computed_tokens: int, + kv_cache_manager: "KVCacheManager", + ) -> list["KVCacheBlock"]: + """ + Get the external prefix cache blocks from the connector. + + This function may change the state of the connector, which will + be used by `attach_connector_meta` later. + + This function will also allocate/free the blocks dynamically when + there is remote cache hit. + + Args: + request (Request): the request object. + computed_blocks (list[KVCacheBlock]): the 'local' computed blocks. + num_computed_tokens (int): the number of 'local' computed tokens. + kv_cache_manager (KVCacheManager): the KV cache manager to + allocate/free the blocks if needed. + + Returns: + The updated list of the computed blocks (appended with the remote + cached blocks) + """ + return computed_blocks + + def attach_connector_meta( + self, scheduler_output: SchedulerOutput) -> SchedulerOutput: + """Attach the connector metadata to the request object. + + This function should NOT modify other fields in the scheduler_output + except the `kv_connector_metadata` field. + Also, calling this function will reset the state of the connector. + + Args: + scheduler_output (SchedulerOutput): the scheduler output object. + """ + meta = P2pNcclConnectorMetadata() + for request in scheduler_output.scheduled_new_reqs: + # T^T, why there is both req_id and request_id???? + if request.req_id in self._requests_need_load: + meta.add_request(request, self._block_size, is_store=False) + else: + # NOTE: here, we set the store and load being exclusive, + # but in LMCache use case, a single request can have both + # store and load status + if not self.found_match_for_request(request): + meta.add_request(request, self._block_size, is_store=True) + scheduler_output.kv_connector_metadata = meta + + self._requests_need_load.clear() + return scheduler_output + + @staticmethod + def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: + logger.info("parse_request_id, request_id: %s, is_prefill: %s", + request_id, is_prefill) + # Regular expression to match the string hostname and integer port + if is_prefill: + pattern = r"___decode_addr_(.*):(\d+)" + else: + pattern = r"___prefill_addr_(.*):(\d+)___" + + # Use re.search to find the pattern in the request_id + match = re.search(pattern, request_id) + if match: + # Extract the ranks + ip = match.group(1) + port = int(match.group(2)) + + logger.info("parse_request_id, request_id: %s, ip: %s, port: %s", + request_id, ip, str(port)) + return ip, port + raise ValueError( + f"Request id {request_id} does not contain hostname and port") + + +def align_to_block_size(num_tokens: int, block_size) -> int: + """Align the number of tokens to the block size. + """ + return (num_tokens - 1) // block_size * block_size diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 863ca7dd0208..3445c70bef08 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -130,8 +130,12 @@ def recv_tensor( if remote_address is None: with self.recv_store_cv: + start_time = time.time() while tensor_id not in self.recv_store: self.recv_store_cv.wait() + duration = time.time() - start_time + logger.info("Recv From %s, tensor_id: %s, duration: %f", + remote_address, tensor_id, duration) return self.recv_store.pop(tensor_id) if remote_address not in self.socks: @@ -277,7 +281,7 @@ def _ping(self): while True: sock.send(msgpack.dumps(data)) # logger.info("ping, zmq_address: %s", self.zmq_address) - time.sleep(1) + time.sleep(3) def _send(self, comm, tensor: torch.Tensor, dst: int, stream=None): assert tensor.device == self.device, ( From 1e9fab6d1b8bca68c09adb2829964f73ce23e421 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 9 Apr 2025 18:02:17 +0800 Subject: [PATCH 017/155] add shape and size log for recv_tensor Signed-off-by: Abatom --- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 3445c70bef08..0efc7c413025 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -129,14 +129,18 @@ def recv_tensor( logger.info("Recv From %s, tensor_id: %s", remote_address, tensor_id) if remote_address is None: + start_time = time.time() with self.recv_store_cv: - start_time = time.time() while tensor_id not in self.recv_store: self.recv_store_cv.wait() - duration = time.time() - start_time - logger.info("Recv From %s, tensor_id: %s, duration: %f", - remote_address, tensor_id, duration) - return self.recv_store.pop(tensor_id) + tensor = self.recv_store.pop(tensor_id) + duration = time.time() - start_time + logger.info( + "๐Ÿšง๐Ÿšง๐Ÿšง Recv From %s, tensor_id: %s, shape: %s, " + "duration: %.3fms, size: %.3fGB", remote_address, tensor_id, + tensor.shape, duration * 1000, + tensor.element_size() * tensor.numel() / 1024**3) + return tensor if remote_address not in self.socks: self._create_connect(remote_address) From 33601e2e9b39c445102a8ecb64d58ef0a91db6df Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 11 Apr 2025 17:23:42 +0800 Subject: [PATCH 018/155] fix hang & oom Signed-off-by: Abatom --- .../kv_transfer/kv_connector/p2p_connector.py | 5 +- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 111 +++++++++++------- 2 files changed, 73 insertions(+), 43 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py index 0363c3ea23ce..02f365bf28fa 100644 --- a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py @@ -200,9 +200,10 @@ def recv_kv_caches_and_hidden_states( # check if both KV cache and the hidden states are received # If not, need to redo the forwarding to compute missing states - if not all([(num_computed_tokens == num_tokens), hidden is not None - ]): + if not all([(num_computed_tokens == num_tokens), keys is not None, + values is not None, hidden is not None]): bypass_model_exec = False + break # update the end position based on how many tokens are cached. end_pos = start_pos + num_computed_tokens diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 0efc7c413025..70bc2c3d4ad6 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -59,8 +59,8 @@ def __init__(self, self.socks: Dict[str, Any] = {} # remote_address: client socket self.comms: Dict[str, Any] = {} # remote_address: (ncclComm_t, rank) - # self.buffer_size = 0 - # self.buffer_size_threshold = self.config.kv_buffer_size + self.buffer_size = 0 + self.buffer_size_threshold = self.config.kv_buffer_size self.send_store_cv = threading.Condition() self.recv_store_cv = threading.Condition() @@ -70,7 +70,7 @@ def __init__(self, target=self._listen_for_requests, daemon=True) self._listener_thread.start() - self._send_thread = threading.Thread(target=self._send_sync, + self._send_thread = threading.Thread(target=self._send_async, daemon=True) self._send_thread.start() @@ -111,15 +111,16 @@ def send_tensor( tensor: torch.Tensor, remote_address: typing.Optional[str] = None, ): - tensor = tensor.clone() + # tensor = tensor.clone() if remote_address is None: with self.recv_store_cv: self.recv_store[tensor_id] = tensor self.recv_store_cv.notify() else: - with self.send_store_cv: - self.send_store.append([tensor_id, remote_address, tensor]) - self.send_store_cv.notify() + self._send_sync(tensor_id, tensor, remote_address) + # with self.send_store_cv: + # self.send_store.append([tensor_id, remote_address, tensor]) + # self.send_store_cv.notify() def recv_tensor( self, @@ -131,15 +132,22 @@ def recv_tensor( if remote_address is None: start_time = time.time() with self.recv_store_cv: - while tensor_id not in self.recv_store: - self.recv_store_cv.wait() - tensor = self.recv_store.pop(tensor_id) + if tensor_id not in self.recv_store: + self.recv_store_cv.wait(timeout=0.001) + tensor = self.recv_store.pop(tensor_id, None) duration = time.time() - start_time - logger.info( - "๐Ÿšง๐Ÿšง๐Ÿšง Recv From %s, tensor_id: %s, shape: %s, " - "duration: %.3fms, size: %.3fGB", remote_address, tensor_id, - tensor.shape, duration * 1000, - tensor.element_size() * tensor.numel() / 1024**3) + if tensor is not None: + self.buffer_size -= (tensor.element_size() * tensor.numel()) + logger.info( + "๐Ÿšง๐Ÿšง๐Ÿšง Recv From %s, tensor_id: %s, shape: %s, " + "duration: %.3fms, size: %.3fGB", remote_address, + tensor_id, tensor.shape, duration * 1000, + tensor.element_size() * tensor.numel() / 1024**3) + else: + logger.warning( + "๐Ÿšง๐Ÿšง๐Ÿšง Recv From %s, tensor_id: %s, " + "duration: %.3fms,", remote_address, tensor_id, + duration * 1000) return tensor if remote_address not in self.socks: @@ -188,6 +196,18 @@ def _listen_for_requests(self): torch, data["dtype"]), device=self.device) + tensor_size = tensor.element_size() * tensor.numel() + if (self.buffer_size + tensor_size + > self.buffer_size_threshold): + self.router_socket.send_multipart( + [remote_address, b"2"]) + logger.warning( + "Recv Tensor, Out Of Threshold, " + "%s ๐Ÿ‘ˆ %s, data: %s", self.zmq_address, + remote_address.decode(), data) + continue + + self.buffer_size += tensor_size self.router_socket.send_multipart( [remote_address, b"0"]) comm, rank = self.comms[remote_address.decode()] @@ -237,40 +257,49 @@ def _listen_for_requests(self): "Unexpected, Received message from %s, data: %s", remote_address, data) - def _send_sync(self): + def _send_async(self): while True: with self.send_store_cv: while not self.send_store: self.send_store_cv.wait() tensor_id, remote_address, tensor = self.send_store.popleft() + self._send_sync(tensor_id, tensor, remote_address) - if remote_address not in self.socks: - self._create_connect(remote_address) - - sock = self.socks[remote_address] - comm, rank = self.comms[remote_address] - data = { - "cmd": "PUT", - "tensor_id": tensor_id, - "shape": tensor.shape, - "dtype": str(tensor.dtype).replace("torch.", "") - } - sock.send(msgpack.dumps(data)) + def _send_sync( + self, + tensor_id: str, + tensor: torch.Tensor, + remote_address: typing.Optional[str] = None, + ): + if remote_address not in self.socks: + self._create_connect(remote_address) - response = sock.recv() - if response != b"0": - self.send_store.append([tensor_id, remote_address, tensor]) - logger.warning( - "Send Tensor, Peer Out Of Memory, %s ๐Ÿ‘‰ %s, " - "MyRank: %s, data: %s, tensor: %s, size: %fGB", - self.zmq_address, remote_address, rank, data, tensor.shape, - tensor.element_size() * tensor.numel() / 1024**3) - continue + sock = self.socks[remote_address] + comm, rank = self.comms[remote_address] + data = { + "cmd": "PUT", + "tensor_id": tensor_id, + "shape": tensor.shape, + "dtype": str(tensor.dtype).replace("torch.", "") + } + sock.send(msgpack.dumps(data)) - self._send(comm, tensor.to(self.device), rank ^ 1) - logger.info( - "Send Tensor, %s ๐Ÿ‘‰ %s, MyRank: %s, data: %s, tensor: %s", - self.zmq_address, remote_address, rank, data, tensor.shape) + response = sock.recv() + if response != b"0": + # with self.send_store_cv: + # self.send_store.append([tensor_id, remote_address, tensor]) + # self.send_store_cv.notify() + logger.warning( + "Send Tensor, Peer Out Of Memory/Threshold, %s ๐Ÿ‘‰ %s, " + "MyRank: %s, data: %s, tensor: %s, size: %fGB, response: %s", + self.zmq_address, remote_address, rank, data, tensor.shape, + tensor.element_size() * tensor.numel() / 1024**3, + response.decode()) + return + + self._send(comm, tensor.to(self.device), rank ^ 1) + logger.info("Send Tensor, %s ๐Ÿ‘‰ %s, MyRank: %s, data: %s, tensor: %s", + self.zmq_address, remote_address, rank, data, tensor.shape) def _ping(self): sock = self.context.socket(zmq.DEALER) From 5a888fc7625304a877f5ece9fc11082a77652ac4 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 11 Apr 2025 21:14:26 +0800 Subject: [PATCH 019/155] format Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 70bc2c3d4ad6..b8a376d1df8c 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -271,6 +271,8 @@ def _send_sync( tensor: torch.Tensor, remote_address: typing.Optional[str] = None, ): + if remote_address is None: + return if remote_address not in self.socks: self._create_connect(remote_address) From bddc4e1cd3f781ca47ca6da4af18675cff89b3c6 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 12 Apr 2025 22:45:35 +0800 Subject: [PATCH 020/155] ping thread Signed-off-by: Abatom --- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index b8a376d1df8c..b69ddb5d238b 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -39,13 +39,19 @@ def __init__(self, raise ValueError("Port cannot be 0") self._hostname = hostname self._port = port + self.zmq_address = f"{self._hostname}:{self._port}" self.http_address = ( f"{self._hostname}:" f"{self.config.kv_connector_extra_config['http_port']}") - self.proxy_address = ( - f"{self.config.kv_connector_extra_config['proxy_ip']}:" - f"{self.config.kv_connector_extra_config['proxy_port']}") + + proxy_ip = self.config.get_from_extra_config("proxy_ip", "") + proxy_port = self.config.get_from_extra_config("proxy_port", "") + if proxy_ip == "" or proxy_port == "": + self.proxy_address = "" + else: + self.proxy_address = proxy_ip + ":" + proxy_port + self.context = zmq.Context() self.router_socket = self.context.socket(zmq.ROUTER) self.router_socket.bind(f"tcp://{self.zmq_address}") @@ -74,7 +80,8 @@ def __init__(self, daemon=True) self._send_thread.start() - if port_offset == 0: + self._ping_thread = None + if port_offset == 0 and self.proxy_address != "": self._ping_thread = threading.Thread(target=self._ping, daemon=True) self._ping_thread.start() @@ -344,4 +351,5 @@ def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): def close(self) -> None: self._listener_thread.join() - self._ping_thread.join() + if self._ping_thread is not None: + self._ping_thread.join() From e516a70fb3c33ec07368f956171c57dda807a830 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 12 Apr 2025 23:04:02 +0800 Subject: [PATCH 021/155] add code comments. Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index b69ddb5d238b..a74972f08ad5 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -40,11 +40,16 @@ def __init__(self, self._hostname = hostname self._port = port + # Each card corresponds to a ZMQ address. self.zmq_address = f"{self._hostname}:{self._port}" + + # The `http_port` must be consistent with the port of OpenAI. self.http_address = ( f"{self._hostname}:" f"{self.config.kv_connector_extra_config['http_port']}") + # If `proxy_ip` or `proxy_port` is `""`, + # then the ping thread will not be enabled. proxy_ip = self.config.get_from_extra_config("proxy_ip", "") proxy_port = self.config.get_from_extra_config("proxy_port", "") if proxy_ip == "" or proxy_port == "": @@ -264,6 +269,8 @@ def _listen_for_requests(self): "Unexpected, Received message from %s, data: %s", remote_address, data) + # Asynchronous sending may cause conflicts between P2P NCCL and + # NCCL used in TP/PP, which can lead to deadlock issues. def _send_async(self): while True: with self.send_store_cv: From 8dadfb407fd15a60d3a09abbe3f016a98217bd69 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 14 Apr 2025 20:49:56 +0800 Subject: [PATCH 022/155] GET Signed-off-by: Abatom --- .../kv_transfer/kv_connector/p2p_connector.py | 25 +-- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 161 +++++++++++------- 2 files changed, 115 insertions(+), 71 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py index 02f365bf28fa..09f136ca6bf7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py @@ -127,14 +127,15 @@ def send_kv_caches_and_hidden_states( ip, port = self.parse_request_id(request_id, True) remote_address = ip + ":" + str(port + self.rank) - self.p2p_nccl_pipe.send_tensor(request_id + "keys", keys, - remote_address) - self.p2p_nccl_pipe.send_tensor(request_id + "values", values, - remote_address) - self.p2p_nccl_pipe.send_tensor( - request_id + "hidden", - hidden_or_intermediate_states[start_pos:end_pos], - remote_address) + if self.p2p_nccl_pipe.send_tensor(request_id + "keys", keys, + remote_address): + if self.p2p_nccl_pipe.send_tensor(request_id + "values", + values, + remote_address): + self.p2p_nccl_pipe.send_tensor( + request_id + "hidden", + hidden_or_intermediate_states[start_pos:end_pos], + remote_address) logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank()) @@ -191,9 +192,11 @@ def recv_kv_caches_and_hidden_states( start_pos_list.append(start_pos) request_id = request_ids[idx] - keys = self.p2p_nccl_pipe.recv_tensor(request_id + "keys") - values = self.p2p_nccl_pipe.recv_tensor(request_id + "values") - hidden = self.p2p_nccl_pipe.recv_tensor(request_id + "hidden") + ip, port = self.parse_request_id(request_id, False) + remote_address = ip + ":" + str(port + self.rank) + keys = self.p2p_nccl_pipe.recv_tensor(request_id + "keys", remote_address) + values = self.p2p_nccl_pipe.recv_tensor(request_id + "values", remote_address) + hidden = self.p2p_nccl_pipe.recv_tensor(request_id + "hidden", remote_address) num_computed_tokens = current_tokens.shape[0] num_computed_tokens_list.append(num_computed_tokens) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index a74972f08ad5..bc1c699af16a 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -64,7 +64,18 @@ def __init__(self, self.poller = zmq.Poller() self.poller.register(self.router_socket, zmq.POLLIN) - self.send_store: Deque[List[Any]] = deque() # tensor_id: torch.Tensor + # The sending type includes tree mutually exclusive options: PUT, GET, PUT_ASYNC. + self.send_type = self.config.get_from_extra_config("send_type", "PUT") + if self.send_type == "GET": + self.send_store: Dict[str, torch.Tensor] = {} # tensor_id: torch.Tensor + else: + # PUT or PUT_ASYNC + self.send_store: Deque[List[Any]] = deque() # tensor_id: torch.Tensor + if self.send_type == "PUT_ASYNC": + self._send_thread = threading.Thread(target=self._send_async, + daemon=True) + self._send_thread.start() + self.recv_store: Dict[str, torch.Tensor] = {} # tensor_id: torch.Tensor self.socks: Dict[str, Any] = {} # remote_address: client socket @@ -81,10 +92,6 @@ def __init__(self, target=self._listen_for_requests, daemon=True) self._listener_thread.start() - self._send_thread = threading.Thread(target=self._send_async, - daemon=True) - self._send_thread.start() - self._ping_thread = None if port_offset == 0 and self.proxy_address != "": self._ping_thread = threading.Thread(target=self._ping, @@ -99,7 +106,7 @@ def _create_connect(self, remote_address: typing.Optional[str] = None): sock.connect(f"tcp://{remote_address}") self.socks[remote_address] = sock if remote_address in self.comms: - logger.info("comm exists, remote_address: %s, comms: %s", + logger.info("๐Ÿ‘‹comm exists, remote_address: %s, comms: %s", remote_address, self.comms) return sock, self.comms[remote_address] @@ -112,7 +119,7 @@ def _create_connect(self, remote_address: typing.Optional[str] = None): comm: ncclComm_t = self.nccl.ncclCommInitRank( 2, unique_id, rank) self.comms[remote_address] = (comm, rank) - logger.info("ncclCommInitRank Success, %s ๐Ÿ‘‰ %s, MyRank: %s", + logger.info("๐ŸคncclCommInitRank Success, %s ๐Ÿ‘‰ %s, MyRank: %s", self.zmq_address, remote_address, rank) return self.socks[remote_address], self.comms[remote_address] @@ -122,26 +129,44 @@ def send_tensor( tensor_id: str, tensor: torch.Tensor, remote_address: typing.Optional[str] = None, - ): - # tensor = tensor.clone() + ) -> bool: if remote_address is None: with self.recv_store_cv: self.recv_store[tensor_id] = tensor self.recv_store_cv.notify() + return True else: - self._send_sync(tensor_id, tensor, remote_address) - # with self.send_store_cv: - # self.send_store.append([tensor_id, remote_address, tensor]) - # self.send_store_cv.notify() + if self.send_type == "PUT": + return self._send_sync(tensor_id, tensor, remote_address) + elif self.send_type == "PUT_ASYNC": + with self.send_store_cv: + self.send_store.append([tensor_id, remote_address, tensor]) + self.send_store_cv.notify() + else: # GET + with self.send_store_cv: + tensor_size = tensor.element_size() * tensor.numel() + while self.buffer_size + tensor_size > self.buffer_size_threshold: + oldest_tenser_id = next(iter(self.send_store)) + oldest_tenser = self.send_store.pop(oldest_tenser_id) + oldest_tenser_size = oldest_tenser.element_size() * oldest_tenser.numel() + self.buffer_size -= oldest_tenser_size + logger.info("โ›”[GET]Send to %s, tensor_id: %s, tensor_size: %d, buffer_size: %d, oldest_tenser_size: %d", + remote_address, tensor_id, tensor_size, self.buffer_size, oldest_tenser_size) + + self.send_store[tensor_id] = tensor + self.buffer_size += tensor_size + logger.info("๐Ÿ”ต[GET]Send to %s, tensor_id: %s, tensor_size: %d, buffer_size: %d(%.2f%%)", + remote_address, tensor_id, tensor_size, + self.buffer_size, self.buffer_size/self.buffer_size_threshold*100) + + return True def recv_tensor( self, tensor_id: str, remote_address: typing.Optional[str] = None, ) -> torch.Tensor: - logger.info("Recv From %s, tensor_id: %s", remote_address, tensor_id) - - if remote_address is None: + if self.send_type == "PUT" or self.send_type == "PUT_ASYNC": start_time = time.time() with self.recv_store_cv: if tensor_id not in self.recv_store: @@ -151,17 +176,20 @@ def recv_tensor( if tensor is not None: self.buffer_size -= (tensor.element_size() * tensor.numel()) logger.info( - "๐Ÿšง๐Ÿšง๐Ÿšง Recv From %s, tensor_id: %s, shape: %s, " + "๐Ÿ”ต[PUT]Recv From %s, tensor_id: %s, shape: %s, " "duration: %.3fms, size: %.3fGB", remote_address, tensor_id, tensor.shape, duration * 1000, tensor.element_size() * tensor.numel() / 1024**3) else: logger.warning( - "๐Ÿšง๐Ÿšง๐Ÿšง Recv From %s, tensor_id: %s, " - "duration: %.3fms,", remote_address, tensor_id, - duration * 1000) + "๐Ÿšง[PUT]Recv From %s, tensor_id: %s, duration: %.3fms", + remote_address, tensor_id, duration * 1000) return tensor + # GET + if remote_address is None: + return None + if remote_address not in self.socks: self._create_connect(remote_address) @@ -173,14 +201,19 @@ def recv_tensor( message = sock.recv() data = msgpack.loads(message) - if data["ret"] == 0: - tensor = torch.empty(data["shape"], - dtype=getattr(torch, data["dtype"]), - device=self.device) - self._recv(comm, tensor, rank ^ 1) - return tensor + if data["ret"] != 0: + logger.info("๐Ÿšง[GET]Recv From %s, tensor_id: %s, ret: %d", + remote_address, tensor_id, data["ret"]) + return None + + tensor = torch.empty(data["shape"], + dtype=getattr(torch, data["dtype"]), + device=self.device) + self._recv(comm, tensor, rank ^ 1) + + logger.info("๐Ÿ”ต[GET]Recv From %s, tensor_id: %s", remote_address, tensor_id) - return None + return tensor def _listen_for_requests(self): while True: @@ -199,7 +232,7 @@ def _listen_for_requests(self): 2, unique_id, rank) self.comms[remote_address.decode()] = (comm, rank) logger.info( - "ncclCommInitRank Success, %s ๐Ÿ‘ˆ %s, MyRank: %s", + "๐ŸคncclCommInitRank Success, %s ๐Ÿ‘ˆ %s, MyRank: %s", self.zmq_address, remote_address.decode(), rank) elif data["cmd"] == "PUT": try: @@ -214,7 +247,7 @@ def _listen_for_requests(self): self.router_socket.send_multipart( [remote_address, b"2"]) logger.warning( - "Recv Tensor, Out Of Threshold, " + "๐Ÿšง[PUT]Recv Tensor, Out Of Threshold, " "%s ๐Ÿ‘ˆ %s, data: %s", self.zmq_address, remote_address.decode(), data) continue @@ -230,7 +263,7 @@ def _listen_for_requests(self): self.recv_store[tensor_id] = tensor self.recv_store_cv.notify() logger.info( - "Recv Tensor, %s ๐Ÿ‘ˆ %s, rank: %s, data: %s, " + "๐Ÿ”ต[PUT]Recv Tensor, %s ๐Ÿ‘ˆ %s, rank: %s, data: %s, " "tensor: %s", self.zmq_address, remote_address.decode(), rank, data, tensor.shape) @@ -238,35 +271,41 @@ def _listen_for_requests(self): self.router_socket.send_multipart( [remote_address, b"1"]) logger.warning( - "Recv Tensor, Out Of Memory, %s ๐Ÿ‘ˆ %s, data: %s", + "๐Ÿšง[PUT]Recv Tensor, Out Of Memory, %s ๐Ÿ‘ˆ %s, data: %s", self.zmq_address, remote_address.decode(), data) elif data["cmd"] == "GET": tensor_id = data["tensor_id"] with self.send_store_cv: - for item in self.send_store: - _tensor_id, _remote_address, tensor = item - if tensor_id == _tensor_id: - data = { - "ret": - 0, - "shape": - tensor.shape, - "dtype": - str(tensor.dtype).replace("torch.", "") - } - else: - data = {"ret": 1} - self.router_socket.send_multipart( - [remote_address, - msgpack.dumps(data)]) - if data["ret"] == 0: - self._send(comm, tensor.to(self.device), - rank ^ 1) - break - else: + tensor = self.send_store.pop(tensor_id, None) + if tensor is not None: + data = { + "ret": + 0, + "shape": + tensor.shape, + "dtype": + str(tensor.dtype).replace("torch.", "") + } + # LRU + self.send_store[tensor_id] = tensor + else: + data = {"ret": 1} + + self.router_socket.send_multipart( + [remote_address, + msgpack.dumps(data)]) + + if data["ret"] == 0: + self._send(comm, tensor.to(self.device), + rank ^ 1) + logger.info( - "Unexpected, Received message from %s, data: %s", + "๐Ÿ”ต[GET]Send Tensor, %s ๐Ÿ‘‰ %s, rank: %s, data: %s", + self.zmq_address, remote_address.decode(), rank, data) + else: + logger.warning( + "๐ŸšงUnexpected, Received message from %s, data: %s", remote_address, data) # Asynchronous sending may cause conflicts between P2P NCCL and @@ -284,9 +323,9 @@ def _send_sync( tensor_id: str, tensor: torch.Tensor, remote_address: typing.Optional[str] = None, - ): + ) -> bool: if remote_address is None: - return + return False if remote_address not in self.socks: self._create_connect(remote_address) @@ -306,21 +345,22 @@ def _send_sync( # self.send_store.append([tensor_id, remote_address, tensor]) # self.send_store_cv.notify() logger.warning( - "Send Tensor, Peer Out Of Memory/Threshold, %s ๐Ÿ‘‰ %s, " + "๐ŸšงSend Tensor, Peer Out Of Memory/Threshold, %s ๐Ÿ‘‰ %s, " "MyRank: %s, data: %s, tensor: %s, size: %fGB, response: %s", self.zmq_address, remote_address, rank, data, tensor.shape, tensor.element_size() * tensor.numel() / 1024**3, response.decode()) - return + return False self._send(comm, tensor.to(self.device), rank ^ 1) - logger.info("Send Tensor, %s ๐Ÿ‘‰ %s, MyRank: %s, data: %s, tensor: %s", + logger.info("๐Ÿ”ตSend Tensor, %s ๐Ÿ‘‰ %s, MyRank: %s, data: %s, tensor: %s", self.zmq_address, remote_address, rank, data, tensor.shape) + return True def _ping(self): sock = self.context.socket(zmq.DEALER) sock.setsockopt_string(zmq.IDENTITY, self.zmq_address) - logger.info("ping start, zmq_address: %s", self.zmq_address) + logger.debug("ping start, zmq_address: %s", self.zmq_address) sock.connect(f"tcp://{self.proxy_address}") data = { "type": "P" if self.config.is_kv_producer else "D", @@ -329,7 +369,6 @@ def _ping(self): } while True: sock.send(msgpack.dumps(data)) - # logger.info("ping, zmq_address: %s", self.zmq_address) time.sleep(3) def _send(self, comm, tensor: torch.Tensor, dst: int, stream=None): @@ -358,5 +397,7 @@ def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): def close(self) -> None: self._listener_thread.join() + if self.send_type == "PUT_ASYNC": + self._send_thread.join() if self._ping_thread is not None: self._ping_thread.join() From 659ead72569a3a0cd593ece60974524feaa1ab04 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 14 Apr 2025 21:47:47 +0800 Subject: [PATCH 023/155] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/p2p_connector.py | 27 +++++----- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 52 +++++++++++-------- 2 files changed, 45 insertions(+), 34 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py index 09f136ca6bf7..5834c9a96f3f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py @@ -127,15 +127,14 @@ def send_kv_caches_and_hidden_states( ip, port = self.parse_request_id(request_id, True) remote_address = ip + ":" + str(port + self.rank) - if self.p2p_nccl_pipe.send_tensor(request_id + "keys", keys, - remote_address): - if self.p2p_nccl_pipe.send_tensor(request_id + "values", - values, - remote_address): - self.p2p_nccl_pipe.send_tensor( - request_id + "hidden", - hidden_or_intermediate_states[start_pos:end_pos], - remote_address) + self.p2p_nccl_pipe.send_tensor(request_id + "keys", keys, + remote_address) + self.p2p_nccl_pipe.send_tensor(request_id + "values", values, + remote_address) + self.p2p_nccl_pipe.send_tensor( + request_id + "hidden", + hidden_or_intermediate_states[start_pos:end_pos], + remote_address) logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank()) @@ -194,9 +193,13 @@ def recv_kv_caches_and_hidden_states( request_id = request_ids[idx] ip, port = self.parse_request_id(request_id, False) remote_address = ip + ":" + str(port + self.rank) - keys = self.p2p_nccl_pipe.recv_tensor(request_id + "keys", remote_address) - values = self.p2p_nccl_pipe.recv_tensor(request_id + "values", remote_address) - hidden = self.p2p_nccl_pipe.recv_tensor(request_id + "hidden", remote_address) + + keys = self.p2p_nccl_pipe.recv_tensor(request_id + "keys", + remote_address) + values = self.p2p_nccl_pipe.recv_tensor(request_id + "values", + remote_address) + hidden = self.p2p_nccl_pipe.recv_tensor(request_id + "hidden", + remote_address) num_computed_tokens = current_tokens.shape[0] num_computed_tokens_list.append(num_computed_tokens) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index bc1c699af16a..28c808d5d5e0 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -64,13 +64,16 @@ def __init__(self, self.poller = zmq.Poller() self.poller.register(self.router_socket, zmq.POLLIN) - # The sending type includes tree mutually exclusive options: PUT, GET, PUT_ASYNC. + # The sending type includes tree mutually exclusive options: + # PUT, GET, PUT_ASYNC. self.send_type = self.config.get_from_extra_config("send_type", "PUT") if self.send_type == "GET": - self.send_store: Dict[str, torch.Tensor] = {} # tensor_id: torch.Tensor + self.send_store: Dict[str, + torch.Tensor] = {} # tensor_id: torch.Tensor else: # PUT or PUT_ASYNC - self.send_store: Deque[List[Any]] = deque() # tensor_id: torch.Tensor + self.send_store: Deque[ + List[Any]] = deque() # tensor_id: torch.Tensor if self.send_type == "PUT_ASYNC": self._send_thread = threading.Thread(target=self._send_async, daemon=True) @@ -142,22 +145,29 @@ def send_tensor( with self.send_store_cv: self.send_store.append([tensor_id, remote_address, tensor]) self.send_store_cv.notify() - else: # GET + else: # GET with self.send_store_cv: tensor_size = tensor.element_size() * tensor.numel() - while self.buffer_size + tensor_size > self.buffer_size_threshold: + while (self.buffer_size + tensor_size + > self.buffer_size_threshold): oldest_tenser_id = next(iter(self.send_store)) oldest_tenser = self.send_store.pop(oldest_tenser_id) - oldest_tenser_size = oldest_tenser.element_size() * oldest_tenser.numel() + oldest_tenser_size = oldest_tenser.element_size( + ) * oldest_tenser.numel() self.buffer_size -= oldest_tenser_size - logger.info("โ›”[GET]Send to %s, tensor_id: %s, tensor_size: %d, buffer_size: %d, oldest_tenser_size: %d", - remote_address, tensor_id, tensor_size, self.buffer_size, oldest_tenser_size) + logger.info( + "โ›”[GET]Send to %s, tensor_id: %s, tensor_size: %d," + "buffer_size: %d, oldest_tenser_size: %d", + remote_address, tensor_id, tensor_size, + self.buffer_size, oldest_tenser_size) self.send_store[tensor_id] = tensor self.buffer_size += tensor_size - logger.info("๐Ÿ”ต[GET]Send to %s, tensor_id: %s, tensor_size: %d, buffer_size: %d(%.2f%%)", - remote_address, tensor_id, tensor_size, - self.buffer_size, self.buffer_size/self.buffer_size_threshold*100) + logger.info( + "๐Ÿ”ต[GET]Send to %s, tensor_id: %s, tensor_size: %d," + "buffer_size: %d(%.2f%%)", remote_address, tensor_id, + tensor_size, self.buffer_size, + self.buffer_size / self.buffer_size_threshold * 100) return True @@ -211,7 +221,8 @@ def recv_tensor( device=self.device) self._recv(comm, tensor, rank ^ 1) - logger.info("๐Ÿ”ต[GET]Recv From %s, tensor_id: %s", remote_address, tensor_id) + logger.info("๐Ÿ”ต[GET]Recv From %s, tensor_id: %s", remote_address, + tensor_id) return tensor @@ -271,8 +282,9 @@ def _listen_for_requests(self): self.router_socket.send_multipart( [remote_address, b"1"]) logger.warning( - "๐Ÿšง[PUT]Recv Tensor, Out Of Memory, %s ๐Ÿ‘ˆ %s, data: %s", - self.zmq_address, remote_address.decode(), data) + "๐Ÿšง[PUT]Recv Tensor, Out Of Memory,%s ๐Ÿ‘ˆ %s, " + "data: %s", self.zmq_address, + remote_address.decode(), data) elif data["cmd"] == "GET": tensor_id = data["tensor_id"] @@ -280,10 +292,8 @@ def _listen_for_requests(self): tensor = self.send_store.pop(tensor_id, None) if tensor is not None: data = { - "ret": - 0, - "shape": - tensor.shape, + "ret": 0, + "shape": tensor.shape, "dtype": str(tensor.dtype).replace("torch.", "") } @@ -293,12 +303,10 @@ def _listen_for_requests(self): data = {"ret": 1} self.router_socket.send_multipart( - [remote_address, - msgpack.dumps(data)]) + [remote_address, msgpack.dumps(data)]) if data["ret"] == 0: - self._send(comm, tensor.to(self.device), - rank ^ 1) + self._send(comm, tensor.to(self.device), rank ^ 1) logger.info( "๐Ÿ”ต[GET]Send Tensor, %s ๐Ÿ‘‰ %s, rank: %s, data: %s", From fd596dc7cc7774fa2d2ed2851086ec86d9289ddb Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 14 Apr 2025 22:09:01 +0800 Subject: [PATCH 024/155] send_queue Signed-off-by: Abatom --- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 28c808d5d5e0..b73e94b3636a 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -72,7 +72,7 @@ def __init__(self, torch.Tensor] = {} # tensor_id: torch.Tensor else: # PUT or PUT_ASYNC - self.send_store: Deque[ + self.send_queue: Deque[ List[Any]] = deque() # tensor_id: torch.Tensor if self.send_type == "PUT_ASYNC": self._send_thread = threading.Thread(target=self._send_async, @@ -88,6 +88,7 @@ def __init__(self, self.buffer_size_threshold = self.config.kv_buffer_size self.send_store_cv = threading.Condition() + self.send_queue_cv = threading.Condition() self.recv_store_cv = threading.Condition() self.comm_cv = threading.Condition() @@ -142,9 +143,9 @@ def send_tensor( if self.send_type == "PUT": return self._send_sync(tensor_id, tensor, remote_address) elif self.send_type == "PUT_ASYNC": - with self.send_store_cv: - self.send_store.append([tensor_id, remote_address, tensor]) - self.send_store_cv.notify() + with self.send_queue_cv: + self.send_queue.append([tensor_id, remote_address, tensor]) + self.send_queue_cv.notify() else: # GET with self.send_store_cv: tensor_size = tensor.element_size() * tensor.numel() @@ -320,10 +321,10 @@ def _listen_for_requests(self): # NCCL used in TP/PP, which can lead to deadlock issues. def _send_async(self): while True: - with self.send_store_cv: - while not self.send_store: - self.send_store_cv.wait() - tensor_id, remote_address, tensor = self.send_store.popleft() + with self.send_queue_cv: + while not self.send_queue: + self.send_queue_cv.wait() + tensor_id, remote_address, tensor = self.send_queue.popleft() self._send_sync(tensor_id, tensor, remote_address) def _send_sync( @@ -349,9 +350,9 @@ def _send_sync( response = sock.recv() if response != b"0": - # with self.send_store_cv: - # self.send_store.append([tensor_id, remote_address, tensor]) - # self.send_store_cv.notify() + # with self.send_queue_cv: + # self.send_queue.append([tensor_id, remote_address, tensor]) + # self.send_queue_cv.notify() logger.warning( "๐ŸšงSend Tensor, Peer Out Of Memory/Threshold, %s ๐Ÿ‘‰ %s, " "MyRank: %s, data: %s, tensor: %s, size: %fGB, response: %s", From fe81aaefa36bf2a2ab9f8fd11c18c874fffe1288 Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 15 Apr 2025 11:55:54 +0800 Subject: [PATCH 025/155] modify log Signed-off-by: Abatom --- .../kv_transfer/kv_connector/p2p_connector.py | 4 +- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 69 ++++++++++--------- 2 files changed, 40 insertions(+), 33 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py index 5834c9a96f3f..f7204ebc6b29 100644 --- a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py @@ -274,7 +274,7 @@ def recv_kv_caches_and_hidden_states( @staticmethod def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: - logger.info("parse_request_id, request_id: %s, is_prefill: %s", + logger.debug("parse_request_id, request_id: %s, is_prefill: %s", request_id, is_prefill) # Regular expression to match the string hostname and integer port if is_prefill: @@ -289,7 +289,7 @@ def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: ip = match.group(1) port = int(match.group(2)) - logger.info("parse_request_id, request_id: %s, ip: %s, port: %s", + logger.debug("parse_request_id, request_id: %s, ip: %s, port: %s", request_id, ip, str(port)) return ip, port raise ValueError( diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index b73e94b3636a..128d92bd5640 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -28,6 +28,7 @@ def __init__(self, port_offset: int = 0, library_path: Optional[str] = None) -> None: self.config = config + self.rank = port_offset self.local_rank = local_rank self.device = torch.device(f"cuda:{self.local_rank}") self.nccl = NCCLLibrary(library_path) @@ -110,7 +111,7 @@ def _create_connect(self, remote_address: typing.Optional[str] = None): sock.connect(f"tcp://{remote_address}") self.socks[remote_address] = sock if remote_address in self.comms: - logger.info("๐Ÿ‘‹comm exists, remote_address: %s, comms: %s", + logger.info("๐Ÿ‘‹comm exists, remote_address:%s, comms:%s", remote_address, self.comms) return sock, self.comms[remote_address] @@ -123,7 +124,7 @@ def _create_connect(self, remote_address: typing.Optional[str] = None): comm: ncclComm_t = self.nccl.ncclCommInitRank( 2, unique_id, rank) self.comms[remote_address] = (comm, rank) - logger.info("๐ŸคncclCommInitRank Success, %s ๐Ÿ‘‰ %s, MyRank: %s", + logger.info("๐ŸคncclCommInitRank Success, %s๐Ÿ‘‰%s, MyRank: %s", self.zmq_address, remote_address, rank) return self.socks[remote_address], self.comms[remote_address] @@ -157,18 +158,19 @@ def send_tensor( ) * oldest_tenser.numel() self.buffer_size -= oldest_tenser_size logger.info( - "โ›”[GET]Send to %s, tensor_id: %s, tensor_size: %d," - "buffer_size: %d, oldest_tenser_size: %d", + "โ›”[GET]Send to %s, tensor_id:%s, tensor_size:%d," + " buffer_size:%d, oldest_tenser_size:%d, rank:%d", remote_address, tensor_id, tensor_size, - self.buffer_size, oldest_tenser_size) + self.buffer_size, oldest_tenser_size, self.rank) self.send_store[tensor_id] = tensor self.buffer_size += tensor_size logger.info( - "๐Ÿ”ต[GET]Send to %s, tensor_id: %s, tensor_size: %d," - "buffer_size: %d(%.2f%%)", remote_address, tensor_id, - tensor_size, self.buffer_size, - self.buffer_size / self.buffer_size_threshold * 100) + "๐Ÿ”ต[GET]Send to %s, tensor_id:%s, tensor_size:%d, " + "buffer_size:%d(%.2f%%), rank:%d", remote_address, + tensor_id, tensor_size, self.buffer_size, + self.buffer_size / self.buffer_size_threshold * 100, + self.rank) return True @@ -187,14 +189,16 @@ def recv_tensor( if tensor is not None: self.buffer_size -= (tensor.element_size() * tensor.numel()) logger.info( - "๐Ÿ”ต[PUT]Recv From %s, tensor_id: %s, shape: %s, " - "duration: %.3fms, size: %.3fGB", remote_address, + "๐Ÿ”ต[PUT]Recv From %s, tensor_id:%s, shape:%s, " + "duration:%.3fms, size:%.3fGB, rank:%d", remote_address, tensor_id, tensor.shape, duration * 1000, - tensor.element_size() * tensor.numel() / 1024**3) + tensor.element_size() * tensor.numel() / 1024**3, + self.rank) else: logger.warning( - "๐Ÿšง[PUT]Recv From %s, tensor_id: %s, duration: %.3fms", - remote_address, tensor_id, duration * 1000) + "๐Ÿ”ด[PUT]Recv From %s, tensor_id:%s, duration:%.3fms, " + "rank:%d", remote_address, tensor_id, duration * 1000, + self.rank) return tensor # GET @@ -213,7 +217,7 @@ def recv_tensor( message = sock.recv() data = msgpack.loads(message) if data["ret"] != 0: - logger.info("๐Ÿšง[GET]Recv From %s, tensor_id: %s, ret: %d", + logger.warning("๐Ÿ”ด[GET]Recv From %s, tensor_id: %s, ret: %d", remote_address, tensor_id, data["ret"]) return None @@ -222,8 +226,11 @@ def recv_tensor( device=self.device) self._recv(comm, tensor, rank ^ 1) - logger.info("๐Ÿ”ต[GET]Recv From %s, tensor_id: %s", remote_address, - tensor_id) + logger.info("๐Ÿ”ต[GET]Recv From %s, tensor_id:%s, shape:%s, " + "size:%.3fGB, rank:%d", remote_address, + tensor_id, tensor.shape, + tensor.element_size() * tensor.numel() / 1024**3, + self.rank) return tensor @@ -233,7 +240,7 @@ def _listen_for_requests(self): if self.router_socket in socks: remote_address, message = self.router_socket.recv_multipart() data = msgpack.loads(message) - logger.debug("Received message from %s, data: %s", + logger.debug("Received message from %s, data:%s", remote_address.decode(), data) if data["cmd"] == "NEW": unique_id = self.nccl.unique_id_from_bytes( @@ -244,7 +251,7 @@ def _listen_for_requests(self): 2, unique_id, rank) self.comms[remote_address.decode()] = (comm, rank) logger.info( - "๐ŸคncclCommInitRank Success, %s ๐Ÿ‘ˆ %s, MyRank: %s", + "๐ŸคncclCommInitRank Success, %s๐Ÿ‘ˆ%s, MyRank:%s", self.zmq_address, remote_address.decode(), rank) elif data["cmd"] == "PUT": try: @@ -259,8 +266,8 @@ def _listen_for_requests(self): self.router_socket.send_multipart( [remote_address, b"2"]) logger.warning( - "๐Ÿšง[PUT]Recv Tensor, Out Of Threshold, " - "%s ๐Ÿ‘ˆ %s, data: %s", self.zmq_address, + "๐Ÿ”ด[PUT]Recv Tensor, Out Of Threshold, " + "%s๐Ÿ‘ˆ%s, data:%s", self.zmq_address, remote_address.decode(), data) continue @@ -275,16 +282,16 @@ def _listen_for_requests(self): self.recv_store[tensor_id] = tensor self.recv_store_cv.notify() logger.info( - "๐Ÿ”ต[PUT]Recv Tensor, %s ๐Ÿ‘ˆ %s, rank: %s, data: %s, " - "tensor: %s", self.zmq_address, + "๐Ÿ”ต[PUT]Recv Tensor, %s๐Ÿ‘ˆ%s, MyRank:%s, data:%s, " + "shape:%s", self.zmq_address, remote_address.decode(), rank, data, tensor.shape) except torch.cuda.OutOfMemoryError: self.router_socket.send_multipart( [remote_address, b"1"]) logger.warning( - "๐Ÿšง[PUT]Recv Tensor, Out Of Memory,%s ๐Ÿ‘ˆ %s, " - "data: %s", self.zmq_address, + "๐Ÿ”ด[PUT]Recv Tensor, Out Of Memory, %s๐Ÿ‘ˆ%s, " + "data:%s", self.zmq_address, remote_address.decode(), data) elif data["cmd"] == "GET": @@ -310,11 +317,11 @@ def _listen_for_requests(self): self._send(comm, tensor.to(self.device), rank ^ 1) logger.info( - "๐Ÿ”ต[GET]Send Tensor, %s ๐Ÿ‘‰ %s, rank: %s, data: %s", + "๐Ÿ”ต[GET]Send Tensor, %s๐Ÿ‘‰%s, MyRank:%s, data:%s", self.zmq_address, remote_address.decode(), rank, data) else: logger.warning( - "๐ŸšงUnexpected, Received message from %s, data: %s", + "๐ŸšงUnexpected, Received message from %s, data:%s", remote_address, data) # Asynchronous sending may cause conflicts between P2P NCCL and @@ -354,22 +361,22 @@ def _send_sync( # self.send_queue.append([tensor_id, remote_address, tensor]) # self.send_queue_cv.notify() logger.warning( - "๐ŸšงSend Tensor, Peer Out Of Memory/Threshold, %s ๐Ÿ‘‰ %s, " - "MyRank: %s, data: %s, tensor: %s, size: %fGB, response: %s", + "๐Ÿ”ดSend Tensor, Peer Out Of Memory/Threshold, %s ๐Ÿ‘‰ %s, " + "MyRank:%s, data:%s, tensor:%s, size:%fGB, response:%s", self.zmq_address, remote_address, rank, data, tensor.shape, tensor.element_size() * tensor.numel() / 1024**3, response.decode()) return False self._send(comm, tensor.to(self.device), rank ^ 1) - logger.info("๐Ÿ”ตSend Tensor, %s ๐Ÿ‘‰ %s, MyRank: %s, data: %s, tensor: %s", + logger.info("๐Ÿ”ตSend Tensor, %s๐Ÿ‘‰%s, MyRank:%s, data:%s, tensor:%s", self.zmq_address, remote_address, rank, data, tensor.shape) return True def _ping(self): sock = self.context.socket(zmq.DEALER) sock.setsockopt_string(zmq.IDENTITY, self.zmq_address) - logger.debug("ping start, zmq_address: %s", self.zmq_address) + logger.debug("ping start, zmq_address:%s", self.zmq_address) sock.connect(f"tcp://{self.proxy_address}") data = { "type": "P" if self.config.is_kv_producer else "D", From 21818fe075183c6aba2c52292df1586e1b5c4b20 Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 15 Apr 2025 12:27:25 +0800 Subject: [PATCH 026/155] fix bug for PUT_ASYNC Signed-off-by: Abatom --- .../kv_transfer/kv_connector/p2p_connector.py | 4 +-- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 26 +++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py index f7204ebc6b29..f60fb1c5c0b7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py @@ -275,7 +275,7 @@ def recv_kv_caches_and_hidden_states( @staticmethod def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: logger.debug("parse_request_id, request_id: %s, is_prefill: %s", - request_id, is_prefill) + request_id, is_prefill) # Regular expression to match the string hostname and integer port if is_prefill: pattern = r"___decode_addr_(.*):(\d+)" @@ -290,7 +290,7 @@ def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: port = int(match.group(2)) logger.debug("parse_request_id, request_id: %s, ip: %s, port: %s", - request_id, ip, str(port)) + request_id, ip, str(port)) return ip, port raise ValueError( f"Request id {request_id} does not contain hostname and port") diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 128d92bd5640..30e620998752 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -65,6 +65,11 @@ def __init__(self, self.poller = zmq.Poller() self.poller.register(self.router_socket, zmq.POLLIN) + self.send_store_cv = threading.Condition() + self.send_queue_cv = threading.Condition() + self.recv_store_cv = threading.Condition() + self.comm_cv = threading.Condition() + # The sending type includes tree mutually exclusive options: # PUT, GET, PUT_ASYNC. self.send_type = self.config.get_from_extra_config("send_type", "PUT") @@ -88,11 +93,6 @@ def __init__(self, self.buffer_size = 0 self.buffer_size_threshold = self.config.kv_buffer_size - self.send_store_cv = threading.Condition() - self.send_queue_cv = threading.Condition() - self.recv_store_cv = threading.Condition() - self.comm_cv = threading.Condition() - self._listener_thread = threading.Thread( target=self._listen_for_requests, daemon=True) self._listener_thread.start() @@ -218,7 +218,7 @@ def recv_tensor( data = msgpack.loads(message) if data["ret"] != 0: logger.warning("๐Ÿ”ด[GET]Recv From %s, tensor_id: %s, ret: %d", - remote_address, tensor_id, data["ret"]) + remote_address, tensor_id, data["ret"]) return None tensor = torch.empty(data["shape"], @@ -226,11 +226,10 @@ def recv_tensor( device=self.device) self._recv(comm, tensor, rank ^ 1) - logger.info("๐Ÿ”ต[GET]Recv From %s, tensor_id:%s, shape:%s, " - "size:%.3fGB, rank:%d", remote_address, - tensor_id, tensor.shape, - tensor.element_size() * tensor.numel() / 1024**3, - self.rank) + logger.info( + "๐Ÿ”ต[GET]Recv From %s, tensor_id:%s, shape:%s, " + "size:%.3fGB, rank:%d", remote_address, tensor_id, tensor.shape, + tensor.element_size() * tensor.numel() / 1024**3, self.rank) return tensor @@ -317,8 +316,9 @@ def _listen_for_requests(self): self._send(comm, tensor.to(self.device), rank ^ 1) logger.info( - "๐Ÿ”ต[GET]Send Tensor, %s๐Ÿ‘‰%s, MyRank:%s, data:%s", - self.zmq_address, remote_address.decode(), rank, data) + "๐Ÿ”ต[GET]Send Tensor, %s๐Ÿ‘‰%s, " + "MyRank:%s, data:%s", self.zmq_address, + remote_address.decode(), rank, data) else: logger.warning( "๐ŸšงUnexpected, Received message from %s, data:%s", From dcb637b42d8e063569614796dc3d8cabfdabc2ec Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 15 Apr 2025 15:20:35 +0800 Subject: [PATCH 027/155] modify log Signed-off-by: Abatom --- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 30e620998752..5c2412568702 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -167,10 +167,10 @@ def send_tensor( self.buffer_size += tensor_size logger.info( "๐Ÿ”ต[GET]Send to %s, tensor_id:%s, tensor_size:%d, " - "buffer_size:%d(%.2f%%), rank:%d", remote_address, - tensor_id, tensor_size, self.buffer_size, - self.buffer_size / self.buffer_size_threshold * 100, - self.rank) + "shape:%s, rank:%d, buffer_size:%d(%.2f%%)", + remote_address, tensor_id, tensor_size, + self.buffer_size, tensor.shape, self.rank, + self.buffer_size / self.buffer_size_threshold * 100) return True @@ -224,12 +224,15 @@ def recv_tensor( tensor = torch.empty(data["shape"], dtype=getattr(torch, data["dtype"]), device=self.device) - self._recv(comm, tensor, rank ^ 1) + start_time = time.time() + self._recv(comm, tensor, rank ^ 1) + duration = time.time() - start_time logger.info( - "๐Ÿ”ต[GET]Recv From %s, tensor_id:%s, shape:%s, " + "๐Ÿ”ต[GET]Recv From %s, tensor_id:%s, shape:%s, duration:%.3fms, " "size:%.3fGB, rank:%d", remote_address, tensor_id, tensor.shape, - tensor.element_size() * tensor.numel() / 1024**3, self.rank) + duration * 1000, tensor.element_size() * tensor.numel() / 1024**3, + self.rank) return tensor From 5cbb299fadd294acec7ca988efe7eeffc6efb37f Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 15 Apr 2025 15:46:41 +0800 Subject: [PATCH 028/155] format Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 5c2412568702..a2479ff44c80 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -231,8 +231,8 @@ def recv_tensor( logger.info( "๐Ÿ”ต[GET]Recv From %s, tensor_id:%s, shape:%s, duration:%.3fms, " "size:%.3fGB, rank:%d", remote_address, tensor_id, tensor.shape, - duration * 1000, tensor.element_size() * tensor.numel() / 1024**3, - self.rank) + duration * 1000, + tensor.element_size() * tensor.numel() / 1024 ** 3, self.rank) return tensor From 976f51bf0af57174cfa80610e2b7f70bf6360e99 Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 15 Apr 2025 15:55:45 +0800 Subject: [PATCH 029/155] format Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index a2479ff44c80..0efb5edff050 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -232,7 +232,7 @@ def recv_tensor( "๐Ÿ”ต[GET]Recv From %s, tensor_id:%s, shape:%s, duration:%.3fms, " "size:%.3fGB, rank:%d", remote_address, tensor_id, tensor.shape, duration * 1000, - tensor.element_size() * tensor.numel() / 1024 ** 3, self.rank) + tensor.element_size() * tensor.numel() / 1024**3, self.rank) return tensor From a2470ac4fbab7ed4ea9070c7c7a26b231040c1d7 Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 15 Apr 2025 17:32:29 +0800 Subject: [PATCH 030/155] fix bug Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 0efb5edff050..ad00d82ffca3 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -169,7 +169,7 @@ def send_tensor( "๐Ÿ”ต[GET]Send to %s, tensor_id:%s, tensor_size:%d, " "shape:%s, rank:%d, buffer_size:%d(%.2f%%)", remote_address, tensor_id, tensor_size, - self.buffer_size, tensor.shape, self.rank, + tensor.shape, self.rank, self.buffer_size, self.buffer_size / self.buffer_size_threshold * 100) return True From b0facb9dd9d89462cc6fdce8e696acf0e7ca07e3 Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 15 Apr 2025 18:31:16 +0800 Subject: [PATCH 031/155] format Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index ad00d82ffca3..49c42ef52024 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -168,8 +168,8 @@ def send_tensor( logger.info( "๐Ÿ”ต[GET]Send to %s, tensor_id:%s, tensor_size:%d, " "shape:%s, rank:%d, buffer_size:%d(%.2f%%)", - remote_address, tensor_id, tensor_size, - tensor.shape, self.rank, self.buffer_size, + remote_address, tensor_id, tensor_size, tensor.shape, + self.rank, self.buffer_size, self.buffer_size / self.buffer_size_threshold * 100) return True From 20f2e7ac38f46a167518a2943941f06fc43ff914 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 16 Apr 2025 21:17:19 +0800 Subject: [PATCH 032/155] fix bug Signed-off-by: Abatom --- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 49c42ef52024..78b3da214ac2 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -182,9 +182,12 @@ def recv_tensor( if self.send_type == "PUT" or self.send_type == "PUT_ASYNC": start_time = time.time() with self.recv_store_cv: - if tensor_id not in self.recv_store: - self.recv_store_cv.wait(timeout=0.001) - tensor = self.recv_store.pop(tensor_id, None) + while tensor_id not in self.recv_store: + self.recv_store_cv.wait() + # TODO:Abatom, To avoid an overly large dictionary. + # tensor = self.recv_store.pop(tensor_id) + tensor = self.recv_store[tensor_id] + self.recv_store[tensor_id] = None duration = time.time() - start_time if tensor is not None: self.buffer_size -= (tensor.element_size() * tensor.numel()) @@ -271,22 +274,22 @@ def _listen_for_requests(self): "๐Ÿ”ด[PUT]Recv Tensor, Out Of Threshold, " "%s๐Ÿ‘ˆ%s, data:%s", self.zmq_address, remote_address.decode(), data) - continue - - self.buffer_size += tensor_size - self.router_socket.send_multipart( - [remote_address, b"0"]) - comm, rank = self.comms[remote_address.decode()] - self._recv(comm, tensor, rank ^ 1) + tensor = None + else: + self.buffer_size += tensor_size + self.router_socket.send_multipart( + [remote_address, b"0"]) + comm, rank = self.comms[remote_address.decode()] + self._recv(comm, tensor, rank ^ 1) + logger.info( + "๐Ÿ”ต[PUT]Recv Tensor, %s๐Ÿ‘ˆ%s, MyRank:%s, data:%s, " + "shape:%s", self.zmq_address, + remote_address.decode(), rank, data, tensor.shape) tensor_id = data["tensor_id"] with self.recv_store_cv: self.recv_store[tensor_id] = tensor self.recv_store_cv.notify() - logger.info( - "๐Ÿ”ต[PUT]Recv Tensor, %s๐Ÿ‘ˆ%s, MyRank:%s, data:%s, " - "shape:%s", self.zmq_address, - remote_address.decode(), rank, data, tensor.shape) except torch.cuda.OutOfMemoryError: self.router_socket.send_multipart( From f1a518300d12d882729c3dec8b72a4c489d547f7 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 17 Apr 2025 11:44:10 +0800 Subject: [PATCH 033/155] fix bug Signed-off-by: Abatom --- .../kv_transfer/kv_connector/p2p_connector.py | 160 ++++-------------- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 21 +-- 2 files changed, 43 insertions(+), 138 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py index f60fb1c5c0b7..4b532b84d69d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py @@ -5,11 +5,11 @@ import torch -import vllm.envs as envs -from vllm import _custom_ops as ops from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase from vllm.distributed.kv_transfer.kv_pipe.p2p_nccl_pipe import P2pNcclPipe +from vllm.distributed.kv_transfer.kv_connector.utils import ( + model_aware_kv_ops_helper as kv_helper) from vllm.logger import init_logger from vllm.sequence import IntermediateTensors @@ -29,9 +29,7 @@ def __init__( ): self.rank = rank self.config = config.kv_transfer_config - self.tp_size = config.parallel_config.tensor_parallel_size - self.is_deepseek_mla = config.model_config.is_deepseek_mla - self.use_mla_opt = not envs.VLLM_MLA_DISABLE + self.kv_helper = kv_helper(config) assert self.config.kv_connector == "P2pConnector" @@ -52,69 +50,25 @@ def send_kv_caches_and_hidden_states( hidden_or_intermediate_states: Union[torch.Tensor, IntermediateTensors], ) -> None: - # input_tokens_tensor = model_input.input_tokens seq_lens = model_input.attn_metadata.seq_lens slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten() - num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens request_ids = list(model_input.request_ids_to_seq_ids.keys()) start_layer = model_executable.model.start_layer end_layer = model_executable.model.end_layer + num_heads, head_size = self.kv_helper.get_model_args(model_executable) - model_config = model_executable.model.config - num_heads = int(model_config.num_key_value_heads / self.tp_size) - hidden_size = model_config.hidden_size - num_attention_heads = model_config.num_attention_heads - - # Deepseek's MLA (Multi-head Latent Attention) uses two different - # kv_cache shapes based on whether VLLM_MLA_DISABLE is set to 0. - # When VLLM_MLA_DISABLE=0 (default), forward absorb is applied, - # resulting in a kv_cache shape of [num_blks, blk_size, 1, - # kv_lora_rank + qk_rope_head_dim]. - # When VLLM_MLA_DISABLE=1, standard FA is used instead, leading - # to a kv_cache shape of [2, num_blks, blk_size, - # num_key_value_heads / tp, qk_nope_head_dim + qk_rope_head_dim]. - # For more details, see vllm/attention/backends/mla/common.py. - if self.is_deepseek_mla and self.use_mla_opt: - head_size = model_config.kv_lora_rank + \ - model_config.qk_rope_head_dim - num_heads = 1 - elif self.is_deepseek_mla and not self.use_mla_opt: - head_size = model_config.qk_nope_head_dim + \ - model_config.qk_rope_head_dim - else: - head_size = getattr(model_config, "head_dim", - int(hidden_size // num_attention_heads)) - - # query_lens contains new KV caches that are added to vLLM. - # so we will send them to decode instance - # FIXME(Kuntai): This assume that all requests are prefill. for idx, slen in enumerate(seq_lens): start_pos = sum(seq_lens[:idx]) end_pos = start_pos + slen - if start_pos >= num_prefill_tokens: - # vllm/worker/model_runner.py::_prepare_model_input_tensors: - # - input_tokens[:num_prefill_tokens] contains prefill tokens. - # - input_tokens[num_prefill_tokens:] contains decode tokens. - logger.warning("You have some decode requests while using " - "SimpleConnector. Their KVCache won't be sent.") - break - # current_tokens = input_tokens_tensor[start_pos:end_pos] - keys, values = [], [] for layer_id in range(start_layer, end_layer): kv_cache = kv_caches[layer_id - start_layer] - - if self.is_deepseek_mla and self.use_mla_opt: - key_cache = kv_cache.reshape(-1, num_heads, head_size) - value_cache = kv_cache.reshape(-1, num_heads, head_size) - else: - key_cache = kv_cache[0].reshape(-1, num_heads, head_size) - value_cache = kv_cache[1].reshape(-1, num_heads, head_size) - + key_cache, value_cache = self.kv_helper.get_kv_from_cache( + kv_cache, num_heads, head_size) current_slot_mapping = slot_mapping_flat[start_pos:end_pos] keys.append(key_cache[current_slot_mapping].unsqueeze(0)) @@ -122,14 +76,13 @@ def send_kv_caches_and_hidden_states( keys = torch.cat(keys, dim=0) values = torch.cat(values, dim=0) + kvcache = torch.stack((keys, values), dim=0) request_id = request_ids[idx] ip, port = self.parse_request_id(request_id, True) remote_address = ip + ":" + str(port + self.rank) - self.p2p_nccl_pipe.send_tensor(request_id + "keys", keys, - remote_address) - self.p2p_nccl_pipe.send_tensor(request_id + "values", values, + self.p2p_nccl_pipe.send_tensor(request_id + "kv", kvcache, remote_address) self.p2p_nccl_pipe.send_tensor( request_id + "hidden", @@ -144,29 +97,16 @@ def recv_kv_caches_and_hidden_states( kv_caches: List[torch.Tensor] ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, "ModelInputForGPUWithSamplingMetadata"]: - - # When bypass_model_exec is set to False, it means that at least for one - # request its corresponding KV cache or hidden state is missing. - # In this case we need to do prefilling to recompute missing KV cache - # and hidden states. bypass_model_exec = True - - model_config = model_executable.model.config - input_tokens_tensor = model_input.input_tokens seq_lens = model_input.attn_metadata.seq_lens num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens slot_mapping = model_input.attn_metadata.slot_mapping.flatten() request_ids = list(model_input.request_ids_to_seq_ids.keys()) - + start_layer = model_executable.model.start_layer + end_layer = model_executable.model.end_layer hidden_or_intermediate_states_for_one_req = [] - input_tokens_list = [] - num_computed_tokens_list = [] - start_pos_list = [] - - # enumerate different requests - # FIXME(Kuntai): This impl assumes that all requests are prefill. for idx, slen in enumerate(seq_lens): start_pos = sum(seq_lens[:idx]) end_pos = start_pos + slen @@ -178,86 +118,50 @@ def recv_kv_caches_and_hidden_states( # - input_tokens[num_prefill_tokens:] contains decode tokens. logger.warning("You should set --enable_chunked_prefill=False " "and --max_num_batched_tokens " - "should be equal to --max_seq_len_to_capture") + "should be equal to max_seq_len_to_capture") bypass_model_exec = False assert start_pos == num_prefill_tokens break current_tokens = input_tokens_tensor[start_pos:end_pos] - num_tokens = slen - - # collecting data for rebuilding the input - input_tokens_list.append(current_tokens) - start_pos_list.append(start_pos) request_id = request_ids[idx] ip, port = self.parse_request_id(request_id, False) remote_address = ip + ":" + str(port + self.rank) - keys = self.p2p_nccl_pipe.recv_tensor(request_id + "keys", + kvcache = self.p2p_nccl_pipe.recv_tensor(request_id + "kv", remote_address) - values = self.p2p_nccl_pipe.recv_tensor(request_id + "values", - remote_address) hidden = self.p2p_nccl_pipe.recv_tensor(request_id + "hidden", remote_address) - num_computed_tokens = current_tokens.shape[0] - num_computed_tokens_list.append(num_computed_tokens) - - # check if both KV cache and the hidden states are received - # If not, need to redo the forwarding to compute missing states - if not all([(num_computed_tokens == num_tokens), keys is not None, - values is not None, hidden is not None]): + if kvcache is None or hidden is None: + # didn't find any match. bypass_model_exec = False - break + continue + + num_computed_tokens = current_tokens.shape[0] # update the end position based on how many tokens are cached. end_pos = start_pos + num_computed_tokens - # put received KV caches into paged memory - for i in range(model_executable.model.start_layer, - model_executable.model.end_layer): - - kv_cache = kv_caches[i - model_executable.model.start_layer] - layer = model_executable.model.layers[i] - - if self.is_deepseek_mla and self.use_mla_opt: - layer.self_attn.attn = layer.self_attn.mla_attn - k_c_normed_k_pe = keys[ - i - model_executable.model.start_layer].to( - kv_cache.device).squeeze(1) - k_c_normed = k_c_normed_k_pe[:, :model_config.kv_lora_rank] - k_pe = k_c_normed_k_pe[:, model_config.kv_lora_rank:] - ops.concat_and_cache_mla( - k_c_normed, - k_pe, - kv_cache, - slot_mapping[start_pos:end_pos], - layer.self_attn.attn.kv_cache_dtype, - layer.self_attn.attn._k_scale, - ) - else: - key_cache, value_cache = kv_cache[0], kv_cache[1] - ops.reshape_and_cache_flash( - keys[i - model_executable.model.start_layer].to( - key_cache.device), - values[i - model_executable.model.start_layer].to( - value_cache.device), - key_cache, - value_cache, - slot_mapping[start_pos:end_pos], - layer.self_attn.attn.kv_cache_dtype, - layer.self_attn.attn._k_scale, - layer.self_attn.attn._v_scale, - ) + # call self.kv_store to get kv layer by layer + for layer_id in range(start_layer, end_layer): + layer = model_executable.model.layers[layer_id] + # get kvcache object + kv_cache = kv_caches[layer_id - start_layer] + + # get remote kvcache + remote_k, remote_v = kvcache[0][layer_id], kvcache[1][ + layer_id] + + self.kv_helper.put_kv_to_cache(model_executable, remote_k, + remote_v, layer, kv_cache, + slot_mapping, start_pos, + end_pos) hidden_or_intermediate_states_for_one_req.append(hidden) if not bypass_model_exec: - # Some of the KV cache is not retrieved - # Here we will fall back to normal model forwarding - # But optionally you can adjust model_input so that you only do - # prefilling on those tokens that are missing KV caches. logger.warning( "[rank%d]: Failed to receive all KVs and hidden " "states, redo model forwarding.", torch.distributed.get_rank()) @@ -295,5 +199,5 @@ def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: raise ValueError( f"Request id {request_id} does not contain hostname and port") - def close(self): + def close(self) -> None: self.p2p_nccl_pipe.close() diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 78b3da214ac2..5e49c23c9141 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -152,8 +152,7 @@ def send_tensor( tensor_size = tensor.element_size() * tensor.numel() while (self.buffer_size + tensor_size > self.buffer_size_threshold): - oldest_tenser_id = next(iter(self.send_store)) - oldest_tenser = self.send_store.pop(oldest_tenser_id) + _, oldest_tenser = self.send_store.popitem(last=False) oldest_tenser_size = oldest_tenser.element_size( ) * oldest_tenser.numel() self.buffer_size -= oldest_tenser_size @@ -184,10 +183,11 @@ def recv_tensor( with self.recv_store_cv: while tensor_id not in self.recv_store: self.recv_store_cv.wait() - # TODO:Abatom, To avoid an overly large dictionary. - # tensor = self.recv_store.pop(tensor_id) tensor = self.recv_store[tensor_id] self.recv_store[tensor_id] = None + while len(self.recv_store) > 10000: + self.recv_store.popitem(last=False) + duration = time.time() - start_time if tensor is not None: self.buffer_size -= (tensor.element_size() * tensor.numel()) @@ -259,6 +259,7 @@ def _listen_for_requests(self): "๐ŸคncclCommInitRank Success, %s๐Ÿ‘ˆ%s, MyRank:%s", self.zmq_address, remote_address.decode(), rank) elif data["cmd"] == "PUT": + tensor_id = data["tensor_id"] try: tensor = torch.empty(data["shape"], dtype=getattr( @@ -281,24 +282,24 @@ def _listen_for_requests(self): [remote_address, b"0"]) comm, rank = self.comms[remote_address.decode()] self._recv(comm, tensor, rank ^ 1) - logger.info( + logger.debug( "๐Ÿ”ต[PUT]Recv Tensor, %s๐Ÿ‘ˆ%s, MyRank:%s, data:%s, " "shape:%s", self.zmq_address, remote_address.decode(), rank, data, tensor.shape) - tensor_id = data["tensor_id"] - with self.recv_store_cv: - self.recv_store[tensor_id] = tensor - self.recv_store_cv.notify() - except torch.cuda.OutOfMemoryError: self.router_socket.send_multipart( [remote_address, b"1"]) + tensor = None logger.warning( "๐Ÿ”ด[PUT]Recv Tensor, Out Of Memory, %s๐Ÿ‘ˆ%s, " "data:%s", self.zmq_address, remote_address.decode(), data) + with self.recv_store_cv: + self.recv_store[tensor_id] = tensor + self.recv_store_cv.notify() + elif data["cmd"] == "GET": tensor_id = data["tensor_id"] with self.send_store_cv: From 1c7485763d8f9012c52cd937ae5810c1f0fa8718 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 17 Apr 2025 11:58:39 +0800 Subject: [PATCH 034/155] format Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_connector/p2p_connector.py | 7 +++---- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 7 ++++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py index 4b532b84d69d..e59d3227f8ea 100644 --- a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py @@ -7,9 +7,9 @@ from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase -from vllm.distributed.kv_transfer.kv_pipe.p2p_nccl_pipe import P2pNcclPipe from vllm.distributed.kv_transfer.kv_connector.utils import ( model_aware_kv_ops_helper as kv_helper) +from vllm.distributed.kv_transfer.kv_pipe.p2p_nccl_pipe import P2pNcclPipe from vllm.logger import init_logger from vllm.sequence import IntermediateTensors @@ -130,7 +130,7 @@ def recv_kv_caches_and_hidden_states( remote_address = ip + ":" + str(port + self.rank) kvcache = self.p2p_nccl_pipe.recv_tensor(request_id + "kv", - remote_address) + remote_address) hidden = self.p2p_nccl_pipe.recv_tensor(request_id + "hidden", remote_address) @@ -151,8 +151,7 @@ def recv_kv_caches_and_hidden_states( kv_cache = kv_caches[layer_id - start_layer] # get remote kvcache - remote_k, remote_v = kvcache[0][layer_id], kvcache[1][ - layer_id] + remote_k, remote_v = kvcache[0][layer_id], kvcache[1][layer_id] self.kv_helper.put_kv_to_cache(model_executable, remote_k, remote_v, layer, kv_cache, diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 5e49c23c9141..e0bc514a17cf 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -283,9 +283,10 @@ def _listen_for_requests(self): comm, rank = self.comms[remote_address.decode()] self._recv(comm, tensor, rank ^ 1) logger.debug( - "๐Ÿ”ต[PUT]Recv Tensor, %s๐Ÿ‘ˆ%s, MyRank:%s, data:%s, " - "shape:%s", self.zmq_address, - remote_address.decode(), rank, data, tensor.shape) + "๐Ÿ”ต[PUT]Recv Tensor, %s๐Ÿ‘ˆ%s, MyRank:%s, " + "data:%s, shape:%s", self.zmq_address, + remote_address.decode(), rank, data, + tensor.shape) except torch.cuda.OutOfMemoryError: self.router_socket.send_multipart( From 41b0ae657ae192440a02b8f4a74e636be0353d36 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 17 Apr 2025 12:13:10 +0800 Subject: [PATCH 035/155] rm popitem Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index e0bc514a17cf..388824c0c8ca 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -152,7 +152,8 @@ def send_tensor( tensor_size = tensor.element_size() * tensor.numel() while (self.buffer_size + tensor_size > self.buffer_size_threshold): - _, oldest_tenser = self.send_store.popitem(last=False) + oldest_tenser_id = next(iter(self.send_store)) + oldest_tenser = self.send_store.pop(oldest_tenser_id) oldest_tenser_size = oldest_tenser.element_size( ) * oldest_tenser.numel() self.buffer_size -= oldest_tenser_size @@ -186,7 +187,7 @@ def recv_tensor( tensor = self.recv_store[tensor_id] self.recv_store[tensor_id] = None while len(self.recv_store) > 10000: - self.recv_store.popitem(last=False) + self.recv_store.pop(next(iter(self.recv_store))) duration = time.time() - start_time if tensor is not None: From ec364e5b4873a50c0d3872041ef5d53a36ce6b3c Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 17 Apr 2025 20:04:00 +0800 Subject: [PATCH 036/155] rm disagg_prefill_xpyd.sh Signed-off-by: Abatom --- .../disagg_xpyd/disagg_prefill_xpyd.sh | 108 ------------------ 1 file changed, 108 deletions(-) delete mode 100644 examples/online_serving/disagg_xpyd/disagg_prefill_xpyd.sh diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_xpyd.sh b/examples/online_serving/disagg_xpyd/disagg_prefill_xpyd.sh deleted file mode 100644 index 6918d9f3ac9b..000000000000 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_xpyd.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/bin/bash -set -xe - -# Trap the SIGINT signal (triggered by Ctrl+C) -trap 'cleanup' INT - -# Cleanup function -cleanup() { - echo "Caught Ctrl+C, cleaning up..." - # Cleanup commands - pgrep python | xargs kill -9 - pkill -f python - echo "Cleanup complete. Exiting." - exit 0 -} - -export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') - -python3 disagg_prefill_proxy_xpyd.py & - -MODEL_NAME=${HF_MODEL_NAME:-meta-llama/Meta-Llama-3.1-8B-Instruct} - -## 2P2D, TP=1 -## prefilling instance, which is the KV producer -#CUDA_VISIBLE_DEVICES=4 vllm serve $MODEL_NAME \ -# --host 0.0.0.0 \ -# --port 20001 \ -# --served-model-name base_model \ -# --max-model-len 8192 \ -# --gpu-memory-utilization 0.8 \ -# --kv-transfer-config \ -# '{"kv_connector":"P2pConnector","kv_role":"kv_producer","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20001"}}' & -# -## prefilling instance, which is the KV producer -#CUDA_VISIBLE_DEVICES=5 vllm serve $MODEL_NAME \ -# --host 0.0.0.0 \ -# --port 20002 \ -# --served-model-name base_model \ -# --max-model-len 8192 \ -# --gpu-memory-utilization 0.8 \ -# --kv-transfer-config \ -# '{"kv_connector":"P2pConnector","kv_role":"kv_producer","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20002"}}' & -# -## decoding instance, which is the KV consumer -#CUDA_VISIBLE_DEVICES=6 vllm serve $MODEL_NAME \ -# --host 0.0.0.0 \ -# --port 20003 \ -# --served-model-name base_model \ -# --max-model-len 8192 \ -# --gpu-memory-utilization 0.8 \ -# --kv-transfer-config \ -# '{"kv_connector":"P2pConnector","kv_role":"kv_consumer","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20003"}}' & -# -## decoding instance, which is the KV consumer -#CUDA_VISIBLE_DEVICES=7 vllm serve $MODEL_NAME \ -# --host 0.0.0.0 \ -# --port 20004 \ -# --served-model-name base_model \ -# --max-model-len 8192 \ -# --gpu-memory-utilization 0.8 \ -# --kv-transfer-config \ -# '{"kv_connector":"P2pConnector","kv_role":"kv_consumer","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20004"}}' & - - -# 2P2D, TP=2 -# prefilling instance, which is the KV producer -CUDA_VISIBLE_DEVICES=0,1 vllm serve $MODEL_NAME \ - --host 0.0.0.0 \ - --port 20001 \ - --tensor-parallel-size 2 \ - --served-model-name base_model \ - --max-model-len 8192 \ - --gpu-memory-utilization 0.8 \ - --kv-transfer-config \ - '{"kv_connector":"P2pConnector","kv_role":"kv_producer","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20001"}}' & - -# prefilling instance, which is the KV producer -CUDA_VISIBLE_DEVICES=2,3 vllm serve $MODEL_NAME \ - --host 0.0.0.0 \ - --port 20002 \ - --tensor-parallel-size 2 \ - --served-model-name base_model \ - --max-model-len 8192 \ - --gpu-memory-utilization 0.8 \ - --kv-transfer-config \ - '{"kv_connector":"P2pConnector","kv_role":"kv_producer","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20002"}}' & - -# decoding instance, which is the KV consumer -CUDA_VISIBLE_DEVICES=4,5 vllm serve $MODEL_NAME \ - --host 0.0.0.0 \ - --port 20003 \ - --tensor-parallel-size 2 \ - --served-model-name base_model \ - --max-model-len 8192 \ - --gpu-memory-utilization 0.8 \ - --kv-transfer-config \ - '{"kv_connector":"P2pConnector","kv_role":"kv_consumer","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20003"}}' & - -# decoding instance, which is the KV consumer -CUDA_VISIBLE_DEVICES=6,7 vllm serve $MODEL_NAME \ - --host 0.0.0.0 \ - --port 20004 \ - --tensor-parallel-size 2 \ - --served-model-name base_model \ - --max-model-len 8192 \ - --gpu-memory-utilization 0.8 \ - --kv-transfer-config \ - '{"kv_connector":"P2pConnector","kv_role":"kv_consumer","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"0.0.0.0","proxy_port":"30001","http_port":"20004"}}' & \ No newline at end of file From ed2fbb67db6c92a4410ddaec2418d58bd7231d1b Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 18 Apr 2025 10:44:10 +0800 Subject: [PATCH 037/155] V1 Signed-off-by: Abatom --- .../{ => v1}/p2p_nccl_connector.py | 216 +++++++++++------- 1 file changed, 128 insertions(+), 88 deletions(-) rename vllm/distributed/kv_transfer/kv_connector/{ => v1}/p2p_nccl_connector.py (53%) diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py similarity index 53% rename from vllm/distributed/kv_transfer/kv_connector/p2p_nccl_connector.py rename to vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index a66580a86080..7d4ef634c859 100644 --- a/vllm/distributed/kv_transfer/kv_connector/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -11,13 +11,12 @@ KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.distributed.kv_transfer.kv_pipe.p2p_nccl_pipe import P2pNcclPipe from vllm.logger import init_logger +from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata from vllm.forward_context import ForwardContext - from vllm.v1.core.kv_cache_manager import KVCacheManager - from vllm.v1.core.kv_cache_utils import KVCacheBlock from vllm.v1.request import Request logger = init_logger(__name__) @@ -34,24 +33,20 @@ class ReqMeta: # Is store or load is_store: bool - ## Blocks allocated by the scheduler (no-longer needed) - # block_ids: torch.Tensor - @staticmethod - def from_request(request: "Request", block_size: int, - is_store: bool) -> "ReqMeta": - valid_num_tokens = align_to_block_size(len(request.prompt_token_ids), - block_size) - token_ids = torch.tensor(request.prompt_token_ids)[:valid_num_tokens] - block_ids = torch.tensor(request.block_ids) - num_blocks = block_ids.shape[0] + def make_meta(request_id: srt, token_ids: list[int], block_ids: list[int], + block_size: int, is_store: bool) -> "ReqMeta": + valid_num_tokens = align_to_block_size(len(token_ids), block_size) + token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens] + block_ids_tensor = torch.tensor(block_ids) + num_blocks = block_ids_tensor.shape[0] block_offsets = torch.arange(0, block_size) slot_mapping = block_offsets.reshape((1, block_size)) + \ - block_ids.reshape((num_blocks, 1)) * block_size + block_ids_tensor.reshape((num_blocks, 1)) * block_size slot_mapping = slot_mapping.flatten()[:valid_num_tokens] return ReqMeta( - request_id=request.request_id, - token_ids=token_ids, + request_id=request_id, + token_ids=token_ids_tensor, slot_mapping=slot_mapping, is_store=is_store, ) @@ -65,28 +60,25 @@ def __init__(self): self.requests = [] def add_request( - self, - request: "Request", - block_size: int, - is_store: bool, + self, + request_id: srt, + token_ids: list[int], + block_ids: list[int], + block_size: int, + is_store: bool, ) -> None: self.requests.append( - ReqMeta.from_request(request, block_size, is_store)) + ReqMeta.make_meta(request_id, token_ids, block_ids, block_size, is_store)) class P2pNcclConnector(KVConnectorBase_V1): - def __init__(self, rank: Optional[int], local_rank: Optional[int], - config: "VllmConfig", role: KVConnectorRole): - super().__init__( - rank=rank, - local_rank=local_rank, - config=config, - role=role, - ) - self._block_size = config.cache_config.block_size - self._requests_need_load: list[str] = [] - logger.info(config.kv_transfer_config) + def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole, + rank: Optional[int], local_rank: Optional[int]): + super().__init__(vllm_config=vllm_config, role=role) + self._block_size = vllm_config.cache_config.block_size + self._requests_need_load: dict[str, Request] = {} + self.config = vllm_config.kv_transfer_config self.p2p_nccl_pipe = P2pNcclPipe( local_rank=local_rank, # type: ignore @@ -108,34 +100,45 @@ def start_load_kv(self, forward_context: "ForwardContext", The number of elements in kv_caches and layer_names should be the same. """ + attn_metadata = forward_context.attn_metadata def inject_kv_into_layer( - dst_kv_cache_layer: torch.Tensor, - src_kv_cache: torch.Tensor, - slot_mapping: torch.Tensor, + dst_kv_cache_layer: torch.Tensor, + src_kv_cache: torch.Tensor, + slot_mapping: torch.Tensor, ) -> None: """Inject the KV cache into the layer. Args: dst_kv_cache_layer (torch.Tensor): the destination KV cache - layer. In shape [2, num_pages, page_size, xxx]. + layer. In shape [2, num_pages, page_size, xxx] if not + using MLA, [num_pages, page_size, xxx] otherwise. src_kv_cache (torch.Tensor): the source KV cache. In shape - [2, num_tokens, xxx]. + [2, num_tokens, xxx] if not using MLA, [num_tokens, xxx] + otherwise. slot_mapping (torch.Tensor): the slot mapping. In shape [num_tokens]. """ dst_kv_cache_layer_shape = dst_kv_cache_layer.shape - num_pages = dst_kv_cache_layer_shape[1] - page_size = dst_kv_cache_layer_shape[2] - dst_kv_cache_layer = dst_kv_cache_layer.reshape( - 2, num_pages * page_size, -1) - dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache - dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) + if isinstance(attn_metadata, MLACommonMetadata): + num_pages = dst_kv_cache_layer_shape[0] + page_size = dst_kv_cache_layer_shape[1] + dst_kv_cache_layer = dst_kv_cache_layer.reshape( + num_pages * page_size, -1) + dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache + dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) + else: + num_pages = dst_kv_cache_layer_shape[1] + page_size = dst_kv_cache_layer_shape[2] + dst_kv_cache_layer = dst_kv_cache_layer.reshape( + 2, num_pages * page_size, -1) + dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache + dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) # Get the metadata metadata: KVConnectorMetadata = \ self._get_connector_metadata() - assert isinstance(metadata, P2pNcclConnectorMetadata) + assert isinstance(metadata, SharedStorageConnectorMetadata) if metadata is None: logger.warning( @@ -179,7 +182,7 @@ def wait_for_layer_load(self, layer_name: str) -> None: def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, attn_metadata: "AttentionMetadata", **kwargs) -> None: - """Start saving the a layer of KV cache from vLLM's paged buffer + """Start saving the KV cache of the layer from vLLM's paged buffer to the connector. Args: @@ -191,19 +194,24 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, """ def extract_kv_from_layer( - layer: torch.Tensor, - slot_mapping: torch.Tensor, + layer: torch.Tensor, + slot_mapping: torch.Tensor, ) -> torch.Tensor: """Extract the KV cache from the layer. - Assume the shape of the layer is (2, num_pages, page_size, xxx). + Assume the shape of the layer is (2, num_pages, page_size, xxx) + if MLA is not used, and (num_pages, page_size, xxx) otherwise. """ + if isinstance(attn_metadata, MLACommonMetadata): + num_pages, page_size = layer.shape[0], layer.shape[1] + return layer.reshape(num_pages * page_size, -1)[slot_mapping, + ...] num_pages, page_size = layer.shape[1], layer.shape[2] return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, - ...] + ...] connector_metadata = self._get_connector_metadata() - assert isinstance(connector_metadata, P2pNcclConnectorMetadata) + assert isinstance(connector_metadata, SharedStorageConnectorMetadata) for request in connector_metadata.requests: if request.is_store: request_id = request.request_id @@ -217,61 +225,93 @@ def extract_kv_from_layer( def wait_for_save(self): return - def get_external_prefix_cache_blocks( - self, - request: "Request", - computed_blocks: list["KVCacheBlock"], - num_computed_tokens: int, - kv_cache_manager: "KVCacheManager", - ) -> list["KVCacheBlock"]: + def get_num_new_matched_tokens( + self, + request: "Request", + num_computed_tokens: int, + ) -> int: """ - Get the external prefix cache blocks from the connector. - - This function may change the state of the connector, which will - be used by `attach_connector_meta` later. - - This function will also allocate/free the blocks dynamically when - there is remote cache hit. + Get number of new tokens that can be loaded from the + external KV cache beyond the num_computed_tokens. Args: request (Request): the request object. - computed_blocks (list[KVCacheBlock]): the 'local' computed blocks. - num_computed_tokens (int): the number of 'local' computed tokens. - kv_cache_manager (KVCacheManager): the KV cache manager to - allocate/free the blocks if needed. + num_computed_tokens (int): the number of locally + computed tokens for this request Returns: - The updated list of the computed blocks (appended with the remote - cached blocks) + the number of tokens that can be loaded from the + external KV cache beyond what is already computed. """ - return computed_blocks - def attach_connector_meta( - self, scheduler_output: SchedulerOutput) -> SchedulerOutput: - """Attach the connector metadata to the request object. + return 0 - This function should NOT modify other fields in the scheduler_output - except the `kv_connector_metadata` field. + def update_state_after_alloc(self, request: "Request", + num_external_tokens: int): + """ + Update KVConnector state after block allocation. + + If blocks were allocated, add to _requests_need_load, + such that we load the KVs in the next forward pass. + """ + if num_external_tokens > 0: + self._requests_need_load[request.request_id] = request + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput, + ) -> KVConnectorMetadata: + """Build the connector metadata for this step. + + This function should NOT modify any fields in the scheduler_output. Also, calling this function will reset the state of the connector. Args: scheduler_output (SchedulerOutput): the scheduler output object. """ - meta = P2pNcclConnectorMetadata() - for request in scheduler_output.scheduled_new_reqs: - # T^T, why there is both req_id and request_id???? - if request.req_id in self._requests_need_load: - meta.add_request(request, self._block_size, is_store=False) + meta = SharedStorageConnectorMetadata() + + total_need_load = 0 + for new_req in scheduler_output.scheduled_new_reqs: + if new_req.req_id in self._requests_need_load: + meta.add_request(token_ids=new_req.prompt_token_ids, + block_ids=new_req.block_ids, + block_size=self._block_size, + is_store=False) + total_need_load += 1 else: - # NOTE: here, we set the store and load being exclusive, - # but in LMCache use case, a single request can have both - # store and load status - if not self.found_match_for_request(request): - meta.add_request(request, self._block_size, is_store=True) - scheduler_output.kv_connector_metadata = meta - + meta.add_request(token_ids=new_req.prompt_token_ids, + block_ids=new_req.block_ids, + block_size=self._block_size, + is_store=True) + + for cached_req in scheduler_output.scheduled_cached_reqs: + # NOTE(rob): here we rely on the resumed requests being + # the first N requests in the list scheduled_cache_reqs. + if not cached_req.resumed_from_preemption: + break + if cached_req.req_id in self._requests_need_load: + # NOTE(rob): cached_req_data does not have the full + # list of token ids (only new tokens). So we look it + # up in the actual request object. + request = self._requests_need_load[cached_req.req_id] + total_tokens = (len(cached_req.new_token_ids) + + cached_req.num_computed_tokens) + token_ids = request.all_token_ids[:total_tokens] + + # NOTE(rob): For resumed req, new_block_ids is all + # of the block_ids for the request. + block_ids = cached_req.new_block_ids + + meta.add_request(token_ids=token_ids, + block_ids=block_ids, + block_size=self._block_size, + is_store=False) + total_need_load += 1 + + assert total_need_load == len(self._requests_need_load) self._requests_need_load.clear() - return scheduler_output + return meta @staticmethod def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: From 616ce481d7b2a22a3bff47bbbb36dff0ec203ae5 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 18 Apr 2025 11:01:30 +0800 Subject: [PATCH 038/155] bugfix and format Signed-off-by: Abatom --- .../kv_connector/v1/p2p_nccl_connector.py | 48 ++++++++++--------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index 7d4ef634c859..3f457e816a81 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -34,7 +34,7 @@ class ReqMeta: is_store: bool @staticmethod - def make_meta(request_id: srt, token_ids: list[int], block_ids: list[int], + def make_meta(request_id: str, token_ids: list[int], block_ids: list[int], block_size: int, is_store: bool) -> "ReqMeta": valid_num_tokens = align_to_block_size(len(token_ids), block_size) token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens] @@ -60,15 +60,16 @@ def __init__(self): self.requests = [] def add_request( - self, - request_id: srt, - token_ids: list[int], - block_ids: list[int], - block_size: int, - is_store: bool, + self, + request_id: str, + token_ids: list[int], + block_ids: list[int], + block_size: int, + is_store: bool, ) -> None: self.requests.append( - ReqMeta.make_meta(request_id, token_ids, block_ids, block_size, is_store)) + ReqMeta.make_meta(request_id, token_ids, block_ids, block_size, + is_store)) class P2pNcclConnector(KVConnectorBase_V1): @@ -79,6 +80,7 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole, self._block_size = vllm_config.cache_config.block_size self._requests_need_load: dict[str, Request] = {} self.config = vllm_config.kv_transfer_config + self.rank = rank self.p2p_nccl_pipe = P2pNcclPipe( local_rank=local_rank, # type: ignore @@ -103,9 +105,9 @@ def start_load_kv(self, forward_context: "ForwardContext", attn_metadata = forward_context.attn_metadata def inject_kv_into_layer( - dst_kv_cache_layer: torch.Tensor, - src_kv_cache: torch.Tensor, - slot_mapping: torch.Tensor, + dst_kv_cache_layer: torch.Tensor, + src_kv_cache: torch.Tensor, + slot_mapping: torch.Tensor, ) -> None: """Inject the KV cache into the layer. @@ -138,7 +140,7 @@ def inject_kv_into_layer( # Get the metadata metadata: KVConnectorMetadata = \ self._get_connector_metadata() - assert isinstance(metadata, SharedStorageConnectorMetadata) + assert isinstance(metadata, P2pNcclConnectorMetadata) if metadata is None: logger.warning( @@ -194,8 +196,8 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, """ def extract_kv_from_layer( - layer: torch.Tensor, - slot_mapping: torch.Tensor, + layer: torch.Tensor, + slot_mapping: torch.Tensor, ) -> torch.Tensor: """Extract the KV cache from the layer. @@ -205,13 +207,13 @@ def extract_kv_from_layer( if isinstance(attn_metadata, MLACommonMetadata): num_pages, page_size = layer.shape[0], layer.shape[1] return layer.reshape(num_pages * page_size, -1)[slot_mapping, - ...] + ...] num_pages, page_size = layer.shape[1], layer.shape[2] return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, - ...] + ...] connector_metadata = self._get_connector_metadata() - assert isinstance(connector_metadata, SharedStorageConnectorMetadata) + assert isinstance(connector_metadata, P2pNcclConnectorMetadata) for request in connector_metadata.requests: if request.is_store: request_id = request.request_id @@ -226,9 +228,9 @@ def wait_for_save(self): return def get_num_new_matched_tokens( - self, - request: "Request", - num_computed_tokens: int, + self, + request: "Request", + num_computed_tokens: int, ) -> int: """ Get number of new tokens that can be loaded from the @@ -258,8 +260,8 @@ def update_state_after_alloc(self, request: "Request", self._requests_need_load[request.request_id] = request def build_connector_meta( - self, - scheduler_output: SchedulerOutput, + self, + scheduler_output: SchedulerOutput, ) -> KVConnectorMetadata: """Build the connector metadata for this step. @@ -269,7 +271,7 @@ def build_connector_meta( Args: scheduler_output (SchedulerOutput): the scheduler output object. """ - meta = SharedStorageConnectorMetadata() + meta = P2pNcclConnectorMetadata() total_need_load = 0 for new_req in scheduler_output.scheduled_new_reqs: From 49336a27be2f54855a6839753e1778c5419e58f2 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 18 Apr 2025 11:14:27 +0800 Subject: [PATCH 039/155] bugfix Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p_nccl_connector.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index 3f457e816a81..c6b4164d999a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -276,13 +276,15 @@ def build_connector_meta( total_need_load = 0 for new_req in scheduler_output.scheduled_new_reqs: if new_req.req_id in self._requests_need_load: - meta.add_request(token_ids=new_req.prompt_token_ids, + meta.add_request(request_id=new_req.req_id, + token_ids=new_req.prompt_token_ids, block_ids=new_req.block_ids, block_size=self._block_size, is_store=False) total_need_load += 1 else: - meta.add_request(token_ids=new_req.prompt_token_ids, + meta.add_request(request_id=new_req.req_id, + token_ids=new_req.prompt_token_ids, block_ids=new_req.block_ids, block_size=self._block_size, is_store=True) @@ -305,7 +307,8 @@ def build_connector_meta( # of the block_ids for the request. block_ids = cached_req.new_block_ids - meta.add_request(token_ids=token_ids, + meta.add_request(request_id=cached_req.req_id, + token_ids=token_ids, block_ids=block_ids, block_size=self._block_size, is_store=False) From 3d8d7b6e7388218cbe0e431e842bee481ea60567 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 18 Apr 2025 11:30:04 +0800 Subject: [PATCH 040/155] bugfix Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p_nccl_connector.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index c6b4164d999a..e6bccd1de2f1 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -75,7 +75,7 @@ def add_request( class P2pNcclConnector(KVConnectorBase_V1): def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole, - rank: Optional[int], local_rank: Optional[int]): + rank: int, local_rank: int): super().__init__(vllm_config=vllm_config, role=role) self._block_size = vllm_config.cache_config.block_size self._requests_need_load: dict[str, Request] = {} @@ -83,10 +83,10 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole, self.rank = rank self.p2p_nccl_pipe = P2pNcclPipe( - local_rank=local_rank, # type: ignore + local_rank=local_rank, config=self.config, hostname="", - port_offset=rank, # type: ignore + port_offset=rank, ) def start_load_kv(self, forward_context: "ForwardContext", From 0b9a2ac6f07dd881e016ac84e0a181cb5d313812 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 18 Apr 2025 11:36:46 +0800 Subject: [PATCH 041/155] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p_nccl_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index e6bccd1de2f1..75c21f01f1c9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -2,7 +2,7 @@ import re from dataclasses import dataclass -from typing import TYPE_CHECKING, Optional, Tuple +from typing import TYPE_CHECKING, Tuple import torch From e3f858f11ff217b8a2db51e14bd955bf9c82d3cb Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 19 Apr 2025 18:22:43 +0800 Subject: [PATCH 042/155] add rank and local_rank Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_connector/factory.py | 12 ++++++------ vllm/distributed/kv_transfer/kv_connector/v1/base.py | 8 +++++++- .../kv_connector/v1/p2p_nccl_connector.py | 9 ++++++--- .../kv_connector/v1/shared_storage_connector.py | 6 +++++- vllm/distributed/kv_transfer/kv_transfer_state.py | 5 ++++- 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 03a0f2998ee5..f90f4cca8633 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -49,11 +49,11 @@ def create_connector_v0(cls, rank: int, local_rank: int, return connector_cls(rank, local_rank, config) @classmethod - def create_connector_v1( - cls, - config: "VllmConfig", - role: KVConnectorRole, - ) -> KVConnectorBase_V1: + def create_connector_v1(cls, + config: "VllmConfig", + role: KVConnectorRole, + rank: int = 0, + local_rank: int = 0) -> KVConnectorBase_V1: if not envs.VLLM_USE_V1: raise ValueError("Attempting to initialize a V1 Connector, " f"but found {envs.VLLM_USE_V1=}") @@ -70,7 +70,7 @@ def create_connector_v1( # - Co-locate with worker process # - Should only be used inside the forward context & attention layer # We build separately to enforce strict separation - return connector_cls(config, role) + return connector_cls(config, role, rank, local_rank) # Register various connectors here. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 95967d2ca919..1835a1bf1078 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -54,13 +54,19 @@ class KVConnectorMetadata: class KVConnectorBase_V1(ABC): - def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): + def __init__(self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + rank: int = 0, + local_rank: int = 0): logger.warning( "Initializing KVConnectorBase_V1. This API is experimental and " "subject to change in the future as we iterate the design.") self._connector_metadata = KVConnectorMetadata() self._vllm_config = vllm_config self._role = role + self._rank = rank + self._local_rank = local_rank @property def role(self) -> KVConnectorRole: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index 75c21f01f1c9..ad92e248b6c0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -74,8 +74,11 @@ def add_request( class P2pNcclConnector(KVConnectorBase_V1): - def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole, - rank: int, local_rank: int): + def __init__(self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + rank: int = 0, + local_rank: int = 0): super().__init__(vllm_config=vllm_config, role=role) self._block_size = vllm_config.cache_config.block_size self._requests_need_load: dict[str, Request] = {} @@ -218,7 +221,7 @@ def extract_kv_from_layer( if request.is_store: request_id = request.request_id ip, port = self.parse_request_id(request_id, True) - remote_address = ip + ":" + str(port + self.rank) + remote_address = ip + ":" + str(port + self._rank) kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) self.p2p_nccl_pipe.send_tensor(request_id + "-" + layer_name, diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 1d2040784e6c..f8b9c44708f0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -73,7 +73,11 @@ class SharedStorageConnector(KVConnectorBase_V1): # It does extra work which will overwrite the existing prefix-cache in GPU # - to remove the overhead, need to add some "mask" in the ReqMeta class - def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): + def __init__(self, + vllm_config: "VllmConfig", + role: KVConnectorRole, + rank: int = 0, + local_rank: int = 0): super().__init__(vllm_config=vllm_config, role=role) self._block_size = vllm_config.cache_config.block_size self._requests_need_load: dict[str, Request] = {} diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py index 25d2f2cf5c6e..96fb4bcf814b 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_state.py +++ b/vllm/distributed/kv_transfer/kv_transfer_state.py @@ -61,7 +61,10 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: and _KV_CONNECTOR_AGENT is None): if envs.VLLM_USE_V1: _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v1( - config=vllm_config, role=KVConnectorRole.WORKER) + config=vllm_config, + role=KVConnectorRole.WORKER, + rank=get_world_group().rank, + local_rank=get_world_group().local_rank) else: _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v0( rank=get_world_group().rank, From e8b8f3606b4150d2b33dab6ba47075b9400b8095 Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 22 Apr 2025 12:11:51 +0800 Subject: [PATCH 043/155] bugfix Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p_nccl_connector.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index ad92e248b6c0..d1d56b4c65da 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -79,7 +79,8 @@ def __init__(self, role: KVConnectorRole, rank: int = 0, local_rank: int = 0): - super().__init__(vllm_config=vllm_config, role=role) + super().__init__(vllm_config=vllm_config, role=role, + rank=rank, local_rank=local_rank) self._block_size = vllm_config.cache_config.block_size self._requests_need_load: dict[str, Request] = {} self.config = vllm_config.kv_transfer_config @@ -90,7 +91,7 @@ def __init__(self, config=self.config, hostname="", port_offset=rank, - ) + ) if role == KVConnectorRole.WORKER else None def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: @@ -323,7 +324,7 @@ def build_connector_meta( @staticmethod def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: - logger.info("parse_request_id, request_id: %s, is_prefill: %s", + logger.debug("parse_request_id, request_id: %s, is_prefill: %s", request_id, is_prefill) # Regular expression to match the string hostname and integer port if is_prefill: @@ -338,7 +339,7 @@ def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: ip = match.group(1) port = int(match.group(2)) - logger.info("parse_request_id, request_id: %s, ip: %s, port: %s", + logger.debug("parse_request_id, request_id: %s, ip: %s, port: %s", request_id, ip, str(port)) return ip, port raise ValueError( From 8623e3cd3187c95fac0f49045f258ded86cfa196 Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 22 Apr 2025 21:04:37 +0800 Subject: [PATCH 044/155] runnable for V1 Signed-off-by: Abatom --- .../kv_connector/v1/p2p_nccl_connector.py | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index d1d56b4c65da..96d1c3b5e662 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -30,12 +30,10 @@ class ReqMeta: token_ids: torch.Tensor # Slot mappings, should have the same length as token_ids slot_mapping: torch.Tensor - # Is store or load - is_store: bool @staticmethod def make_meta(request_id: str, token_ids: list[int], block_ids: list[int], - block_size: int, is_store: bool) -> "ReqMeta": + block_size: int) -> "ReqMeta": valid_num_tokens = align_to_block_size(len(token_ids), block_size) token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens] block_ids_tensor = torch.tensor(block_ids) @@ -48,7 +46,6 @@ def make_meta(request_id: str, token_ids: list[int], block_ids: list[int], request_id=request_id, token_ids=token_ids_tensor, slot_mapping=slot_mapping, - is_store=is_store, ) @@ -65,11 +62,9 @@ def add_request( token_ids: list[int], block_ids: list[int], block_size: int, - is_store: bool, ) -> None: self.requests.append( - ReqMeta.make_meta(request_id, token_ids, block_ids, block_size, - is_store)) + ReqMeta.make_meta(request_id, token_ids, block_ids, block_size)) class P2pNcclConnector(KVConnectorBase_V1): @@ -85,6 +80,7 @@ def __init__(self, self._requests_need_load: dict[str, Request] = {} self.config = vllm_config.kv_transfer_config self.rank = rank + self.is_producer = self.config.is_kv_producer self.p2p_nccl_pipe = P2pNcclPipe( local_rank=local_rank, @@ -160,7 +156,7 @@ def inject_kv_into_layer( # Load the KV for each request each layer for request in metadata.requests: - if request.is_store: + if self.is_producer: continue logger.info("Inject KV cache of %d tokens to the paged memory", len(request.slot_mapping)) @@ -219,7 +215,7 @@ def extract_kv_from_layer( connector_metadata = self._get_connector_metadata() assert isinstance(connector_metadata, P2pNcclConnectorMetadata) for request in connector_metadata.requests: - if request.is_store: + if self.is_producer: request_id = request.request_id ip, port = self.parse_request_id(request_id, True) remote_address = ip + ":" + str(port + self._rank) @@ -260,7 +256,7 @@ def update_state_after_alloc(self, request: "Request", If blocks were allocated, add to _requests_need_load, such that we load the KVs in the next forward pass. """ - if num_external_tokens > 0: + if not self.is_producer: self._requests_need_load[request.request_id] = request def build_connector_meta( @@ -283,15 +279,14 @@ def build_connector_meta( meta.add_request(request_id=new_req.req_id, token_ids=new_req.prompt_token_ids, block_ids=new_req.block_ids, - block_size=self._block_size, - is_store=False) + block_size=self._block_size) total_need_load += 1 else: - meta.add_request(request_id=new_req.req_id, - token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids, - block_size=self._block_size, - is_store=True) + if self.is_producer: + meta.add_request(request_id=new_req.req_id, + token_ids=new_req.prompt_token_ids, + block_ids=new_req.block_ids, + block_size=self._block_size) for cached_req in scheduler_output.scheduled_cached_reqs: # NOTE(rob): here we rely on the resumed requests being @@ -314,8 +309,7 @@ def build_connector_meta( meta.add_request(request_id=cached_req.req_id, token_ids=token_ids, block_ids=block_ids, - block_size=self._block_size, - is_store=False) + block_size=self._block_size) total_need_load += 1 assert total_need_load == len(self._requests_need_load) From d11388a30ca28dab1cf853b4339dac7b70b69bde Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 22 Apr 2025 22:25:51 +0800 Subject: [PATCH 045/155] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p_nccl_connector.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index 96d1c3b5e662..2b913cf3980a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -74,8 +74,10 @@ def __init__(self, role: KVConnectorRole, rank: int = 0, local_rank: int = 0): - super().__init__(vllm_config=vllm_config, role=role, - rank=rank, local_rank=local_rank) + super().__init__(vllm_config=vllm_config, + role=role, + rank=rank, + local_rank=local_rank) self._block_size = vllm_config.cache_config.block_size self._requests_need_load: dict[str, Request] = {} self.config = vllm_config.kv_transfer_config @@ -245,7 +247,6 @@ def get_num_new_matched_tokens( the number of tokens that can be loaded from the external KV cache beyond what is already computed. """ - return 0 def update_state_after_alloc(self, request: "Request", @@ -319,7 +320,7 @@ def build_connector_meta( @staticmethod def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: logger.debug("parse_request_id, request_id: %s, is_prefill: %s", - request_id, is_prefill) + request_id, is_prefill) # Regular expression to match the string hostname and integer port if is_prefill: pattern = r"___decode_addr_(.*):(\d+)" @@ -334,7 +335,7 @@ def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: port = int(match.group(2)) logger.debug("parse_request_id, request_id: %s, ip: %s, port: %s", - request_id, ip, str(port)) + request_id, ip, str(port)) return ip, port raise ValueError( f"Request id {request_id} does not contain hostname and port") From 17e9905900bc92983e118600e1f2f9221e15ea47 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 23 Apr 2025 11:27:34 +0800 Subject: [PATCH 046/155] rm valid_num_tokens Signed-off-by: Abatom --- .../kv_connector/v1/p2p_nccl_connector.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index 2b913cf3980a..c6530c2f9a3a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -34,14 +34,13 @@ class ReqMeta: @staticmethod def make_meta(request_id: str, token_ids: list[int], block_ids: list[int], block_size: int) -> "ReqMeta": - valid_num_tokens = align_to_block_size(len(token_ids), block_size) - token_ids_tensor = torch.tensor(token_ids)[:valid_num_tokens] + token_ids_tensor = torch.tensor(token_ids) block_ids_tensor = torch.tensor(block_ids) num_blocks = block_ids_tensor.shape[0] block_offsets = torch.arange(0, block_size) slot_mapping = block_offsets.reshape((1, block_size)) + \ block_ids_tensor.reshape((num_blocks, 1)) * block_size - slot_mapping = slot_mapping.flatten()[:valid_num_tokens] + slot_mapping = slot_mapping.flatten() return ReqMeta( request_id=request_id, token_ids=token_ids_tensor, @@ -160,8 +159,6 @@ def inject_kv_into_layer( for request in metadata.requests: if self.is_producer: continue - logger.info("Inject KV cache of %d tokens to the paged memory", - len(request.slot_mapping)) for layer_name in forward_context.no_compile_layers: attn_layer = forward_context.no_compile_layers[layer_name] kv_cache_layer = attn_layer.kv_cache[ \ @@ -173,6 +170,9 @@ def inject_kv_into_layer( inject_kv_into_layer(kv_cache_layer, kv_cache, request.slot_mapping) + logger.info("Inject KV cache of %d tokens to the paged memory, %s", + len(request.slot_mapping), request.request_id) + def wait_for_layer_load(self, layer_name: str) -> None: """Blocking until the KV for a specific layer is loaded into vLLM's paged buffer. @@ -339,9 +339,3 @@ def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: return ip, port raise ValueError( f"Request id {request_id} does not contain hostname and port") - - -def align_to_block_size(num_tokens: int, block_size) -> int: - """Align the number of tokens to the block size. - """ - return (num_tokens - 1) // block_size * block_size From e13094b6bd1ca7017f5224c4cfc1bc12decf0c5c Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 23 Apr 2025 18:14:34 +0800 Subject: [PATCH 047/155] wait_for_save Signed-off-by: Abatom --- .../kv_connector/v1/p2p_nccl_connector.py | 3 ++- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index c6530c2f9a3a..17fb08965504 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -227,7 +227,8 @@ def extract_kv_from_layer( kv_cache, remote_address) def wait_for_save(self): - return + if self.is_producer: + self.p2p_nccl_pipe.wait_for_sent() def get_num_new_matched_tokens( self, diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 388824c0c8ca..de86fe885e85 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -283,7 +283,7 @@ def _listen_for_requests(self): [remote_address, b"0"]) comm, rank = self.comms[remote_address.decode()] self._recv(comm, tensor, rank ^ 1) - logger.debug( + logger.info( "๐Ÿ”ต[PUT]Recv Tensor, %s๐Ÿ‘ˆ%s, MyRank:%s, " "data:%s, shape:%s", self.zmq_address, remote_address.decode(), rank, data, @@ -341,8 +341,21 @@ def _send_async(self): while not self.send_queue: self.send_queue_cv.wait() tensor_id, remote_address, tensor = self.send_queue.popleft() + if not self.send_queue: + self.send_queue_cv.notify() self._send_sync(tensor_id, tensor, remote_address) + def wait_for_sent(self): + if self.send_type == "PUT_ASYNC": + start_time = time.time() + with self.send_queue_cv: + while self.send_queue: + self.send_queue_cv.wait() + duration = time.time() - start_time + logger.info( + "๐Ÿšง[PUT_ASYNC]It took %.3fms to wait for the send_queue" + " to be empty, rank:%d", duration * 1000, self.rank) + def _send_sync( self, tensor_id: str, From d96ecc36c66a09e7d70e7c729bfee9647b6c2962 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 23 Apr 2025 20:54:31 +0800 Subject: [PATCH 048/155] inject_kv_into_layer Signed-off-by: Abatom --- .../kv_connector/v1/p2p_nccl_connector.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index 17fb08965504..6879195b1996 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -122,21 +122,17 @@ def inject_kv_into_layer( slot_mapping (torch.Tensor): the slot mapping. In shape [num_tokens]. """ - dst_kv_cache_layer_shape = dst_kv_cache_layer.shape + dst_shape = dst_kv_cache_layer.shape if isinstance(attn_metadata, MLACommonMetadata): - num_pages = dst_kv_cache_layer_shape[0] - page_size = dst_kv_cache_layer_shape[1] dst_kv_cache_layer = dst_kv_cache_layer.reshape( - num_pages * page_size, -1) - dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache - dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) + -1, dst_shape[-1]) + dst_kv_cache_layer[slot_mapping] = src_kv_cache + dst_kv_cache_layer = dst_kv_cache_layer.reshape(dst_shape) else: - num_pages = dst_kv_cache_layer_shape[1] - page_size = dst_kv_cache_layer_shape[2] dst_kv_cache_layer = dst_kv_cache_layer.reshape( - 2, num_pages * page_size, -1) - dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache - dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) + 2, -1, dst_shape[-1]) + dst_kv_cache_layer[:, slot_mapping] = src_kv_cache + dst_kv_cache_layer = dst_kv_cache_layer.reshape(dst_shape) # Get the metadata metadata: KVConnectorMetadata = \ From eaaf50cdc2336e8cc275889c7911eb0446d6e148 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 24 Apr 2025 11:15:11 +0800 Subject: [PATCH 049/155] get_num_new_matched_tokens Signed-off-by: Abatom --- .../kv_connector/v1/p2p_nccl_connector.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index 6879195b1996..b4f605f843cc 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -244,7 +244,17 @@ def get_num_new_matched_tokens( the number of tokens that can be loaded from the external KV cache beyond what is already computed. """ - return 0 + if self.is_producer: + return 0 + + num_external_tokens = ( + len(request.prompt_token_ids) - 1 - num_computed_tokens) + + logger.info("๐Ÿ’num_external_tokens:%d, num_prompt_tokens:%d, " + "num_computed_tokens:%d", num_external_tokens, + len(request.prompt_token_ids), num_computed_tokens) + + return num_external_tokens def update_state_after_alloc(self, request: "Request", num_external_tokens: int): @@ -254,7 +264,7 @@ def update_state_after_alloc(self, request: "Request", If blocks were allocated, add to _requests_need_load, such that we load the KVs in the next forward pass. """ - if not self.is_producer: + if not self.is_producer and num_external_tokens > 0: self._requests_need_load[request.request_id] = request def build_connector_meta( From 6a2af6c49e7d6f34f4f3217abf8157c00dee3d62 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 24 Apr 2025 16:11:35 +0800 Subject: [PATCH 050/155] make_meta Signed-off-by: Abatom --- .../kv_connector/v1/p2p_nccl_connector.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index b4f605f843cc..2cb99b7243eb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -34,13 +34,14 @@ class ReqMeta: @staticmethod def make_meta(request_id: str, token_ids: list[int], block_ids: list[int], block_size: int) -> "ReqMeta": + valid_num_tokens = len(token_ids) token_ids_tensor = torch.tensor(token_ids) block_ids_tensor = torch.tensor(block_ids) num_blocks = block_ids_tensor.shape[0] block_offsets = torch.arange(0, block_size) slot_mapping = block_offsets.reshape((1, block_size)) + \ - block_ids_tensor.reshape((num_blocks, 1)) * block_size - slot_mapping = slot_mapping.flatten() + block_ids_tensor.reshape((num_blocks, 1)) * block_size + slot_mapping = slot_mapping.flatten()[:valid_num_tokens] return ReqMeta( request_id=request_id, token_ids=token_ids_tensor, @@ -122,17 +123,21 @@ def inject_kv_into_layer( slot_mapping (torch.Tensor): the slot mapping. In shape [num_tokens]. """ - dst_shape = dst_kv_cache_layer.shape + dst_kv_cache_layer_shape = dst_kv_cache_layer.shape if isinstance(attn_metadata, MLACommonMetadata): + num_pages = dst_kv_cache_layer_shape[0] + page_size = dst_kv_cache_layer_shape[1] dst_kv_cache_layer = dst_kv_cache_layer.reshape( - -1, dst_shape[-1]) - dst_kv_cache_layer[slot_mapping] = src_kv_cache - dst_kv_cache_layer = dst_kv_cache_layer.reshape(dst_shape) + num_pages * page_size, -1) + dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache + dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) else: + num_pages = dst_kv_cache_layer_shape[1] + page_size = dst_kv_cache_layer_shape[2] dst_kv_cache_layer = dst_kv_cache_layer.reshape( - 2, -1, dst_shape[-1]) - dst_kv_cache_layer[:, slot_mapping] = src_kv_cache - dst_kv_cache_layer = dst_kv_cache_layer.reshape(dst_shape) + 2, num_pages * page_size, -1) + dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache + dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) # Get the metadata metadata: KVConnectorMetadata = \ From 7d0f562c73664320c03b5bd4c266e9253c479486 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 24 Apr 2025 16:50:04 +0800 Subject: [PATCH 051/155] format Signed-off-by: Abatom --- .../kv_connector/v1/p2p_nccl_connector.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index 2cb99b7243eb..641b0b0aba4d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -104,6 +104,8 @@ def start_load_kv(self, forward_context: "ForwardContext", The number of elements in kv_caches and layer_names should be the same. """ + assert self.p2p_nccl_pipe is not None + attn_metadata = forward_context.attn_metadata def inject_kv_into_layer( @@ -197,6 +199,7 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, attn_metadata (AttentionMetadata): the attention metadata. **kwargs: additional arguments for the save operation. """ + assert self.p2p_nccl_pipe is not None def extract_kv_from_layer( layer: torch.Tensor, @@ -229,6 +232,7 @@ def extract_kv_from_layer( def wait_for_save(self): if self.is_producer: + assert self.p2p_nccl_pipe is not None self.p2p_nccl_pipe.wait_for_sent() def get_num_new_matched_tokens( @@ -252,12 +256,12 @@ def get_num_new_matched_tokens( if self.is_producer: return 0 - num_external_tokens = ( - len(request.prompt_token_ids) - 1 - num_computed_tokens) - - logger.info("๐Ÿ’num_external_tokens:%d, num_prompt_tokens:%d, " - "num_computed_tokens:%d", num_external_tokens, - len(request.prompt_token_ids), num_computed_tokens) + num_external_tokens = (len(request.prompt_token_ids) - 1 - + num_computed_tokens) + logger.info( + "๐Ÿ’num_external_tokens:%d, num_prompt_tokens:%d, " + "num_computed_tokens:%d", num_external_tokens, + len(request.prompt_token_ids), num_computed_tokens) return num_external_tokens From ca9724a1afc12bcd957f481a637ee776b3e3b3dc Mon Sep 17 00:00:00 2001 From: Abatom Date: Sun, 27 Apr 2025 10:39:25 +0800 Subject: [PATCH 052/155] add send_stream and recv_stream Signed-off-by: Abatom --- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index de86fe885e85..7495804b0c9f 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -68,7 +68,9 @@ def __init__(self, self.send_store_cv = threading.Condition() self.send_queue_cv = threading.Condition() self.recv_store_cv = threading.Condition() - self.comm_cv = threading.Condition() + + self.send_stream = torch.cuda.Stream() + self.recv_stream = torch.cuda.Stream() # The sending type includes tree mutually exclusive options: # PUT, GET, PUT_ASYNC. @@ -230,7 +232,7 @@ def recv_tensor( device=self.device) start_time = time.time() - self._recv(comm, tensor, rank ^ 1) + self._recv(comm, tensor, rank ^ 1, self.recv_stream) duration = time.time() - start_time logger.info( "๐Ÿ”ต[GET]Recv From %s, tensor_id:%s, shape:%s, duration:%.3fms, " @@ -282,7 +284,8 @@ def _listen_for_requests(self): self.router_socket.send_multipart( [remote_address, b"0"]) comm, rank = self.comms[remote_address.decode()] - self._recv(comm, tensor, rank ^ 1) + self._recv(comm, tensor, rank ^ 1, + self.recv_stream) logger.info( "๐Ÿ”ต[PUT]Recv Tensor, %s๐Ÿ‘ˆ%s, MyRank:%s, " "data:%s, shape:%s", self.zmq_address, @@ -322,7 +325,8 @@ def _listen_for_requests(self): [remote_address, msgpack.dumps(data)]) if data["ret"] == 0: - self._send(comm, tensor.to(self.device), rank ^ 1) + self._send(comm, tensor.to(self.device), rank ^ 1, + self.send_stream) logger.info( "๐Ÿ”ต[GET]Send Tensor, %s๐Ÿ‘‰%s, " @@ -390,7 +394,7 @@ def _send_sync( response.decode()) return False - self._send(comm, tensor.to(self.device), rank ^ 1) + self._send(comm, tensor.to(self.device), rank ^ 1, self.send_stream) logger.info("๐Ÿ”ตSend Tensor, %s๐Ÿ‘‰%s, MyRank:%s, data:%s, tensor:%s", self.zmq_address, remote_address, rank, data, tensor.shape) return True @@ -416,7 +420,7 @@ def _send(self, comm, tensor: torch.Tensor, dst: int, stream=None): if stream is None: stream = current_stream() - with self.comm_cv: + with torch.cuda.stream(stream): self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), dst, comm, cudaStream_t(stream.cuda_stream)) @@ -428,7 +432,7 @@ def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): if stream is None: stream = current_stream() - with self.comm_cv: + with torch.cuda.stream(stream): self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, comm, cudaStream_t(stream.cuda_stream)) From 8d413596227e2cc3b2464b7754a198b7c9b6e16d Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 28 Apr 2025 10:14:53 +0800 Subject: [PATCH 053/155] Each NCCL connects to a stream. Signed-off-by: Abatom --- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 7495804b0c9f..a72bb40e5cf9 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -87,10 +87,12 @@ def __init__(self, daemon=True) self._send_thread.start() - self.recv_store: Dict[str, - torch.Tensor] = {} # tensor_id: torch.Tensor - self.socks: Dict[str, Any] = {} # remote_address: client socket - self.comms: Dict[str, Any] = {} # remote_address: (ncclComm_t, rank) + # tensor_id: torch.Tensor + self.recv_store: Dict[str, torch.Tensor] = {} + # remote_address: client socket + self.socks: Dict[str, Any] = {} + # remote_address: (ncclComm_t, rank, stream) + self.comms: Dict[str, Any] = {} self.buffer_size = 0 self.buffer_size_threshold = self.config.kv_buffer_size @@ -125,7 +127,8 @@ def _create_connect(self, remote_address: typing.Optional[str] = None): rank = 0 comm: ncclComm_t = self.nccl.ncclCommInitRank( 2, unique_id, rank) - self.comms[remote_address] = (comm, rank) + stream = torch.cuda.Stream() + self.comms[remote_address] = (comm, rank, stream) logger.info("๐ŸคncclCommInitRank Success, %s๐Ÿ‘‰%s, MyRank: %s", self.zmq_address, remote_address, rank) @@ -215,7 +218,7 @@ def recv_tensor( self._create_connect(remote_address) sock = self.socks[remote_address] - comm, rank = self.comms[remote_address] + comm, rank, stream = self.comms[remote_address] data = {"cmd": "GET", "tensor_id": tensor_id} sock.send(msgpack.dumps(data)) @@ -232,7 +235,7 @@ def recv_tensor( device=self.device) start_time = time.time() - self._recv(comm, tensor, rank ^ 1, self.recv_stream) + self._recv(comm, tensor, rank ^ 1, stream) duration = time.time() - start_time logger.info( "๐Ÿ”ต[GET]Recv From %s, tensor_id:%s, shape:%s, duration:%.3fms, " @@ -257,7 +260,8 @@ def _listen_for_requests(self): rank = 1 comm: ncclComm_t = self.nccl.ncclCommInitRank( 2, unique_id, rank) - self.comms[remote_address.decode()] = (comm, rank) + stream = torch.cuda.Stream() + self.comms[remote_address.decode()] = (comm, rank, stream) logger.info( "๐ŸคncclCommInitRank Success, %s๐Ÿ‘ˆ%s, MyRank:%s", self.zmq_address, remote_address.decode(), rank) @@ -283,9 +287,8 @@ def _listen_for_requests(self): self.buffer_size += tensor_size self.router_socket.send_multipart( [remote_address, b"0"]) - comm, rank = self.comms[remote_address.decode()] - self._recv(comm, tensor, rank ^ 1, - self.recv_stream) + comm, rank, stream = self.comms[remote_address.decode()] + self._recv(comm, tensor, rank ^ 1, stream) logger.info( "๐Ÿ”ต[PUT]Recv Tensor, %s๐Ÿ‘ˆ%s, MyRank:%s, " "data:%s, shape:%s", self.zmq_address, @@ -337,8 +340,6 @@ def _listen_for_requests(self): "๐ŸšงUnexpected, Received message from %s, data:%s", remote_address, data) - # Asynchronous sending may cause conflicts between P2P NCCL and - # NCCL used in TP/PP, which can lead to deadlock issues. def _send_async(self): while True: with self.send_queue_cv: @@ -372,7 +373,7 @@ def _send_sync( self._create_connect(remote_address) sock = self.socks[remote_address] - comm, rank = self.comms[remote_address] + comm, rank, stream = self.comms[remote_address] data = { "cmd": "PUT", "tensor_id": tensor_id, @@ -394,7 +395,7 @@ def _send_sync( response.decode()) return False - self._send(comm, tensor.to(self.device), rank ^ 1, self.send_stream) + self._send(comm, tensor.to(self.device), rank ^ 1, stream) logger.info("๐Ÿ”ตSend Tensor, %s๐Ÿ‘‰%s, MyRank:%s, data:%s, tensor:%s", self.zmq_address, remote_address, rank, data, tensor.shape) return True From 1ad957950ffc1552af5abda78c03d88ddb67945b Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 28 Apr 2025 11:52:36 +0800 Subject: [PATCH 054/155] bugfix for GET and revert Each NCCL connects to a stream. Signed-off-by: Abatom --- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index a72bb40e5cf9..f9e4503e0d6f 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -87,12 +87,10 @@ def __init__(self, daemon=True) self._send_thread.start() - # tensor_id: torch.Tensor - self.recv_store: Dict[str, torch.Tensor] = {} - # remote_address: client socket - self.socks: Dict[str, Any] = {} - # remote_address: (ncclComm_t, rank, stream) - self.comms: Dict[str, Any] = {} + self.recv_store: Dict[str, + torch.Tensor] = {} # tensor_id: torch.Tensor + self.socks: Dict[str, Any] = {} # remote_address: client socket + self.comms: Dict[str, Any] = {} # remote_address: (ncclComm_t, rank) self.buffer_size = 0 self.buffer_size_threshold = self.config.kv_buffer_size @@ -127,8 +125,7 @@ def _create_connect(self, remote_address: typing.Optional[str] = None): rank = 0 comm: ncclComm_t = self.nccl.ncclCommInitRank( 2, unique_id, rank) - stream = torch.cuda.Stream() - self.comms[remote_address] = (comm, rank, stream) + self.comms[remote_address] = (comm, rank) logger.info("๐ŸคncclCommInitRank Success, %s๐Ÿ‘‰%s, MyRank: %s", self.zmq_address, remote_address, rank) @@ -218,7 +215,7 @@ def recv_tensor( self._create_connect(remote_address) sock = self.socks[remote_address] - comm, rank, stream = self.comms[remote_address] + comm, rank = self.comms[remote_address] data = {"cmd": "GET", "tensor_id": tensor_id} sock.send(msgpack.dumps(data)) @@ -235,7 +232,7 @@ def recv_tensor( device=self.device) start_time = time.time() - self._recv(comm, tensor, rank ^ 1, stream) + self._recv(comm, tensor, rank ^ 1, self.recv_stream) duration = time.time() - start_time logger.info( "๐Ÿ”ต[GET]Recv From %s, tensor_id:%s, shape:%s, duration:%.3fms, " @@ -260,8 +257,7 @@ def _listen_for_requests(self): rank = 1 comm: ncclComm_t = self.nccl.ncclCommInitRank( 2, unique_id, rank) - stream = torch.cuda.Stream() - self.comms[remote_address.decode()] = (comm, rank, stream) + self.comms[remote_address.decode()] = (comm, rank) logger.info( "๐ŸคncclCommInitRank Success, %s๐Ÿ‘ˆ%s, MyRank:%s", self.zmq_address, remote_address.decode(), rank) @@ -287,8 +283,9 @@ def _listen_for_requests(self): self.buffer_size += tensor_size self.router_socket.send_multipart( [remote_address, b"0"]) - comm, rank, stream = self.comms[remote_address.decode()] - self._recv(comm, tensor, rank ^ 1, stream) + comm, rank = self.comms[remote_address.decode()] + self._recv(comm, tensor, rank ^ 1, + self.recv_stream) logger.info( "๐Ÿ”ต[PUT]Recv Tensor, %s๐Ÿ‘ˆ%s, MyRank:%s, " "data:%s, shape:%s", self.zmq_address, @@ -328,6 +325,7 @@ def _listen_for_requests(self): [remote_address, msgpack.dumps(data)]) if data["ret"] == 0: + comm, rank = self.comms[remote_address.decode()] self._send(comm, tensor.to(self.device), rank ^ 1, self.send_stream) @@ -373,7 +371,7 @@ def _send_sync( self._create_connect(remote_address) sock = self.socks[remote_address] - comm, rank, stream = self.comms[remote_address] + comm, rank = self.comms[remote_address] data = { "cmd": "PUT", "tensor_id": tensor_id, @@ -395,7 +393,7 @@ def _send_sync( response.decode()) return False - self._send(comm, tensor.to(self.device), rank ^ 1, stream) + self._send(comm, tensor.to(self.device), rank ^ 1, self.send_stream) logger.info("๐Ÿ”ตSend Tensor, %s๐Ÿ‘‰%s, MyRank:%s, data:%s, tensor:%s", self.zmq_address, remote_address, rank, data, tensor.shape) return True From 13fa8b6993e0cbb14414c80380de48a0e25f5b93 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 30 Apr 2025 14:29:35 +0800 Subject: [PATCH 055/155] add mem pool Signed-off-by: Abatom --- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 35 +- .../kv_transfer/tensor_memory_pool.py | 343 ++++++++++++++++++ 2 files changed, 365 insertions(+), 13 deletions(-) create mode 100644 vllm/distributed/kv_transfer/tensor_memory_pool.py diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index f9e4503e0d6f..e954fa30d528 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -14,6 +14,8 @@ from vllm.config import KVTransferConfig from vllm.distributed.device_communicators.pynccl_wrapper import ( NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum) +from vllm.distributed.kv_transfer.tensor_memory_pool import ( + CudaPinnedMemoryPool) from vllm.utils import current_stream, get_ip logger = logging.getLogger(__name__) @@ -72,6 +74,8 @@ def __init__(self, self.send_stream = torch.cuda.Stream() self.recv_stream = torch.cuda.Stream() + self.pool = CudaPinnedMemoryPool(max_block_size=100 * 1024**3) # 100GB + # The sending type includes tree mutually exclusive options: # PUT, GET, PUT_ASYNC. self.send_type = self.config.get_from_extra_config("send_type", "PUT") @@ -87,8 +91,8 @@ def __init__(self, daemon=True) self._send_thread.start() - self.recv_store: Dict[str, - torch.Tensor] = {} # tensor_id: torch.Tensor + # tensor_id: torch.Tensor/(addr, dtype, shape) + self.recv_store: Dict[str, Any] = {} self.socks: Dict[str, Any] = {} # remote_address: client socket self.comms: Dict[str, Any] = {} # remote_address: (ncclComm_t, rank) @@ -191,9 +195,15 @@ def recv_tensor( while len(self.recv_store) > 10000: self.recv_store.pop(next(iter(self.recv_store))) - duration = time.time() - start_time if tensor is not None: - self.buffer_size -= (tensor.element_size() * tensor.numel()) + if isinstance(tensor, tuple): + addr, dtype, shape = tensor + tensor = self.pool.load_tensor(addr, dtype, shape) + self.pool.free(addr) + else: + self.buffer_size -= (tensor.element_size() * + tensor.numel()) + duration = time.time() - start_time logger.info( "๐Ÿ”ต[PUT]Recv From %s, tensor_id:%s, shape:%s, " "duration:%.3fms, size:%.3fGB, rank:%d", remote_address, @@ -201,6 +211,7 @@ def recv_tensor( tensor.element_size() * tensor.numel() / 1024**3, self.rank) else: + duration = time.time() - start_time logger.warning( "๐Ÿ”ด[PUT]Recv From %s, tensor_id:%s, duration:%.3fms, " "rank:%d", remote_address, tensor_id, duration * 1000, @@ -268,24 +279,22 @@ def _listen_for_requests(self): dtype=getattr( torch, data["dtype"]), device=self.device) - + self.router_socket.send_multipart( + [remote_address, b"0"]) + comm, rank = self.comms[remote_address.decode()] + self._recv(comm, tensor, rank ^ 1, self.recv_stream) tensor_size = tensor.element_size() * tensor.numel() if (self.buffer_size + tensor_size > self.buffer_size_threshold): - self.router_socket.send_multipart( - [remote_address, b"2"]) + # Store Tensor in memory pool + addr = self.pool.store_tensor(tensor) + tensor = (addr, tensor.dtype, tensor.shape) logger.warning( "๐Ÿ”ด[PUT]Recv Tensor, Out Of Threshold, " "%s๐Ÿ‘ˆ%s, data:%s", self.zmq_address, remote_address.decode(), data) - tensor = None else: self.buffer_size += tensor_size - self.router_socket.send_multipart( - [remote_address, b"0"]) - comm, rank = self.comms[remote_address.decode()] - self._recv(comm, tensor, rank ^ 1, - self.recv_stream) logger.info( "๐Ÿ”ต[PUT]Recv Tensor, %s๐Ÿ‘ˆ%s, MyRank:%s, " "data:%s, shape:%s", self.zmq_address, diff --git a/vllm/distributed/kv_transfer/tensor_memory_pool.py b/vllm/distributed/kv_transfer/tensor_memory_pool.py new file mode 100644 index 000000000000..7f2e6951fa10 --- /dev/null +++ b/vllm/distributed/kv_transfer/tensor_memory_pool.py @@ -0,0 +1,343 @@ +# SPDX-License-Identifier: Apache-2.0 + +import atexit +import ctypes +import math +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, TypeVar + +import tensor_store_load_mem as cuda_kernels +import torch + +# Define type variable +T = TypeVar('T') + + +@dataclass +class MemoryBlock: + size: int + addr: int + is_free: bool = True + tensor: Optional[torch.Tensor] = None + buddy: Optional['MemoryBlock'] = None + prev: Optional['MemoryBlock'] = None + next: Optional['MemoryBlock'] = None + + +class PinnedMemoryPool: + + def __init__(self, max_block_size: int, min_block_size: int = 512): + """ + Initialize pinned memory pool + :param max_block_size: Maximum block size (bytes) + :param min_block_size: Minimum block size (bytes), default is 512 bytes + """ + if max_block_size <= 0 or min_block_size <= 0: + raise ValueError("Block sizes must be positive") + if max_block_size < min_block_size: + raise ValueError( + "Max block size must be greater than min block size") + + # Ensure block sizes are powers of two + self.max_block_size = self._round_to_power_of_two(max_block_size) + self.min_block_size = self._round_to_power_of_two(min_block_size) + + # Initialize buddy system free lists + self.free_lists: Dict[int, List[MemoryBlock]] = {} + self.allocated_blocks: Dict[int, MemoryBlock] = { + } # Address to block mapping + + # Create free lists for largest blocks + self._initialize_free_lists() + + # Allocate actual pinned memory + self._allocate_pinned_memory() + + # Register cleanup function + atexit.register(self.cleanup) + + def _round_to_power_of_two(self, size: int) -> int: + """Round size to nearest power of two""" + return 1 << (size - 1).bit_length() + + def _initialize_free_lists(self): + """Initialize free lists""" + size = self.max_block_size + while size >= self.min_block_size: + self.free_lists[size] = [] + size = size // 2 + + def _allocate_pinned_memory(self): + """Allocate pinned memory""" + # Use PyTorch to allocate pinned memory + self.base_tensor = torch.empty(self.max_block_size // 4, + dtype=torch.float32, + pin_memory=True) + + # Get raw pointer address + self.base_address = self.base_tensor.data_ptr() + + # Create largest memory block + initial_block = MemoryBlock(size=self.max_block_size, + addr=self.base_address) + self.free_lists[self.max_block_size].append(initial_block) + + def allocate(self, size: int) -> int: + """ + Allocate memory + :param size: Required size (bytes) + :return: Allocated memory address + """ + if size <= 0: + raise ValueError("Allocation size must be positive") + + # Calculate minimum required block size + required_size = self._round_to_power_of_two( + max(size, self.min_block_size)) + + # Check if we have a large enough block + if required_size > self.max_block_size: + raise MemoryError("Requested size exceeds maximum block size") + + # Find suitable block in free lists + current_size = required_size + while current_size <= self.max_block_size: + if self.free_lists[current_size]: + # Found suitable block + block = self.free_lists[current_size].pop() + self._split_block(block, required_size) + block.is_free = False + self.allocated_blocks[block.addr] = block + return block.addr + current_size *= 2 + + # No suitable block found + raise MemoryError("Insufficient memory") + + def _split_block(self, block: MemoryBlock, required_size: int): + """ + Split memory block until reaching required size + """ + while (block.size > required_size + and block.size // 2 >= self.min_block_size): + # Create buddy block + buddy_size = block.size // 2 + buddy_addr = block.addr + buddy_size + + buddy = MemoryBlock(size=buddy_size, addr=buddy_addr) + block.size = buddy_size + + # Set buddy relationship + block.buddy = buddy + buddy.buddy = block + + # Add buddy to free list + self.free_lists[buddy_size].append(buddy) + + def free(self, addr: int): + """ + Free memory + :param addr: Memory address to free + """ + if addr not in self.allocated_blocks: + raise ValueError("Invalid address to free") + + block = self.allocated_blocks.pop(addr) + block.is_free = True + + # Try to merge buddy blocks + self._merge_buddies(block) + + def _merge_buddies(self, block: MemoryBlock): + """ + Attempt to merge buddy blocks + """ + while block.buddy and block.buddy.is_free: + # Get buddy + buddy = block.buddy + + # Remove buddy from free list + self.free_lists[buddy.size].remove(buddy) + + # Determine merged block address (take smaller address of the two) + merged_addr = min(block.addr, buddy.addr) + merged_size = block.size * 2 + + # Create merged block + merged_block = MemoryBlock(size=merged_size, addr=merged_addr) + + # Set new block's buddy relationship + if merged_block.size < self.max_block_size: + # Find new block's buddy + buddy_offset = merged_size if merged_addr % ( + 2 * merged_size) == 0 else -merged_size + buddy_addr = merged_addr + buddy_offset + + # Look for potential buddy in free lists + for existing_block in self.free_lists[merged_size]: + if existing_block.addr == buddy_addr: + merged_block.buddy = existing_block + existing_block.buddy = merged_block + break + + # Add merged block to free list + self.free_lists[merged_size].append(merged_block) + + # Update current block to merged block + block = merged_block + + def store_tensor(self, tensor: torch.Tensor) -> int: + """ + Store Tensor in memory pool + :param tensor: CUDA Tensor to store + :return: Stored memory address + """ + if not tensor.is_cuda: + raise ValueError("Only CUDA tensors can be stored") + + # Calculate required size (bytes) + size = tensor.element_size() * tensor.numel() + + # Allocate memory + addr = self.allocate(size) + + # Get block + block = self.allocated_blocks[addr] + + # Create pinned CPU Tensor view + dtype_size = tensor.element_size() + num_elements = size // dtype_size + cpu_tensor = torch.frombuffer(ctypes.cast( + block.addr, ctypes.POINTER(ctypes.c_byte)), + count=num_elements, + dtype=tensor.dtype) + + # Asynchronously copy data to pinned memory + with torch.cuda.stream(torch.cuda.Stream()): + cpu_tensor.copy_(tensor, non_blocking=True) + + # Save Tensor metadata + block.tensor = tensor + + return addr + + def load_tensor(self, addr: int, dtype: torch.dtype, + shape: Tuple[int, ...]) -> torch.Tensor: + """ + Load Tensor from memory pool + :param addr: Stored memory address + :param dtype: Tensor data type + :param shape: Tensor shape + :return: Recovered CUDA Tensor + """ + if addr not in self.allocated_blocks: + raise ValueError("Invalid address to load") + + block = self.allocated_blocks[addr] + + # Calculate element size and count + dtype_size = torch.tensor([], dtype=dtype).element_size() + num_elements = math.prod(shape) + required_size = dtype_size * num_elements + + if required_size > block.size: + raise ValueError("Requested tensor size exceeds block size") + + # Create CUDA Tensor + cuda_tensor = torch.empty(shape, dtype=dtype, device='cuda') + + # Create pinned CPU Tensor view + cpu_tensor = torch.frombuffer(ctypes.cast( + block.addr, ctypes.POINTER(ctypes.c_byte)), + count=num_elements, + dtype=dtype) + + # Asynchronously copy data to CUDA + with torch.cuda.stream(torch.cuda.Stream()): + cuda_tensor.copy_(cpu_tensor[:num_elements], non_blocking=True) + + return cuda_tensor + + def cleanup(self): + """Clean up all resources""" + self.free_lists.clear() + self.allocated_blocks.clear() + if hasattr(self, 'base_tensor'): + del self.base_tensor + + def __del__(self): + self.cleanup() + + +class CudaPinnedMemoryPool(PinnedMemoryPool): + + def __init__(self, max_block_size: int, min_block_size: int = 512): + super().__init__(max_block_size, min_block_size) + + def store_tensor(self, tensor: torch.Tensor) -> int: + """Store Tensor using CUDA kernel""" + if not tensor.is_cuda: + raise ValueError("Only CUDA tensors can be stored") + + # Calculate required size (bytes) + size = tensor.element_size() * tensor.numel() + + # Allocate memory (ensure enough space) + addr = self.allocate(size) + block = self.allocated_blocks[addr] + + # Verify allocated size is sufficient + if block.size < size: + self.free(addr) + raise MemoryError( + f"Allocated block size {block.size} is smaller than " + f"required size {size}") + + # Create pinned CPU Tensor view + try: + # Use ctypes to create correctly sized buffer + buffer = (ctypes.c_byte * block.size).from_address(block.addr) + cpu_tensor = torch.frombuffer(buffer, + dtype=tensor.dtype, + count=tensor.numel()) + except ValueError as e: + self.free(addr) + raise MemoryError(f"Failed to create tensor view: {e}") from e + + # Use CUDA kernel to copy data + cuda_kernels.store_tensor(tensor, cpu_tensor) + + # Synchronize to ensure copy completes + torch.cuda.synchronize() + + block.tensor = tensor + return addr + + def load_tensor(self, addr: int, dtype: torch.dtype, + shape: Tuple[int, ...]) -> torch.Tensor: + """Load Tensor using CUDA kernel""" + if addr not in self.allocated_blocks: + raise ValueError("Invalid address to load") + + block = self.allocated_blocks[addr] + num_elements = math.prod(shape) + dtype_size = torch.tensor([], dtype=dtype).element_size() + required_size = num_elements * dtype_size + + if required_size > block.size: + raise ValueError("Requested tensor size exceeds block size") + + # Create CUDA Tensor + cuda_tensor = torch.empty(shape, dtype=dtype, device='cuda') + + # Create pinned CPU Tensor view + buffer = (ctypes.c_byte * block.size).from_address(block.addr) + cpu_tensor = torch.frombuffer(buffer, dtype=dtype, count=num_elements) + + # Use CUDA kernel to copy data + cuda_kernels.load_tensor(cpu_tensor, cuda_tensor) + + # Synchronize to ensure copy completes + torch.cuda.synchronize() + + return cuda_tensor From d715d6b68fe958e5d67cdcb217d6ec5ad1641e22 Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 6 May 2025 21:08:45 +0800 Subject: [PATCH 056/155] improve mempool Signed-off-by: Abatom --- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 5 +- .../kv_transfer/tensor_memory_pool.py | 267 ++++-------------- 2 files changed, 56 insertions(+), 216 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index e954fa30d528..d6eb58b7c070 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -15,7 +15,7 @@ from vllm.distributed.device_communicators.pynccl_wrapper import ( NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum) from vllm.distributed.kv_transfer.tensor_memory_pool import ( - CudaPinnedMemoryPool) + TensorMemoryPool) from vllm.utils import current_stream, get_ip logger = logging.getLogger(__name__) @@ -74,7 +74,7 @@ def __init__(self, self.send_stream = torch.cuda.Stream() self.recv_stream = torch.cuda.Stream() - self.pool = CudaPinnedMemoryPool(max_block_size=100 * 1024**3) # 100GB + self.pool = TensorMemoryPool(max_block_size=100 * 1024**3) # 100GB # The sending type includes tree mutually exclusive options: # PUT, GET, PUT_ASYNC. @@ -199,7 +199,6 @@ def recv_tensor( if isinstance(tensor, tuple): addr, dtype, shape = tensor tensor = self.pool.load_tensor(addr, dtype, shape) - self.pool.free(addr) else: self.buffer_size -= (tensor.element_size() * tensor.numel()) diff --git a/vllm/distributed/kv_transfer/tensor_memory_pool.py b/vllm/distributed/kv_transfer/tensor_memory_pool.py index 7f2e6951fa10..a76c33cfd4cc 100644 --- a/vllm/distributed/kv_transfer/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/tensor_memory_pool.py @@ -4,13 +4,10 @@ import ctypes import math from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple, TypeVar +from typing import Dict, Optional, Tuple -import tensor_store_load_mem as cuda_kernels import torch - -# Define type variable -T = TypeVar('T') +import tensor_store_load_mem as cuda_kernels @dataclass @@ -20,302 +17,140 @@ class MemoryBlock: is_free: bool = True tensor: Optional[torch.Tensor] = None buddy: Optional['MemoryBlock'] = None - prev: Optional['MemoryBlock'] = None - next: Optional['MemoryBlock'] = None -class PinnedMemoryPool: - +class TensorMemoryPool: def __init__(self, max_block_size: int, min_block_size: int = 512): - """ - Initialize pinned memory pool - :param max_block_size: Maximum block size (bytes) - :param min_block_size: Minimum block size (bytes), default is 512 bytes - """ if max_block_size <= 0 or min_block_size <= 0: raise ValueError("Block sizes must be positive") if max_block_size < min_block_size: - raise ValueError( - "Max block size must be greater than min block size") + raise ValueError("Max block size must be greater than min block size") - # Ensure block sizes are powers of two self.max_block_size = self._round_to_power_of_two(max_block_size) self.min_block_size = self._round_to_power_of_two(min_block_size) - # Initialize buddy system free lists - self.free_lists: Dict[int, List[MemoryBlock]] = {} - self.allocated_blocks: Dict[int, MemoryBlock] = { - } # Address to block mapping + self.free_lists: Dict[int, Dict[int, MemoryBlock]] = {} + self.allocated_blocks: Dict[int, MemoryBlock] = {} - # Create free lists for largest blocks self._initialize_free_lists() - - # Allocate actual pinned memory self._allocate_pinned_memory() - # Register cleanup function atexit.register(self.cleanup) + self.store_stream = torch.cuda.Stream() + self.load_stream = torch.cuda.Stream() + def _round_to_power_of_two(self, size: int) -> int: - """Round size to nearest power of two""" return 1 << (size - 1).bit_length() def _initialize_free_lists(self): - """Initialize free lists""" size = self.max_block_size while size >= self.min_block_size: - self.free_lists[size] = [] - size = size // 2 + self.free_lists[size] = {} + size //= 2 def _allocate_pinned_memory(self): - """Allocate pinned memory""" - # Use PyTorch to allocate pinned memory - self.base_tensor = torch.empty(self.max_block_size // 4, - dtype=torch.float32, - pin_memory=True) - - # Get raw pointer address + self.base_tensor = torch.empty(self.max_block_size // 4, dtype=torch.float32, pin_memory=True) self.base_address = self.base_tensor.data_ptr() - - # Create largest memory block - initial_block = MemoryBlock(size=self.max_block_size, - addr=self.base_address) - self.free_lists[self.max_block_size].append(initial_block) + initial_block = MemoryBlock(size=self.max_block_size, addr=self.base_address) + self.free_lists[self.max_block_size][initial_block.addr] = initial_block def allocate(self, size: int) -> int: - """ - Allocate memory - :param size: Required size (bytes) - :return: Allocated memory address - """ if size <= 0: raise ValueError("Allocation size must be positive") - # Calculate minimum required block size - required_size = self._round_to_power_of_two( - max(size, self.min_block_size)) - - # Check if we have a large enough block + required_size = self._round_to_power_of_two(max(size, self.min_block_size)) if required_size > self.max_block_size: raise MemoryError("Requested size exceeds maximum block size") - # Find suitable block in free lists current_size = required_size while current_size <= self.max_block_size: if self.free_lists[current_size]: - # Found suitable block - block = self.free_lists[current_size].pop() + addr, block = self.free_lists[current_size].popitem() self._split_block(block, required_size) block.is_free = False self.allocated_blocks[block.addr] = block return block.addr current_size *= 2 - # No suitable block found raise MemoryError("Insufficient memory") def _split_block(self, block: MemoryBlock, required_size: int): - """ - Split memory block until reaching required size - """ - while (block.size > required_size - and block.size // 2 >= self.min_block_size): - # Create buddy block + while block.size > required_size and block.size // 2 >= self.min_block_size: buddy_size = block.size // 2 buddy_addr = block.addr + buddy_size buddy = MemoryBlock(size=buddy_size, addr=buddy_addr) block.size = buddy_size - # Set buddy relationship block.buddy = buddy buddy.buddy = block - # Add buddy to free list - self.free_lists[buddy_size].append(buddy) + self.free_lists[buddy_size][buddy.addr] = buddy def free(self, addr: int): - """ - Free memory - :param addr: Memory address to free - """ if addr not in self.allocated_blocks: raise ValueError("Invalid address to free") block = self.allocated_blocks.pop(addr) block.is_free = True - - # Try to merge buddy blocks self._merge_buddies(block) def _merge_buddies(self, block: MemoryBlock): - """ - Attempt to merge buddy blocks - """ - while block.buddy and block.buddy.is_free: - # Get buddy + MAX_MERGE_DEPTH = 20 + depth = 0 + + while block.buddy and block.buddy.is_free and depth < MAX_MERGE_DEPTH: buddy = block.buddy - # Remove buddy from free list - self.free_lists[buddy.size].remove(buddy) + if buddy.addr in self.free_lists[buddy.size]: + del self.free_lists[buddy.size][buddy.addr] + else: + break - # Determine merged block address (take smaller address of the two) merged_addr = min(block.addr, buddy.addr) merged_size = block.size * 2 - - # Create merged block merged_block = MemoryBlock(size=merged_size, addr=merged_addr) - # Set new block's buddy relationship if merged_block.size < self.max_block_size: - # Find new block's buddy - buddy_offset = merged_size if merged_addr % ( - 2 * merged_size) == 0 else -merged_size + buddy_offset = merged_size if merged_addr % (2 * merged_size) == 0 else -merged_size buddy_addr = merged_addr + buddy_offset + existing_buddy = self.free_lists[merged_size].get(buddy_addr) + if existing_buddy: + merged_block.buddy = existing_buddy + existing_buddy.buddy = merged_block - # Look for potential buddy in free lists - for existing_block in self.free_lists[merged_size]: - if existing_block.addr == buddy_addr: - merged_block.buddy = existing_block - existing_block.buddy = merged_block - break - - # Add merged block to free list - self.free_lists[merged_size].append(merged_block) - - # Update current block to merged block + self.free_lists[merged_size][merged_block.addr] = merged_block block = merged_block + depth += 1 def store_tensor(self, tensor: torch.Tensor) -> int: - """ - Store Tensor in memory pool - :param tensor: CUDA Tensor to store - :return: Stored memory address - """ - if not tensor.is_cuda: - raise ValueError("Only CUDA tensors can be stored") - - # Calculate required size (bytes) - size = tensor.element_size() * tensor.numel() - - # Allocate memory - addr = self.allocate(size) - - # Get block - block = self.allocated_blocks[addr] - - # Create pinned CPU Tensor view - dtype_size = tensor.element_size() - num_elements = size // dtype_size - cpu_tensor = torch.frombuffer(ctypes.cast( - block.addr, ctypes.POINTER(ctypes.c_byte)), - count=num_elements, - dtype=tensor.dtype) - - # Asynchronously copy data to pinned memory - with torch.cuda.stream(torch.cuda.Stream()): - cpu_tensor.copy_(tensor, non_blocking=True) - - # Save Tensor metadata - block.tensor = tensor - - return addr - - def load_tensor(self, addr: int, dtype: torch.dtype, - shape: Tuple[int, ...]) -> torch.Tensor: - """ - Load Tensor from memory pool - :param addr: Stored memory address - :param dtype: Tensor data type - :param shape: Tensor shape - :return: Recovered CUDA Tensor - """ - if addr not in self.allocated_blocks: - raise ValueError("Invalid address to load") - - block = self.allocated_blocks[addr] - - # Calculate element size and count - dtype_size = torch.tensor([], dtype=dtype).element_size() - num_elements = math.prod(shape) - required_size = dtype_size * num_elements - - if required_size > block.size: - raise ValueError("Requested tensor size exceeds block size") - - # Create CUDA Tensor - cuda_tensor = torch.empty(shape, dtype=dtype, device='cuda') - - # Create pinned CPU Tensor view - cpu_tensor = torch.frombuffer(ctypes.cast( - block.addr, ctypes.POINTER(ctypes.c_byte)), - count=num_elements, - dtype=dtype) - - # Asynchronously copy data to CUDA - with torch.cuda.stream(torch.cuda.Stream()): - cuda_tensor.copy_(cpu_tensor[:num_elements], non_blocking=True) - - return cuda_tensor - - def cleanup(self): - """Clean up all resources""" - self.free_lists.clear() - self.allocated_blocks.clear() - if hasattr(self, 'base_tensor'): - del self.base_tensor - - def __del__(self): - self.cleanup() - - -class CudaPinnedMemoryPool(PinnedMemoryPool): - - def __init__(self, max_block_size: int, min_block_size: int = 512): - super().__init__(max_block_size, min_block_size) - - def store_tensor(self, tensor: torch.Tensor) -> int: - """Store Tensor using CUDA kernel""" if not tensor.is_cuda: raise ValueError("Only CUDA tensors can be stored") - # Calculate required size (bytes) size = tensor.element_size() * tensor.numel() - - # Allocate memory (ensure enough space) addr = self.allocate(size) block = self.allocated_blocks[addr] - # Verify allocated size is sufficient if block.size < size: self.free(addr) - raise MemoryError( - f"Allocated block size {block.size} is smaller than " - f"required size {size}") + raise MemoryError(f"Allocated block size {block.size} is smaller than required size {size}") - # Create pinned CPU Tensor view try: - # Use ctypes to create correctly sized buffer buffer = (ctypes.c_byte * block.size).from_address(block.addr) - cpu_tensor = torch.frombuffer(buffer, - dtype=tensor.dtype, - count=tensor.numel()) + cpu_tensor = torch.frombuffer(buffer, dtype=tensor.dtype, count=tensor.numel()) except ValueError as e: self.free(addr) - raise MemoryError(f"Failed to create tensor view: {e}") from e + raise MemoryError(f"Failed to create tensor view: {e}") - # Use CUDA kernel to copy data - cuda_kernels.store_tensor(tensor, cpu_tensor) - - # Synchronize to ensure copy completes - torch.cuda.synchronize() + with torch.cuda.stream(self.store_stream): + cuda_kernels.store_tensor(tensor, cpu_tensor) + self.store_stream.synchronize() block.tensor = tensor return addr - def load_tensor(self, addr: int, dtype: torch.dtype, - shape: Tuple[int, ...]) -> torch.Tensor: - """Load Tensor using CUDA kernel""" + def load_tensor(self, addr: int, dtype: torch.dtype, shape: Tuple[int, ...]) -> torch.Tensor: if addr not in self.allocated_blocks: raise ValueError("Invalid address to load") @@ -327,17 +162,23 @@ def load_tensor(self, addr: int, dtype: torch.dtype, if required_size > block.size: raise ValueError("Requested tensor size exceeds block size") - # Create CUDA Tensor - cuda_tensor = torch.empty(shape, dtype=dtype, device='cuda') - - # Create pinned CPU Tensor view buffer = (ctypes.c_byte * block.size).from_address(block.addr) cpu_tensor = torch.frombuffer(buffer, dtype=dtype, count=num_elements) - # Use CUDA kernel to copy data - cuda_kernels.load_tensor(cpu_tensor, cuda_tensor) + cuda_tensor = torch.empty(shape, dtype=dtype, device='cuda') - # Synchronize to ensure copy completes - torch.cuda.synchronize() + with torch.cuda.stream(self.load_stream): + cuda_kernels.load_tensor(cpu_tensor, cuda_tensor) + self.load_stream.synchronize() + self.free(addr) return cuda_tensor + + def cleanup(self): + self.free_lists.clear() + self.allocated_blocks.clear() + if hasattr(self, 'base_tensor'): + del self.base_tensor + + def __del__(self): + self.cleanup() From 3259540411b8c0f37229fbb52c59cea5630b94f9 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 7 May 2025 13:07:15 +0800 Subject: [PATCH 057/155] bugfix Signed-off-by: Abatom --- .../kv_transfer/tensor_memory_pool.py | 45 ++++++------------- 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/vllm/distributed/kv_transfer/tensor_memory_pool.py b/vllm/distributed/kv_transfer/tensor_memory_pool.py index a76c33cfd4cc..49b62bb95893 100644 --- a/vllm/distributed/kv_transfer/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/tensor_memory_pool.py @@ -14,9 +14,6 @@ class MemoryBlock: size: int addr: int - is_free: bool = True - tensor: Optional[torch.Tensor] = None - buddy: Optional['MemoryBlock'] = None class TensorMemoryPool: @@ -54,6 +51,7 @@ def _allocate_pinned_memory(self): self.base_address = self.base_tensor.data_ptr() initial_block = MemoryBlock(size=self.max_block_size, addr=self.base_address) self.free_lists[self.max_block_size][initial_block.addr] = initial_block + print("TensorMemoryPool, base_address:", self.base_address, self.base_address % self.max_block_size) def allocate(self, size: int) -> int: if size <= 0: @@ -66,9 +64,8 @@ def allocate(self, size: int) -> int: current_size = required_size while current_size <= self.max_block_size: if self.free_lists[current_size]: - addr, block = self.free_lists[current_size].popitem() + _, block = self.free_lists[current_size].popitem() self._split_block(block, required_size) - block.is_free = False self.allocated_blocks[block.addr] = block return block.addr current_size *= 2 @@ -83,9 +80,6 @@ def _split_block(self, block: MemoryBlock, required_size: int): buddy = MemoryBlock(size=buddy_size, addr=buddy_addr) block.size = buddy_size - block.buddy = buddy - buddy.buddy = block - self.free_lists[buddy_size][buddy.addr] = buddy def free(self, addr: int): @@ -93,37 +87,27 @@ def free(self, addr: int): raise ValueError("Invalid address to free") block = self.allocated_blocks.pop(addr) - block.is_free = True self._merge_buddies(block) def _merge_buddies(self, block: MemoryBlock): - MAX_MERGE_DEPTH = 20 + MAX_MERGE_DEPTH = 30 depth = 0 - while block.buddy and block.buddy.is_free and depth < MAX_MERGE_DEPTH: - buddy = block.buddy - - if buddy.addr in self.free_lists[buddy.size]: + while depth < MAX_MERGE_DEPTH: + buddy_offset = block.size if (block.addr - self.base_address) % (2 * block.size) == 0 else -block.size + buddy_addr = block.addr + buddy_offset + buddy = self.free_lists[block.size].get(buddy_addr) + if buddy: del self.free_lists[buddy.size][buddy.addr] + merged_addr = min(block.addr, buddy.addr) + merged_size = block.size * 2 + merged_block = MemoryBlock(size=merged_size, addr=merged_addr) + block = merged_block + depth += 1 else: + self.free_lists[block.size][block.addr] = block break - merged_addr = min(block.addr, buddy.addr) - merged_size = block.size * 2 - merged_block = MemoryBlock(size=merged_size, addr=merged_addr) - - if merged_block.size < self.max_block_size: - buddy_offset = merged_size if merged_addr % (2 * merged_size) == 0 else -merged_size - buddy_addr = merged_addr + buddy_offset - existing_buddy = self.free_lists[merged_size].get(buddy_addr) - if existing_buddy: - merged_block.buddy = existing_buddy - existing_buddy.buddy = merged_block - - self.free_lists[merged_size][merged_block.addr] = merged_block - block = merged_block - depth += 1 - def store_tensor(self, tensor: torch.Tensor) -> int: if not tensor.is_cuda: raise ValueError("Only CUDA tensors can be stored") @@ -147,7 +131,6 @@ def store_tensor(self, tensor: torch.Tensor) -> int: cuda_kernels.store_tensor(tensor, cpu_tensor) self.store_stream.synchronize() - block.tensor = tensor return addr def load_tensor(self, addr: int, dtype: torch.dtype, shape: Tuple[int, ...]) -> torch.Tensor: From f5250012582a2f614278c4942bac5244e6fc4697 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 7 May 2025 20:51:57 +0800 Subject: [PATCH 058/155] torch.cuda.Event Signed-off-by: Abatom --- .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 14 ++++++++------ .../kv_transfer/tensor_memory_pool.py | 18 +++++++++++------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index d6eb58b7c070..4aec921bf57b 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -198,17 +198,19 @@ def recv_tensor( if tensor is not None: if isinstance(tensor, tuple): addr, dtype, shape = tensor - tensor = self.pool.load_tensor(addr, dtype, shape) + tensor = self.pool.load_tensor(addr, dtype, shape, + self.device) else: + addr = 0 self.buffer_size -= (tensor.element_size() * tensor.numel()) duration = time.time() - start_time logger.info( "๐Ÿ”ต[PUT]Recv From %s, tensor_id:%s, shape:%s, " - "duration:%.3fms, size:%.3fGB, rank:%d", remote_address, - tensor_id, tensor.shape, duration * 1000, + "duration:%.3fms, size:%.3fGB, addr:%d, rank:%d", + remote_address, tensor_id, tensor.shape, duration * 1000, tensor.element_size() * tensor.numel() / 1024**3, - self.rank) + addr, self.rank) else: duration = time.time() - start_time logger.warning( @@ -290,8 +292,8 @@ def _listen_for_requests(self): tensor = (addr, tensor.dtype, tensor.shape) logger.warning( "๐Ÿ”ด[PUT]Recv Tensor, Out Of Threshold, " - "%s๐Ÿ‘ˆ%s, data:%s", self.zmq_address, - remote_address.decode(), data) + "%s๐Ÿ‘ˆ%s, data:%s, addr:%d", self.zmq_address, + remote_address.decode(), data, addr) else: self.buffer_size += tensor_size logger.info( diff --git a/vllm/distributed/kv_transfer/tensor_memory_pool.py b/vllm/distributed/kv_transfer/tensor_memory_pool.py index 49b62bb95893..8b6acd6d3007 100644 --- a/vllm/distributed/kv_transfer/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/tensor_memory_pool.py @@ -101,12 +101,11 @@ def _merge_buddies(self, block: MemoryBlock): del self.free_lists[buddy.size][buddy.addr] merged_addr = min(block.addr, buddy.addr) merged_size = block.size * 2 - merged_block = MemoryBlock(size=merged_size, addr=merged_addr) - block = merged_block + block = MemoryBlock(size=merged_size, addr=merged_addr) depth += 1 else: - self.free_lists[block.size][block.addr] = block break + self.free_lists[block.size][block.addr] = block def store_tensor(self, tensor: torch.Tensor) -> int: if not tensor.is_cuda: @@ -127,13 +126,15 @@ def store_tensor(self, tensor: torch.Tensor) -> int: self.free(addr) raise MemoryError(f"Failed to create tensor view: {e}") + event = torch.cuda.Event() with torch.cuda.stream(self.store_stream): cuda_kernels.store_tensor(tensor, cpu_tensor) - self.store_stream.synchronize() + event.record() + event.wait() return addr - def load_tensor(self, addr: int, dtype: torch.dtype, shape: Tuple[int, ...]) -> torch.Tensor: + def load_tensor(self, addr: int, dtype: torch.dtype, shape: Tuple[int, ...], device) -> torch.Tensor: if addr not in self.allocated_blocks: raise ValueError("Invalid address to load") @@ -148,11 +149,14 @@ def load_tensor(self, addr: int, dtype: torch.dtype, shape: Tuple[int, ...]) -> buffer = (ctypes.c_byte * block.size).from_address(block.addr) cpu_tensor = torch.frombuffer(buffer, dtype=dtype, count=num_elements) - cuda_tensor = torch.empty(shape, dtype=dtype, device='cuda') + cuda_tensor = torch.empty(shape, dtype=dtype, device=device) + event = torch.cuda.Event() with torch.cuda.stream(self.load_stream): cuda_kernels.load_tensor(cpu_tensor, cuda_tensor) - self.load_stream.synchronize() + event.record() + event.wait() + self.free(addr) return cuda_tensor From 28ef7ccfefe5a7d2b3d8c69aff52eebde43cbfa0 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 10 May 2025 15:45:34 +0800 Subject: [PATCH 059/155] load_stream.synchronize and store_stream.synchronize Signed-off-by: Abatom --- vllm/distributed/kv_transfer/tensor_memory_pool.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/vllm/distributed/kv_transfer/tensor_memory_pool.py b/vllm/distributed/kv_transfer/tensor_memory_pool.py index 8b6acd6d3007..518333fbcb40 100644 --- a/vllm/distributed/kv_transfer/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/tensor_memory_pool.py @@ -126,11 +126,9 @@ def store_tensor(self, tensor: torch.Tensor) -> int: self.free(addr) raise MemoryError(f"Failed to create tensor view: {e}") - event = torch.cuda.Event() with torch.cuda.stream(self.store_stream): cuda_kernels.store_tensor(tensor, cpu_tensor) - event.record() - event.wait() + self.store_stream.synchronize() return addr @@ -151,11 +149,9 @@ def load_tensor(self, addr: int, dtype: torch.dtype, shape: Tuple[int, ...], dev cuda_tensor = torch.empty(shape, dtype=dtype, device=device) - event = torch.cuda.Event() with torch.cuda.stream(self.load_stream): cuda_kernels.load_tensor(cpu_tensor, cuda_tensor) - event.record() - event.wait() + self.load_stream.synchronize() self.free(addr) From fab1d3363a4c2e4a004f31cc1d9b2b9fa776bd22 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 10 May 2025 16:26:48 +0800 Subject: [PATCH 060/155] stream.synchronize Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index 4aec921bf57b..ee7398415ad8 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -433,6 +433,7 @@ def _send(self, comm, tensor: torch.Tensor, dst: int, stream=None): self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), dst, comm, cudaStream_t(stream.cuda_stream)) + stream.synchronize() def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): assert tensor.device == self.device, ( @@ -445,6 +446,7 @@ def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, comm, cudaStream_t(stream.cuda_stream)) + stream.synchronize() def close(self) -> None: self._listener_thread.join() From 2400d0b2186fba4cbc9ee13c36d504856eacdc84 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sun, 11 May 2025 21:25:37 +0800 Subject: [PATCH 061/155] build_connector_meta Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p_nccl_connector.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index 641b0b0aba4d..24fae164a36e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -298,12 +298,6 @@ def build_connector_meta( block_ids=new_req.block_ids, block_size=self._block_size) total_need_load += 1 - else: - if self.is_producer: - meta.add_request(request_id=new_req.req_id, - token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids, - block_size=self._block_size) for cached_req in scheduler_output.scheduled_cached_reqs: # NOTE(rob): here we rely on the resumed requests being From 6eab9df8e61c7389a3a1f6acea43a0a8663e7754 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 12 May 2025 13:10:45 +0800 Subject: [PATCH 062/155] bugfix Signed-off-by: Abatom --- .../kv_connector/v1/p2p_nccl_connector.py | 11 +++++++++++ .../kv_transfer/kv_pipe/p2p_nccl_pipe.py | 13 +++++++++++++ 2 files changed, 24 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index 24fae164a36e..384012cf5503 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -170,6 +170,11 @@ def inject_kv_into_layer( kv_cache = self.p2p_nccl_pipe.recv_tensor(request.request_id + "-" + layer_name) + if kv_cache is None: + logger.warning("๐Ÿšงsrc_kv_cache is None, %s", + request.request_id) + continue + inject_kv_into_layer(kv_cache_layer, kv_cache, request.slot_mapping) @@ -298,6 +303,12 @@ def build_connector_meta( block_ids=new_req.block_ids, block_size=self._block_size) total_need_load += 1 + else: + if self.is_producer: + meta.add_request(request_id=new_req.req_id, + token_ids=new_req.prompt_token_ids, + block_ids=new_req.block_ids, + block_size=self._block_size) for cached_req in scheduler_output.scheduled_cached_reqs: # NOTE(rob): here we rely on the resumed requests being diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index ee7398415ad8..d39b8dfb3ac1 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -429,12 +429,19 @@ def _send(self, comm, tensor: torch.Tensor, dst: int, stream=None): if stream is None: stream = current_stream() + start_time = time.time() with torch.cuda.stream(stream): self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), dst, comm, cudaStream_t(stream.cuda_stream)) stream.synchronize() + duration = time.time() - start_time + logger.info( + "๐Ÿ•Nccl Send Tensor, shape:%s, duration:%.3fms, size:%.3fGB, " + "rank:%d", tensor.shape, duration * 1000, + tensor.element_size() * tensor.numel() / 1024 ** 3, dst) + def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " @@ -442,11 +449,17 @@ def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): if stream is None: stream = current_stream() + start_time = time.time() with torch.cuda.stream(stream): self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, comm, cudaStream_t(stream.cuda_stream)) stream.synchronize() + duration = time.time() - start_time + logger.info( + "๐Ÿ•Nccl Recv Tensor, shape:%s, duration:%.3fms, size:%.3fGB, " + "rank:%d", tensor.shape, duration * 1000, + tensor.element_size() * tensor.numel() / 1024 ** 3, src) def close(self) -> None: self._listener_thread.join() From acfa2ac82b440d762787a3f994bedfa95719d26b Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 12 May 2025 15:36:25 +0800 Subject: [PATCH 063/155] bugfix Signed-off-by: Abatom --- .../kv_connector/v1/p2p_nccl_connector.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index 384012cf5503..7fc7fd372122 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -131,14 +131,26 @@ def inject_kv_into_layer( page_size = dst_kv_cache_layer_shape[1] dst_kv_cache_layer = dst_kv_cache_layer.reshape( num_pages * page_size, -1) - dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache + num_token = src_kv_cache.shape[0] + if len(slot_mapping) == num_token: + dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache + else: + dst_kv_cache_layer[slot_mapping[:num_token], ...] = src_kv_cache + logger.warning("๐Ÿšงsrc_kv_cache does not match, num_slot:%d, num_token:%d", len(slot_mapping), num_token) + dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) else: num_pages = dst_kv_cache_layer_shape[1] page_size = dst_kv_cache_layer_shape[2] dst_kv_cache_layer = dst_kv_cache_layer.reshape( 2, num_pages * page_size, -1) - dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache + num_token = src_kv_cache.shape[1] + if len(slot_mapping) == num_token: + dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache + else: + dst_kv_cache_layer[:, slot_mapping[:num_token],...] = src_kv_cache + logger.warning("๐Ÿšงsrc_kv_cache does not match, num_slot:%d, num_token:%d", len(slot_mapping), num_token) + dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) # Get the metadata From bb596f7ce279fa612d65314a775c127b44b3a627 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 12 May 2025 18:15:54 +0800 Subject: [PATCH 064/155] proxy add round robin Signed-off-by: Abatom --- .../disagg_xpyd/disagg_prefill_proxy_xpyd.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index 47d03d7e9734..448ba6064625 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -11,6 +11,7 @@ import zmq from quart import Quart, make_response, request +count = 0 prefill_instances: dict[str, str] = {} # http_address: zmq_address decode_instances: dict[str, str] = {} # http_address: zmq_address @@ -102,21 +103,27 @@ async def handle_request(): # change max_tokens = 1 to let it only do prefill prefill_request['max_tokens'] = 1 + global count global prefill_instances global prefill_cv with prefill_cv: - prefill_addr, prefill_zmq_addr = random.choice( - list(prefill_instances.items())) - print("handle_request, prefill_addr: %s, zmq_addr: %s", - prefill_addr, prefill_zmq_addr) + # prefill_addr, prefill_zmq_addr = random.choice( + # list(prefill_instances.items())) + prefill_list = list(prefill_instances.items()) + prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)] global decode_instances global decode_cv with decode_cv: - decode_addr, decode_zmq_addr = random.choice( - list(decode_instances.items())) - print("handle_request, decode_addr: %s, zmq_addr: %s", decode_addr, - decode_zmq_addr) + # decode_addr, decode_zmq_addr = random.choice( + # list(decode_instances.items())) + decode_list = list(decode_instances.items()) + decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)] + + print(f"handle_request count: {count}, [HTTP:{prefill_addr}, " + f"ZMQ:{prefill_zmq_addr}] ๐Ÿ‘‰ [HTTP:{decode_addr}, " + f"ZMQ:{decode_zmq_addr}]") + count += 1 request_id = ( f"___prefill_addr_{prefill_zmq_addr}___decode_addr_{decode_zmq_addr}_{random_uuid()}" From 7b710f44c63f53fb085a90584d16e644db48589a Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 14 May 2025 14:02:08 +0800 Subject: [PATCH 065/155] mem_pool_size Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py index d39b8dfb3ac1..86d94dbf43e0 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py @@ -74,7 +74,8 @@ def __init__(self, self.send_stream = torch.cuda.Stream() self.recv_stream = torch.cuda.Stream() - self.pool = TensorMemoryPool(max_block_size=100 * 1024**3) # 100GB + mem_pool_size = self.config.get_from_extra_config("mem_pool_size", 128) + self.pool = TensorMemoryPool(max_block_size=mem_pool_size * 1024**3) # GB # The sending type includes tree mutually exclusive options: # PUT, GET, PUT_ASYNC. From 942e1d584f88f75a2df9ca4d708442adb69c110a Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 14 May 2025 14:39:12 +0800 Subject: [PATCH 066/155] add tensor_mem_pool Signed-off-by: Abatom --- csrc/ops.h | 2 + csrc/tensor_store_load_mem.cu | 100 ++++++++++++++++++ csrc/torch_bindings.cpp | 5 + vllm/_custom_ops.py | 8 ++ .../kv_transfer/tensor_memory_pool.py | 6 +- 5 files changed, 118 insertions(+), 3 deletions(-) create mode 100644 csrc/tensor_store_load_mem.cu diff --git a/csrc/ops.h b/csrc/ops.h index 1dfd2e067e85..f471dfd80cc6 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -308,3 +308,5 @@ std::tuple allocate_shared_buffer_and_handle( int64_t size); int64_t open_mem_handle(torch::Tensor& mem_handle); void free_shared_buffer(int64_t buffer); +void store_tensor(torch::Tensor device_tensor, torch::Tensor host_tensor); +void load_tensor(torch::Tensor host_tensor, torch::Tensor device_tensor); diff --git a/csrc/tensor_store_load_mem.cu b/csrc/tensor_store_load_mem.cu new file mode 100644 index 000000000000..7cdfd54cb3d1 --- /dev/null +++ b/csrc/tensor_store_load_mem.cu @@ -0,0 +1,100 @@ +#include +#include + +// Template-based CUDA kernel: Copy from device memory to pinned host memory +template +__global__ void store_kernel(const scalar_t* device_ptr, scalar_t* host_ptr, size_t num_elements) { + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_elements) { + host_ptr[idx] = device_ptr[idx]; + } +} + +// Templated CUDA kernel: Copy from pinned host memory to device memory +template +__global__ void load_kernel(const scalar_t* host_ptr, scalar_t* device_ptr, size_t num_elements) { + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_elements) { + device_ptr[idx] = host_ptr[idx]; + } +} + +// Templated wrapper function: Store Tensor to pinned memory +template +void store_tensor_impl(torch::Tensor device_tensor, torch::Tensor host_tensor) { + const auto num_elements = device_tensor.numel(); + const int threads = 256; + const int blocks = (num_elements + threads - 1) / threads; + + auto device_ptr = device_tensor.data_ptr(); + auto host_ptr = host_tensor.data_ptr(); + + store_kernel<<>>( + device_ptr, host_ptr, num_elements); +} + +// Templated wrapper function: Load Tensor from pinned memory +template +void load_tensor_impl(torch::Tensor host_tensor, torch::Tensor device_tensor) { + const auto num_elements = host_tensor.numel(); + const int threads = 256; + const int blocks = (num_elements + threads - 1) / threads; + + auto host_ptr = host_tensor.data_ptr(); + auto device_ptr = device_tensor.data_ptr(); + + load_kernel<<>>( + host_ptr, device_ptr, num_elements); +} + +// Type-dispatched wrapper function +void store_tensor(torch::Tensor device_tensor, torch::Tensor host_tensor) { + // Validate arguments + AT_ASSERT(device_tensor.is_cuda(), "Input tensor must be a CUDA tensor"); + AT_ASSERT(host_tensor.is_pinned(), "Output tensor must be pinned memory"); + AT_ASSERT(device_tensor.numel() == host_tensor.numel(), "Tensors must have same number of elements"); + AT_ASSERT(device_tensor.dtype() == host_tensor.dtype(), "Tensors must have same dtype"); + + // Type-based dispatch to different implementations + switch (device_tensor.scalar_type()) { + case torch::kFloat: + store_tensor_impl(device_tensor, host_tensor); + break; + case torch::kHalf: + store_tensor_impl(device_tensor, host_tensor); + break; + case torch::kBFloat16: + store_tensor_impl(device_tensor, host_tensor); + break; + default: + AT_ERROR("Unsupported data type: ", device_tensor.scalar_type()); + } +} + +void load_tensor(torch::Tensor host_tensor, torch::Tensor device_tensor) { + // Validate arguments + AT_ASSERT(device_tensor.is_cuda(), "Output tensor must be a CUDA tensor"); + AT_ASSERT(host_tensor.is_pinned(), "Input tensor must be pinned memory"); + AT_ASSERT(device_tensor.numel() == host_tensor.numel(), "Tensors must have same number of elements"); + AT_ASSERT(device_tensor.dtype() == host_tensor.dtype(), "Tensors must have same dtype"); + + // Type-based dispatch to different implementations + switch (host_tensor.scalar_type()) { + case torch::kFloat: + load_tensor_impl(host_tensor, device_tensor); + break; + case torch::kHalf: + load_tensor_impl(host_tensor, device_tensor); + break; + case torch::kBFloat16: + load_tensor_impl(host_tensor, device_tensor); + break; + default: + AT_ERROR("Unsupported data type: ", host_tensor.scalar_type()); + } +} + +// PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { +// m.def("store_tensor", &store_tensor, "Store CUDA tensor to pinned memory (supports float32, float16, bfloat16)"); +// m.def("load_tensor", &load_tensor, "Load CUDA tensor from pinned memory (supports float32, float16, bfloat16)"); +// } diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 7ca40a5e7827..47d8b9752062 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -665,4 +665,9 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { custom_ar.def("free_shared_buffer", &free_shared_buffer); } +TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _mem_pool), mem_pool) { + mem_pool.def("store_tensor", &store_tensor); + mem_pool.def("load_tensor", &load_tensor); +} + REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 0206d4552c8b..60aadcc1bd48 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1493,6 +1493,14 @@ def free_shared_buffer(ptr: int) -> None: torch.ops._C_custom_ar.free_shared_buffer(ptr) +def store_tensor(device_tensor: torch.Tensor, host_tensor: torch.Tensor): + torch.ops._C_mem_pool.store_tensor(device_tensor, host_tensor) + + +def load_tensor(host_tensor: torch.Tensor, device_tensor: torch.Tensor): + torch.ops._C_mem_pool.load_tensor(host_tensor, device_tensor) + + def get_flash_mla_metadata( cache_seqlens: torch.Tensor, num_heads_per_head_k: int, diff --git a/vllm/distributed/kv_transfer/tensor_memory_pool.py b/vllm/distributed/kv_transfer/tensor_memory_pool.py index 518333fbcb40..685d36f19e99 100644 --- a/vllm/distributed/kv_transfer/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/tensor_memory_pool.py @@ -7,7 +7,7 @@ from typing import Dict, Optional, Tuple import torch -import tensor_store_load_mem as cuda_kernels +from vllm import _custom_ops as ops @dataclass @@ -127,7 +127,7 @@ def store_tensor(self, tensor: torch.Tensor) -> int: raise MemoryError(f"Failed to create tensor view: {e}") with torch.cuda.stream(self.store_stream): - cuda_kernels.store_tensor(tensor, cpu_tensor) + ops.store_tensor(tensor, cpu_tensor) self.store_stream.synchronize() return addr @@ -150,7 +150,7 @@ def load_tensor(self, addr: int, dtype: torch.dtype, shape: Tuple[int, ...], dev cuda_tensor = torch.empty(shape, dtype=dtype, device=device) with torch.cuda.stream(self.load_stream): - cuda_kernels.load_tensor(cpu_tensor, cuda_tensor) + ops.load_tensor(cpu_tensor, cuda_tensor) self.load_stream.synchronize() self.free(addr) From 49387ff1169f904391d72d74203839771bf16a50 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 16 May 2025 11:37:01 +0800 Subject: [PATCH 067/155] rm v0 Signed-off-by: Abatom --- .../kv_transfer/kv_connector/factory.py | 4 - .../kv_transfer/kv_connector/p2p_connector.py | 202 ------------------ .../kv_connector/v1/p2p_nccl_connector.py | 16 +- .../p2p_nccl_pipe.py => p2p_nccl_transfer.py} | 2 +- 4 files changed, 9 insertions(+), 215 deletions(-) delete mode 100644 vllm/distributed/kv_transfer/kv_connector/p2p_connector.py rename vllm/distributed/kv_transfer/{kv_pipe/p2p_nccl_pipe.py => p2p_nccl_transfer.py} (99%) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 33e80af7a7ce..2b8bda35a8e4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -97,10 +97,6 @@ def create_connector_v1(cls, "vllm.distributed.kv_transfer.kv_connector.mooncake_store_connector", "MooncakeStoreConnector") -KVConnectorFactory.register_connector( - "P2pConnector", "vllm.distributed.kv_transfer.kv_connector.p2p_connector", - "P2pConnector") - KVConnectorFactory.register_connector( "SharedStorageConnector", "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector", diff --git a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py b/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py deleted file mode 100644 index e59d3227f8ea..000000000000 --- a/vllm/distributed/kv_transfer/kv_connector/p2p_connector.py +++ /dev/null @@ -1,202 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import re -from typing import TYPE_CHECKING, List, Tuple, Union - -import torch - -from vllm.config import VllmConfig -from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase -from vllm.distributed.kv_transfer.kv_connector.utils import ( - model_aware_kv_ops_helper as kv_helper) -from vllm.distributed.kv_transfer.kv_pipe.p2p_nccl_pipe import P2pNcclPipe -from vllm.logger import init_logger -from vllm.sequence import IntermediateTensors - -if TYPE_CHECKING: - from vllm.worker.model_runner import ModelInputForGPUWithSamplingMetadata - -logger = init_logger(__name__) - - -class P2pConnector(KVConnectorBase): - - def __init__( - self, - rank: int, - local_rank: int, - config: VllmConfig, - ): - self.rank = rank - self.config = config.kv_transfer_config - self.kv_helper = kv_helper(config) - - assert self.config.kv_connector == "P2pConnector" - - self.lookup_buffer_size = self.config.kv_buffer_size - - self.p2p_nccl_pipe = P2pNcclPipe( - local_rank=local_rank, - config=self.config, - hostname="", - port_offset=rank, - ) - - def send_kv_caches_and_hidden_states( - self, - model_executable: torch.nn.Module, - model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: List[torch.Tensor], - hidden_or_intermediate_states: Union[torch.Tensor, - IntermediateTensors], - ) -> None: - # input_tokens_tensor = model_input.input_tokens - seq_lens = model_input.attn_metadata.seq_lens - slot_mapping_flat = model_input.attn_metadata.slot_mapping.flatten() - request_ids = list(model_input.request_ids_to_seq_ids.keys()) - start_layer = model_executable.model.start_layer - end_layer = model_executable.model.end_layer - num_heads, head_size = self.kv_helper.get_model_args(model_executable) - - for idx, slen in enumerate(seq_lens): - start_pos = sum(seq_lens[:idx]) - end_pos = start_pos + slen - - # current_tokens = input_tokens_tensor[start_pos:end_pos] - keys, values = [], [] - - for layer_id in range(start_layer, end_layer): - kv_cache = kv_caches[layer_id - start_layer] - key_cache, value_cache = self.kv_helper.get_kv_from_cache( - kv_cache, num_heads, head_size) - current_slot_mapping = slot_mapping_flat[start_pos:end_pos] - - keys.append(key_cache[current_slot_mapping].unsqueeze(0)) - values.append(value_cache[current_slot_mapping].unsqueeze(0)) - - keys = torch.cat(keys, dim=0) - values = torch.cat(values, dim=0) - kvcache = torch.stack((keys, values), dim=0) - - request_id = request_ids[idx] - ip, port = self.parse_request_id(request_id, True) - remote_address = ip + ":" + str(port + self.rank) - - self.p2p_nccl_pipe.send_tensor(request_id + "kv", kvcache, - remote_address) - self.p2p_nccl_pipe.send_tensor( - request_id + "hidden", - hidden_or_intermediate_states[start_pos:end_pos], - remote_address) - - logger.debug("[rank%d]: KV send DONE.", torch.distributed.get_rank()) - - def recv_kv_caches_and_hidden_states( - self, model_executable: torch.nn.Module, - model_input: "ModelInputForGPUWithSamplingMetadata", - kv_caches: List[torch.Tensor] - ) -> Tuple[Union[torch.Tensor, IntermediateTensors], bool, - "ModelInputForGPUWithSamplingMetadata"]: - bypass_model_exec = True - input_tokens_tensor = model_input.input_tokens - seq_lens = model_input.attn_metadata.seq_lens - num_prefill_tokens = model_input.attn_metadata.num_prefill_tokens - slot_mapping = model_input.attn_metadata.slot_mapping.flatten() - request_ids = list(model_input.request_ids_to_seq_ids.keys()) - start_layer = model_executable.model.start_layer - end_layer = model_executable.model.end_layer - hidden_or_intermediate_states_for_one_req = [] - - for idx, slen in enumerate(seq_lens): - start_pos = sum(seq_lens[:idx]) - end_pos = start_pos + slen - - if start_pos >= num_prefill_tokens: - # This can happen during inflight batching. See: - # vllm/worker/model_runner.py::_prepare_model_input_tensors: - # - input_tokens[:num_prefill_tokens] contains prefill tokens. - # - input_tokens[num_prefill_tokens:] contains decode tokens. - logger.warning("You should set --enable_chunked_prefill=False " - "and --max_num_batched_tokens " - "should be equal to max_seq_len_to_capture") - bypass_model_exec = False - assert start_pos == num_prefill_tokens - break - - current_tokens = input_tokens_tensor[start_pos:end_pos] - - request_id = request_ids[idx] - ip, port = self.parse_request_id(request_id, False) - remote_address = ip + ":" + str(port + self.rank) - - kvcache = self.p2p_nccl_pipe.recv_tensor(request_id + "kv", - remote_address) - hidden = self.p2p_nccl_pipe.recv_tensor(request_id + "hidden", - remote_address) - - if kvcache is None or hidden is None: - # didn't find any match. - bypass_model_exec = False - continue - - num_computed_tokens = current_tokens.shape[0] - - # update the end position based on how many tokens are cached. - end_pos = start_pos + num_computed_tokens - - # call self.kv_store to get kv layer by layer - for layer_id in range(start_layer, end_layer): - layer = model_executable.model.layers[layer_id] - # get kvcache object - kv_cache = kv_caches[layer_id - start_layer] - - # get remote kvcache - remote_k, remote_v = kvcache[0][layer_id], kvcache[1][layer_id] - - self.kv_helper.put_kv_to_cache(model_executable, remote_k, - remote_v, layer, kv_cache, - slot_mapping, start_pos, - end_pos) - - hidden_or_intermediate_states_for_one_req.append(hidden) - - if not bypass_model_exec: - logger.warning( - "[rank%d]: Failed to receive all KVs and hidden " - "states, redo model forwarding.", torch.distributed.get_rank()) - hidden_or_intermediate_states = None - - else: - logger.debug( - "[rank%d]: Successfully received all KVs and hidden " - "states, skip model forwarding.", torch.distributed.get_rank()) - hidden_or_intermediate_states = torch.cat( - hidden_or_intermediate_states_for_one_req, dim=0) - - return hidden_or_intermediate_states, bypass_model_exec, model_input - - @staticmethod - def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: - logger.debug("parse_request_id, request_id: %s, is_prefill: %s", - request_id, is_prefill) - # Regular expression to match the string hostname and integer port - if is_prefill: - pattern = r"___decode_addr_(.*):(\d+)" - else: - pattern = r"___prefill_addr_(.*):(\d+)___" - - # Use re.search to find the pattern in the request_id - match = re.search(pattern, request_id) - if match: - # Extract the ranks - ip = match.group(1) - port = int(match.group(2)) - - logger.debug("parse_request_id, request_id: %s, ip: %s, port: %s", - request_id, ip, str(port)) - return ip, port - raise ValueError( - f"Request id {request_id} does not contain hostname and port") - - def close(self) -> None: - self.p2p_nccl_pipe.close() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py index 7fc7fd372122..3ec1bebf16af 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py @@ -9,7 +9,7 @@ from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) -from vllm.distributed.kv_transfer.kv_pipe.p2p_nccl_pipe import P2pNcclPipe +from vllm.distributed.kv_transfer.p2p_nccl_transfer import P2pNcclTransfer from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput @@ -84,7 +84,7 @@ def __init__(self, self.rank = rank self.is_producer = self.config.is_kv_producer - self.p2p_nccl_pipe = P2pNcclPipe( + self.p2p_nccl_transfer = P2pNcclTransfer( local_rank=local_rank, config=self.config, hostname="", @@ -104,7 +104,7 @@ def start_load_kv(self, forward_context: "ForwardContext", The number of elements in kv_caches and layer_names should be the same. """ - assert self.p2p_nccl_pipe is not None + assert self.p2p_nccl_transfer is not None attn_metadata = forward_context.attn_metadata @@ -179,7 +179,7 @@ def inject_kv_into_layer( kv_cache_layer = attn_layer.kv_cache[ \ forward_context.virtual_engine] - kv_cache = self.p2p_nccl_pipe.recv_tensor(request.request_id + + kv_cache = self.p2p_nccl_transfer.recv_tensor(request.request_id + "-" + layer_name) if kv_cache is None: @@ -216,7 +216,7 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, attn_metadata (AttentionMetadata): the attention metadata. **kwargs: additional arguments for the save operation. """ - assert self.p2p_nccl_pipe is not None + assert self.p2p_nccl_transfer is not None def extract_kv_from_layer( layer: torch.Tensor, @@ -244,13 +244,13 @@ def extract_kv_from_layer( remote_address = ip + ":" + str(port + self._rank) kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) - self.p2p_nccl_pipe.send_tensor(request_id + "-" + layer_name, + self.p2p_nccl_transfer.send_tensor(request_id + "-" + layer_name, kv_cache, remote_address) def wait_for_save(self): if self.is_producer: - assert self.p2p_nccl_pipe is not None - self.p2p_nccl_pipe.wait_for_sent() + assert self.p2p_nccl_transfer is not None + self.p2p_nccl_transfer.wait_for_sent() def get_num_new_matched_tokens( self, diff --git a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py b/vllm/distributed/kv_transfer/p2p_nccl_transfer.py similarity index 99% rename from vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py rename to vllm/distributed/kv_transfer/p2p_nccl_transfer.py index 86d94dbf43e0..c158ff4d6833 100644 --- a/vllm/distributed/kv_transfer/kv_pipe/p2p_nccl_pipe.py +++ b/vllm/distributed/kv_transfer/p2p_nccl_transfer.py @@ -21,7 +21,7 @@ logger = logging.getLogger(__name__) -class P2pNcclPipe: +class P2pNcclTransfer: def __init__(self, local_rank: int, From 80e26b9038c9dd47b0e80773e2d69eff97494cc5 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 16 May 2025 12:27:28 +0800 Subject: [PATCH 068/155] rename Signed-off-by: Abatom --- .../kv_transfer/kv_connector/factory.py | 2 +- .../kv_transfer/kv_connector/v1/p2p/__init__.py | 0 .../v1/{ => p2p}/p2p_nccl_connector.py | 16 ++++++++-------- .../v1/p2p/p2p_nccl_engine.py} | 4 ++-- .../v1/p2p}/tensor_memory_pool.py | 0 5 files changed, 11 insertions(+), 11 deletions(-) create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py rename vllm/distributed/kv_transfer/kv_connector/v1/{ => p2p}/p2p_nccl_connector.py (96%) rename vllm/distributed/kv_transfer/{p2p_nccl_transfer.py => kv_connector/v1/p2p/p2p_nccl_engine.py} (99%) rename vllm/distributed/kv_transfer/{ => kv_connector/v1/p2p}/tensor_memory_pool.py (100%) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index d2b68f7b0215..d9441e57376c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -104,7 +104,7 @@ def create_connector_v1(cls, KVConnectorFactory.register_connector( "P2pNcclConnector", - "vllm.distributed.kv_transfer.kv_connector.v1.p2p_nccl_connector", + "vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_connector", "P2pNcclConnector") KVConnectorFactory.register_connector( diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py similarity index 96% rename from vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py rename to vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 3ec1bebf16af..d3aa5236a66d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -9,7 +9,7 @@ from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) -from vllm.distributed.kv_transfer.p2p_nccl_transfer import P2pNcclTransfer +from vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_engine import P2pNcclEngine from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput @@ -84,7 +84,7 @@ def __init__(self, self.rank = rank self.is_producer = self.config.is_kv_producer - self.p2p_nccl_transfer = P2pNcclTransfer( + self.p2p_nccl_engine = P2pNcclEngine( local_rank=local_rank, config=self.config, hostname="", @@ -104,7 +104,7 @@ def start_load_kv(self, forward_context: "ForwardContext", The number of elements in kv_caches and layer_names should be the same. """ - assert self.p2p_nccl_transfer is not None + assert self.p2p_nccl_engine is not None attn_metadata = forward_context.attn_metadata @@ -179,7 +179,7 @@ def inject_kv_into_layer( kv_cache_layer = attn_layer.kv_cache[ \ forward_context.virtual_engine] - kv_cache = self.p2p_nccl_transfer.recv_tensor(request.request_id + + kv_cache = self.p2p_nccl_engine.recv_tensor(request.request_id + "-" + layer_name) if kv_cache is None: @@ -216,7 +216,7 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, attn_metadata (AttentionMetadata): the attention metadata. **kwargs: additional arguments for the save operation. """ - assert self.p2p_nccl_transfer is not None + assert self.p2p_nccl_engine is not None def extract_kv_from_layer( layer: torch.Tensor, @@ -244,13 +244,13 @@ def extract_kv_from_layer( remote_address = ip + ":" + str(port + self._rank) kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) - self.p2p_nccl_transfer.send_tensor(request_id + "-" + layer_name, + self.p2p_nccl_engine.send_tensor(request_id + "-" + layer_name, kv_cache, remote_address) def wait_for_save(self): if self.is_producer: - assert self.p2p_nccl_transfer is not None - self.p2p_nccl_transfer.wait_for_sent() + assert self.p2p_nccl_engine is not None + self.p2p_nccl_engine.wait_for_sent() def get_num_new_matched_tokens( self, diff --git a/vllm/distributed/kv_transfer/p2p_nccl_transfer.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py similarity index 99% rename from vllm/distributed/kv_transfer/p2p_nccl_transfer.py rename to vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index c158ff4d6833..47be9f5a4cb1 100644 --- a/vllm/distributed/kv_transfer/p2p_nccl_transfer.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -14,14 +14,14 @@ from vllm.config import KVTransferConfig from vllm.distributed.device_communicators.pynccl_wrapper import ( NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum) -from vllm.distributed.kv_transfer.tensor_memory_pool import ( +from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import ( TensorMemoryPool) from vllm.utils import current_stream, get_ip logger = logging.getLogger(__name__) -class P2pNcclTransfer: +class P2pNcclEngine: def __init__(self, local_rank: int, diff --git a/vllm/distributed/kv_transfer/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py similarity index 100% rename from vllm/distributed/kv_transfer/tensor_memory_pool.py rename to vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py From a9c56747120978dd472d90d657bce5734804a79f Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 16 May 2025 15:00:30 +0800 Subject: [PATCH 069/155] format Signed-off-by: Abatom --- csrc/tensor_store_load_mem.cu | 126 +++++++++--------- .../disagg_xpyd/disagg_prefill_proxy_xpyd.py | 7 +- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 23 ++-- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 11 +- .../kv_connector/v1/p2p/tensor_memory_pool.py | 36 +++-- 5 files changed, 114 insertions(+), 89 deletions(-) diff --git a/csrc/tensor_store_load_mem.cu b/csrc/tensor_store_load_mem.cu index 7cdfd54cb3d1..5cf18adc8921 100644 --- a/csrc/tensor_store_load_mem.cu +++ b/csrc/tensor_store_load_mem.cu @@ -3,95 +3,97 @@ // Template-based CUDA kernel: Copy from device memory to pinned host memory template -__global__ void store_kernel(const scalar_t* device_ptr, scalar_t* host_ptr, size_t num_elements) { - const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_elements) { - host_ptr[idx] = device_ptr[idx]; - } +__global__ void store_kernel(const scalar_t* device_ptr, scalar_t* host_ptr, + size_t num_elements) { + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_elements) { + host_ptr[idx] = device_ptr[idx]; + } } // Templated CUDA kernel: Copy from pinned host memory to device memory template -__global__ void load_kernel(const scalar_t* host_ptr, scalar_t* device_ptr, size_t num_elements) { - const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_elements) { - device_ptr[idx] = host_ptr[idx]; - } +__global__ void load_kernel(const scalar_t* host_ptr, scalar_t* device_ptr, + size_t num_elements) { + const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < num_elements) { + device_ptr[idx] = host_ptr[idx]; + } } // Templated wrapper function: Store Tensor to pinned memory template void store_tensor_impl(torch::Tensor device_tensor, torch::Tensor host_tensor) { - const auto num_elements = device_tensor.numel(); - const int threads = 256; - const int blocks = (num_elements + threads - 1) / threads; + const auto num_elements = device_tensor.numel(); + const int threads = 256; + const int blocks = (num_elements + threads - 1) / threads; - auto device_ptr = device_tensor.data_ptr(); - auto host_ptr = host_tensor.data_ptr(); + auto device_ptr = device_tensor.data_ptr(); + auto host_ptr = host_tensor.data_ptr(); - store_kernel<<>>( - device_ptr, host_ptr, num_elements); + store_kernel<<>>( + device_ptr, host_ptr, num_elements); } // Templated wrapper function: Load Tensor from pinned memory template void load_tensor_impl(torch::Tensor host_tensor, torch::Tensor device_tensor) { - const auto num_elements = host_tensor.numel(); - const int threads = 256; - const int blocks = (num_elements + threads - 1) / threads; + const auto num_elements = host_tensor.numel(); + const int threads = 256; + const int blocks = (num_elements + threads - 1) / threads; - auto host_ptr = host_tensor.data_ptr(); - auto device_ptr = device_tensor.data_ptr(); + auto host_ptr = host_tensor.data_ptr(); + auto device_ptr = device_tensor.data_ptr(); - load_kernel<<>>( - host_ptr, device_ptr, num_elements); + load_kernel<<>>( + host_ptr, device_ptr, num_elements); } // Type-dispatched wrapper function void store_tensor(torch::Tensor device_tensor, torch::Tensor host_tensor) { - // Validate arguments - AT_ASSERT(device_tensor.is_cuda(), "Input tensor must be a CUDA tensor"); - AT_ASSERT(host_tensor.is_pinned(), "Output tensor must be pinned memory"); - AT_ASSERT(device_tensor.numel() == host_tensor.numel(), "Tensors must have same number of elements"); - AT_ASSERT(device_tensor.dtype() == host_tensor.dtype(), "Tensors must have same dtype"); + // Validate arguments + AT_ASSERT(device_tensor.is_cuda(), "Input tensor must be a CUDA tensor"); + AT_ASSERT(host_tensor.is_pinned(), "Output tensor must be pinned memory"); + AT_ASSERT(device_tensor.numel() == host_tensor.numel(), "Tensors must have same number of elements"); + AT_ASSERT(device_tensor.dtype() == host_tensor.dtype(), "Tensors must have same dtype"); - // Type-based dispatch to different implementations - switch (device_tensor.scalar_type()) { - case torch::kFloat: - store_tensor_impl(device_tensor, host_tensor); - break; - case torch::kHalf: - store_tensor_impl(device_tensor, host_tensor); - break; - case torch::kBFloat16: - store_tensor_impl(device_tensor, host_tensor); - break; - default: - AT_ERROR("Unsupported data type: ", device_tensor.scalar_type()); - } + // Type-based dispatch to different implementations + switch (device_tensor.scalar_type()) { + case torch::kFloat: + store_tensor_impl(device_tensor, host_tensor); + break; + case torch::kHalf: + store_tensor_impl(device_tensor, host_tensor); + break; + case torch::kBFloat16: + store_tensor_impl(device_tensor, host_tensor); + break; + default: + AT_ERROR("Unsupported data type: ", device_tensor.scalar_type()); + } } void load_tensor(torch::Tensor host_tensor, torch::Tensor device_tensor) { - // Validate arguments - AT_ASSERT(device_tensor.is_cuda(), "Output tensor must be a CUDA tensor"); - AT_ASSERT(host_tensor.is_pinned(), "Input tensor must be pinned memory"); - AT_ASSERT(device_tensor.numel() == host_tensor.numel(), "Tensors must have same number of elements"); - AT_ASSERT(device_tensor.dtype() == host_tensor.dtype(), "Tensors must have same dtype"); + // Validate arguments + AT_ASSERT(device_tensor.is_cuda(), "Output tensor must be a CUDA tensor"); + AT_ASSERT(host_tensor.is_pinned(), "Input tensor must be pinned memory"); + AT_ASSERT(device_tensor.numel() == host_tensor.numel(), "Tensors must have same number of elements"); + AT_ASSERT(device_tensor.dtype() == host_tensor.dtype(), "Tensors must have same dtype"); - // Type-based dispatch to different implementations - switch (host_tensor.scalar_type()) { - case torch::kFloat: - load_tensor_impl(host_tensor, device_tensor); - break; - case torch::kHalf: - load_tensor_impl(host_tensor, device_tensor); - break; - case torch::kBFloat16: - load_tensor_impl(host_tensor, device_tensor); - break; - default: - AT_ERROR("Unsupported data type: ", host_tensor.scalar_type()); - } + // Type-based dispatch to different implementations + switch (host_tensor.scalar_type()) { + case torch::kFloat: + load_tensor_impl(host_tensor, device_tensor); + break; + case torch::kHalf: + load_tensor_impl(host_tensor, device_tensor); + break; + case torch::kBFloat16: + load_tensor_impl(host_tensor, device_tensor); + break; + default: + AT_ERROR("Unsupported data type: ", host_tensor.scalar_type()); + } } // PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index 448ba6064625..210dd546c898 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import os -import random import socket import threading import uuid @@ -110,7 +109,8 @@ async def handle_request(): # prefill_addr, prefill_zmq_addr = random.choice( # list(prefill_instances.items())) prefill_list = list(prefill_instances.items()) - prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)] + prefill_addr, prefill_zmq_addr = prefill_list[count % + len(prefill_list)] global decode_instances global decode_cv @@ -118,7 +118,8 @@ async def handle_request(): # decode_addr, decode_zmq_addr = random.choice( # list(decode_instances.items())) decode_list = list(decode_instances.items()) - decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)] + decode_addr, decode_zmq_addr = decode_list[count % + len(decode_list)] print(f"handle_request count: {count}, [HTTP:{prefill_addr}, " f"ZMQ:{prefill_zmq_addr}] ๐Ÿ‘‰ [HTTP:{decode_addr}, " diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index d3aa5236a66d..080b3bde42b7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -9,7 +9,8 @@ from vllm.config import VllmConfig from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) -from vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_engine import P2pNcclEngine +from vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_engine import ( + P2pNcclEngine) from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput @@ -135,8 +136,11 @@ def inject_kv_into_layer( if len(slot_mapping) == num_token: dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache else: - dst_kv_cache_layer[slot_mapping[:num_token], ...] = src_kv_cache - logger.warning("๐Ÿšงsrc_kv_cache does not match, num_slot:%d, num_token:%d", len(slot_mapping), num_token) + dst_kv_cache_layer[slot_mapping[:num_token], + ...] = src_kv_cache + logger.warning( + "๐Ÿšงsrc_kv_cache does not match, num_slot:%d, num_token:%d", + len(slot_mapping), num_token) dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) else: @@ -148,8 +152,11 @@ def inject_kv_into_layer( if len(slot_mapping) == num_token: dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache else: - dst_kv_cache_layer[:, slot_mapping[:num_token],...] = src_kv_cache - logger.warning("๐Ÿšงsrc_kv_cache does not match, num_slot:%d, num_token:%d", len(slot_mapping), num_token) + dst_kv_cache_layer[:, slot_mapping[:num_token], + ...] = src_kv_cache + logger.warning( + "๐Ÿšงsrc_kv_cache does not match, num_slot:%d, num_token:%d", + len(slot_mapping), num_token) dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) @@ -179,8 +186,8 @@ def inject_kv_into_layer( kv_cache_layer = attn_layer.kv_cache[ \ forward_context.virtual_engine] - kv_cache = self.p2p_nccl_engine.recv_tensor(request.request_id + - "-" + layer_name) + kv_cache = self.p2p_nccl_engine.recv_tensor( + request.request_id + "-" + layer_name) if kv_cache is None: logger.warning("๐Ÿšงsrc_kv_cache is None, %s", @@ -245,7 +252,7 @@ def extract_kv_from_layer( kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) self.p2p_nccl_engine.send_tensor(request_id + "-" + layer_name, - kv_cache, remote_address) + kv_cache, remote_address) def wait_for_save(self): if self.is_producer: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 47be9f5a4cb1..6358f781d203 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -75,7 +75,8 @@ def __init__(self, self.recv_stream = torch.cuda.Stream() mem_pool_size = self.config.get_from_extra_config("mem_pool_size", 128) - self.pool = TensorMemoryPool(max_block_size=mem_pool_size * 1024**3) # GB + self.pool = TensorMemoryPool(max_block_size=mem_pool_size * + 1024**3) # GB # The sending type includes tree mutually exclusive options: # PUT, GET, PUT_ASYNC. @@ -210,8 +211,8 @@ def recv_tensor( "๐Ÿ”ต[PUT]Recv From %s, tensor_id:%s, shape:%s, " "duration:%.3fms, size:%.3fGB, addr:%d, rank:%d", remote_address, tensor_id, tensor.shape, duration * 1000, - tensor.element_size() * tensor.numel() / 1024**3, - addr, self.rank) + tensor.element_size() * tensor.numel() / 1024**3, addr, + self.rank) else: duration = time.time() - start_time logger.warning( @@ -441,7 +442,7 @@ def _send(self, comm, tensor: torch.Tensor, dst: int, stream=None): logger.info( "๐Ÿ•Nccl Send Tensor, shape:%s, duration:%.3fms, size:%.3fGB, " "rank:%d", tensor.shape, duration * 1000, - tensor.element_size() * tensor.numel() / 1024 ** 3, dst) + tensor.element_size() * tensor.numel() / 1024**3, dst) def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): assert tensor.device == self.device, ( @@ -460,7 +461,7 @@ def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): logger.info( "๐Ÿ•Nccl Recv Tensor, shape:%s, duration:%.3fms, size:%.3fGB, " "rank:%d", tensor.shape, duration * 1000, - tensor.element_size() * tensor.numel() / 1024 ** 3, src) + tensor.element_size() * tensor.numel() / 1024**3, src) def close(self) -> None: self._listener_thread.join() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py index 685d36f19e99..0eca2db7cf45 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py @@ -4,9 +4,10 @@ import ctypes import math from dataclasses import dataclass -from typing import Dict, Optional, Tuple +from typing import Dict, Tuple import torch + from vllm import _custom_ops as ops @@ -21,7 +22,8 @@ def __init__(self, max_block_size: int, min_block_size: int = 512): if max_block_size <= 0 or min_block_size <= 0: raise ValueError("Block sizes must be positive") if max_block_size < min_block_size: - raise ValueError("Max block size must be greater than min block size") + raise ValueError( + "Max block size must be greater than min block size") self.max_block_size = self._round_to_power_of_two(max_block_size) self.min_block_size = self._round_to_power_of_two(min_block_size) @@ -47,17 +49,23 @@ def _initialize_free_lists(self): size //= 2 def _allocate_pinned_memory(self): - self.base_tensor = torch.empty(self.max_block_size // 4, dtype=torch.float32, pin_memory=True) + self.base_tensor = torch.empty(self.max_block_size // 4, + dtype=torch.float32, + pin_memory=True) self.base_address = self.base_tensor.data_ptr() - initial_block = MemoryBlock(size=self.max_block_size, addr=self.base_address) - self.free_lists[self.max_block_size][initial_block.addr] = initial_block - print("TensorMemoryPool, base_address:", self.base_address, self.base_address % self.max_block_size) + initial_block = MemoryBlock(size=self.max_block_size, + addr=self.base_address) + self.free_lists[self.max_block_size][ + initial_block.addr] = initial_block + print("TensorMemoryPool, base_address:", self.base_address, + self.base_address % self.max_block_size) def allocate(self, size: int) -> int: if size <= 0: raise ValueError("Allocation size must be positive") - required_size = self._round_to_power_of_two(max(size, self.min_block_size)) + required_size = self._round_to_power_of_two( + max(size, self.min_block_size)) if required_size > self.max_block_size: raise MemoryError("Requested size exceeds maximum block size") @@ -94,7 +102,8 @@ def _merge_buddies(self, block: MemoryBlock): depth = 0 while depth < MAX_MERGE_DEPTH: - buddy_offset = block.size if (block.addr - self.base_address) % (2 * block.size) == 0 else -block.size + buddy_offset = block.size if (block.addr - self.base_address) % ( + 2 * block.size) == 0 else -block.size buddy_addr = block.addr + buddy_offset buddy = self.free_lists[block.size].get(buddy_addr) if buddy: @@ -117,11 +126,15 @@ def store_tensor(self, tensor: torch.Tensor) -> int: if block.size < size: self.free(addr) - raise MemoryError(f"Allocated block size {block.size} is smaller than required size {size}") + raise MemoryError( + f"Allocated block size {block.size} is smaller than required size {size}" + ) try: buffer = (ctypes.c_byte * block.size).from_address(block.addr) - cpu_tensor = torch.frombuffer(buffer, dtype=tensor.dtype, count=tensor.numel()) + cpu_tensor = torch.frombuffer(buffer, + dtype=tensor.dtype, + count=tensor.numel()) except ValueError as e: self.free(addr) raise MemoryError(f"Failed to create tensor view: {e}") @@ -132,7 +145,8 @@ def store_tensor(self, tensor: torch.Tensor) -> int: return addr - def load_tensor(self, addr: int, dtype: torch.dtype, shape: Tuple[int, ...], device) -> torch.Tensor: + def load_tensor(self, addr: int, dtype: torch.dtype, + shape: Tuple[int, ...], device) -> torch.Tensor: if addr not in self.allocated_blocks: raise ValueError("Invalid address to load") From 3c67c9e61c8edd1a7d6da1b86075b0d08afd2810 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 16 May 2025 15:40:02 +0800 Subject: [PATCH 070/155] bugfix Signed-off-by: Abatom --- csrc/tensor_store_load_mem.cu | 22 ++++++++++++------- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 8 +++---- .../kv_connector/v1/p2p/tensor_memory_pool.py | 2 +- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/csrc/tensor_store_load_mem.cu b/csrc/tensor_store_load_mem.cu index 5cf18adc8921..b34bc3dcd037 100644 --- a/csrc/tensor_store_load_mem.cu +++ b/csrc/tensor_store_load_mem.cu @@ -31,8 +31,9 @@ void store_tensor_impl(torch::Tensor device_tensor, torch::Tensor host_tensor) { auto device_ptr = device_tensor.data_ptr(); auto host_ptr = host_tensor.data_ptr(); - store_kernel<<>>( - device_ptr, host_ptr, num_elements); + store_kernel + <<>>( + device_ptr, host_ptr, num_elements); } // Templated wrapper function: Load Tensor from pinned memory @@ -45,8 +46,9 @@ void load_tensor_impl(torch::Tensor host_tensor, torch::Tensor device_tensor) { auto host_ptr = host_tensor.data_ptr(); auto device_ptr = device_tensor.data_ptr(); - load_kernel<<>>( - host_ptr, device_ptr, num_elements); + load_kernel + <<>>( + host_ptr, device_ptr, num_elements); } // Type-dispatched wrapper function @@ -54,8 +56,10 @@ void store_tensor(torch::Tensor device_tensor, torch::Tensor host_tensor) { // Validate arguments AT_ASSERT(device_tensor.is_cuda(), "Input tensor must be a CUDA tensor"); AT_ASSERT(host_tensor.is_pinned(), "Output tensor must be pinned memory"); - AT_ASSERT(device_tensor.numel() == host_tensor.numel(), "Tensors must have same number of elements"); - AT_ASSERT(device_tensor.dtype() == host_tensor.dtype(), "Tensors must have same dtype"); + AT_ASSERT(device_tensor.numel() == host_tensor.numel(), + "Tensors must have same number of elements"); + AT_ASSERT(device_tensor.dtype() == host_tensor.dtype(), + "Tensors must have same dtype"); // Type-based dispatch to different implementations switch (device_tensor.scalar_type()) { @@ -77,8 +81,10 @@ void load_tensor(torch::Tensor host_tensor, torch::Tensor device_tensor) { // Validate arguments AT_ASSERT(device_tensor.is_cuda(), "Output tensor must be a CUDA tensor"); AT_ASSERT(host_tensor.is_pinned(), "Input tensor must be pinned memory"); - AT_ASSERT(device_tensor.numel() == host_tensor.numel(), "Tensors must have same number of elements"); - AT_ASSERT(device_tensor.dtype() == host_tensor.dtype(), "Tensors must have same dtype"); + AT_ASSERT(device_tensor.numel() == host_tensor.numel(), + "Tensors must have same number of elements"); + AT_ASSERT(device_tensor.dtype() == host_tensor.dtype(), + "Tensors must have same dtype"); // Type-based dispatch to different implementations switch (host_tensor.scalar_type()) { diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 080b3bde42b7..d61210d568ed 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -252,7 +252,7 @@ def extract_kv_from_layer( kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) self.p2p_nccl_engine.send_tensor(request_id + "-" + layer_name, - kv_cache, remote_address) + kv_cache, remote_address) def wait_for_save(self): if self.is_producer: @@ -263,7 +263,7 @@ def get_num_new_matched_tokens( self, request: "Request", num_computed_tokens: int, - ) -> int: + ) -> tuple[int, bool]: """ Get number of new tokens that can be loaded from the external KV cache beyond the num_computed_tokens. @@ -278,7 +278,7 @@ def get_num_new_matched_tokens( external KV cache beyond what is already computed. """ if self.is_producer: - return 0 + return 0, False num_external_tokens = (len(request.prompt_token_ids) - 1 - num_computed_tokens) @@ -287,7 +287,7 @@ def get_num_new_matched_tokens( "num_computed_tokens:%d", num_external_tokens, len(request.prompt_token_ids), num_computed_tokens) - return num_external_tokens + return num_external_tokens, True def update_state_after_alloc(self, request: "Request", num_external_tokens: int): diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py index 0eca2db7cf45..e246e00cc651 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py @@ -103,7 +103,7 @@ def _merge_buddies(self, block: MemoryBlock): while depth < MAX_MERGE_DEPTH: buddy_offset = block.size if (block.addr - self.base_address) % ( - 2 * block.size) == 0 else -block.size + 2 * block.size) == 0 else -block.size buddy_addr = block.addr + buddy_offset buddy = self.free_lists[block.size].get(buddy_addr) if buddy: From b6b52d2a94e2c872d3a2da9c6a73c68bf1a45560 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 16 May 2025 15:55:52 +0800 Subject: [PATCH 071/155] format Signed-off-by: Abatom --- csrc/tensor_store_load_mem.cu | 16 ++++++++++------ .../kv_connector/v1/p2p/p2p_nccl_connector.py | 4 +--- .../kv_connector/v1/p2p/tensor_memory_pool.py | 1 + 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/csrc/tensor_store_load_mem.cu b/csrc/tensor_store_load_mem.cu index b34bc3dcd037..bb7672623360 100644 --- a/csrc/tensor_store_load_mem.cu +++ b/csrc/tensor_store_load_mem.cu @@ -32,8 +32,8 @@ void store_tensor_impl(torch::Tensor device_tensor, torch::Tensor host_tensor) { auto host_ptr = host_tensor.data_ptr(); store_kernel - <<>>( - device_ptr, host_ptr, num_elements); + <<>>( + device_ptr, host_ptr, num_elements); } // Templated wrapper function: Load Tensor from pinned memory @@ -47,8 +47,8 @@ void load_tensor_impl(torch::Tensor host_tensor, torch::Tensor device_tensor) { auto device_ptr = device_tensor.data_ptr(); load_kernel - <<>>( - host_ptr, device_ptr, num_elements); + <<>>( + host_ptr, device_ptr, num_elements); } // Type-dispatched wrapper function @@ -103,6 +103,10 @@ void load_tensor(torch::Tensor host_tensor, torch::Tensor device_tensor) { } // PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { -// m.def("store_tensor", &store_tensor, "Store CUDA tensor to pinned memory (supports float32, float16, bfloat16)"); -// m.def("load_tensor", &load_tensor, "Load CUDA tensor from pinned memory (supports float32, float16, bfloat16)"); +// m.def("store_tensor", &store_tensor, +// "Store CUDA tensor to pinned memory +// (supports float32, float16, bfloat16)"); +// m.def("load_tensor", &load_tensor, +// "Load CUDA tensor from pinned memory +// (supports float32, float16, bfloat16)"); // } diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index d61210d568ed..8f9f7557c3d0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -290,12 +290,10 @@ def get_num_new_matched_tokens( return num_external_tokens, True def update_state_after_alloc(self, request: "Request", + blocks: "KVCacheBlocks", num_external_tokens: int): """ Update KVConnector state after block allocation. - - If blocks were allocated, add to _requests_need_load, - such that we load the KVs in the next forward pass. """ if not self.is_producer and num_external_tokens > 0: self._requests_need_load[request.request_id] = request diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py index e246e00cc651..ac81b4ee1342 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py @@ -18,6 +18,7 @@ class MemoryBlock: class TensorMemoryPool: + def __init__(self, max_block_size: int, min_block_size: int = 512): if max_block_size <= 0 or min_block_size <= 0: raise ValueError("Block sizes must be positive") From 540ab4e116504c6a896c469c4d3f75b17882672a Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 16 May 2025 16:04:35 +0800 Subject: [PATCH 072/155] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 8f9f7557c3d0..28de06d5c18e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -18,6 +18,7 @@ if TYPE_CHECKING: from vllm.attention.backends.abstract import AttentionMetadata from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks from vllm.v1.request import Request logger = init_logger(__name__) From a0b23a6a4886412c6ebd28734a3a69af5ce7f157 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 16 May 2025 17:56:00 +0800 Subject: [PATCH 073/155] format Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 12 ++++++------ .../kv_connector/v1/p2p/p2p_nccl_engine.py | 17 ++++++++--------- .../v1/p2p => }/tensor_memory_pool.py | 17 +++++++++-------- 3 files changed, 23 insertions(+), 23 deletions(-) rename vllm/{distributed/kv_transfer/kv_connector/v1/p2p => }/tensor_memory_pool.py (93%) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 28de06d5c18e..d8fac7d50991 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -2,7 +2,7 @@ import re from dataclasses import dataclass -from typing import TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING import torch @@ -140,8 +140,8 @@ def inject_kv_into_layer( dst_kv_cache_layer[slot_mapping[:num_token], ...] = src_kv_cache logger.warning( - "๐Ÿšงsrc_kv_cache does not match, num_slot:%d, num_token:%d", - len(slot_mapping), num_token) + "๐Ÿšงsrc_kv_cache does not match, num_slot:%d, " + "num_token:%d", len(slot_mapping), num_token) dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) else: @@ -156,8 +156,8 @@ def inject_kv_into_layer( dst_kv_cache_layer[:, slot_mapping[:num_token], ...] = src_kv_cache logger.warning( - "๐Ÿšงsrc_kv_cache does not match, num_slot:%d, num_token:%d", - len(slot_mapping), num_token) + "๐Ÿšงsrc_kv_cache does not match, num_slot:%d, " + "num_token:%d", len(slot_mapping), num_token) dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) @@ -357,7 +357,7 @@ def build_connector_meta( return meta @staticmethod - def parse_request_id(request_id: str, is_prefill=True) -> Tuple[str, int]: + def parse_request_id(request_id: str, is_prefill=True) -> tuple[str, int]: logger.debug("parse_request_id, request_id: %s, is_prefill: %s", request_id, is_prefill) # Regular expression to match the string hostname and integer port diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 6358f781d203..1cacd6667a6a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -5,7 +5,7 @@ import time import typing from collections import deque -from typing import Any, Deque, Dict, List, Optional +from typing import Any, Optional import msgpack import torch @@ -14,8 +14,7 @@ from vllm.config import KVTransferConfig from vllm.distributed.device_communicators.pynccl_wrapper import ( NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum) -from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import ( - TensorMemoryPool) +from vllm.tensor_memory_pool import TensorMemoryPool from vllm.utils import current_stream, get_ip logger = logging.getLogger(__name__) @@ -82,21 +81,21 @@ def __init__(self, # PUT, GET, PUT_ASYNC. self.send_type = self.config.get_from_extra_config("send_type", "PUT") if self.send_type == "GET": - self.send_store: Dict[str, + self.send_store: dict[str, torch.Tensor] = {} # tensor_id: torch.Tensor else: # PUT or PUT_ASYNC - self.send_queue: Deque[ - List[Any]] = deque() # tensor_id: torch.Tensor + # tensor_id: torch.Tensor + self.send_queue: deque[list[Any]] = deque() if self.send_type == "PUT_ASYNC": self._send_thread = threading.Thread(target=self._send_async, daemon=True) self._send_thread.start() # tensor_id: torch.Tensor/(addr, dtype, shape) - self.recv_store: Dict[str, Any] = {} - self.socks: Dict[str, Any] = {} # remote_address: client socket - self.comms: Dict[str, Any] = {} # remote_address: (ncclComm_t, rank) + self.recv_store: dict[str, Any] = {} + self.socks: dict[str, Any] = {} # remote_address: client socket + self.comms: dict[str, Any] = {} # remote_address: (ncclComm_t, rank) self.buffer_size = 0 self.buffer_size_threshold = self.config.kv_buffer_size diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/tensor_memory_pool.py similarity index 93% rename from vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py rename to vllm/tensor_memory_pool.py index ac81b4ee1342..7b19e1d3c99e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +++ b/vllm/tensor_memory_pool.py @@ -4,7 +4,6 @@ import ctypes import math from dataclasses import dataclass -from typing import Dict, Tuple import torch @@ -29,8 +28,8 @@ def __init__(self, max_block_size: int, min_block_size: int = 512): self.max_block_size = self._round_to_power_of_two(max_block_size) self.min_block_size = self._round_to_power_of_two(min_block_size) - self.free_lists: Dict[int, Dict[int, MemoryBlock]] = {} - self.allocated_blocks: Dict[int, MemoryBlock] = {} + self.free_lists: dict[int, dict[int, MemoryBlock]] = {} + self.allocated_blocks: dict[int, MemoryBlock] = {} self._initialize_free_lists() self._allocate_pinned_memory() @@ -82,7 +81,8 @@ def allocate(self, size: int) -> int: raise MemoryError("Insufficient memory") def _split_block(self, block: MemoryBlock, required_size: int): - while block.size > required_size and block.size // 2 >= self.min_block_size: + while (block.size > required_size and + block.size // 2 >= self.min_block_size): buddy_size = block.size // 2 buddy_addr = block.addr + buddy_size @@ -128,7 +128,8 @@ def store_tensor(self, tensor: torch.Tensor) -> int: if block.size < size: self.free(addr) raise MemoryError( - f"Allocated block size {block.size} is smaller than required size {size}" + f"Allocated block size {block.size} is smaller than " + f"required size {size}" ) try: @@ -136,9 +137,9 @@ def store_tensor(self, tensor: torch.Tensor) -> int: cpu_tensor = torch.frombuffer(buffer, dtype=tensor.dtype, count=tensor.numel()) - except ValueError as e: + except ValueError as err: self.free(addr) - raise MemoryError(f"Failed to create tensor view: {e}") + raise MemoryError(f"Failed to create tensor view: {err}") from err with torch.cuda.stream(self.store_stream): ops.store_tensor(tensor, cpu_tensor) @@ -147,7 +148,7 @@ def store_tensor(self, tensor: torch.Tensor) -> int: return addr def load_tensor(self, addr: int, dtype: torch.dtype, - shape: Tuple[int, ...], device) -> torch.Tensor: + shape: tuple[int, ...], device) -> torch.Tensor: if addr not in self.allocated_blocks: raise ValueError("Invalid address to load") From 92c2ab8377b3911c120cfeb840465c2bd0e37401 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 16 May 2025 18:07:10 +0800 Subject: [PATCH 074/155] format Signed-off-by: Abatom --- vllm/tensor_memory_pool.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/tensor_memory_pool.py b/vllm/tensor_memory_pool.py index 7b19e1d3c99e..510592fb7eed 100644 --- a/vllm/tensor_memory_pool.py +++ b/vllm/tensor_memory_pool.py @@ -81,8 +81,8 @@ def allocate(self, size: int) -> int: raise MemoryError("Insufficient memory") def _split_block(self, block: MemoryBlock, required_size: int): - while (block.size > required_size and - block.size // 2 >= self.min_block_size): + while (block.size > required_size + and block.size // 2 >= self.min_block_size): buddy_size = block.size // 2 buddy_addr = block.addr + buddy_size @@ -129,8 +129,7 @@ def store_tensor(self, tensor: torch.Tensor) -> int: self.free(addr) raise MemoryError( f"Allocated block size {block.size} is smaller than " - f"required size {size}" - ) + f"required size {size}") try: buffer = (ctypes.c_byte * block.size).from_address(block.addr) From ccba1d6f37955df20f998298a223ff1fa7fe2e9c Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 20 May 2025 20:47:38 +0800 Subject: [PATCH 075/155] get_num_new_matched_tokens Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index d8fac7d50991..234d76dd9302 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -288,7 +288,7 @@ def get_num_new_matched_tokens( "num_computed_tokens:%d", num_external_tokens, len(request.prompt_token_ids), num_computed_tokens) - return num_external_tokens, True + return num_external_tokens, False def update_state_after_alloc(self, request: "Request", blocks: "KVCacheBlocks", From 2840981d25864601b073f649253ccc64da5f22f1 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 13:31:57 +0800 Subject: [PATCH 076/155] bugfix for undefined symbol Signed-off-by: Abatom --- csrc/torch_bindings.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 40e3ab6341f5..c115478ad47a 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -704,8 +704,10 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { } TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _mem_pool), mem_pool) { - mem_pool.def("store_tensor", &store_tensor); - mem_pool.def("load_tensor", &load_tensor); + mem_pool.def("store_tensor(Tensor device_tensor, Tensor host_tensor) -> ()"); + mem_pool.impl("store_tensor", torch::kCUDA, &store_tensor); + mem_pool.def("load_tensor(Tensor host_tensor, Tensor device_tensor) -> ()"); + mem_pool.impl("load_tensor", torch::kCUDA, &load_tensor); } REGISTER_EXTENSION(TORCH_EXTENSION_NAME) From db544da9f9b236dd3c9461304fee44f7f11a7807 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 14:40:27 +0800 Subject: [PATCH 077/155] store_tensor and load_tensor Signed-off-by: Abatom --- csrc/torch_bindings.cpp | 14 +++++++------- vllm/_custom_ops.py | 16 ++++++++-------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index c115478ad47a..4d184e2c5647 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -602,6 +602,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) -> Tensor"); // conditionally compiled so impl in source file #endif + + // Store Tensor to pinned memory + ops.def("store_tensor(Tensor device_tensor, Tensor host_tensor) -> ()"); + ops.impl("store_tensor", torch::kCUDA, &store_tensor); + // Load Tensor from pinned memory + ops.def("load_tensor(Tensor host_tensor, Tensor device_tensor) -> ()"); + ops.impl("load_tensor", torch::kCUDA, &load_tensor); } TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { @@ -703,11 +710,4 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { custom_ar.def("free_shared_buffer", &free_shared_buffer); } -TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _mem_pool), mem_pool) { - mem_pool.def("store_tensor(Tensor device_tensor, Tensor host_tensor) -> ()"); - mem_pool.impl("store_tensor", torch::kCUDA, &store_tensor); - mem_pool.def("load_tensor(Tensor host_tensor, Tensor device_tensor) -> ()"); - mem_pool.impl("load_tensor", torch::kCUDA, &load_tensor); -} - REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index fa5ec404c62b..6905e59dde79 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1663,14 +1663,6 @@ def free_shared_buffer(ptr: int) -> None: torch.ops._C_custom_ar.free_shared_buffer(ptr) -def store_tensor(device_tensor: torch.Tensor, host_tensor: torch.Tensor): - torch.ops._C_mem_pool.store_tensor(device_tensor, host_tensor) - - -def load_tensor(host_tensor: torch.Tensor, device_tensor: torch.Tensor): - torch.ops._C_mem_pool.load_tensor(host_tensor, device_tensor) - - def get_flash_mla_metadata( cache_seqlens: torch.Tensor, num_heads_per_head_k: int, @@ -1742,3 +1734,11 @@ def cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor, torch.ops._C.cutlass_mla_decode(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale) return out + + +def store_tensor(device_tensor: torch.Tensor, host_tensor: torch.Tensor): + torch.ops._C.store_tensor(device_tensor, host_tensor) + + +def load_tensor(host_tensor: torch.Tensor, device_tensor: torch.Tensor): + torch.ops._C.load_tensor(host_tensor, device_tensor) From bb3b6175e578423ca01d785bc66161060206f52f Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 14:48:13 +0800 Subject: [PATCH 078/155] add ! Signed-off-by: Abatom --- csrc/torch_bindings.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 4d184e2c5647..57d2e7f54a65 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -604,10 +604,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { #endif // Store Tensor to pinned memory - ops.def("store_tensor(Tensor device_tensor, Tensor host_tensor) -> ()"); + ops.def("store_tensor(Tensor device_tensor, Tensor! host_tensor) -> ()"); ops.impl("store_tensor", torch::kCUDA, &store_tensor); // Load Tensor from pinned memory - ops.def("load_tensor(Tensor host_tensor, Tensor device_tensor) -> ()"); + ops.def("load_tensor(Tensor host_tensor, Tensor! device_tensor) -> ()"); ops.impl("load_tensor", torch::kCUDA, &load_tensor); } From 9665cd24a056baa636d439a7f4c67ed714e87a46 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 15:35:16 +0800 Subject: [PATCH 079/155] add tensor_store_load_mem.cu to CMakeLists.txt Signed-off-by: Abatom --- CMakeLists.txt | 1 + csrc/tensor_store_load_mem.cu | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a6c54be9530b..7fb0f2d142bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -245,6 +245,7 @@ set(VLLM_EXT_SRC "csrc/cuda_utils_kernels.cu" "csrc/prepare_inputs/advance_step.cu" "csrc/custom_all_reduce.cu" + "csrc/tensor_store_load_mem.cu.cu" "csrc/torch_bindings.cpp") if(VLLM_GPU_LANG STREQUAL "CUDA") diff --git a/csrc/tensor_store_load_mem.cu b/csrc/tensor_store_load_mem.cu index bb7672623360..c5b65e68e9ba 100644 --- a/csrc/tensor_store_load_mem.cu +++ b/csrc/tensor_store_load_mem.cu @@ -23,7 +23,7 @@ __global__ void load_kernel(const scalar_t* host_ptr, scalar_t* device_ptr, // Templated wrapper function: Store Tensor to pinned memory template -void store_tensor_impl(torch::Tensor device_tensor, torch::Tensor host_tensor) { +void store_tensor_impl(torch::Tensor& device_tensor, torch::Tensor& host_tensor) { const auto num_elements = device_tensor.numel(); const int threads = 256; const int blocks = (num_elements + threads - 1) / threads; @@ -38,7 +38,7 @@ void store_tensor_impl(torch::Tensor device_tensor, torch::Tensor host_tensor) { // Templated wrapper function: Load Tensor from pinned memory template -void load_tensor_impl(torch::Tensor host_tensor, torch::Tensor device_tensor) { +void load_tensor_impl(torch::Tensor& host_tensor, torch::Tensor& device_tensor) { const auto num_elements = host_tensor.numel(); const int threads = 256; const int blocks = (num_elements + threads - 1) / threads; @@ -52,7 +52,7 @@ void load_tensor_impl(torch::Tensor host_tensor, torch::Tensor device_tensor) { } // Type-dispatched wrapper function -void store_tensor(torch::Tensor device_tensor, torch::Tensor host_tensor) { +void store_tensor(torch::Tensor& device_tensor, torch::Tensor& host_tensor) { // Validate arguments AT_ASSERT(device_tensor.is_cuda(), "Input tensor must be a CUDA tensor"); AT_ASSERT(host_tensor.is_pinned(), "Output tensor must be pinned memory"); @@ -77,7 +77,7 @@ void store_tensor(torch::Tensor device_tensor, torch::Tensor host_tensor) { } } -void load_tensor(torch::Tensor host_tensor, torch::Tensor device_tensor) { +void load_tensor(torch::Tensor& host_tensor, torch::Tensor& device_tensor) { // Validate arguments AT_ASSERT(device_tensor.is_cuda(), "Output tensor must be a CUDA tensor"); AT_ASSERT(host_tensor.is_pinned(), "Input tensor must be pinned memory"); From c792a55d710b75dfc0436b6ee7530a4439150459 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 15:39:09 +0800 Subject: [PATCH 080/155] bugfix for CMakeLists.txt Signed-off-by: Abatom --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7fb0f2d142bb..bc4944f31a97 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -245,7 +245,7 @@ set(VLLM_EXT_SRC "csrc/cuda_utils_kernels.cu" "csrc/prepare_inputs/advance_step.cu" "csrc/custom_all_reduce.cu" - "csrc/tensor_store_load_mem.cu.cu" + "csrc/tensor_store_load_mem.cu" "csrc/torch_bindings.cpp") if(VLLM_GPU_LANG STREQUAL "CUDA") From d02ecabd57b67d1eb6ec88ccd8ab1ebd9f1b59ef Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 15:49:31 +0800 Subject: [PATCH 081/155] bugfix for cmake Signed-off-by: Abatom --- csrc/ops.h | 4 ++-- csrc/torch_bindings.cpp | 16 +++++++++------- vllm/_custom_ops.py | 4 ++-- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/csrc/ops.h b/csrc/ops.h index d893370c0dc3..646abb724ac0 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -345,5 +345,5 @@ std::tuple allocate_shared_buffer_and_handle( int64_t size); int64_t open_mem_handle(torch::Tensor& mem_handle); void free_shared_buffer(int64_t buffer); -void store_tensor(torch::Tensor device_tensor, torch::Tensor host_tensor); -void load_tensor(torch::Tensor host_tensor, torch::Tensor device_tensor); +void store_tensor(torch::Tensor& device_tensor, torch::Tensor& host_tensor); +void load_tensor(torch::Tensor& host_tensor, torch::Tensor& device_tensor); diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 57d2e7f54a65..78f6be635ebf 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -602,13 +602,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { "CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) -> Tensor"); // conditionally compiled so impl in source file #endif - - // Store Tensor to pinned memory - ops.def("store_tensor(Tensor device_tensor, Tensor! host_tensor) -> ()"); - ops.impl("store_tensor", torch::kCUDA, &store_tensor); - // Load Tensor from pinned memory - ops.def("load_tensor(Tensor host_tensor, Tensor! device_tensor) -> ()"); - ops.impl("load_tensor", torch::kCUDA, &load_tensor); } TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { @@ -710,4 +703,13 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { custom_ar.def("free_shared_buffer", &free_shared_buffer); } +TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _mem_pool), mem_pool) { + // Store Tensor to pinned memory + mem_pool.def("store_tensor(Tensor device_tensor, Tensor! host_tensor) -> ()"); + mem_pool.impl("store_tensor", torch::kCUDA, &store_tensor); + // Load Tensor from pinned memory + mem_pool.def("load_tensor(Tensor host_tensor, Tensor! device_tensor) -> ()"); + mem_pool.impl("load_tensor", torch::kCUDA, &load_tensor); +} + REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 6905e59dde79..3e7ae3c0e31e 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1737,8 +1737,8 @@ def cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor, def store_tensor(device_tensor: torch.Tensor, host_tensor: torch.Tensor): - torch.ops._C.store_tensor(device_tensor, host_tensor) + torch.ops._C_mem_pool.store_tensor(device_tensor, host_tensor) def load_tensor(host_tensor: torch.Tensor, device_tensor: torch.Tensor): - torch.ops._C.load_tensor(host_tensor, device_tensor) + torch.ops._C_mem_pool.load_tensor(host_tensor, device_tensor) From d33a40462531cf04100be29441d41068d7454965 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 15:59:02 +0800 Subject: [PATCH 082/155] #include Signed-off-by: Abatom --- csrc/tensor_store_load_mem.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/csrc/tensor_store_load_mem.cu b/csrc/tensor_store_load_mem.cu index c5b65e68e9ba..0b8579c19e5b 100644 --- a/csrc/tensor_store_load_mem.cu +++ b/csrc/tensor_store_load_mem.cu @@ -1,3 +1,4 @@ +#include #include #include From 2c4996549cd6cf6d4fc4e8e004f72c3aeb92917d Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 16:40:07 +0800 Subject: [PATCH 083/155] fix make error Signed-off-by: Abatom --- csrc/tensor_store_load_mem.cu | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/csrc/tensor_store_load_mem.cu b/csrc/tensor_store_load_mem.cu index 0b8579c19e5b..33d047367f2b 100644 --- a/csrc/tensor_store_load_mem.cu +++ b/csrc/tensor_store_load_mem.cu @@ -1,6 +1,6 @@ -#include -#include -#include +#include +#include +#include // Template-based CUDA kernel: Copy from device memory to pinned host memory template @@ -55,11 +55,11 @@ void load_tensor_impl(torch::Tensor& host_tensor, torch::Tensor& device_tensor) // Type-dispatched wrapper function void store_tensor(torch::Tensor& device_tensor, torch::Tensor& host_tensor) { // Validate arguments - AT_ASSERT(device_tensor.is_cuda(), "Input tensor must be a CUDA tensor"); - AT_ASSERT(host_tensor.is_pinned(), "Output tensor must be pinned memory"); - AT_ASSERT(device_tensor.numel() == host_tensor.numel(), + TORCH_CHECK(device_tensor.is_cuda(), "Input tensor must be a CUDA tensor"); + TORCH_CHECK(host_tensor.is_pinned(), "Output tensor must be pinned memory"); + TORCH_CHECK(device_tensor.numel() == host_tensor.numel(), "Tensors must have same number of elements"); - AT_ASSERT(device_tensor.dtype() == host_tensor.dtype(), + TORCH_CHECK(device_tensor.dtype() == host_tensor.dtype(), "Tensors must have same dtype"); // Type-based dispatch to different implementations @@ -74,17 +74,17 @@ void store_tensor(torch::Tensor& device_tensor, torch::Tensor& host_tensor) { store_tensor_impl(device_tensor, host_tensor); break; default: - AT_ERROR("Unsupported data type: ", device_tensor.scalar_type()); + TORCH_CHECK("Unsupported data type: ", device_tensor.scalar_type()); } } void load_tensor(torch::Tensor& host_tensor, torch::Tensor& device_tensor) { // Validate arguments - AT_ASSERT(device_tensor.is_cuda(), "Output tensor must be a CUDA tensor"); - AT_ASSERT(host_tensor.is_pinned(), "Input tensor must be pinned memory"); - AT_ASSERT(device_tensor.numel() == host_tensor.numel(), + TORCH_CHECK(device_tensor.is_cuda(), "Output tensor must be a CUDA tensor"); + TORCH_CHECK(host_tensor.is_pinned(), "Input tensor must be pinned memory"); + TORCH_CHECK(device_tensor.numel() == host_tensor.numel(), "Tensors must have same number of elements"); - AT_ASSERT(device_tensor.dtype() == host_tensor.dtype(), + TORCH_CHECK(device_tensor.dtype() == host_tensor.dtype(), "Tensors must have same dtype"); // Type-based dispatch to different implementations @@ -99,15 +99,6 @@ void load_tensor(torch::Tensor& host_tensor, torch::Tensor& device_tensor) { load_tensor_impl(host_tensor, device_tensor); break; default: - AT_ERROR("Unsupported data type: ", host_tensor.scalar_type()); + TORCH_CHECK("Unsupported data type: ", host_tensor.scalar_type()); } } - -// PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { -// m.def("store_tensor", &store_tensor, -// "Store CUDA tensor to pinned memory -// (supports float32, float16, bfloat16)"); -// m.def("load_tensor", &load_tensor, -// "Load CUDA tensor from pinned memory -// (supports float32, float16, bfloat16)"); -// } From ed94529c6485253310b2b926ab63c979285c1d69 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 16:51:47 +0800 Subject: [PATCH 084/155] fix make error Signed-off-by: Abatom --- csrc/tensor_store_load_mem.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/csrc/tensor_store_load_mem.cu b/csrc/tensor_store_load_mem.cu index 33d047367f2b..a54babc865eb 100644 --- a/csrc/tensor_store_load_mem.cu +++ b/csrc/tensor_store_load_mem.cu @@ -1,5 +1,6 @@ #include #include +#include #include // Template-based CUDA kernel: Copy from device memory to pinned host memory From b9ac67724abfa22aee86e264a2ad793afc773650 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 17:07:00 +0800 Subject: [PATCH 085/155] format Signed-off-by: Abatom --- csrc/tensor_store_load_mem.cu | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/csrc/tensor_store_load_mem.cu b/csrc/tensor_store_load_mem.cu index a54babc865eb..59c48cea9045 100644 --- a/csrc/tensor_store_load_mem.cu +++ b/csrc/tensor_store_load_mem.cu @@ -25,7 +25,8 @@ __global__ void load_kernel(const scalar_t* host_ptr, scalar_t* device_ptr, // Templated wrapper function: Store Tensor to pinned memory template -void store_tensor_impl(torch::Tensor& device_tensor, torch::Tensor& host_tensor) { +void store_tensor_impl(torch::Tensor& device_tensor, + torch::Tensor& host_tensor) { const auto num_elements = device_tensor.numel(); const int threads = 256; const int blocks = (num_elements + threads - 1) / threads; @@ -40,7 +41,8 @@ void store_tensor_impl(torch::Tensor& device_tensor, torch::Tensor& host_tensor) // Templated wrapper function: Load Tensor from pinned memory template -void load_tensor_impl(torch::Tensor& host_tensor, torch::Tensor& device_tensor) { +void load_tensor_impl(torch::Tensor& host_tensor, + torch::Tensor& device_tensor) { const auto num_elements = host_tensor.numel(); const int threads = 256; const int blocks = (num_elements + threads - 1) / threads; @@ -59,9 +61,9 @@ void store_tensor(torch::Tensor& device_tensor, torch::Tensor& host_tensor) { TORCH_CHECK(device_tensor.is_cuda(), "Input tensor must be a CUDA tensor"); TORCH_CHECK(host_tensor.is_pinned(), "Output tensor must be pinned memory"); TORCH_CHECK(device_tensor.numel() == host_tensor.numel(), - "Tensors must have same number of elements"); + "Tensors must have same number of elements"); TORCH_CHECK(device_tensor.dtype() == host_tensor.dtype(), - "Tensors must have same dtype"); + "Tensors must have same dtype"); // Type-based dispatch to different implementations switch (device_tensor.scalar_type()) { @@ -84,9 +86,9 @@ void load_tensor(torch::Tensor& host_tensor, torch::Tensor& device_tensor) { TORCH_CHECK(device_tensor.is_cuda(), "Output tensor must be a CUDA tensor"); TORCH_CHECK(host_tensor.is_pinned(), "Input tensor must be pinned memory"); TORCH_CHECK(device_tensor.numel() == host_tensor.numel(), - "Tensors must have same number of elements"); + "Tensors must have same number of elements"); TORCH_CHECK(device_tensor.dtype() == host_tensor.dtype(), - "Tensors must have same dtype"); + "Tensors must have same dtype"); // Type-based dispatch to different implementations switch (host_tensor.scalar_type()) { From 684d070666e03d90a620ba459bf2042b45b72de1 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 17:47:45 +0800 Subject: [PATCH 086/155] bugfix & format Signed-off-by: Abatom --- csrc/tensor_store_load_mem.cu | 8 ++++---- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/csrc/tensor_store_load_mem.cu b/csrc/tensor_store_load_mem.cu index 59c48cea9045..449b18552813 100644 --- a/csrc/tensor_store_load_mem.cu +++ b/csrc/tensor_store_load_mem.cu @@ -61,9 +61,9 @@ void store_tensor(torch::Tensor& device_tensor, torch::Tensor& host_tensor) { TORCH_CHECK(device_tensor.is_cuda(), "Input tensor must be a CUDA tensor"); TORCH_CHECK(host_tensor.is_pinned(), "Output tensor must be pinned memory"); TORCH_CHECK(device_tensor.numel() == host_tensor.numel(), - "Tensors must have same number of elements"); + "Tensors must have same number of elements"); TORCH_CHECK(device_tensor.dtype() == host_tensor.dtype(), - "Tensors must have same dtype"); + "Tensors must have same dtype"); // Type-based dispatch to different implementations switch (device_tensor.scalar_type()) { @@ -86,9 +86,9 @@ void load_tensor(torch::Tensor& host_tensor, torch::Tensor& device_tensor) { TORCH_CHECK(device_tensor.is_cuda(), "Output tensor must be a CUDA tensor"); TORCH_CHECK(host_tensor.is_pinned(), "Input tensor must be pinned memory"); TORCH_CHECK(device_tensor.numel() == host_tensor.numel(), - "Tensors must have same number of elements"); + "Tensors must have same number of elements"); TORCH_CHECK(device_tensor.dtype() == host_tensor.dtype(), - "Tensors must have same dtype"); + "Tensors must have same dtype"); // Type-based dispatch to different implementations switch (host_tensor.scalar_type()) { diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 1cacd6667a6a..86ce231455e1 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -36,7 +36,7 @@ def __init__(self, if not hostname: hostname = get_ip() - port = self.config.kv_port + port_offset + port = int(self.config.kv_port) + port_offset if port == 0: raise ValueError("Port cannot be 0") self._hostname = hostname From 152b140074df8971ef9cb4b79c3d8fa72448de93 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 19:51:39 +0800 Subject: [PATCH 087/155] float(self.config.kv_buffer_size) Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 86ce231455e1..fd9f4e87b1c9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -98,7 +98,7 @@ def __init__(self, self.comms: dict[str, Any] = {} # remote_address: (ncclComm_t, rank) self.buffer_size = 0 - self.buffer_size_threshold = self.config.kv_buffer_size + self.buffer_size_threshold = float(self.config.kv_buffer_size) self._listener_thread = threading.Thread( target=self._listen_for_requests, daemon=True) From 1ba522800facbf885a181e05a25220b8a477392e Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 20:42:57 +0800 Subject: [PATCH 088/155] add log Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index fd9f4e87b1c9..c0a43461092e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -110,6 +110,13 @@ def __init__(self, daemon=True) self._ping_thread.start() + logger.info("๐Ÿ’ฏP2pNcclEngine init, rank:%d, local_rank:%d, " + "http_address:%s, zmq_address:%s, proxy_address:%s, " + "send_type:%s, buffer_size_threshold:%.2f", self.rank, + self.local_rank, self.http_address, self.zmq_address, + self.proxy_address, self.send_type, + self.buffer_size_threshold) + def _create_connect(self, remote_address: typing.Optional[str] = None): assert remote_address is not None if remote_address not in self.socks: From 65e5f43d111f4aed030a89df9131295b4469dea9 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 21 May 2025 20:59:24 +0800 Subject: [PATCH 089/155] format Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index c0a43461092e..739fc6924b5f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -110,12 +110,12 @@ def __init__(self, daemon=True) self._ping_thread.start() - logger.info("๐Ÿ’ฏP2pNcclEngine init, rank:%d, local_rank:%d, " - "http_address:%s, zmq_address:%s, proxy_address:%s, " - "send_type:%s, buffer_size_threshold:%.2f", self.rank, - self.local_rank, self.http_address, self.zmq_address, - self.proxy_address, self.send_type, - self.buffer_size_threshold) + logger.info( + "๐Ÿ’ฏP2pNcclEngine init, rank:%d, local_rank:%d, http_address:%s" + "zmq_address:%s, proxy_address:%s, send_type:%s, buffer_size_" + "threshold:%.2f", self.rank, self.local_rank, self.http_address, + self.zmq_address, self.proxy_address, self.send_type, + self.buffer_size_threshold) def _create_connect(self, remote_address: typing.Optional[str] = None): assert remote_address is not None From 28aa69389799b65886dc29b45cff4c2b19ccff73 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 22 May 2025 10:57:52 +0800 Subject: [PATCH 090/155] change - to # Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 234d76dd9302..86ef3212520c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -188,7 +188,7 @@ def inject_kv_into_layer( forward_context.virtual_engine] kv_cache = self.p2p_nccl_engine.recv_tensor( - request.request_id + "-" + layer_name) + request.request_id + "#" + layer_name) if kv_cache is None: logger.warning("๐Ÿšงsrc_kv_cache is None, %s", @@ -252,7 +252,7 @@ def extract_kv_from_layer( remote_address = ip + ":" + str(port + self._rank) kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) - self.p2p_nccl_engine.send_tensor(request_id + "-" + layer_name, + self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name, kv_cache, remote_address) def wait_for_save(self): @@ -352,6 +352,10 @@ def build_connector_meta( block_size=self._block_size) total_need_load += 1 + for finished_req in scheduler_output.finished_req_ids: + # TODO: Abatom + break + assert total_need_load == len(self._requests_need_load) self._requests_need_load.clear() return meta From 8512639d2926d24fdd5df4276be885468baedced Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 22 May 2025 16:12:41 +0800 Subject: [PATCH 091/155] add get_finished & request_finished Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 67 +++++++++++++++++-- 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 86ef3212520c..c4bb0550481e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -2,7 +2,7 @@ import re from dataclasses import dataclass -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Optional import torch @@ -93,6 +93,10 @@ def __init__(self, port_offset=rank, ) if role == KVConnectorRole.WORKER else None + # ============================== + # Worker-side methods + # ============================== + def start_load_kv(self, forward_context: "ForwardContext", **kwargs) -> None: """Start loading the KV cache from the connector buffer to vLLM's @@ -109,6 +113,10 @@ def start_load_kv(self, forward_context: "ForwardContext", assert self.p2p_nccl_engine is not None attn_metadata = forward_context.attn_metadata + if attn_metadata is None: + logger.warning( + "In connector.start_load_kv, but the attn_metadata is None") + return def inject_kv_into_layer( dst_kv_cache_layer: torch.Tensor, @@ -172,12 +180,6 @@ def inject_kv_into_layer( ) return - attn_metadata = forward_context.attn_metadata - if attn_metadata is None: - logger.warning( - "In connector.start_load_kv, but the attn_metadata is None") - return - # Load the KV for each request each layer for request in metadata.requests: if self.is_producer: @@ -260,6 +262,28 @@ def wait_for_save(self): assert self.p2p_nccl_engine is not None self.p2p_nccl_engine.wait_for_sent() + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[Optional[set[str]], Optional[set[str]]]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens. + + Returns: + ids of requests that have finished asynchronous transfer, + tuple of (sending/saving ids, recving/loading ids). + The finished saves/sends req ids must belong to a set provided in a + call to this method (this call or a prior one). + """ + + logger.debug("๐Ÿžget_finished, finished_req_ids:%s", finished_req_ids) + + return None, None + + # ============================== + # Scheduler-side methods + # ============================== + def get_num_new_matched_tokens( self, request: "Request", @@ -311,6 +335,10 @@ def build_connector_meta( Args: scheduler_output (SchedulerOutput): the scheduler output object. """ + + logger.debug( + "๐Ÿžbuild_connector_meta, scheduler_output:%s", scheduler_output) + meta = P2pNcclConnectorMetadata() total_need_load = 0 @@ -360,6 +388,31 @@ def build_connector_meta( self._requests_need_load.clear() return meta + def request_finished( + self, + request: "Request", + block_ids: list[int], + ) -> tuple[bool, Optional[dict[str, Any]]]: + """ + Called when a request has finished, before its blocks are freed. + + Returns: + True if the request is being saved/sent asynchronously and blocks + should not be freed until the request_id is returned from + get_finished(). + Optional KVTransferParams to be included in the request outputs + returned by the engine. + """ + + logger.debug("๐Ÿžrequest_finished, request_id:%s, block_ids:%s", + request.request_id, block_ids) + + return False, None + + # ============================== + # Static methods + # ============================== + @staticmethod def parse_request_id(request_id: str, is_prefill=True) -> tuple[str, int]: logger.debug("parse_request_id, request_id: %s, is_prefill: %s", From b7d73752eef1231d88a497d21bcb64d479423606 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 22 May 2025 16:25:47 +0800 Subject: [PATCH 092/155] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index c4bb0550481e..bd0d065ca8e3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -336,8 +336,8 @@ def build_connector_meta( scheduler_output (SchedulerOutput): the scheduler output object. """ - logger.debug( - "๐Ÿžbuild_connector_meta, scheduler_output:%s", scheduler_output) + logger.debug("๐Ÿžbuild_connector_meta, scheduler_output:%s", + scheduler_output) meta = P2pNcclConnectorMetadata() From ad815aca7a3e013abc255730ff94370a092e8bf3 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 23 May 2025 18:43:02 +0800 Subject: [PATCH 093/155] bugfix for chunked prefill Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 104 +++++++++++++----- 1 file changed, 78 insertions(+), 26 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index bd0d065ca8e3..7644218734cb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -44,6 +44,13 @@ def make_meta(request_id: str, token_ids: list[int], block_ids: list[int], slot_mapping = block_offsets.reshape((1, block_size)) + \ block_ids_tensor.reshape((num_blocks, 1)) * block_size slot_mapping = slot_mapping.flatten()[:valid_num_tokens] + + logger.info( + "๐ŸžP2pNcclConnector make_meta, request_id:%s, token_ids:%s, " + "valid_num_tokens:%d, block_ids:%s, num_blocks:%d, block_size:%d, " + "slot_mapping:%s", request_id, token_ids, valid_num_tokens, + block_ids, num_blocks, block_size, slot_mapping.tolist()) + return ReqMeta( request_id=request_id, token_ids=token_ids_tensor, @@ -85,6 +92,7 @@ def __init__(self, self.config = vllm_config.kv_transfer_config self.rank = rank self.is_producer = self.config.is_kv_producer + self.chunked_prefill: dict[str, Any] = {} self.p2p_nccl_engine = P2pNcclEngine( local_rank=local_rank, @@ -110,6 +118,11 @@ def start_load_kv(self, forward_context: "ForwardContext", The number of elements in kv_caches and layer_names should be the same. """ + + # Only consumer/decode loads KV Cache + if self.is_producer: + return + assert self.p2p_nccl_engine is not None attn_metadata = forward_context.attn_metadata @@ -122,6 +135,7 @@ def inject_kv_into_layer( dst_kv_cache_layer: torch.Tensor, src_kv_cache: torch.Tensor, slot_mapping: torch.Tensor, + request_id: str, ) -> None: """Inject the KV cache into the layer. @@ -134,6 +148,7 @@ def inject_kv_into_layer( otherwise. slot_mapping (torch.Tensor): the slot mapping. In shape [num_tokens]. + request_id (str): request id for log """ dst_kv_cache_layer_shape = dst_kv_cache_layer.shape if isinstance(attn_metadata, MLACommonMetadata): @@ -149,7 +164,8 @@ def inject_kv_into_layer( ...] = src_kv_cache logger.warning( "๐Ÿšงsrc_kv_cache does not match, num_slot:%d, " - "num_token:%d", len(slot_mapping), num_token) + "num_token:%d, request_id:%s", len(slot_mapping), + num_token, request_id) dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) else: @@ -165,7 +181,8 @@ def inject_kv_into_layer( ...] = src_kv_cache logger.warning( "๐Ÿšงsrc_kv_cache does not match, num_slot:%d, " - "num_token:%d", len(slot_mapping), num_token) + "num_token:%d, request_id:%s", len(slot_mapping), + num_token, request_id) dst_kv_cache_layer.reshape(dst_kv_cache_layer_shape) @@ -182,8 +199,6 @@ def inject_kv_into_layer( # Load the KV for each request each layer for request in metadata.requests: - if self.is_producer: - continue for layer_name in forward_context.no_compile_layers: attn_layer = forward_context.no_compile_layers[layer_name] kv_cache_layer = attn_layer.kv_cache[ \ @@ -198,7 +213,7 @@ def inject_kv_into_layer( continue inject_kv_into_layer(kv_cache_layer, kv_cache, - request.slot_mapping) + request.slot_mapping, request.request_id) logger.info("Inject KV cache of %d tokens to the paged memory, %s", len(request.slot_mapping), request.request_id) @@ -226,6 +241,11 @@ def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, attn_metadata (AttentionMetadata): the attention metadata. **kwargs: additional arguments for the save operation. """ + + # Only producer/prefill saves KV Cache + if not self.is_producer: + return + assert self.p2p_nccl_engine is not None def extract_kv_from_layer( @@ -248,14 +268,13 @@ def extract_kv_from_layer( connector_metadata = self._get_connector_metadata() assert isinstance(connector_metadata, P2pNcclConnectorMetadata) for request in connector_metadata.requests: - if self.is_producer: - request_id = request.request_id - ip, port = self.parse_request_id(request_id, True) - remote_address = ip + ":" + str(port + self._rank) - kv_cache = extract_kv_from_layer(kv_layer, - request.slot_mapping) - self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name, - kv_cache, remote_address) + request_id = request.request_id + ip, port = self.parse_request_id(request_id, True) + remote_address = ip + ":" + str(port + self._rank) + kv_cache = extract_kv_from_layer(kv_layer, + request.slot_mapping) + self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name, + kv_cache, remote_address) def wait_for_save(self): if self.is_producer: @@ -276,7 +295,7 @@ def get_finished( call to this method (this call or a prior one). """ - logger.debug("๐Ÿžget_finished, finished_req_ids:%s", finished_req_ids) + logger.info("๐Ÿžget_finished, finished_req_ids:%s", finished_req_ids) return None, None @@ -336,27 +355,64 @@ def build_connector_meta( scheduler_output (SchedulerOutput): the scheduler output object. """ - logger.debug("๐Ÿžbuild_connector_meta, scheduler_output:%s", + logger.info("๐Ÿžbuild_connector_meta, scheduler_output:%s", scheduler_output) meta = P2pNcclConnectorMetadata() total_need_load = 0 for new_req in scheduler_output.scheduled_new_reqs: + if self.is_producer: + num_scheduled_tokens = ( + scheduler_output.num_scheduled_tokens)[new_req.req_id] + num_tokens = num_scheduled_tokens + new_req.num_computed_tokens + # the request's prompt is chunked prefill + if num_tokens < len(new_req.prompt_token_ids): + # 'CachedRequestData' has no attribute 'prompt_token_ids' + self.chunked_prefill[new_req.req_id] = (new_req.block_ids, new_req.prompt_token_ids) + logger.info( + "๐Ÿžbuild_connector_meta, chunked prefill, request_id:%s," + "num_scheduled_tokens:%d, num_prompt_tokens:%d, " + "num_computed_tokens:%d, num_tokens:%d", new_req.req_id, + num_scheduled_tokens, len(new_req.prompt_token_ids), + new_req.num_computed_tokens, num_tokens) + continue + # the request's prompt is not chunked prefill + meta.add_request(request_id=new_req.req_id, + token_ids=new_req.prompt_token_ids, + block_ids=new_req.block_ids, + block_size=self._block_size) + continue if new_req.req_id in self._requests_need_load: meta.add_request(request_id=new_req.req_id, token_ids=new_req.prompt_token_ids, block_ids=new_req.block_ids, block_size=self._block_size) total_need_load += 1 - else: - if self.is_producer: - meta.add_request(request_id=new_req.req_id, - token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids, - block_size=self._block_size) for cached_req in scheduler_output.scheduled_cached_reqs: + if self.is_producer: + num_scheduled_tokens = ( + scheduler_output.num_scheduled_tokens)[cached_req.req_id] + num_tokens = num_scheduled_tokens + cached_req.num_computed_tokens + assert cached_req.req_id in self.chunked_prefill + block_ids = (self.chunked_prefill[cached_req.req_id][0] + + cached_req.new_block_ids) + prompt_token_ids = self.chunked_prefill[cached_req.req_id][1] + logger.info("๐Ÿžbuild_connector_meta, cached_req, request_id:%s, num_scheduled_tokens:%d, num_prompt_tokens:%d", + cached_req.req_id, num_scheduled_tokens, len(prompt_token_ids)) + # the request's prompt is chunked prefill again + if num_tokens < len(prompt_token_ids): + self.chunked_prefill[cached_req.req_id] = (block_ids, prompt_token_ids) + continue + # the request's prompt is all prefilled finally + meta.add_request(request_id=cached_req.req_id, + token_ids=prompt_token_ids, + block_ids=block_ids, + block_size=self._block_size) + self.chunked_prefill.pop(cached_req.req_id, None) + continue + # NOTE(rob): here we rely on the resumed requests being # the first N requests in the list scheduled_cache_reqs. if not cached_req.resumed_from_preemption: @@ -380,10 +436,6 @@ def build_connector_meta( block_size=self._block_size) total_need_load += 1 - for finished_req in scheduler_output.finished_req_ids: - # TODO: Abatom - break - assert total_need_load == len(self._requests_need_load) self._requests_need_load.clear() return meta @@ -404,7 +456,7 @@ def request_finished( returned by the engine. """ - logger.debug("๐Ÿžrequest_finished, request_id:%s, block_ids:%s", + logger.info("๐Ÿžrequest_finished, request_id:%s, block_ids:%s", request.request_id, block_ids) return False, None From 1db37295ae38f9c9dfb42487e5add8a76f2b9ea9 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 23 May 2025 19:47:42 +0800 Subject: [PATCH 094/155] format Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 7644218734cb..ddc40a684146 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -271,8 +271,7 @@ def extract_kv_from_layer( request_id = request.request_id ip, port = self.parse_request_id(request_id, True) remote_address = ip + ":" + str(port + self._rank) - kv_cache = extract_kv_from_layer(kv_layer, - request.slot_mapping) + kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name, kv_cache, remote_address) @@ -356,7 +355,7 @@ def build_connector_meta( """ logger.info("๐Ÿžbuild_connector_meta, scheduler_output:%s", - scheduler_output) + scheduler_output) meta = P2pNcclConnectorMetadata() @@ -369,12 +368,14 @@ def build_connector_meta( # the request's prompt is chunked prefill if num_tokens < len(new_req.prompt_token_ids): # 'CachedRequestData' has no attribute 'prompt_token_ids' - self.chunked_prefill[new_req.req_id] = (new_req.block_ids, new_req.prompt_token_ids) + self.chunked_prefill[new_req.req_id] = ( + new_req.block_ids, new_req.prompt_token_ids) logger.info( - "๐Ÿžbuild_connector_meta, chunked prefill, request_id:%s," - "num_scheduled_tokens:%d, num_prompt_tokens:%d, " - "num_computed_tokens:%d, num_tokens:%d", new_req.req_id, - num_scheduled_tokens, len(new_req.prompt_token_ids), + "๐Ÿžbuild_connector_meta, chunked prefill, " + "request_id:%s, num_scheduled_tokens:%d, " + "num_prompt_tokens:%d, num_computed_tokens:%d, " + "num_tokens:%d", new_req.req_id, num_scheduled_tokens, + len(new_req.prompt_token_ids), new_req.num_computed_tokens, num_tokens) continue # the request's prompt is not chunked prefill @@ -399,11 +400,15 @@ def build_connector_meta( block_ids = (self.chunked_prefill[cached_req.req_id][0] + cached_req.new_block_ids) prompt_token_ids = self.chunked_prefill[cached_req.req_id][1] - logger.info("๐Ÿžbuild_connector_meta, cached_req, request_id:%s, num_scheduled_tokens:%d, num_prompt_tokens:%d", - cached_req.req_id, num_scheduled_tokens, len(prompt_token_ids)) + logger.info( + "๐Ÿžbuild_connector_meta, cached_req, request_id:%s, " + "num_scheduled_tokens:%d, num_prompt_tokens:%d", + cached_req.req_id, num_scheduled_tokens, + len(prompt_token_ids)) # the request's prompt is chunked prefill again if num_tokens < len(prompt_token_ids): - self.chunked_prefill[cached_req.req_id] = (block_ids, prompt_token_ids) + self.chunked_prefill[cached_req.req_id] = ( + block_ids, prompt_token_ids) continue # the request's prompt is all prefilled finally meta.add_request(request_id=cached_req.req_id, @@ -457,7 +462,7 @@ def request_finished( """ logger.info("๐Ÿžrequest_finished, request_id:%s, block_ids:%s", - request.request_id, block_ids) + request.request_id, block_ids) return False, None From 1caf9377def00a6b23936938ed6d76bee1747ece Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 23 May 2025 19:58:09 +0800 Subject: [PATCH 095/155] format & debug log Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index ddc40a684146..5a117270c0d2 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -45,7 +45,7 @@ def make_meta(request_id: str, token_ids: list[int], block_ids: list[int], block_ids_tensor.reshape((num_blocks, 1)) * block_size slot_mapping = slot_mapping.flatten()[:valid_num_tokens] - logger.info( + logger.debug( "๐ŸžP2pNcclConnector make_meta, request_id:%s, token_ids:%s, " "valid_num_tokens:%d, block_ids:%s, num_blocks:%d, block_size:%d, " "slot_mapping:%s", request_id, token_ids, valid_num_tokens, @@ -294,7 +294,7 @@ def get_finished( call to this method (this call or a prior one). """ - logger.info("๐Ÿžget_finished, finished_req_ids:%s", finished_req_ids) + logger.debug("๐Ÿžget_finished, finished_req_ids:%s", finished_req_ids) return None, None @@ -354,8 +354,8 @@ def build_connector_meta( scheduler_output (SchedulerOutput): the scheduler output object. """ - logger.info("๐Ÿžbuild_connector_meta, scheduler_output:%s", - scheduler_output) + logger.debug("๐Ÿžbuild_connector_meta, scheduler_output:%s", + scheduler_output) meta = P2pNcclConnectorMetadata() @@ -370,7 +370,7 @@ def build_connector_meta( # 'CachedRequestData' has no attribute 'prompt_token_ids' self.chunked_prefill[new_req.req_id] = ( new_req.block_ids, new_req.prompt_token_ids) - logger.info( + logger.debug( "๐Ÿžbuild_connector_meta, chunked prefill, " "request_id:%s, num_scheduled_tokens:%d, " "num_prompt_tokens:%d, num_computed_tokens:%d, " @@ -395,12 +395,13 @@ def build_connector_meta( if self.is_producer: num_scheduled_tokens = ( scheduler_output.num_scheduled_tokens)[cached_req.req_id] - num_tokens = num_scheduled_tokens + cached_req.num_computed_tokens + num_tokens = (num_scheduled_tokens + + cached_req.num_computed_tokens) assert cached_req.req_id in self.chunked_prefill block_ids = (self.chunked_prefill[cached_req.req_id][0] + cached_req.new_block_ids) prompt_token_ids = self.chunked_prefill[cached_req.req_id][1] - logger.info( + logger.debug( "๐Ÿžbuild_connector_meta, cached_req, request_id:%s, " "num_scheduled_tokens:%d, num_prompt_tokens:%d", cached_req.req_id, num_scheduled_tokens, @@ -461,8 +462,8 @@ def request_finished( returned by the engine. """ - logger.info("๐Ÿžrequest_finished, request_id:%s, block_ids:%s", - request.request_id, block_ids) + logger.debug("๐Ÿžrequest_finished, request_id:%s, block_ids:%s", + request.request_id, block_ids) return False, None From b1937e924e38afa9ec71e772b3e010561394f6ce Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 24 May 2025 15:38:24 +0800 Subject: [PATCH 096/155] bugfix for Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 5a117270c0d2..75d5ec0c23ab 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -369,7 +369,7 @@ def build_connector_meta( if num_tokens < len(new_req.prompt_token_ids): # 'CachedRequestData' has no attribute 'prompt_token_ids' self.chunked_prefill[new_req.req_id] = ( - new_req.block_ids, new_req.prompt_token_ids) + new_req.block_ids[0], new_req.prompt_token_ids) logger.debug( "๐Ÿžbuild_connector_meta, chunked prefill, " "request_id:%s, num_scheduled_tokens:%d, " @@ -381,13 +381,13 @@ def build_connector_meta( # the request's prompt is not chunked prefill meta.add_request(request_id=new_req.req_id, token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids, + block_ids=new_req.block_ids[0], block_size=self._block_size) continue if new_req.req_id in self._requests_need_load: meta.add_request(request_id=new_req.req_id, token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids, + block_ids=new_req.block_ids[0], block_size=self._block_size) total_need_load += 1 @@ -399,7 +399,7 @@ def build_connector_meta( cached_req.num_computed_tokens) assert cached_req.req_id in self.chunked_prefill block_ids = (self.chunked_prefill[cached_req.req_id][0] + - cached_req.new_block_ids) + cached_req.new_block_ids[0]) prompt_token_ids = self.chunked_prefill[cached_req.req_id][1] logger.debug( "๐Ÿžbuild_connector_meta, cached_req, request_id:%s, " @@ -434,7 +434,7 @@ def build_connector_meta( # NOTE(rob): For resumed req, new_block_ids is all # of the block_ids for the request. - block_ids = cached_req.new_block_ids + block_ids = cached_req.new_block_ids[0] meta.add_request(request_id=cached_req.req_id, token_ids=token_ids, From fa2f13087e52402a07da2a56b520081e1d9e62f3 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 24 May 2025 20:57:07 +0800 Subject: [PATCH 097/155] 1/N preemption Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 75d5ec0c23ab..4c65b5036876 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -398,8 +398,10 @@ def build_connector_meta( num_tokens = (num_scheduled_tokens + cached_req.num_computed_tokens) assert cached_req.req_id in self.chunked_prefill - block_ids = (self.chunked_prefill[cached_req.req_id][0] + - cached_req.new_block_ids[0]) + block_ids = cached_req.new_block_ids[0] + if not cached_req.resumed_from_preemption: + block_ids = (self.chunked_prefill[cached_req.req_id][0] + + block_ids) prompt_token_ids = self.chunked_prefill[cached_req.req_id][1] logger.debug( "๐Ÿžbuild_connector_meta, cached_req, request_id:%s, " @@ -465,6 +467,8 @@ def request_finished( logger.debug("๐Ÿžrequest_finished, request_id:%s, block_ids:%s", request.request_id, block_ids) + self.chunked_prefill.pop(request.request_id, None) + return False, None # ============================== From 4f7947636c6688320d8d3f1c2b4c72be3cf80794 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 24 May 2025 21:26:29 +0800 Subject: [PATCH 098/155] Clear the buffer upon request completion(1/N) Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 4 +--- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 4c65b5036876..6f211bfd5896 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -294,9 +294,7 @@ def get_finished( call to this method (this call or a prior one). """ - logger.debug("๐Ÿžget_finished, finished_req_ids:%s", finished_req_ids) - - return None, None + return self.p2p_nccl_engine.get_finished(finished_req_ids) # ============================== # Scheduler-side methods diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 739fc6924b5f..e042521df689 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -416,6 +416,24 @@ def _send_sync( self.zmq_address, remote_address, rank, data, tensor.shape) return True + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[Optional[set[str]], Optional[set[str]]]: + """ + Notifies worker-side connector ids of requests that have + finished generating tokens. + + Returns: + ids of requests that have finished asynchronous transfer, + tuple of (sending/saving ids, recving/loading ids). + The finished saves/sends req ids must belong to a set provided in a + call to this method (this call or a prior one). + """ + + logger.debug("๐Ÿžget_finished, finished_req_ids:%s", finished_req_ids) + + return None, None + def _ping(self): sock = self.context.socket(zmq.DEALER) sock.setsockopt_string(zmq.IDENTITY, self.zmq_address) From e41d815e194d3caebdd7112dfe78e027aa7b6804 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 24 May 2025 22:12:13 +0800 Subject: [PATCH 099/155] Clear the buffer upon request completion(2/N) Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 2 ++ .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 1 + vllm/tensor_memory_pool.py | 2 -- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 6f211bfd5896..795bc930f516 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -294,6 +294,8 @@ def get_finished( call to this method (this call or a prior one). """ + assert self.p2p_nccl_engine is not None + return self.p2p_nccl_engine.get_finished(finished_req_ids) # ============================== diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index e042521df689..730efc8f80a0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -208,6 +208,7 @@ def recv_tensor( addr, dtype, shape = tensor tensor = self.pool.load_tensor(addr, dtype, shape, self.device) + self.pool.free(addr) else: addr = 0 self.buffer_size -= (tensor.element_size() * diff --git a/vllm/tensor_memory_pool.py b/vllm/tensor_memory_pool.py index 510592fb7eed..31857bbedaf3 100644 --- a/vllm/tensor_memory_pool.py +++ b/vllm/tensor_memory_pool.py @@ -168,8 +168,6 @@ def load_tensor(self, addr: int, dtype: torch.dtype, ops.load_tensor(cpu_tensor, cuda_tensor) self.load_stream.synchronize() - self.free(addr) - return cuda_tensor def cleanup(self): From eefd2672ddca358e8762922974d53fbd0f748683 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 26 May 2025 12:14:08 +0800 Subject: [PATCH 100/155] get_finished Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/base.py | 2 +- .../kv_connector/v1/multi_connector.py | 2 +- .../kv_connector/v1/nixl_connector.py | 3 +- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 8 +- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 77 +++++++++++++++++-- vllm/v1/worker/gpu_model_runner.py | 2 +- 6 files changed, 79 insertions(+), 15 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 44bfd300a3d9..0e61a18dcc12 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -182,7 +182,7 @@ def wait_for_save(self): pass def get_finished( - self, finished_req_ids: set[str] + self, finished_req_ids: set[str], **kwargs ) -> tuple[Optional[set[str]], Optional[set[str]]]: """ Notifies worker-side connector ids of requests that have diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 0aabb260fd3d..e2b4cae2d46b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -101,7 +101,7 @@ def wait_for_save(self): c.wait_for_save() def get_finished( - self, finished_req_ids: set[str] + self, finished_req_ids: set[str], **kwargs ) -> tuple[Optional[set[str]], Optional[set[str]]]: finished_sending: set[str] = set() finished_recving: set[str] = set() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 6303d77ad305..763a11e8e744 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -140,7 +140,8 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): self.connector_worker.register_kv_caches(kv_caches) def get_finished(self, - finished_req_ids: set[str]) -> tuple[set[str], set[str]]: + finished_req_ids: set[str], + **kwargs) -> tuple[set[str], set[str]]: """Get the finished recving and sending requests.""" assert self.connector_worker is not None return self.connector_worker.get_finished() diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 795bc930f516..21bf7baab4d8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -281,7 +281,7 @@ def wait_for_save(self): self.p2p_nccl_engine.wait_for_sent() def get_finished( - self, finished_req_ids: set[str] + self, finished_req_ids: set[str], **kwargs ) -> tuple[Optional[set[str]], Optional[set[str]]]: """ Notifies worker-side connector ids of requests that have @@ -296,7 +296,9 @@ def get_finished( assert self.p2p_nccl_engine is not None - return self.p2p_nccl_engine.get_finished(finished_req_ids) + forward_context: "ForwardContext" = kwargs.get("forward_context") + return self.p2p_nccl_engine.get_finished( + finished_req_ids, forward_context) # ============================== # Scheduler-side methods @@ -330,7 +332,7 @@ def get_num_new_matched_tokens( "num_computed_tokens:%d", num_external_tokens, len(request.prompt_token_ids), num_computed_tokens) - return num_external_tokens, False + return num_external_tokens, self.load_kv_async def update_state_after_alloc(self, request: "Request", blocks: "KVCacheBlocks", diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 730efc8f80a0..682c1da09f4b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -81,12 +81,13 @@ def __init__(self, # PUT, GET, PUT_ASYNC. self.send_type = self.config.get_from_extra_config("send_type", "PUT") if self.send_type == "GET": - self.send_store: dict[str, - torch.Tensor] = {} # tensor_id: torch.Tensor + # tensor_id: torch.Tensor + self.send_store: dict[str, torch.Tensor] = {} else: # PUT or PUT_ASYNC # tensor_id: torch.Tensor self.send_queue: deque[list[Any]] = deque() + self.send_request_id_to_tensor_ids: dict[str, set[str]] = {} if self.send_type == "PUT_ASYNC": self._send_thread = threading.Thread(target=self._send_async, daemon=True) @@ -94,6 +95,7 @@ def __init__(self, # tensor_id: torch.Tensor/(addr, dtype, shape) self.recv_store: dict[str, Any] = {} + self.recv_request_id_to_tensor_ids: dict[str, set[str]] = {} self.socks: dict[str, Any] = {} # remote_address: client socket self.comms: dict[str, Any] = {} # remote_address: (ncclComm_t, rank) @@ -199,16 +201,16 @@ def recv_tensor( while tensor_id not in self.recv_store: self.recv_store_cv.wait() tensor = self.recv_store[tensor_id] - self.recv_store[tensor_id] = None - while len(self.recv_store) > 10000: - self.recv_store.pop(next(iter(self.recv_store))) + # self.recv_store[tensor_id] = None + # while len(self.recv_store) > 10000: + # self.recv_store.pop(next(iter(self.recv_store))) if tensor is not None: if isinstance(tensor, tuple): addr, dtype, shape = tensor tensor = self.pool.load_tensor(addr, dtype, shape, self.device) - self.pool.free(addr) + # self.pool.free(addr) else: addr = 0 self.buffer_size -= (tensor.element_size() * @@ -322,6 +324,7 @@ def _listen_for_requests(self): with self.recv_store_cv: self.recv_store[tensor_id] = tensor + self._have_received_tensor_id(tensor_id) self.recv_store_cv.notify() elif data["cmd"] == "GET": @@ -337,12 +340,14 @@ def _listen_for_requests(self): } # LRU self.send_store[tensor_id] = tensor + self._have_sent_tensor_id(tensor_id) else: data = {"ret": 1} self.router_socket.send_multipart( [remote_address, msgpack.dumps(data)]) + rank = -1 if data["ret"] == 0: comm, rank = self.comms[remote_address.decode()] self._send(comm, tensor.to(self.device), rank ^ 1, @@ -357,6 +362,20 @@ def _listen_for_requests(self): "๐ŸšงUnexpected, Received message from %s, data:%s", remote_address, data) + def _have_sent_tensor_id(self, tensor_id: str): + request_id = tensor_id.split('#')[0] + if request_id not in self.send_request_id_to_tensor_ids: + self.send_request_id_to_tensor_ids[request_id] = set() + self.send_request_id_to_tensor_ids[request_id].add( + tensor_id) + + def _have_received_tensor_id(self, tensor_id: str): + request_id = tensor_id.split('#')[0] + if request_id not in self.recv_request_id_to_tensor_ids: + self.recv_request_id_to_tensor_ids[request_id] = set() + self.recv_request_id_to_tensor_ids[request_id].add( + tensor_id) + def _send_async(self): while True: with self.send_queue_cv: @@ -413,12 +432,16 @@ def _send_sync( return False self._send(comm, tensor.to(self.device), rank ^ 1, self.send_stream) + + if self.send_type == "PUT_ASYNC": + self._have_sent_tensor_id(tensor_id) + logger.info("๐Ÿ”ตSend Tensor, %s๐Ÿ‘‰%s, MyRank:%s, data:%s, tensor:%s", self.zmq_address, remote_address, rank, data, tensor.shape) return True def get_finished( - self, finished_req_ids: set[str] + self, finished_req_ids: set[str], forward_context: "ForwardContext" ) -> tuple[Optional[set[str]], Optional[set[str]]]: """ Notifies worker-side connector ids of requests that have @@ -433,7 +456,45 @@ def get_finished( logger.debug("๐Ÿžget_finished, finished_req_ids:%s", finished_req_ids) - return None, None + # Clear the buffer upon request completion. + for request_id in finished_req_ids: + for layer_name in forward_context.no_compile_layers: + tensor_id = request_id + "#" + layer_name + if tensor_id in self.recv_store: + with self.recv_store_cv: + tensor = self.recv_store.pop(tensor_id, None) + self.recv_request_id_to_tensor_ids.discard(request_id) + self.send_request_id_to_tensor_ids.discard(request_id) + addr = 0 + if isinstance(tensor, tuple): + addr, _, _ = tensor + self.pool.free(addr) + logger.debug( + "๐Ÿžget_finished, remove tensor_id:%s, addr:%d", + tensor_id, addr) + + num_layers = len(forward_context.no_compile_layers) + + # Retrieve requests that have already sent the KV cache. + finished_sending: set[str] = set() + if self.send_type == "PUT_ASYNC" or self.send_type == "GET": + for request_id in self.send_request_id_to_tensor_ids: + if (len(self.send_request_id_to_tensor_ids[request_id]) == + num_layers): + finished_sending.add(request_id) + + # Retrieve requests that have already received the KV cache. + finished_recving: set[str] = set() + for request_id in self.recv_request_id_to_tensor_ids: + if (len(self.recv_request_id_to_tensor_ids[request_id]) == + num_layers): + finished_recving.add(request_id) + + logger.debug( + "๐Ÿžget_finished, finished_sending:%s, finished_recving:%s", + finished_sending, finished_recving) + + return finished_sending or None, finished_recving or None def _ping(self): sock = self.context.socket(zmq.DEALER) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 42847e2f8c36..3d3b57548fd8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1479,7 +1479,7 @@ def get_finished_kv_transfers( ) -> tuple[Optional[set[str]], Optional[set[str]]]: if has_kv_transfer_group(): return get_kv_transfer_group().get_finished( - scheduler_output.finished_req_ids) + scheduler_output.finished_req_ids, get_forward_context()) return None, None def generate_draft_token_ids( From 1a0bcba259b5ea8ac0eda1cb2cd41933729ae983 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 26 May 2025 12:17:12 +0800 Subject: [PATCH 101/155] bugfix for get_finished Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 21bf7baab4d8..c538f3f715d4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -332,7 +332,7 @@ def get_num_new_matched_tokens( "num_computed_tokens:%d", num_external_tokens, len(request.prompt_token_ids), num_computed_tokens) - return num_external_tokens, self.load_kv_async + return num_external_tokens, True def update_state_after_alloc(self, request: "Request", blocks: "KVCacheBlocks", From f5876c8e793fc4ad708cfff1da3ab94a7f0324d0 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 26 May 2025 13:52:55 +0800 Subject: [PATCH 102/155] bugfix for get_finished and format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/base.py | 4 +-- .../kv_connector/v1/multi_connector.py | 4 +-- .../kv_connector/v1/nixl_connector.py | 3 +-- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 10 +++---- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 27 ++++++++++--------- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 0e61a18dcc12..b40356c3ba69 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -182,8 +182,8 @@ def wait_for_save(self): pass def get_finished( - self, finished_req_ids: set[str], **kwargs - ) -> tuple[Optional[set[str]], Optional[set[str]]]: + self, finished_req_ids: set[str], + **kwargs) -> tuple[Optional[set[str]], Optional[set[str]]]: """ Notifies worker-side connector ids of requests that have finished generating tokens. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index e2b4cae2d46b..286f182101c7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -101,8 +101,8 @@ def wait_for_save(self): c.wait_for_save() def get_finished( - self, finished_req_ids: set[str], **kwargs - ) -> tuple[Optional[set[str]], Optional[set[str]]]: + self, finished_req_ids: set[str], + **kwargs) -> tuple[Optional[set[str]], Optional[set[str]]]: finished_sending: set[str] = set() finished_recving: set[str] = set() for c in self._connectors: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 763a11e8e744..873758cbaf26 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -139,8 +139,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): assert self.connector_worker is not None self.connector_worker.register_kv_caches(kv_caches) - def get_finished(self, - finished_req_ids: set[str], + def get_finished(self, finished_req_ids: set[str], **kwargs) -> tuple[set[str], set[str]]: """Get the finished recving and sending requests.""" assert self.connector_worker is not None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index c538f3f715d4..877d5a482e95 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -281,8 +281,8 @@ def wait_for_save(self): self.p2p_nccl_engine.wait_for_sent() def get_finished( - self, finished_req_ids: set[str], **kwargs - ) -> tuple[Optional[set[str]], Optional[set[str]]]: + self, finished_req_ids: set[str], + **kwargs) -> tuple[Optional[set[str]], Optional[set[str]]]: """ Notifies worker-side connector ids of requests that have finished generating tokens. @@ -296,9 +296,9 @@ def get_finished( assert self.p2p_nccl_engine is not None - forward_context: "ForwardContext" = kwargs.get("forward_context") - return self.p2p_nccl_engine.get_finished( - finished_req_ids, forward_context) + forward_context: ForwardContext = kwargs.get("forward_context") + return self.p2p_nccl_engine.get_finished(finished_req_ids, + forward_context) # ============================== # Scheduler-side methods diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 682c1da09f4b..6bc0f4081b54 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -17,6 +17,9 @@ from vllm.tensor_memory_pool import TensorMemoryPool from vllm.utils import current_stream, get_ip +if TYPE_CHECKING: + from vllm.forward_context import ForwardContext + logger = logging.getLogger(__name__) @@ -366,15 +369,13 @@ def _have_sent_tensor_id(self, tensor_id: str): request_id = tensor_id.split('#')[0] if request_id not in self.send_request_id_to_tensor_ids: self.send_request_id_to_tensor_ids[request_id] = set() - self.send_request_id_to_tensor_ids[request_id].add( - tensor_id) + self.send_request_id_to_tensor_ids[request_id].add(tensor_id) def _have_received_tensor_id(self, tensor_id: str): request_id = tensor_id.split('#')[0] if request_id not in self.recv_request_id_to_tensor_ids: self.recv_request_id_to_tensor_ids[request_id] = set() - self.recv_request_id_to_tensor_ids[request_id].add( - tensor_id) + self.recv_request_id_to_tensor_ids[request_id].add(tensor_id) def _send_async(self): while True: @@ -463,15 +464,16 @@ def get_finished( if tensor_id in self.recv_store: with self.recv_store_cv: tensor = self.recv_store.pop(tensor_id, None) - self.recv_request_id_to_tensor_ids.discard(request_id) - self.send_request_id_to_tensor_ids.discard(request_id) + self.send_request_id_to_tensor_ids.pop(request_id, + None) + self.recv_request_id_to_tensor_ids.pop(request_id, + None) addr = 0 if isinstance(tensor, tuple): addr, _, _ = tensor self.pool.free(addr) - logger.debug( - "๐Ÿžget_finished, remove tensor_id:%s, addr:%d", - tensor_id, addr) + logger.debug("๐Ÿžget_finished, remove tensor_id:%s, addr:%d", + tensor_id, addr) num_layers = len(forward_context.no_compile_layers) @@ -486,13 +488,12 @@ def get_finished( # Retrieve requests that have already received the KV cache. finished_recving: set[str] = set() for request_id in self.recv_request_id_to_tensor_ids: - if (len(self.recv_request_id_to_tensor_ids[request_id]) == + if (len(self.recv_request_id_to_tensor_ids[request_id]) == num_layers): finished_recving.add(request_id) - logger.debug( - "๐Ÿžget_finished, finished_sending:%s, finished_recving:%s", - finished_sending, finished_recving) + logger.debug("๐Ÿžget_finished, finished_sending:%s, finished_recving:%s", + finished_sending, finished_recving) return finished_sending or None, finished_recving or None From 1d371668a103b626c1772598a04d06f088863618 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 26 May 2025 13:59:47 +0800 Subject: [PATCH 103/155] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 6bc0f4081b54..0a44007d9da9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -5,7 +5,7 @@ import time import typing from collections import deque -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional import msgpack import torch @@ -464,10 +464,10 @@ def get_finished( if tensor_id in self.recv_store: with self.recv_store_cv: tensor = self.recv_store.pop(tensor_id, None) - self.send_request_id_to_tensor_ids.pop(request_id, - None) - self.recv_request_id_to_tensor_ids.pop(request_id, - None) + self.send_request_id_to_tensor_ids.pop( + request_id, None) + self.recv_request_id_to_tensor_ids.pop( + request_id, None) addr = 0 if isinstance(tensor, tuple): addr, _, _ = tensor From 1b2935200c2092cdc4977f489b34e38036215618 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 26 May 2025 14:07:37 +0800 Subject: [PATCH 104/155] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 0a44007d9da9..8425e67681f8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -370,7 +370,7 @@ def _have_sent_tensor_id(self, tensor_id: str): if request_id not in self.send_request_id_to_tensor_ids: self.send_request_id_to_tensor_ids[request_id] = set() self.send_request_id_to_tensor_ids[request_id].add(tensor_id) - + def _have_received_tensor_id(self, tensor_id: str): request_id = tensor_id.split('#')[0] if request_id not in self.recv_request_id_to_tensor_ids: From 43ed2ad4d333f6c964eea2e244361b29b1076d06 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 26 May 2025 18:24:27 +0800 Subject: [PATCH 105/155] Support release KV Cache after sending is completed Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 35 ++++++++++++------- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 35 ++++++++++--------- vllm/v1/worker/gpu_model_runner.py | 3 +- 3 files changed, 42 insertions(+), 31 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 877d5a482e95..100f40ec26b3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -88,7 +88,7 @@ def __init__(self, rank=rank, local_rank=local_rank) self._block_size = vllm_config.cache_config.block_size - self._requests_need_load: dict[str, Request] = {} + self._requests_need_load: dict[str, Any] = {} self.config = vllm_config.kv_transfer_config self.rank = rank self.is_producer = self.config.is_kv_producer @@ -332,7 +332,7 @@ def get_num_new_matched_tokens( "num_computed_tokens:%d", num_external_tokens, len(request.prompt_token_ids), num_computed_tokens) - return num_external_tokens, True + return num_external_tokens, False def update_state_after_alloc(self, request: "Request", blocks: "KVCacheBlocks", @@ -341,7 +341,8 @@ def update_state_after_alloc(self, request: "Request", Update KVConnector state after block allocation. """ if not self.is_producer and num_external_tokens > 0: - self._requests_need_load[request.request_id] = request + self._requests_need_load[request.request_id] = ( + request, blocks.get_block_ids()[0]) def build_connector_meta( self, @@ -361,7 +362,6 @@ def build_connector_meta( meta = P2pNcclConnectorMetadata() - total_need_load = 0 for new_req in scheduler_output.scheduled_new_reqs: if self.is_producer: num_scheduled_tokens = ( @@ -391,7 +391,7 @@ def build_connector_meta( token_ids=new_req.prompt_token_ids, block_ids=new_req.block_ids[0], block_size=self._block_size) - total_need_load += 1 + self._requests_need_load.pop(new_req.req_id) for cached_req in scheduler_output.scheduled_cached_reqs: if self.is_producer: @@ -431,7 +431,7 @@ def build_connector_meta( # NOTE(rob): cached_req_data does not have the full # list of token ids (only new tokens). So we look it # up in the actual request object. - request = self._requests_need_load[cached_req.req_id] + request, _ = self._requests_need_load.pop(cached_req.req_id) total_tokens = (len(cached_req.new_token_ids) + cached_req.num_computed_tokens) token_ids = request.all_token_ids[:total_tokens] @@ -444,9 +444,18 @@ def build_connector_meta( token_ids=token_ids, block_ids=block_ids, block_size=self._block_size) - total_need_load += 1 - assert total_need_load == len(self._requests_need_load) + # Requests loaded asynchronously are not in the scheduler_output. + # for request_id in self._requests_need_load: + # request, block_ids = self._requests_need_load[request_id] + # meta.add_request(request_id=request.request_id, + # token_ids=request.prompt_token_ids, + # block_ids=block_ids, + # block_size=self._block_size) + + logger.debug("๐Ÿžbuild_connector_meta, _requests_need_load:%s", + self._requests_need_load) + self._requests_need_load.clear() return meta @@ -471,7 +480,7 @@ def request_finished( self.chunked_prefill.pop(request.request_id, None) - return False, None + return True, None # ============================== # Static methods @@ -479,8 +488,6 @@ def request_finished( @staticmethod def parse_request_id(request_id: str, is_prefill=True) -> tuple[str, int]: - logger.debug("parse_request_id, request_id: %s, is_prefill: %s", - request_id, is_prefill) # Regular expression to match the string hostname and integer port if is_prefill: pattern = r"___decode_addr_(.*):(\d+)" @@ -494,8 +501,10 @@ def parse_request_id(request_id: str, is_prefill=True) -> tuple[str, int]: ip = match.group(1) port = int(match.group(2)) - logger.debug("parse_request_id, request_id: %s, ip: %s, port: %s", - request_id, ip, str(port)) + logger.debug( + "parse_request_id, request_id: %s, ip: %s, port: %s, " + "is_prefill:%s", request_id, ip, str(port), is_prefill) + return ip, port raise ValueError( f"Request id {request_id} does not contain hostname and port") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 8425e67681f8..82db556dc254 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -383,20 +383,21 @@ def _send_async(self): while not self.send_queue: self.send_queue_cv.wait() tensor_id, remote_address, tensor = self.send_queue.popleft() - if not self.send_queue: - self.send_queue_cv.notify() + # if not self.send_queue: + # self.send_queue_cv.notify() self._send_sync(tensor_id, tensor, remote_address) def wait_for_sent(self): - if self.send_type == "PUT_ASYNC": - start_time = time.time() - with self.send_queue_cv: - while self.send_queue: - self.send_queue_cv.wait() - duration = time.time() - start_time - logger.info( - "๐Ÿšง[PUT_ASYNC]It took %.3fms to wait for the send_queue" - " to be empty, rank:%d", duration * 1000, self.rank) + return + # if self.send_type == "PUT_ASYNC": + # start_time = time.time() + # with self.send_queue_cv: + # while self.send_queue: + # self.send_queue_cv.wait() + # duration = time.time() - start_time + # logger.info( + # "๐Ÿšง[PUT_ASYNC]It took %.3fms to wait for the send_queue" + # " to be empty, rank:%d", duration * 1000, self.rank) def _send_sync( self, @@ -484,13 +485,13 @@ def get_finished( if (len(self.send_request_id_to_tensor_ids[request_id]) == num_layers): finished_sending.add(request_id) - + # Retrieve requests that have already received the KV cache. - finished_recving: set[str] = set() - for request_id in self.recv_request_id_to_tensor_ids: - if (len(self.recv_request_id_to_tensor_ids[request_id]) == - num_layers): - finished_recving.add(request_id) + # finished_recving: set[str] = set() + # for request_id in self.recv_request_id_to_tensor_ids: + # if (len(self.recv_request_id_to_tensor_ids[request_id]) == + # num_layers): + # finished_recving.add(request_id) logger.debug("๐Ÿžget_finished, finished_sending:%s, finished_recving:%s", finished_sending, finished_recving) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 3d3b57548fd8..2fd51c14d308 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1479,7 +1479,8 @@ def get_finished_kv_transfers( ) -> tuple[Optional[set[str]], Optional[set[str]]]: if has_kv_transfer_group(): return get_kv_transfer_group().get_finished( - scheduler_output.finished_req_ids, get_forward_context()) + scheduler_output.finished_req_ids, + forward_context=get_forward_context()) return None, None def generate_draft_token_ids( From 38ae6c35c7f84a9fcf54a7be30f312c3f703e1aa Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 26 May 2025 19:34:54 +0800 Subject: [PATCH 106/155] Clear the buffer upon request completion(3/N) Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 2 +- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 37 +++++++++---------- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 100f40ec26b3..efe61bee4133 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -480,7 +480,7 @@ def request_finished( self.chunked_prefill.pop(request.request_id, None) - return True, None + return False, None # ============================== # Static methods diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 82db556dc254..30b3db286fe4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -383,21 +383,20 @@ def _send_async(self): while not self.send_queue: self.send_queue_cv.wait() tensor_id, remote_address, tensor = self.send_queue.popleft() - # if not self.send_queue: - # self.send_queue_cv.notify() + if not self.send_queue: + self.send_queue_cv.notify() self._send_sync(tensor_id, tensor, remote_address) def wait_for_sent(self): - return - # if self.send_type == "PUT_ASYNC": - # start_time = time.time() - # with self.send_queue_cv: - # while self.send_queue: - # self.send_queue_cv.wait() - # duration = time.time() - start_time - # logger.info( - # "๐Ÿšง[PUT_ASYNC]It took %.3fms to wait for the send_queue" - # " to be empty, rank:%d", duration * 1000, self.rank) + if self.send_type == "PUT_ASYNC": + start_time = time.time() + with self.send_queue_cv: + while self.send_queue: + self.send_queue_cv.wait() + duration = time.time() - start_time + logger.info( + "๐Ÿšง[PUT_ASYNC]It took %.3fms to wait for the send_queue" + " to be empty, rank:%d", duration * 1000, self.rank) def _send_sync( self, @@ -476,18 +475,18 @@ def get_finished( logger.debug("๐Ÿžget_finished, remove tensor_id:%s, addr:%d", tensor_id, addr) - num_layers = len(forward_context.no_compile_layers) + # num_layers = len(forward_context.no_compile_layers) # Retrieve requests that have already sent the KV cache. finished_sending: set[str] = set() - if self.send_type == "PUT_ASYNC" or self.send_type == "GET": - for request_id in self.send_request_id_to_tensor_ids: - if (len(self.send_request_id_to_tensor_ids[request_id]) == - num_layers): - finished_sending.add(request_id) + # if self.send_type == "PUT_ASYNC" or self.send_type == "GET": + # for request_id in self.send_request_id_to_tensor_ids: + # if (len(self.send_request_id_to_tensor_ids[request_id]) == + # num_layers): + # finished_sending.add(request_id) # Retrieve requests that have already received the KV cache. - # finished_recving: set[str] = set() + finished_recving: set[str] = set() # for request_id in self.recv_request_id_to_tensor_ids: # if (len(self.recv_request_id_to_tensor_ids[request_id]) == # num_layers): From bcd3dd9657157bb0bfda82784214f348a5905321 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 28 May 2025 14:58:59 +0800 Subject: [PATCH 107/155] Fix the issue of inaccurate results caused by preemption Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index efe61bee4133..3167773c1e2d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -329,8 +329,12 @@ def get_num_new_matched_tokens( num_computed_tokens) logger.info( "๐Ÿ’num_external_tokens:%d, num_prompt_tokens:%d, " - "num_computed_tokens:%d", num_external_tokens, - len(request.prompt_token_ids), num_computed_tokens) + "num_computed_tokens:%d, request_id:%s", num_external_tokens, + len(request.prompt_token_ids), num_computed_tokens, + request.request_id) + + if num_external_tokens < 0: + num_external_tokens = 0 return num_external_tokens, False @@ -428,18 +432,21 @@ def build_connector_meta( if not cached_req.resumed_from_preemption: break if cached_req.req_id in self._requests_need_load: - # NOTE(rob): cached_req_data does not have the full - # list of token ids (only new tokens). So we look it - # up in the actual request object. request, _ = self._requests_need_load.pop(cached_req.req_id) - total_tokens = (len(cached_req.new_token_ids) + - cached_req.num_computed_tokens) + total_tokens = cached_req.num_computed_tokens + 1 token_ids = request.all_token_ids[:total_tokens] # NOTE(rob): For resumed req, new_block_ids is all # of the block_ids for the request. block_ids = cached_req.new_block_ids[0] + logger.debug( + "๐Ÿžbuild_connector_meta, req_id:%s, total_tokens:%d, " + "num_computed_tokens:%d, token_ids:%s, num_token_ids:%d, " + "block_ids:%s, num_block_ids:%d", cached_req.req_id, + total_tokens, cached_req.num_computed_tokens, token_ids, + len(token_ids), block_ids, len(block_ids)) + meta.add_request(request_id=cached_req.req_id, token_ids=token_ids, block_ids=block_ids, From 4b15e270739f0e4928324c1180b4ee2039dda249 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 28 May 2025 15:17:58 +0800 Subject: [PATCH 108/155] format Signed-off-by: Abatom --- .../disagg_xpyd/disagg_prefill_proxy_xpyd.py | 62 +++++++++---------- 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index 210dd546c898..d33e93d89f5e 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -32,17 +32,17 @@ def _listen_for_register(poller, router_socket): global prefill_instances global prefill_cv with prefill_cv: - prefill_instances[ - data["http_address"]] = data["zmq_address"] + prefill_instances[data["http_address"]] = data["zmq_address"] elif data["type"] == "D": global decode_instances global decode_cv with decode_cv: - decode_instances[ - data["http_address"]] = data["zmq_address"] + decode_instances[data["http_address"]] = data["zmq_address"] else: - print("Unexpected, Received message from %s, data: %s", - remote_address, data) + print( + "Unexpected, Received message from %s, data: %s", + remote_address, + data) def start_service_discovery(hostname, port): @@ -58,9 +58,9 @@ def start_service_discovery(hostname, port): poller = zmq.Poller() poller.register(router_socket, zmq.POLLIN) - _listener_thread = threading.Thread(target=_listen_for_register, - args=[poller, router_socket], - daemon=True) + _listener_thread = threading.Thread( + target=_listen_for_register, args=[poller, router_socket], daemon=True + ) _listener_thread.start() return _listener_thread @@ -78,29 +78,27 @@ async def forward_request(url, data, request_id): async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "X-Request-Id": request_id + "X-Request-Id": request_id, } - async with session.post(url=url, json=data, - headers=headers) as response: + async with session.post(url=url, json=data, headers=headers) as response: if response.status == 200: # if response.headers.get('Transfer-Encoding') == 'chunked': if True: - async for chunk_bytes in response.content.iter_chunked( - 1024): + async for chunk_bytes in response.content.iter_chunked(1024): yield chunk_bytes else: content = await response.read() yield content -@app.route('/v1/completions', methods=['POST']) +@app.route("/v1/completions", methods=["POST"]) async def handle_request(): try: original_request_data = await request.get_json() prefill_request = original_request_data.copy() # change max_tokens = 1 to let it only do prefill - prefill_request['max_tokens'] = 1 + prefill_request["max_tokens"] = 1 global count global prefill_instances @@ -109,8 +107,7 @@ async def handle_request(): # prefill_addr, prefill_zmq_addr = random.choice( # list(prefill_instances.items())) prefill_list = list(prefill_instances.items()) - prefill_addr, prefill_zmq_addr = prefill_list[count % - len(prefill_list)] + prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)] global decode_instances global decode_cv @@ -118,26 +115,27 @@ async def handle_request(): # decode_addr, decode_zmq_addr = random.choice( # list(decode_instances.items())) decode_list = list(decode_instances.items()) - decode_addr, decode_zmq_addr = decode_list[count % - len(decode_list)] + decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)] - print(f"handle_request count: {count}, [HTTP:{prefill_addr}, " - f"ZMQ:{prefill_zmq_addr}] ๐Ÿ‘‰ [HTTP:{decode_addr}, " - f"ZMQ:{decode_zmq_addr}]") + print( + f"handle_request count: {count}, [HTTP:{prefill_addr}, " + f"ZMQ:{prefill_zmq_addr}] ๐Ÿ‘‰ [HTTP:{decode_addr}, " + f"ZMQ:{decode_zmq_addr}]" + ) count += 1 - request_id = ( - f"___prefill_addr_{prefill_zmq_addr}___decode_addr_{decode_zmq_addr}_{random_uuid()}" - ) + request_id = (f"___prefill_addr_{prefill_zmq_addr}___decode_addr_{decode_zmq_addr}_{random_uuid()}") # finish prefill - async for _ in forward_request(f'http://{prefill_addr}/v1/completions', - prefill_request, request_id): + async for _ in forward_request( + f'http://{prefill_addr}/v1/completions', prefill_request, request_id + ): continue # return decode - generator = forward_request(f'http://{decode_addr}/v1/completions', - original_request_data, request_id) + generator = forward_request( + f'http://{decode_addr}/v1/completions', original_request_data, request_id + ) response = await make_response(generator) response.timeout = None @@ -152,7 +150,7 @@ async def handle_request(): print("".join(traceback.format_exception(*exc_info))) -if __name__ == '__main__': +if __name__ == "__main__": t = start_service_discovery("0.0.0.0", 30001) - app.run(host='0.0.0.0', port=10001) + app.run(host="0.0.0.0", port=10001) t.join() From bcc608fde5632e640b806d5f1f23afd2f5c66508 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 28 May 2025 15:27:33 +0800 Subject: [PATCH 109/155] format Signed-off-by: Abatom --- .../disagg_xpyd/disagg_prefill_proxy_xpyd.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index d33e93d89f5e..4864b45f9993 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -42,7 +42,8 @@ def _listen_for_register(poller, router_socket): print( "Unexpected, Received message from %s, data: %s", remote_address, - data) + data, + ) def start_service_discovery(hostname, port): @@ -124,17 +125,17 @@ async def handle_request(): ) count += 1 - request_id = (f"___prefill_addr_{prefill_zmq_addr}___decode_addr_{decode_zmq_addr}_{random_uuid()}") + request_id = f"___prefill_addr_{prefill_zmq_addr}___decode_addr_{decode_zmq_addr}_{random_uuid()}" # finish prefill async for _ in forward_request( - f'http://{prefill_addr}/v1/completions', prefill_request, request_id + f"http://{prefill_addr}/v1/completions", prefill_request, request_id ): continue # return decode generator = forward_request( - f'http://{decode_addr}/v1/completions', original_request_data, request_id + f"http://{decode_addr}/v1/completions", original_request_data, request_id ) response = await make_response(generator) response.timeout = None @@ -144,6 +145,7 @@ async def handle_request(): except Exception as e: import sys import traceback + exc_info = sys.exc_info() print("Error occurred in disagg prefill proxy server") print(e) From 5c964909140327d2b67c1e5fb5f187862049e034 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 28 May 2025 15:37:23 +0800 Subject: [PATCH 110/155] format Signed-off-by: Abatom --- .../online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index 4864b45f9993..8bc12f692094 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -125,7 +125,8 @@ async def handle_request(): ) count += 1 - request_id = f"___prefill_addr_{prefill_zmq_addr}___decode_addr_{decode_zmq_addr}_{random_uuid()}" + request_id = (f"___prefill_addr_{prefill_zmq_addr}___decode_addr_" + f"{decode_zmq_addr}_{random_uuid()}") # finish prefill async for _ in forward_request( From af150dd369b016e00898984e4a1bb649c9d08f9a Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 28 May 2025 15:43:46 +0800 Subject: [PATCH 111/155] format Signed-off-by: Abatom --- .../online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index 8bc12f692094..b82f9383a1f8 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -125,8 +125,10 @@ async def handle_request(): ) count += 1 - request_id = (f"___prefill_addr_{prefill_zmq_addr}___decode_addr_" - f"{decode_zmq_addr}_{random_uuid()}") + request_id = ( + f"___prefill_addr_{prefill_zmq_addr}___decode_addr_" + f"{decode_zmq_addr}_{random_uuid()}" + ) # finish prefill async for _ in forward_request( From a7ccdca46947397b6a7a891df9e1dc0b8f62eff9 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 28 May 2025 17:09:37 +0800 Subject: [PATCH 112/155] bugfix for KVCacheManager.get_block_ids Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 2 +- vllm/v1/core/kv_cache_manager.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 3167773c1e2d..fdf228ec6cb0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -327,7 +327,7 @@ def get_num_new_matched_tokens( num_external_tokens = (len(request.prompt_token_ids) - 1 - num_computed_tokens) - logger.info( + logger.debug( "๐Ÿ’num_external_tokens:%d, num_prompt_tokens:%d, " "num_computed_tokens:%d, request_id:%s", num_external_tokens, len(request.prompt_token_ids), num_computed_tokens, diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index da18ece7555a..e65a56e4234f 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -366,6 +366,7 @@ def take_events(self) -> list[KVCacheEvent]: def get_block_ids(self, request_id: str) -> list[list[int]]: """Get the block ids of a request.""" - assert request_id in self.single_type_manager.req_to_blocks + if request_id not in self.single_type_manager.req_to_blocks: + return [[]] return KVCacheBlocks(self.single_type_manager.req_to_blocks[request_id] ).get_block_ids() From 4707612c883b1210b065ffcba2385685b337604f Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 29 May 2025 10:17:50 +0800 Subject: [PATCH 113/155] get_block_ids Signed-off-by: Abatom --- vllm/v1/core/kv_cache_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index e65a56e4234f..4ad03ef2c57d 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -367,6 +367,6 @@ def take_events(self) -> list[KVCacheEvent]: def get_block_ids(self, request_id: str) -> list[list[int]]: """Get the block ids of a request.""" if request_id not in self.single_type_manager.req_to_blocks: - return [[]] + return KVCacheBlocks.create_empty().get_block_ids() return KVCacheBlocks(self.single_type_manager.req_to_blocks[request_id] ).get_block_ids() From 066b34792342af63446466783c57aa4254d56613 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 29 May 2025 16:18:17 +0800 Subject: [PATCH 114/155] use copy_ Signed-off-by: Abatom --- vllm/tensor_memory_pool.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/vllm/tensor_memory_pool.py b/vllm/tensor_memory_pool.py index 31857bbedaf3..b9c8d9442709 100644 --- a/vllm/tensor_memory_pool.py +++ b/vllm/tensor_memory_pool.py @@ -36,9 +36,6 @@ def __init__(self, max_block_size: int, min_block_size: int = 512): atexit.register(self.cleanup) - self.store_stream = torch.cuda.Stream() - self.load_stream = torch.cuda.Stream() - def _round_to_power_of_two(self, size: int) -> int: return 1 << (size - 1).bit_length() @@ -135,14 +132,12 @@ def store_tensor(self, tensor: torch.Tensor) -> int: buffer = (ctypes.c_byte * block.size).from_address(block.addr) cpu_tensor = torch.frombuffer(buffer, dtype=tensor.dtype, - count=tensor.numel()) + count=tensor.numel()).reshape(tensor.shape) except ValueError as err: self.free(addr) raise MemoryError(f"Failed to create tensor view: {err}") from err - with torch.cuda.stream(self.store_stream): - ops.store_tensor(tensor, cpu_tensor) - self.store_stream.synchronize() + cpu_tensor.copy_(tensor) return addr @@ -160,13 +155,13 @@ def load_tensor(self, addr: int, dtype: torch.dtype, raise ValueError("Requested tensor size exceeds block size") buffer = (ctypes.c_byte * block.size).from_address(block.addr) - cpu_tensor = torch.frombuffer(buffer, dtype=dtype, count=num_elements) + cpu_tensor = torch.frombuffer(buffer, + dtype=dtype, + count=num_elements).reshape(shape) cuda_tensor = torch.empty(shape, dtype=dtype, device=device) - with torch.cuda.stream(self.load_stream): - ops.load_tensor(cpu_tensor, cuda_tensor) - self.load_stream.synchronize() + cuda_tensor.copy_(cpu_tensor) return cuda_tensor From e384e7d273349969ec4159bc0e27c98c49f68614 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 29 May 2025 16:29:15 +0800 Subject: [PATCH 115/155] format Signed-off-by: Abatom --- vllm/tensor_memory_pool.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/vllm/tensor_memory_pool.py b/vllm/tensor_memory_pool.py index b9c8d9442709..a4ca13d6661c 100644 --- a/vllm/tensor_memory_pool.py +++ b/vllm/tensor_memory_pool.py @@ -7,8 +7,6 @@ import torch -from vllm import _custom_ops as ops - @dataclass class MemoryBlock: @@ -132,7 +130,8 @@ def store_tensor(self, tensor: torch.Tensor) -> int: buffer = (ctypes.c_byte * block.size).from_address(block.addr) cpu_tensor = torch.frombuffer(buffer, dtype=tensor.dtype, - count=tensor.numel()).reshape(tensor.shape) + count=tensor.numel()).reshape( + tensor.shape) except ValueError as err: self.free(addr) raise MemoryError(f"Failed to create tensor view: {err}") from err @@ -155,8 +154,7 @@ def load_tensor(self, addr: int, dtype: torch.dtype, raise ValueError("Requested tensor size exceeds block size") buffer = (ctypes.c_byte * block.size).from_address(block.addr) - cpu_tensor = torch.frombuffer(buffer, - dtype=dtype, + cpu_tensor = torch.frombuffer(buffer, dtype=dtype, count=num_elements).reshape(shape) cuda_tensor = torch.empty(shape, dtype=dtype, device=device) From 5cc96861090aea320feeb5b9cd741f9f10982e4e Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 29 May 2025 16:33:36 +0800 Subject: [PATCH 116/155] remove tensor_store_load_mem.cu Signed-off-by: Abatom --- CMakeLists.txt | 1 - csrc/ops.h | 2 - csrc/tensor_store_load_mem.cu | 107 ---------------------------------- csrc/torch_bindings.cpp | 9 --- vllm/_custom_ops.py | 10 +--- 5 files changed, 1 insertion(+), 128 deletions(-) delete mode 100644 csrc/tensor_store_load_mem.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 2dcf7a62c6f2..6a1ed588749a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -251,7 +251,6 @@ set(VLLM_EXT_SRC "csrc/cuda_utils_kernels.cu" "csrc/prepare_inputs/advance_step.cu" "csrc/custom_all_reduce.cu" - "csrc/tensor_store_load_mem.cu" "csrc/torch_bindings.cpp") if(VLLM_GPU_LANG STREQUAL "CUDA") diff --git a/csrc/ops.h b/csrc/ops.h index 646abb724ac0..7044b4588b81 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -345,5 +345,3 @@ std::tuple allocate_shared_buffer_and_handle( int64_t size); int64_t open_mem_handle(torch::Tensor& mem_handle); void free_shared_buffer(int64_t buffer); -void store_tensor(torch::Tensor& device_tensor, torch::Tensor& host_tensor); -void load_tensor(torch::Tensor& host_tensor, torch::Tensor& device_tensor); diff --git a/csrc/tensor_store_load_mem.cu b/csrc/tensor_store_load_mem.cu deleted file mode 100644 index 449b18552813..000000000000 --- a/csrc/tensor_store_load_mem.cu +++ /dev/null @@ -1,107 +0,0 @@ -#include -#include -#include -#include - -// Template-based CUDA kernel: Copy from device memory to pinned host memory -template -__global__ void store_kernel(const scalar_t* device_ptr, scalar_t* host_ptr, - size_t num_elements) { - const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_elements) { - host_ptr[idx] = device_ptr[idx]; - } -} - -// Templated CUDA kernel: Copy from pinned host memory to device memory -template -__global__ void load_kernel(const scalar_t* host_ptr, scalar_t* device_ptr, - size_t num_elements) { - const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < num_elements) { - device_ptr[idx] = host_ptr[idx]; - } -} - -// Templated wrapper function: Store Tensor to pinned memory -template -void store_tensor_impl(torch::Tensor& device_tensor, - torch::Tensor& host_tensor) { - const auto num_elements = device_tensor.numel(); - const int threads = 256; - const int blocks = (num_elements + threads - 1) / threads; - - auto device_ptr = device_tensor.data_ptr(); - auto host_ptr = host_tensor.data_ptr(); - - store_kernel - <<>>( - device_ptr, host_ptr, num_elements); -} - -// Templated wrapper function: Load Tensor from pinned memory -template -void load_tensor_impl(torch::Tensor& host_tensor, - torch::Tensor& device_tensor) { - const auto num_elements = host_tensor.numel(); - const int threads = 256; - const int blocks = (num_elements + threads - 1) / threads; - - auto host_ptr = host_tensor.data_ptr(); - auto device_ptr = device_tensor.data_ptr(); - - load_kernel - <<>>( - host_ptr, device_ptr, num_elements); -} - -// Type-dispatched wrapper function -void store_tensor(torch::Tensor& device_tensor, torch::Tensor& host_tensor) { - // Validate arguments - TORCH_CHECK(device_tensor.is_cuda(), "Input tensor must be a CUDA tensor"); - TORCH_CHECK(host_tensor.is_pinned(), "Output tensor must be pinned memory"); - TORCH_CHECK(device_tensor.numel() == host_tensor.numel(), - "Tensors must have same number of elements"); - TORCH_CHECK(device_tensor.dtype() == host_tensor.dtype(), - "Tensors must have same dtype"); - - // Type-based dispatch to different implementations - switch (device_tensor.scalar_type()) { - case torch::kFloat: - store_tensor_impl(device_tensor, host_tensor); - break; - case torch::kHalf: - store_tensor_impl(device_tensor, host_tensor); - break; - case torch::kBFloat16: - store_tensor_impl(device_tensor, host_tensor); - break; - default: - TORCH_CHECK("Unsupported data type: ", device_tensor.scalar_type()); - } -} - -void load_tensor(torch::Tensor& host_tensor, torch::Tensor& device_tensor) { - // Validate arguments - TORCH_CHECK(device_tensor.is_cuda(), "Output tensor must be a CUDA tensor"); - TORCH_CHECK(host_tensor.is_pinned(), "Input tensor must be pinned memory"); - TORCH_CHECK(device_tensor.numel() == host_tensor.numel(), - "Tensors must have same number of elements"); - TORCH_CHECK(device_tensor.dtype() == host_tensor.dtype(), - "Tensors must have same dtype"); - - // Type-based dispatch to different implementations - switch (host_tensor.scalar_type()) { - case torch::kFloat: - load_tensor_impl(host_tensor, device_tensor); - break; - case torch::kHalf: - load_tensor_impl(host_tensor, device_tensor); - break; - case torch::kBFloat16: - load_tensor_impl(host_tensor, device_tensor); - break; - default: - TORCH_CHECK("Unsupported data type: ", host_tensor.scalar_type()); - } -} diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 78f6be635ebf..4eda1aaccc6b 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -703,13 +703,4 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { custom_ar.def("free_shared_buffer", &free_shared_buffer); } -TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _mem_pool), mem_pool) { - // Store Tensor to pinned memory - mem_pool.def("store_tensor(Tensor device_tensor, Tensor! host_tensor) -> ()"); - mem_pool.impl("store_tensor", torch::kCUDA, &store_tensor); - // Load Tensor from pinned memory - mem_pool.def("load_tensor(Tensor host_tensor, Tensor! device_tensor) -> ()"); - mem_pool.impl("load_tensor", torch::kCUDA, &load_tensor); -} - REGISTER_EXTENSION(TORCH_EXTENSION_NAME) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 1f024b6cb30c..91a037e7fcb3 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1739,12 +1739,4 @@ def cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor, scale: float) -> torch.Tensor: torch.ops._C.cutlass_mla_decode(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale) - return out - - -def store_tensor(device_tensor: torch.Tensor, host_tensor: torch.Tensor): - torch.ops._C_mem_pool.store_tensor(device_tensor, host_tensor) - - -def load_tensor(host_tensor: torch.Tensor, device_tensor: torch.Tensor): - torch.ops._C_mem_pool.load_tensor(host_tensor, device_tensor) + return out \ No newline at end of file From 6825ed7c9134f3e6011c0008329031ccfd9e22e8 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 29 May 2025 16:34:33 +0800 Subject: [PATCH 117/155] remove tensor_store_load_mem.cu Signed-off-by: Abatom --- vllm/_custom_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 91a037e7fcb3..3c8e6b95ce76 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -1739,4 +1739,4 @@ def cutlass_mla_decode(out: torch.Tensor, q_nope: torch.Tensor, scale: float) -> torch.Tensor: torch.ops._C.cutlass_mla_decode(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale) - return out \ No newline at end of file + return out From a805c4279cd9e693fd6a6852c77e9c69146a170c Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 29 May 2025 17:08:35 +0800 Subject: [PATCH 118/155] format Signed-off-by: Abatom --- .../online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py | 4 ---- vllm/distributed/kv_transfer/kv_connector/factory.py | 1 - 2 files changed, 5 deletions(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index b82f9383a1f8..50916dd5a026 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -105,16 +105,12 @@ async def handle_request(): global prefill_instances global prefill_cv with prefill_cv: - # prefill_addr, prefill_zmq_addr = random.choice( - # list(prefill_instances.items())) prefill_list = list(prefill_instances.items()) prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)] global decode_instances global decode_cv with decode_cv: - # decode_addr, decode_zmq_addr = random.choice( - # list(decode_instances.items())) decode_list = list(decode_instances.items()) decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)] diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index e092a6097060..dc9f122d3aaa 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -85,7 +85,6 @@ def create_connector_v1(cls, # Register various connectors here. # The registration should not be done in each individual file, as we want to # only load the files corresponding to the current connector. - KVConnectorFactory.register_connector( "PyNcclConnector", "vllm.distributed.kv_transfer.kv_connector.simple_connector", From afa7552e99e73866dc1e7ea829e1f2a6dcebdfee Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 29 May 2025 17:34:41 +0800 Subject: [PATCH 119/155] Misc Signed-off-by: Abatom --- .../device_communicators/pynccl_wrapper.py | 13 ------------- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 12 +++--------- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/vllm/distributed/device_communicators/pynccl_wrapper.py b/vllm/distributed/device_communicators/pynccl_wrapper.py index 8fd0e9d8603d..f3d069d828a6 100644 --- a/vllm/distributed/device_communicators/pynccl_wrapper.py +++ b/vllm/distributed/device_communicators/pynccl_wrapper.py @@ -272,22 +272,9 @@ def ncclGetUniqueId(self) -> ncclUniqueId: return unique_id def unique_id_from_bytes(self, data: bytes) -> ncclUniqueId: - """ - Reconstructs an `ncclUniqueId` object from bytes data. - - Args: - data: Must be a 128-byte data block (matching NCCL's unique_id). - - Returns: - ncclUniqueId: The reconstructed NCCL Unique ID object. - - Raises: - ValueError: If the input data length is not 128 bytes. - """ if len(data) != 128: raise ValueError( f"Expected 128 bytes for ncclUniqueId, got {len(data)} bytes") - unique_id = ncclUniqueId() ctypes.memmove(ctypes.addressof(unique_id.internal), data, 128) return unique_id diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 30b3db286fe4..9d66a550fcec 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -204,16 +204,12 @@ def recv_tensor( while tensor_id not in self.recv_store: self.recv_store_cv.wait() tensor = self.recv_store[tensor_id] - # self.recv_store[tensor_id] = None - # while len(self.recv_store) > 10000: - # self.recv_store.pop(next(iter(self.recv_store))) if tensor is not None: if isinstance(tensor, tuple): addr, dtype, shape = tensor tensor = self.pool.load_tensor(addr, dtype, shape, self.device) - # self.pool.free(addr) else: addr = 0 self.buffer_size -= (tensor.element_size() * @@ -421,9 +417,6 @@ def _send_sync( response = sock.recv() if response != b"0": - # with self.send_queue_cv: - # self.send_queue.append([tensor_id, remote_address, tensor]) - # self.send_queue_cv.notify() logger.warning( "๐Ÿ”ดSend Tensor, Peer Out Of Memory/Threshold, %s ๐Ÿ‘‰ %s, " "MyRank:%s, data:%s, tensor:%s, size:%fGB, response:%s", @@ -437,8 +430,9 @@ def _send_sync( if self.send_type == "PUT_ASYNC": self._have_sent_tensor_id(tensor_id) - logger.info("๐Ÿ”ตSend Tensor, %s๐Ÿ‘‰%s, MyRank:%s, data:%s, tensor:%s", - self.zmq_address, remote_address, rank, data, tensor.shape) + logger.info( + "๐Ÿ”ตSend Tensor, %s๐Ÿ‘‰%s, MyRank:%s, data:%s", self.zmq_address, + remote_address, rank, data) return True def get_finished( From 47ee59acd17c930883ad1cdfe3e76aa696880007 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 29 May 2025 17:45:05 +0800 Subject: [PATCH 120/155] Misc Signed-off-by: Abatom --- .../online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py index 50916dd5a026..73f2caaa0dbd 100644 --- a/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py +++ b/examples/online_serving/disagg_xpyd/disagg_prefill_proxy_xpyd.py @@ -26,8 +26,6 @@ def _listen_for_register(poller, router_socket): # data: {"type": "P", "http_address": "ip:port", # "zmq_address": "ip:port"} data = msgpack.loads(message) - # print("Received message from %s, data: %s", - # remote_address.decode(), data) if data["type"] == "P": global prefill_instances global prefill_cv @@ -83,7 +81,6 @@ async def forward_request(url, data, request_id): } async with session.post(url=url, json=data, headers=headers) as response: if response.status == 200: - # if response.headers.get('Transfer-Encoding') == 'chunked': if True: async for chunk_bytes in response.content.iter_chunked(1024): yield chunk_bytes From 135e906f86c5bb42387d1e0ce3590c40092b9040 Mon Sep 17 00:00:00 2001 From: Abatom Date: Thu, 29 May 2025 18:09:20 +0800 Subject: [PATCH 121/155] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 9d66a550fcec..308e8f43e229 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -430,9 +430,8 @@ def _send_sync( if self.send_type == "PUT_ASYNC": self._have_sent_tensor_id(tensor_id) - logger.info( - "๐Ÿ”ตSend Tensor, %s๐Ÿ‘‰%s, MyRank:%s, data:%s", self.zmq_address, - remote_address, rank, data) + logger.info("๐Ÿ”ตSend Tensor, %s๐Ÿ‘‰%s, MyRank:%s, data:%s", + self.zmq_address, remote_address, rank, data) return True def get_finished( From 8af556a5902af50c56a830ccd63bd93356430a90 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 31 May 2025 12:43:23 +0800 Subject: [PATCH 122/155] get_finished Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index 286f182101c7..c8841fe3279e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -106,7 +106,7 @@ def get_finished( finished_sending: set[str] = set() finished_recving: set[str] = set() for c in self._connectors: - sending, recving = c.get_finished(finished_req_ids) + sending, recving = c.get_finished(finished_req_ids, kwargs) if not recving and not sending: continue # Aggregate finished recving request ids. From 5c1c552544d859991fe954315c9ec4fd90c6bd4c Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 31 May 2025 13:36:49 +0800 Subject: [PATCH 123/155] get_finished Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index c8841fe3279e..c39fa7589908 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -106,7 +106,7 @@ def get_finished( finished_sending: set[str] = set() finished_recving: set[str] = set() for c in self._connectors: - sending, recving = c.get_finished(finished_req_ids, kwargs) + sending, recving = c.get_finished(finished_req_ids, **kwargs) if not recving and not sending: continue # Aggregate finished recving request ids. From 18d40b17b75306e9027d3ec5b54c2cbc510390bd Mon Sep 17 00:00:00 2001 From: Abatom Date: Sun, 1 Jun 2025 11:31:10 +0800 Subject: [PATCH 124/155] use get_world_group().local_rank Signed-off-by: Abatom --- .../kv_transfer/kv_connector/factory.py | 12 ++++++------ .../kv_transfer/kv_connector/v1/base.py | 11 ++++------- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 18 +++++------------- .../kv_transfer/kv_transfer_state.py | 5 +---- 4 files changed, 16 insertions(+), 30 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 31233edc150e..f0a397bf9ec6 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -49,11 +49,11 @@ def create_connector_v0(cls, rank: int, local_rank: int, return connector_cls(rank, local_rank, config) @classmethod - def create_connector_v1(cls, - config: "VllmConfig", - role: KVConnectorRole, - rank: int = 0, - local_rank: int = 0) -> KVConnectorBase_V1: + def create_connector_v1( + cls, + config: "VllmConfig", + role: KVConnectorRole, + ) -> KVConnectorBase_V1: if not envs.VLLM_USE_V1: raise ValueError("Attempting to initialize a V1 Connector, " f"but found {envs.VLLM_USE_V1=}") @@ -80,7 +80,7 @@ def create_connector_v1(cls, # - Co-locate with worker process # - Should only be used inside the forward context & attention layer # We build separately to enforce strict separation - return connector_cls(config, role, rank, local_rank) + return connector_cls(config, role) # Register various connectors here. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index b40356c3ba69..c3b9a8dd9e6f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -26,6 +26,7 @@ import torch +from vllm.distributed.parallel_state import get_world_group from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput @@ -57,19 +58,15 @@ class KVConnectorMetadata: class KVConnectorBase_V1(ABC): - def __init__(self, - vllm_config: "VllmConfig", - role: KVConnectorRole, - rank: int = 0, - local_rank: int = 0): + def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): logger.warning( "Initializing KVConnectorBase_V1. This API is experimental and " "subject to change in the future as we iterate the design.") self._connector_metadata = KVConnectorMetadata() self._vllm_config = vllm_config self._role = role - self._rank = rank - self._local_rank = local_rank + self._rank = get_world_group().rank + self._local_rank = get_world_group().local_rank @property def role(self) -> KVConnectorRole: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index fdf228ec6cb0..eedd94d7f25f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 -import re from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional +import regex as re import torch from vllm.config import VllmConfig @@ -78,27 +78,19 @@ def add_request( class P2pNcclConnector(KVConnectorBase_V1): - def __init__(self, - vllm_config: "VllmConfig", - role: KVConnectorRole, - rank: int = 0, - local_rank: int = 0): - super().__init__(vllm_config=vllm_config, - role=role, - rank=rank, - local_rank=local_rank) + def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): + super().__init__(vllm_config=vllm_config, role=role) self._block_size = vllm_config.cache_config.block_size self._requests_need_load: dict[str, Any] = {} self.config = vllm_config.kv_transfer_config - self.rank = rank self.is_producer = self.config.is_kv_producer self.chunked_prefill: dict[str, Any] = {} self.p2p_nccl_engine = P2pNcclEngine( - local_rank=local_rank, + local_rank=self._local_rank, config=self.config, hostname="", - port_offset=rank, + port_offset=self._rank, ) if role == KVConnectorRole.WORKER else None # ============================== diff --git a/vllm/distributed/kv_transfer/kv_transfer_state.py b/vllm/distributed/kv_transfer/kv_transfer_state.py index 96fb4bcf814b..25d2f2cf5c6e 100644 --- a/vllm/distributed/kv_transfer/kv_transfer_state.py +++ b/vllm/distributed/kv_transfer/kv_transfer_state.py @@ -61,10 +61,7 @@ def ensure_kv_transfer_initialized(vllm_config: "VllmConfig") -> None: and _KV_CONNECTOR_AGENT is None): if envs.VLLM_USE_V1: _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v1( - config=vllm_config, - role=KVConnectorRole.WORKER, - rank=get_world_group().rank, - local_rank=get_world_group().local_rank) + config=vllm_config, role=KVConnectorRole.WORKER) else: _KV_CONNECTOR_AGENT = KVConnectorFactory.create_connector_v0( rank=get_world_group().rank, From 3003a10ed8e787ec7aba37ee3f2936934bb0578f Mon Sep 17 00:00:00 2001 From: Abatom Date: Sun, 1 Jun 2025 13:01:46 +0800 Subject: [PATCH 125/155] modify log Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 308e8f43e229..ec77137cfb5b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -116,7 +116,7 @@ def __init__(self, self._ping_thread.start() logger.info( - "๐Ÿ’ฏP2pNcclEngine init, rank:%d, local_rank:%d, http_address:%s" + "๐Ÿ’ฏP2pNcclEngine init, rank:%d, local_rank:%d, http_address:%s, " "zmq_address:%s, proxy_address:%s, send_type:%s, buffer_size_" "threshold:%.2f", self.rank, self.local_rank, self.http_address, self.zmq_address, self.proxy_address, self.send_type, From c71b97fc6fff095affe7f0cee5f29df4248d7e83 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sun, 1 Jun 2025 13:22:50 +0800 Subject: [PATCH 126/155] bugfix Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 3 --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 6 ++++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index c3b9a8dd9e6f..cc781c7cc4dc 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -26,7 +26,6 @@ import torch -from vllm.distributed.parallel_state import get_world_group from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput @@ -65,8 +64,6 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): self._connector_metadata = KVConnectorMetadata() self._vllm_config = vllm_config self._role = role - self._rank = get_world_group().rank - self._local_rank = get_world_group().local_rank @property def role(self) -> KVConnectorRole: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index eedd94d7f25f..d2a3faf19e84 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -11,6 +11,7 @@ KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) from vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_engine import ( P2pNcclEngine) +from vllm.distributed.parallel_state import get_world_group from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput @@ -86,6 +87,11 @@ def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): self.is_producer = self.config.is_kv_producer self.chunked_prefill: dict[str, Any] = {} + self._rank = get_world_group().rank \ + if role == KVConnectorRole.WORKER else 0 + self._local_rank = get_world_group().local_rank \ + if role == KVConnectorRole.WORKER else 0 + self.p2p_nccl_engine = P2pNcclEngine( local_rank=self._local_rank, config=self.config, From f6460c328dabb0246fdc113b044c9f2cc8d78d90 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sun, 1 Jun 2025 14:17:04 +0800 Subject: [PATCH 127/155] SharedStorageConnector.__init__ Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/shared_storage_connector.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 239ae0f4b8f1..0421a65a2c81 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -74,11 +74,7 @@ class SharedStorageConnector(KVConnectorBase_V1): # It does extra work which will overwrite the existing prefix-cache in GPU # - to remove the overhead, need to add some "mask" in the ReqMeta class - def __init__(self, - vllm_config: "VllmConfig", - role: KVConnectorRole, - rank: int = 0, - local_rank: int = 0): + def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): super().__init__(vllm_config=vllm_config, role=role) self._block_size = vllm_config.cache_config.block_size self._requests_need_load: dict[str, Request] = {} From bc5872bc6e21ef83baf4ceffaddb11ffc4c902f2 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 2 Jun 2025 10:23:04 +0800 Subject: [PATCH 128/155] get_forward_context() Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index d2a3faf19e84..408257c702c8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -12,6 +12,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.p2p.p2p_nccl_engine import ( P2pNcclEngine) from vllm.distributed.parallel_state import get_world_group +from vllm.forward_context import get_forward_context from vllm.logger import init_logger from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.core.sched.output import SchedulerOutput @@ -294,7 +295,7 @@ def get_finished( assert self.p2p_nccl_engine is not None - forward_context: ForwardContext = kwargs.get("forward_context") + forward_context: ForwardContext = get_forward_context() return self.p2p_nccl_engine.get_finished(finished_req_ids, forward_context) From 4e362034c80e35a77861e1104e29932287bd26a2 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 2 Jun 2025 11:38:14 +0800 Subject: [PATCH 129/155] forward_context Signed-off-by: Abatom --- vllm/distributed/kv_transfer/kv_connector/v1/base.py | 4 ++-- .../kv_transfer/kv_connector/v1/multi_connector.py | 6 +++--- .../kv_transfer/kv_connector/v1/nixl_connector.py | 4 ++-- vllm/v1/worker/gpu_model_runner.py | 3 +-- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index cc781c7cc4dc..bc9258e9d07b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -176,8 +176,8 @@ def wait_for_save(self): pass def get_finished( - self, finished_req_ids: set[str], - **kwargs) -> tuple[Optional[set[str]], Optional[set[str]]]: + self, finished_req_ids: set[str] + ) -> tuple[Optional[set[str]], Optional[set[str]]]: """ Notifies worker-side connector ids of requests that have finished generating tokens. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index c39fa7589908..0aabb260fd3d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -101,12 +101,12 @@ def wait_for_save(self): c.wait_for_save() def get_finished( - self, finished_req_ids: set[str], - **kwargs) -> tuple[Optional[set[str]], Optional[set[str]]]: + self, finished_req_ids: set[str] + ) -> tuple[Optional[set[str]], Optional[set[str]]]: finished_sending: set[str] = set() finished_recving: set[str] = set() for c in self._connectors: - sending, recving = c.get_finished(finished_req_ids, **kwargs) + sending, recving = c.get_finished(finished_req_ids) if not recving and not sending: continue # Aggregate finished recving request ids. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 66708e789741..f02434aeb5ca 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -139,8 +139,8 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): assert self.connector_worker is not None self.connector_worker.register_kv_caches(kv_caches) - def get_finished(self, finished_req_ids: set[str], - **kwargs) -> tuple[set[str], set[str]]: + def get_finished(self, + finished_req_ids: set[str]) -> tuple[set[str], set[str]]: """Get the finished recving and sending requests.""" assert self.connector_worker is not None return self.connector_worker.get_finished() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 66fdbc4bd2d7..60425a4e1581 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1498,8 +1498,7 @@ def get_finished_kv_transfers( ) -> tuple[Optional[set[str]], Optional[set[str]]]: if has_kv_transfer_group(): return get_kv_transfer_group().get_finished( - scheduler_output.finished_req_ids, - forward_context=get_forward_context()) + scheduler_output.finished_req_ids) return None, None def generate_draft_token_ids( From bbf257c55f31ed4e621525f8644367f87e2eaa49 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 4 Jun 2025 10:14:16 +0800 Subject: [PATCH 130/155] clean code Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index ec77137cfb5b..45939a34b9cb 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -76,8 +76,8 @@ def __init__(self, self.send_stream = torch.cuda.Stream() self.recv_stream = torch.cuda.Stream() - mem_pool_size = self.config.get_from_extra_config("mem_pool_size", 128) - self.pool = TensorMemoryPool(max_block_size=mem_pool_size * + mem_pool_size_gb = self.config.get_from_extra_config("mem_pool_size_gb", 128) + self.pool = TensorMemoryPool(max_block_size=mem_pool_size_gb * 1024**3) # GB # The sending type includes tree mutually exclusive options: @@ -468,22 +468,11 @@ def get_finished( logger.debug("๐Ÿžget_finished, remove tensor_id:%s, addr:%d", tensor_id, addr) - # num_layers = len(forward_context.no_compile_layers) - - # Retrieve requests that have already sent the KV cache. + # TODO:Retrieve requests that have already sent the KV cache. finished_sending: set[str] = set() - # if self.send_type == "PUT_ASYNC" or self.send_type == "GET": - # for request_id in self.send_request_id_to_tensor_ids: - # if (len(self.send_request_id_to_tensor_ids[request_id]) == - # num_layers): - # finished_sending.add(request_id) - # Retrieve requests that have already received the KV cache. + # TODO:Retrieve requests that have already received the KV cache. finished_recving: set[str] = set() - # for request_id in self.recv_request_id_to_tensor_ids: - # if (len(self.recv_request_id_to_tensor_ids[request_id]) == - # num_layers): - # finished_recving.add(request_id) logger.debug("๐Ÿžget_finished, finished_sending:%s, finished_recving:%s", finished_sending, finished_recving) From 6b5547d4a17b11ea511013043eba3b30c60f6333 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 4 Jun 2025 10:21:51 +0800 Subject: [PATCH 131/155] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 45939a34b9cb..003b8252010b 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -76,7 +76,8 @@ def __init__(self, self.send_stream = torch.cuda.Stream() self.recv_stream = torch.cuda.Stream() - mem_pool_size_gb = self.config.get_from_extra_config("mem_pool_size_gb", 128) + mem_pool_size_gb = self.config.get_from_extra_config( + "mem_pool_size_gb", 128) self.pool = TensorMemoryPool(max_block_size=mem_pool_size_gb * 1024**3) # GB From 37c537469003ecfb87bf9081d93bf19d2ff0e2bd Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 4 Jun 2025 16:46:40 +0800 Subject: [PATCH 132/155] set_p2p_nccl_context Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 51 ++++++++++++++++--- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 003b8252010b..ddd3b0479263 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -1,10 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 import logging +import os import threading import time import typing from collections import deque +from contextlib import contextmanager from typing import TYPE_CHECKING, Any, Optional import msgpack @@ -23,6 +25,36 @@ logger = logging.getLogger(__name__) +@contextmanager +def set_p2p_nccl_context(num_chennels: str): + original_values: dict[str, Any] = {} + env_vars = [ + 'NCCL_MAX_NCHANNELS', + 'NCCL_MIN_NCHANNELS', + 'NCCL_CUMEM_ENABLE', + 'NCCL_BUFFSIZE', + 'NCCL_PROTO', # LL,LL128,SIMPLE + 'NCCL_ALGO', # RING,TREE + ] + + for var in env_vars: + original_values[var] = os.environ.get(var) + + logger.info("set_p2p_nccl_context, original_values: %s", original_values) + + try: + os.environ['NCCL_MAX_NCHANNELS'] = num_chennels + os.environ['NCCL_MIN_NCHANNELS'] = num_chennels + os.environ['NCCL_CUMEM_ENABLE'] = '1' + yield + finally: + for var in env_vars: + if original_values[var] is not None: + os.environ[var] = original_values[var] + else: + os.environ.pop(var, None) + + class P2pNcclEngine: def __init__(self, @@ -106,6 +138,9 @@ def __init__(self, self.buffer_size = 0 self.buffer_size_threshold = float(self.config.kv_buffer_size) + self.nccl_num_chennels = self.config.get_from_extra_config( + "nccl_num_chennels", "16") + self._listener_thread = threading.Thread( target=self._listen_for_requests, daemon=True) self._listener_thread.start() @@ -119,9 +154,9 @@ def __init__(self, logger.info( "๐Ÿ’ฏP2pNcclEngine init, rank:%d, local_rank:%d, http_address:%s, " "zmq_address:%s, proxy_address:%s, send_type:%s, buffer_size_" - "threshold:%.2f", self.rank, self.local_rank, self.http_address, - self.zmq_address, self.proxy_address, self.send_type, - self.buffer_size_threshold) + "threshold:%.2f, nccl_num_chennels:%s", self.rank, self.local_rank, + self.http_address, self.zmq_address, self.proxy_address, + self.send_type, self.buffer_size_threshold, self.nccl_num_chennels) def _create_connect(self, remote_address: typing.Optional[str] = None): assert remote_address is not None @@ -141,8 +176,9 @@ def _create_connect(self, remote_address: typing.Optional[str] = None): with torch.cuda.device(self.device): rank = 0 - comm: ncclComm_t = self.nccl.ncclCommInitRank( - 2, unique_id, rank) + with set_p2p_nccl_context(self.nccl_num_chennels): + comm: ncclComm_t = self.nccl.ncclCommInitRank( + 2, unique_id, rank) self.comms[remote_address] = (comm, rank) logger.info("๐ŸคncclCommInitRank Success, %s๐Ÿ‘‰%s, MyRank: %s", self.zmq_address, remote_address, rank) @@ -278,8 +314,9 @@ def _listen_for_requests(self): bytes(data["unique_id"])) with torch.cuda.device(self.device): rank = 1 - comm: ncclComm_t = self.nccl.ncclCommInitRank( - 2, unique_id, rank) + with set_p2p_nccl_context(self.nccl_num_chennels): + comm: ncclComm_t = self.nccl.ncclCommInitRank( + 2, unique_id, rank) self.comms[remote_address.decode()] = (comm, rank) logger.info( "๐ŸคncclCommInitRank Success, %s๐Ÿ‘ˆ%s, MyRank:%s", From 1a2ffe5de529b6e91c15c6fd460b26081f0f0707 Mon Sep 17 00:00:00 2001 From: Abatom Date: Sat, 7 Jun 2025 09:26:07 +0800 Subject: [PATCH 133/155] mem_pool_size_gb(32) Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index ddd3b0479263..23feb0ab33f8 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -109,8 +109,8 @@ def __init__(self, self.recv_stream = torch.cuda.Stream() mem_pool_size_gb = self.config.get_from_extra_config( - "mem_pool_size_gb", 128) - self.pool = TensorMemoryPool(max_block_size=mem_pool_size_gb * + "mem_pool_size_gb", 32) + self.pool = TensorMemoryPool(max_block_size=int(mem_pool_size_gb) * 1024**3) # GB # The sending type includes tree mutually exclusive options: From 17a83f9151e77f9b28dc8b24d3c78fc1ae18902b Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 14:08:16 +0800 Subject: [PATCH 134/155] nccl_num_chennels=8 Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 23feb0ab33f8..6cc3c783c842 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -139,7 +139,7 @@ def __init__(self, self.buffer_size_threshold = float(self.config.kv_buffer_size) self.nccl_num_chennels = self.config.get_from_extra_config( - "nccl_num_chennels", "16") + "nccl_num_chennels", "8") self._listener_thread = threading.Thread( target=self._listen_for_requests, daemon=True) From b4416a402fb96901891e3883331ded0f78c687ff Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 14:24:10 +0800 Subject: [PATCH 135/155] MemoryError->ValueError Signed-off-by: Abatom --- vllm/tensor_memory_pool.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/tensor_memory_pool.py b/vllm/tensor_memory_pool.py index a4ca13d6661c..abcf7ac6495d 100644 --- a/vllm/tensor_memory_pool.py +++ b/vllm/tensor_memory_pool.py @@ -62,7 +62,7 @@ def allocate(self, size: int) -> int: required_size = self._round_to_power_of_two( max(size, self.min_block_size)) if required_size > self.max_block_size: - raise MemoryError("Requested size exceeds maximum block size") + raise ValueError("Requested size exceeds maximum block size") current_size = required_size while current_size <= self.max_block_size: @@ -73,7 +73,7 @@ def allocate(self, size: int) -> int: return block.addr current_size *= 2 - raise MemoryError("Insufficient memory") + raise ValueError("Insufficient memory") def _split_block(self, block: MemoryBlock, required_size: int): while (block.size > required_size @@ -122,7 +122,7 @@ def store_tensor(self, tensor: torch.Tensor) -> int: if block.size < size: self.free(addr) - raise MemoryError( + raise ValueError( f"Allocated block size {block.size} is smaller than " f"required size {size}") @@ -134,7 +134,7 @@ def store_tensor(self, tensor: torch.Tensor) -> int: tensor.shape) except ValueError as err: self.free(addr) - raise MemoryError(f"Failed to create tensor view: {err}") from err + raise ValueError(f"Failed to create tensor view: {err}") from err cpu_tensor.copy_(tensor) From a3e7337548e806c59ec919bd4af88e77b1bd34de Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 14:26:37 +0800 Subject: [PATCH 136/155] log level Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 408257c702c8..b5ca13ce8cdc 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -214,8 +214,8 @@ def inject_kv_into_layer( inject_kv_into_layer(kv_cache_layer, kv_cache, request.slot_mapping, request.request_id) - logger.info("Inject KV cache of %d tokens to the paged memory, %s", - len(request.slot_mapping), request.request_id) + logger.debug("Inject KV cache of %d tokens to the paged memory, %s", + len(request.slot_mapping), request.request_id) def wait_for_layer_load(self, layer_name: str) -> None: """Blocking until the KV for a specific layer is loaded into vLLM's From ad7f14a806d2ac64beee8ecfadd6d67553207813 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 14:40:26 +0800 Subject: [PATCH 137/155] TensorMemoryPool Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py | 5 +++-- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 5 +++-- .../kv_transfer/kv_connector/v1/p2p}/tensor_memory_pool.py | 6 ++++-- 3 files changed, 10 insertions(+), 6 deletions(-) rename vllm/{ => distributed/kv_transfer/kv_connector/v1/p2p}/tensor_memory_pool.py (96%) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index b5ca13ce8cdc..014617790bf7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -214,8 +214,9 @@ def inject_kv_into_layer( inject_kv_into_layer(kv_cache_layer, kv_cache, request.slot_mapping, request.request_id) - logger.debug("Inject KV cache of %d tokens to the paged memory, %s", - len(request.slot_mapping), request.request_id) + logger.debug( + "Inject KV cache of %d tokens to the paged memory, %s", + len(request.slot_mapping), request.request_id) def wait_for_layer_load(self, layer_name: str) -> None: """Blocking until the KV for a specific layer is loaded into vLLM's diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 6cc3c783c842..fc3d794ddb0f 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -16,7 +16,7 @@ from vllm.config import KVTransferConfig from vllm.distributed.device_communicators.pynccl_wrapper import ( NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum) -from vllm.tensor_memory_pool import TensorMemoryPool +from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import TensorMemoryPool from vllm.utils import current_stream, get_ip if TYPE_CHECKING: @@ -24,6 +24,7 @@ logger = logging.getLogger(__name__) +DEFAULT_MEM_POOL_SIZE_GB = 32 @contextmanager def set_p2p_nccl_context(num_chennels: str): @@ -109,7 +110,7 @@ def __init__(self, self.recv_stream = torch.cuda.Stream() mem_pool_size_gb = self.config.get_from_extra_config( - "mem_pool_size_gb", 32) + "mem_pool_size_gb", DEFAULT_MEM_POOL_SIZE_GB) self.pool = TensorMemoryPool(max_block_size=int(mem_pool_size_gb) * 1024**3) # GB diff --git a/vllm/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py similarity index 96% rename from vllm/tensor_memory_pool.py rename to vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py index abcf7ac6495d..9d4b813c2c56 100644 --- a/vllm/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py @@ -4,9 +4,11 @@ import ctypes import math from dataclasses import dataclass +from vllm.logger import init_logger import torch +logger = init_logger(__name__) @dataclass class MemoryBlock: @@ -52,8 +54,8 @@ def _allocate_pinned_memory(self): addr=self.base_address) self.free_lists[self.max_block_size][ initial_block.addr] = initial_block - print("TensorMemoryPool, base_address:", self.base_address, - self.base_address % self.max_block_size) + logger.debug("TensorMemoryPool, base_address:", self.base_address, + self.base_address % self.max_block_size) def allocate(self, size: int) -> int: if size <= 0: From 3c30c02a5961a740e5ec20f36ef5fe76ec886020 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 15:05:46 +0800 Subject: [PATCH 138/155] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 4 +++- .../kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index fc3d794ddb0f..b82f49db007a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -16,7 +16,8 @@ from vllm.config import KVTransferConfig from vllm.distributed.device_communicators.pynccl_wrapper import ( NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum) -from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import TensorMemoryPool +from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import ( + TensorMemoryPool) from vllm.utils import current_stream, get_ip if TYPE_CHECKING: @@ -26,6 +27,7 @@ DEFAULT_MEM_POOL_SIZE_GB = 32 + @contextmanager def set_p2p_nccl_context(num_chennels: str): original_values: dict[str, Any] = {} diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py index 9d4b813c2c56..9840c112788c 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py @@ -4,12 +4,14 @@ import ctypes import math from dataclasses import dataclass -from vllm.logger import init_logger import torch +from vllm.logger import init_logger + logger = init_logger(__name__) + @dataclass class MemoryBlock: size: int From babdaa260e6cefeffadcc88cf9b9c7e250738b42 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 15:40:34 +0800 Subject: [PATCH 139/155] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index b82f49db007a..91f7f202295d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -16,7 +16,7 @@ from vllm.config import KVTransferConfig from vllm.distributed.device_communicators.pynccl_wrapper import ( NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum) -from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import ( +from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import ( # noqa: E501 TensorMemoryPool) from vllm.utils import current_stream, get_ip From fea91abc65e97dc0e7f3f5ea5bce06eaf5541470 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 15:54:09 +0800 Subject: [PATCH 140/155] add the docstring for TensorMemoryPool Signed-off-by: Abatom --- .../kv_connector/v1/p2p/tensor_memory_pool.py | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py index 9840c112788c..b1703667cb85 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py @@ -18,8 +18,46 @@ class MemoryBlock: addr: int +"""A memory pool for managing pinned host memory allocations for tensors. + +This class implements a buddy allocation system to efficiently manage pinned host +memory for tensor storage. It supports allocation, deallocation, and tensor +storage/retrieval operations. + +Key Features: +- Uses power-of-two block sizes for efficient buddy allocation +- Supports splitting and merging of memory blocks +- Provides methods to store CUDA tensors in pinned host memory +- Allows loading tensors from pinned memory back to device +- Automatically cleans up memory on destruction + +Attributes: + max_block_size (int): Maximum block size (rounded to nearest power of two) + min_block_size (int): Minimum block size (rounded to nearest power of two) + free_lists (dict): Dictionary of free memory blocks by size + allocated_blocks (dict): Dictionary of currently allocated blocks + base_tensor (torch.Tensor): Base pinned memory tensor + base_address (int): Base memory address of the pinned memory region + +Example: + >>> pool = TensorMemoryPool(max_block_size=1024*1024) + >>> tensor = torch.randn(100, device='cuda') + >>> addr = pool.store_tensor(tensor) + >>> loaded_tensor = pool.load_tensor(addr, tensor.dtype, tensor.shape, 'cuda') + >>> pool.free(addr) +""" + class TensorMemoryPool: + """Initializes the memory pool with given size constraints. + + Args: + max_block_size (int): Maximum size of memory blocks to manage + min_block_size (int, optional): Minimum size of memory blocks to manage. + Defaults to 512. + Raises: + ValueError: If block sizes are invalid or max_block_size < min_block_size + """ def __init__(self, max_block_size: int, min_block_size: int = 512): if max_block_size <= 0 or min_block_size <= 0: raise ValueError("Block sizes must be positive") @@ -60,6 +98,17 @@ def _allocate_pinned_memory(self): self.base_address % self.max_block_size) def allocate(self, size: int) -> int: + """Allocates a memory block of at least the requested size. + + Args: + size (int): Minimum size of memory to allocate + + Returns: + int: Address of the allocated memory block + + Raises: + ValueError: If size is invalid or insufficient memory is available + """ if size <= 0: raise ValueError("Allocation size must be positive") @@ -91,6 +140,14 @@ def _split_block(self, block: MemoryBlock, required_size: int): self.free_lists[buddy_size][buddy.addr] = buddy def free(self, addr: int): + """Frees an allocated memory block. + + Args: + addr (int): Address of the block to free + + Raises: + ValueError: If address is invalid or not allocated + """ if addr not in self.allocated_blocks: raise ValueError("Invalid address to free") @@ -117,6 +174,17 @@ def _merge_buddies(self, block: MemoryBlock): self.free_lists[block.size][block.addr] = block def store_tensor(self, tensor: torch.Tensor) -> int: + """Stores a CUDA tensor in pinned host memory. + + Args: + tensor (torch.Tensor): CUDA tensor to store + + Returns: + int: Address where the tensor is stored + + Raises: + ValueError: If tensor is not on CUDA or allocation fails + """ if not tensor.is_cuda: raise ValueError("Only CUDA tensors can be stored") @@ -146,6 +214,20 @@ def store_tensor(self, tensor: torch.Tensor) -> int: def load_tensor(self, addr: int, dtype: torch.dtype, shape: tuple[int, ...], device) -> torch.Tensor: + """Loads a tensor from pinned host memory to the specified device. + + Args: + addr (int): Address where tensor is stored + dtype (torch.dtype): Data type of the tensor + shape (tuple[int, ...]): Shape of the tensor + device: Target device for the loaded tensor + + Returns: + torch.Tensor: The loaded tensor on the specified device + + Raises: + ValueError: If address is invalid or sizes don't match + """ if addr not in self.allocated_blocks: raise ValueError("Invalid address to load") @@ -168,6 +250,7 @@ def load_tensor(self, addr: int, dtype: torch.dtype, return cuda_tensor def cleanup(self): + """Cleans up all memory resources and resets the pool state.""" self.free_lists.clear() self.allocated_blocks.clear() if hasattr(self, 'base_tensor'): From a33e149730c6d9f76abdf6eaad7c9430bc93f066 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 16:05:17 +0800 Subject: [PATCH 141/155] format Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 +- .../kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 91f7f202295d..d319bc8b98c9 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -16,7 +16,7 @@ from vllm.config import KVTransferConfig from vllm.distributed.device_communicators.pynccl_wrapper import ( NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum) -from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import ( # noqa: E501 +from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import ( # noqa: E501 TensorMemoryPool) from vllm.utils import current_stream, get_ip diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py index b1703667cb85..561ccbecc8f3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py @@ -47,6 +47,7 @@ class MemoryBlock: >>> pool.free(addr) """ + class TensorMemoryPool: """Initializes the memory pool with given size constraints. @@ -58,6 +59,7 @@ class TensorMemoryPool: Raises: ValueError: If block sizes are invalid or max_block_size < min_block_size """ + def __init__(self, max_block_size: int, min_block_size: int = 512): if max_block_size <= 0 or min_block_size <= 0: raise ValueError("Block sizes must be positive") From c8558d37d3761e0d69c927823bf94689611a3290 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 16:24:15 +0800 Subject: [PATCH 142/155] num_chennels->num_channels Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index d319bc8b98c9..160e753d619d 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -29,7 +29,7 @@ @contextmanager -def set_p2p_nccl_context(num_chennels: str): +def set_p2p_nccl_context(num_channels: str): original_values: dict[str, Any] = {} env_vars = [ 'NCCL_MAX_NCHANNELS', @@ -46,8 +46,8 @@ def set_p2p_nccl_context(num_chennels: str): logger.info("set_p2p_nccl_context, original_values: %s", original_values) try: - os.environ['NCCL_MAX_NCHANNELS'] = num_chennels - os.environ['NCCL_MIN_NCHANNELS'] = num_chennels + os.environ['NCCL_MAX_NCHANNELS'] = num_channels + os.environ['NCCL_MIN_NCHANNELS'] = num_channels os.environ['NCCL_CUMEM_ENABLE'] = '1' yield finally: @@ -141,8 +141,8 @@ def __init__(self, self.buffer_size = 0 self.buffer_size_threshold = float(self.config.kv_buffer_size) - self.nccl_num_chennels = self.config.get_from_extra_config( - "nccl_num_chennels", "8") + self.nccl_num_channels = self.config.get_from_extra_config( + "nccl_num_channels", "8") self._listener_thread = threading.Thread( target=self._listen_for_requests, daemon=True) @@ -157,9 +157,9 @@ def __init__(self, logger.info( "๐Ÿ’ฏP2pNcclEngine init, rank:%d, local_rank:%d, http_address:%s, " "zmq_address:%s, proxy_address:%s, send_type:%s, buffer_size_" - "threshold:%.2f, nccl_num_chennels:%s", self.rank, self.local_rank, + "threshold:%.2f, nccl_num_channels:%s", self.rank, self.local_rank, self.http_address, self.zmq_address, self.proxy_address, - self.send_type, self.buffer_size_threshold, self.nccl_num_chennels) + self.send_type, self.buffer_size_threshold, self.nccl_num_channels) def _create_connect(self, remote_address: typing.Optional[str] = None): assert remote_address is not None @@ -179,7 +179,7 @@ def _create_connect(self, remote_address: typing.Optional[str] = None): with torch.cuda.device(self.device): rank = 0 - with set_p2p_nccl_context(self.nccl_num_chennels): + with set_p2p_nccl_context(self.nccl_num_channels): comm: ncclComm_t = self.nccl.ncclCommInitRank( 2, unique_id, rank) self.comms[remote_address] = (comm, rank) @@ -317,7 +317,7 @@ def _listen_for_requests(self): bytes(data["unique_id"])) with torch.cuda.device(self.device): rank = 1 - with set_p2p_nccl_context(self.nccl_num_chennels): + with set_p2p_nccl_context(self.nccl_num_channels): comm: ncclComm_t = self.nccl.ncclCommInitRank( 2, unique_id, rank) self.comms[remote_address.decode()] = (comm, rank) @@ -458,7 +458,7 @@ def _send_sync( response = sock.recv() if response != b"0": - logger.warning( + logger.error( "๐Ÿ”ดSend Tensor, Peer Out Of Memory/Threshold, %s ๐Ÿ‘‰ %s, " "MyRank:%s, data:%s, tensor:%s, size:%fGB, response:%s", self.zmq_address, remote_address, rank, data, tensor.shape, From 58eeac7d74d85c78de684b5dca8e0122858d8204 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 16:45:02 +0800 Subject: [PATCH 143/155] format Signed-off-by: Abatom --- .../kv_connector/v1/p2p/tensor_memory_pool.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py index 561ccbecc8f3..303619a3fdd0 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/tensor_memory_pool.py @@ -20,9 +20,9 @@ class MemoryBlock: """A memory pool for managing pinned host memory allocations for tensors. -This class implements a buddy allocation system to efficiently manage pinned host -memory for tensor storage. It supports allocation, deallocation, and tensor -storage/retrieval operations. +This class implements a buddy allocation system to efficiently manage pinned +host memory for tensor storage. It supports allocation, deallocation, and +tensor storage/retrieval operations. Key Features: - Uses power-of-two block sizes for efficient buddy allocation @@ -43,7 +43,8 @@ class MemoryBlock: >>> pool = TensorMemoryPool(max_block_size=1024*1024) >>> tensor = torch.randn(100, device='cuda') >>> addr = pool.store_tensor(tensor) - >>> loaded_tensor = pool.load_tensor(addr, tensor.dtype, tensor.shape, 'cuda') + >>> loaded_tensor = pool.load_tensor(addr, tensor.dtype, + ... tensor.shape, 'cuda') >>> pool.free(addr) """ @@ -53,11 +54,12 @@ class TensorMemoryPool: Args: max_block_size (int): Maximum size of memory blocks to manage - min_block_size (int, optional): Minimum size of memory blocks to manage. - Defaults to 512. + min_block_size (int, optional): Minimum size of memory blocks + to manage. Defaults to 512. Raises: - ValueError: If block sizes are invalid or max_block_size < min_block_size + ValueError: If block sizes are invalid or max_block_size is less + than min_block_size """ def __init__(self, max_block_size: int, min_block_size: int = 512): From d063d895925ead7b3f8e5feef3ad794f47577815 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 18:19:07 +0800 Subject: [PATCH 144/155] Only symmetric TP is supported Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 014617790bf7..4836e4307ff7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -155,6 +155,8 @@ def inject_kv_into_layer( page_size = dst_kv_cache_layer_shape[1] dst_kv_cache_layer = dst_kv_cache_layer.reshape( num_pages * page_size, -1) + self.check_tensors_except_dim( + dst_kv_cache_layer, src_kv_cache, 0) num_token = src_kv_cache.shape[0] if len(slot_mapping) == num_token: dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache @@ -172,6 +174,8 @@ def inject_kv_into_layer( page_size = dst_kv_cache_layer_shape[2] dst_kv_cache_layer = dst_kv_cache_layer.reshape( 2, num_pages * page_size, -1) + self.check_tensors_except_dim( + dst_kv_cache_layer, src_kv_cache, 1) num_token = src_kv_cache.shape[1] if len(slot_mapping) == num_token: dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache @@ -515,3 +519,13 @@ def parse_request_id(request_id: str, is_prefill=True) -> tuple[str, int]: return ip, port raise ValueError( f"Request id {request_id} does not contain hostname and port") + + @staticmethod + def check_tensors_except_dim(tensor1, tensor2, dim): + shape1 = tensor1.size() + shape2 = tensor2.size() + + if len(shape1) != len(shape2) or not all(s1 == s2 for i, (s1, s2) in enumerate(zip(shape1, shape2)) if i != dim): + raise NotImplementedError( + "Currently, only symmetric TP is supported. Asymmetric TP, PP," + "and others will be supported in future PRs.") \ No newline at end of file From 5118c060e1ce1f9cdb480a0239744325f326ee50 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 18:29:09 +0800 Subject: [PATCH 145/155] format Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 4836e4307ff7..60a0be160db7 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -155,8 +155,8 @@ def inject_kv_into_layer( page_size = dst_kv_cache_layer_shape[1] dst_kv_cache_layer = dst_kv_cache_layer.reshape( num_pages * page_size, -1) - self.check_tensors_except_dim( - dst_kv_cache_layer, src_kv_cache, 0) + self.check_tensors_except_dim(dst_kv_cache_layer, src_kv_cache, + 0) num_token = src_kv_cache.shape[0] if len(slot_mapping) == num_token: dst_kv_cache_layer[slot_mapping, ...] = src_kv_cache @@ -174,8 +174,8 @@ def inject_kv_into_layer( page_size = dst_kv_cache_layer_shape[2] dst_kv_cache_layer = dst_kv_cache_layer.reshape( 2, num_pages * page_size, -1) - self.check_tensors_except_dim( - dst_kv_cache_layer, src_kv_cache, 1) + self.check_tensors_except_dim(dst_kv_cache_layer, src_kv_cache, + 1) num_token = src_kv_cache.shape[1] if len(slot_mapping) == num_token: dst_kv_cache_layer[:, slot_mapping, ...] = src_kv_cache @@ -525,7 +525,9 @@ def check_tensors_except_dim(tensor1, tensor2, dim): shape1 = tensor1.size() shape2 = tensor2.size() - if len(shape1) != len(shape2) or not all(s1 == s2 for i, (s1, s2) in enumerate(zip(shape1, shape2)) if i != dim): + if len(shape1) != len(shape2) or not all( + s1 == s2 + for i, (s1, s2) in enumerate(zip(shape1, shape2)) if i != dim): raise NotImplementedError( "Currently, only symmetric TP is supported. Asymmetric TP, PP," - "and others will be supported in future PRs.") \ No newline at end of file + "and others will be supported in future PRs.") From fd38521d005df184f81f27b6fb053f8f31a13754 Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 21:51:30 +0800 Subject: [PATCH 146/155] add doc Signed-off-by: Abatom --- docs/design/v1/p2p_nccl_connector.md | 238 +++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) create mode 100644 docs/design/v1/p2p_nccl_connector.md diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md new file mode 100644 index 000000000000..78bb323d25de --- /dev/null +++ b/docs/design/v1/p2p_nccl_connector.md @@ -0,0 +1,238 @@ +# Install vLLM + +```shell +# Enter the home directory or your working directory. +cd /home + +# Download the installation package, and I will update the commit-id in time. You can directly copy the command. +wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl + +# Download the code repository. +git clone -b xpyd-v1 https://github.com/Abatom/vllm.git +cd vllm + +# Set the installation package path. +export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl + +# installation +pip install -e . -v +``` + +# Run xPyD + +## Instructions +- The following examples are run on an A800 (80GB) device, using the Meta-Llama-3.1-8B-Instruct model. +- Pay attention to the setting of the `kv_buffer_size` (in bytes). The empirical value is 10% of the GPU memory size. This is related to the kvcache size. If it is too small, the GPU memory buffer for temporarily storing the received kvcache will overflow, causing the kvcache to be stored in the tensor memory pool, which increases latency. If it is too large, the kvcache available for inference will be reduced, leading to a smaller batch size and decreased throughput. +- For Prefill instances, the `kv_buffer_size` can be set to 1, as Prefill currently does not need to receive kvcache. +- The `--port` must be consistent with the `http_port` in the `--kv-transfer-config`. +- `PUT_ASYNC` offers the best performance and should be prioritized. +- You may need to modify the `kv_buffer_size` and `port` in the following commands (if there is a conflict). +- The `disagg_prefill_proxy_xpyd.py` script will use port 10001 (for receiving client requests) and port 30001 (for receiving service discovery from P and D instances). +- The node running the proxy must have `quart` installed. +- Supports multiple nodes; you just need to modify the `proxy_ip` and `proxy_port` in `--kv-transfer-config`. +- In the following examples, it is assumed that **the proxy's IP is 10.0.1.1**. + +## Run 1P3D + +### Proxy (e.g. 10.0.1.1) + +```shell +cd {your vllm directory}/examples/online_serving/disagg_xpyd/ +python3 disagg_prefill_proxy_xpyd.py & +``` +### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1) + +```shell +VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \ + --host 0.0.0.0 \ + --port 20005 \ + --tensor-parallel-size 1 \ + --seed 1024 \ + --served-model-name base_model \ + --dtype float16 \ + --max-model-len 10000 \ + --max-num-batched-tokens 10000 \ + --max-num-seqs 256 \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & +``` +### Decode1 (e.g. 10.0.1.3 or 10.0.1.1) + +```shell +VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \ + --host 0.0.0.0 \ + --port 20009 \ + --tensor-parallel-size 1 \ + --seed 1024 \ + --served-model-name base_model \ + --dtype float16 \ + --max-model-len 10000 \ + --max-num-batched-tokens 10000 \ + --max-num-seqs 256 \ + --trust-remote-code \ + --gpu-memory-utilization 0.7 \ + --kv-transfer-config \ + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & +``` +### Decode2 (e.g. 10.0.1.4 or 10.0.1.1) + +```shell +VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \ + --host 0.0.0.0 \ + --port 20003 \ + --tensor-parallel-size 1 \ + --seed 1024 \ + --served-model-name base_model \ + --dtype float16 \ + --max-model-len 10000 \ + --max-num-batched-tokens 10000 \ + --max-num-seqs 256 \ + --trust-remote-code \ + --gpu-memory-utilization 0.7 \ + --kv-transfer-config \ + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & +``` +### Decode3 (e.g. 10.0.1.5 or 10.0.1.1) + +```shell +VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \ + --host 0.0.0.0 \ + --port 20008 \ + --tensor-parallel-size 1 \ + --seed 1024 \ + --served-model-name base_model \ + --dtype float16 \ + --max-model-len 10000 \ + --max-num-batched-tokens 10000 \ + --max-num-seqs 256 \ + --trust-remote-code \ + --gpu-memory-utilization 0.7 \ + --kv-transfer-config \ + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & +``` + +## Run 3P1D + +### Proxy (e.g. 10.0.1.1) + +```shell +cd {your vllm directory}/examples/online_serving/disagg_xpyd/ +python3 disagg_prefill_proxy_xpyd.py & +``` +### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1) + +```shell +VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \ + --host 0.0.0.0 \ + --port 20005 \ + --tensor-parallel-size 1 \ + --seed 1024 \ + --served-model-name base_model \ + --dtype float16 \ + --max-model-len 10000 \ + --max-num-batched-tokens 10000 \ + --max-num-seqs 256 \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & +``` + +### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1) + +```shell +VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \ + --host 0.0.0.0 \ + --port 20009 \ + --tensor-parallel-size 1 \ + --seed 1024 \ + --served-model-name base_model \ + --dtype float16 \ + --max-model-len 10000 \ + --max-num-batched-tokens 10000 \ + --max-num-seqs 256 \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & +``` + +### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1) + +```shell +VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \ + --host 0.0.0.0 \ + --port 20003 \ + --tensor-parallel-size 1 \ + --seed 1024 \ + --served-model-name base_model \ + --dtype float16 \ + --max-model-len 10000 \ + --max-num-batched-tokens 10000 \ + --max-num-seqs 256 \ + --trust-remote-code \ + --gpu-memory-utilization 0.9 \ + --kv-transfer-config \ + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & +``` + +### Decode1 (e.g. 10.0.1.5 or 10.0.1.1) + +```shell +VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \ + --host 0.0.0.0 \ + --port 20008 \ + --tensor-parallel-size 1 \ + --seed 1024 \ + --served-model-name base_model \ + --dtype float16 \ + --max-model-len 10000 \ + --max-num-batched-tokens 10000 \ + --max-num-seqs 256 \ + --trust-remote-code \ + --gpu-memory-utilization 0.7 \ + --kv-transfer-config \ + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & +``` + +# Single request + +```shell +curl -X POST -s http://10.0.1.1:10001/v1/completions \ +-H "Content-Type: application/json" \ +-d '{ + "model": "base_model", + "prompt": "San Francisco is a", + "max_tokens": 10, + "temperature": 0 +}' +``` + +# Benchmark + +```shell +python3 benchmark_serving.py \ + --backend vllm \ + --model base_model \ + --tokenizer auto \ + --dataset-name "random" \ + --host 10.0.1.1 \ + --port 10001 \ + --random-input-len 1024 \ + --random-output-len 1024 \ + --ignore-eos \ + --burstiness 100 \ + --percentile-metrics "ttft,tpot,itl,e2el" \ + --metric-percentiles "90,95,99" \ + --seed $(date +%s) \ + --trust-remote-code \ + --request-rate 3 \ + --num-prompts 1000 +``` + +# Shut down +```shell +pgrep python | xargs kill -9 && pkill -f python +``` From 4fd68a94f7aeae8b2541e0ba873da1c59eb87c1e Mon Sep 17 00:00:00 2001 From: Abatom Date: Fri, 13 Jun 2025 23:20:24 +0800 Subject: [PATCH 147/155] doc Signed-off-by: Abatom --- docs/design/v1/p2p_nccl_connector.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md index 78bb323d25de..934f82bf01c1 100644 --- a/docs/design/v1/p2p_nccl_connector.md +++ b/docs/design/v1/p2p_nccl_connector.md @@ -23,10 +23,10 @@ pip install -e . -v ## Instructions - The following examples are run on an A800 (80GB) device, using the Meta-Llama-3.1-8B-Instruct model. - Pay attention to the setting of the `kv_buffer_size` (in bytes). The empirical value is 10% of the GPU memory size. This is related to the kvcache size. If it is too small, the GPU memory buffer for temporarily storing the received kvcache will overflow, causing the kvcache to be stored in the tensor memory pool, which increases latency. If it is too large, the kvcache available for inference will be reduced, leading to a smaller batch size and decreased throughput. -- For Prefill instances, the `kv_buffer_size` can be set to 1, as Prefill currently does not need to receive kvcache. -- The `--port` must be consistent with the `http_port` in the `--kv-transfer-config`. -- `PUT_ASYNC` offers the best performance and should be prioritized. +- For Prefill instances, when using non-GET mode, the `kv_buffer_size` can be set to 1, as Prefill currently does not need to receive kvcache. However, when using GET mode, a larger `kv_buffer_size` is required because it needs to store the kvcache sent to the D instance. - You may need to modify the `kv_buffer_size` and `port` in the following commands (if there is a conflict). +- `PUT_ASYNC` offers the best performance and should be prioritized. +- The `--port` must be consistent with the `http_port` in the `--kv-transfer-config`. - The `disagg_prefill_proxy_xpyd.py` script will use port 10001 (for receiving client requests) and port 30001 (for receiving service discovery from P and D instances). - The node running the proxy must have `quart` installed. - Supports multiple nodes; you just need to modify the `proxy_ip` and `proxy_port` in `--kv-transfer-config`. @@ -40,6 +40,7 @@ pip install -e . -v cd {your vllm directory}/examples/online_serving/disagg_xpyd/ python3 disagg_prefill_proxy_xpyd.py & ``` + ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1) ```shell @@ -58,6 +59,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & ``` + ### Decode1 (e.g. 10.0.1.3 or 10.0.1.1) ```shell @@ -76,6 +78,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & ``` + ### Decode2 (e.g. 10.0.1.4 or 10.0.1.1) ```shell @@ -94,6 +97,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & ``` + ### Decode3 (e.g. 10.0.1.5 or 10.0.1.1) ```shell @@ -121,6 +125,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \ cd {your vllm directory}/examples/online_serving/disagg_xpyd/ python3 disagg_prefill_proxy_xpyd.py & ``` + ### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1) ```shell @@ -233,6 +238,7 @@ python3 benchmark_serving.py \ ``` # Shut down + ```shell pgrep python | xargs kill -9 && pkill -f python ``` From 95bd8c8f99ff53c733f0a8a6735de15e5c42a547 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 16 Jun 2025 14:08:24 +0800 Subject: [PATCH 148/155] clean up lots of logs Signed-off-by: Abatom --- .../kv_connector/v1/p2p/p2p_nccl_connector.py | 52 ------------------- .../kv_connector/v1/p2p/p2p_nccl_engine.py | 52 +------------------ 2 files changed, 1 insertion(+), 103 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py index 60a0be160db7..a47deaf91272 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py @@ -47,12 +47,6 @@ def make_meta(request_id: str, token_ids: list[int], block_ids: list[int], block_ids_tensor.reshape((num_blocks, 1)) * block_size slot_mapping = slot_mapping.flatten()[:valid_num_tokens] - logger.debug( - "๐ŸžP2pNcclConnector make_meta, request_id:%s, token_ids:%s, " - "valid_num_tokens:%d, block_ids:%s, num_blocks:%d, block_size:%d, " - "slot_mapping:%s", request_id, token_ids, valid_num_tokens, - block_ids, num_blocks, block_size, slot_mapping.tolist()) - return ReqMeta( request_id=request_id, token_ids=token_ids_tensor, @@ -126,8 +120,6 @@ def start_load_kv(self, forward_context: "ForwardContext", attn_metadata = forward_context.attn_metadata if attn_metadata is None: - logger.warning( - "In connector.start_load_kv, but the attn_metadata is None") return def inject_kv_into_layer( @@ -195,9 +187,6 @@ def inject_kv_into_layer( assert isinstance(metadata, P2pNcclConnectorMetadata) if metadata is None: - logger.warning( - "In connector.start_load_kv, but the connector metadata is None" - ) return # Load the KV for each request each layer @@ -218,10 +207,6 @@ def inject_kv_into_layer( inject_kv_into_layer(kv_cache_layer, kv_cache, request.slot_mapping, request.request_id) - logger.debug( - "Inject KV cache of %d tokens to the paged memory, %s", - len(request.slot_mapping), request.request_id) - def wait_for_layer_load(self, layer_name: str) -> None: """Blocking until the KV for a specific layer is loaded into vLLM's paged buffer. @@ -331,11 +316,6 @@ def get_num_new_matched_tokens( num_external_tokens = (len(request.prompt_token_ids) - 1 - num_computed_tokens) - logger.debug( - "๐Ÿ’num_external_tokens:%d, num_prompt_tokens:%d, " - "num_computed_tokens:%d, request_id:%s", num_external_tokens, - len(request.prompt_token_ids), num_computed_tokens, - request.request_id) if num_external_tokens < 0: num_external_tokens = 0 @@ -365,9 +345,6 @@ def build_connector_meta( scheduler_output (SchedulerOutput): the scheduler output object. """ - logger.debug("๐Ÿžbuild_connector_meta, scheduler_output:%s", - scheduler_output) - meta = P2pNcclConnectorMetadata() for new_req in scheduler_output.scheduled_new_reqs: @@ -380,13 +357,6 @@ def build_connector_meta( # 'CachedRequestData' has no attribute 'prompt_token_ids' self.chunked_prefill[new_req.req_id] = ( new_req.block_ids[0], new_req.prompt_token_ids) - logger.debug( - "๐Ÿžbuild_connector_meta, chunked prefill, " - "request_id:%s, num_scheduled_tokens:%d, " - "num_prompt_tokens:%d, num_computed_tokens:%d, " - "num_tokens:%d", new_req.req_id, num_scheduled_tokens, - len(new_req.prompt_token_ids), - new_req.num_computed_tokens, num_tokens) continue # the request's prompt is not chunked prefill meta.add_request(request_id=new_req.req_id, @@ -413,11 +383,6 @@ def build_connector_meta( block_ids = (self.chunked_prefill[cached_req.req_id][0] + block_ids) prompt_token_ids = self.chunked_prefill[cached_req.req_id][1] - logger.debug( - "๐Ÿžbuild_connector_meta, cached_req, request_id:%s, " - "num_scheduled_tokens:%d, num_prompt_tokens:%d", - cached_req.req_id, num_scheduled_tokens, - len(prompt_token_ids)) # the request's prompt is chunked prefill again if num_tokens < len(prompt_token_ids): self.chunked_prefill[cached_req.req_id] = ( @@ -444,13 +409,6 @@ def build_connector_meta( # of the block_ids for the request. block_ids = cached_req.new_block_ids[0] - logger.debug( - "๐Ÿžbuild_connector_meta, req_id:%s, total_tokens:%d, " - "num_computed_tokens:%d, token_ids:%s, num_token_ids:%d, " - "block_ids:%s, num_block_ids:%d", cached_req.req_id, - total_tokens, cached_req.num_computed_tokens, token_ids, - len(token_ids), block_ids, len(block_ids)) - meta.add_request(request_id=cached_req.req_id, token_ids=token_ids, block_ids=block_ids, @@ -464,9 +422,6 @@ def build_connector_meta( # block_ids=block_ids, # block_size=self._block_size) - logger.debug("๐Ÿžbuild_connector_meta, _requests_need_load:%s", - self._requests_need_load) - self._requests_need_load.clear() return meta @@ -486,9 +441,6 @@ def request_finished( returned by the engine. """ - logger.debug("๐Ÿžrequest_finished, request_id:%s, block_ids:%s", - request.request_id, block_ids) - self.chunked_prefill.pop(request.request_id, None) return False, None @@ -512,10 +464,6 @@ def parse_request_id(request_id: str, is_prefill=True) -> tuple[str, int]: ip = match.group(1) port = int(match.group(2)) - logger.debug( - "parse_request_id, request_id: %s, ip: %s, port: %s, " - "is_prefill:%s", request_id, ip, str(port), is_prefill) - return ip, port raise ValueError( f"Request id {request_id} does not contain hostname and port") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 160e753d619d..1d827b68662e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -224,7 +224,7 @@ def send_tensor( self.send_store[tensor_id] = tensor self.buffer_size += tensor_size - logger.info( + logger.debug( "๐Ÿ”ต[GET]Send to %s, tensor_id:%s, tensor_size:%d, " "shape:%s, rank:%d, buffer_size:%d(%.2f%%)", remote_address, tensor_id, tensor_size, tensor.shape, @@ -251,16 +251,8 @@ def recv_tensor( tensor = self.pool.load_tensor(addr, dtype, shape, self.device) else: - addr = 0 self.buffer_size -= (tensor.element_size() * tensor.numel()) - duration = time.time() - start_time - logger.info( - "๐Ÿ”ต[PUT]Recv From %s, tensor_id:%s, shape:%s, " - "duration:%.3fms, size:%.3fGB, addr:%d, rank:%d", - remote_address, tensor_id, tensor.shape, duration * 1000, - tensor.element_size() * tensor.numel() / 1024**3, addr, - self.rank) else: duration = time.time() - start_time logger.warning( @@ -293,14 +285,7 @@ def recv_tensor( dtype=getattr(torch, data["dtype"]), device=self.device) - start_time = time.time() self._recv(comm, tensor, rank ^ 1, self.recv_stream) - duration = time.time() - start_time - logger.info( - "๐Ÿ”ต[GET]Recv From %s, tensor_id:%s, shape:%s, duration:%.3fms, " - "size:%.3fGB, rank:%d", remote_address, tensor_id, tensor.shape, - duration * 1000, - tensor.element_size() * tensor.numel() / 1024**3, self.rank) return tensor @@ -310,8 +295,6 @@ def _listen_for_requests(self): if self.router_socket in socks: remote_address, message = self.router_socket.recv_multipart() data = msgpack.loads(message) - logger.debug("Received message from %s, data:%s", - remote_address.decode(), data) if data["cmd"] == "NEW": unique_id = self.nccl.unique_id_from_bytes( bytes(data["unique_id"])) @@ -347,11 +330,6 @@ def _listen_for_requests(self): remote_address.decode(), data, addr) else: self.buffer_size += tensor_size - logger.info( - "๐Ÿ”ต[PUT]Recv Tensor, %s๐Ÿ‘ˆ%s, MyRank:%s, " - "data:%s, shape:%s", self.zmq_address, - remote_address.decode(), rank, data, - tensor.shape) except torch.cuda.OutOfMemoryError: self.router_socket.send_multipart( @@ -387,16 +365,10 @@ def _listen_for_requests(self): self.router_socket.send_multipart( [remote_address, msgpack.dumps(data)]) - rank = -1 if data["ret"] == 0: comm, rank = self.comms[remote_address.decode()] self._send(comm, tensor.to(self.device), rank ^ 1, self.send_stream) - - logger.info( - "๐Ÿ”ต[GET]Send Tensor, %s๐Ÿ‘‰%s, " - "MyRank:%s, data:%s", self.zmq_address, - remote_address.decode(), rank, data) else: logger.warning( "๐ŸšงUnexpected, Received message from %s, data:%s", @@ -471,8 +443,6 @@ def _send_sync( if self.send_type == "PUT_ASYNC": self._have_sent_tensor_id(tensor_id) - logger.info("๐Ÿ”ตSend Tensor, %s๐Ÿ‘‰%s, MyRank:%s, data:%s", - self.zmq_address, remote_address, rank, data) return True def get_finished( @@ -489,8 +459,6 @@ def get_finished( call to this method (this call or a prior one). """ - logger.debug("๐Ÿžget_finished, finished_req_ids:%s", finished_req_ids) - # Clear the buffer upon request completion. for request_id in finished_req_ids: for layer_name in forward_context.no_compile_layers: @@ -506,8 +474,6 @@ def get_finished( if isinstance(tensor, tuple): addr, _, _ = tensor self.pool.free(addr) - logger.debug("๐Ÿžget_finished, remove tensor_id:%s, addr:%d", - tensor_id, addr) # TODO:Retrieve requests that have already sent the KV cache. finished_sending: set[str] = set() @@ -515,9 +481,6 @@ def get_finished( # TODO:Retrieve requests that have already received the KV cache. finished_recving: set[str] = set() - logger.debug("๐Ÿžget_finished, finished_sending:%s, finished_recving:%s", - finished_sending, finished_recving) - return finished_sending or None, finished_recving or None def _ping(self): @@ -541,19 +504,12 @@ def _send(self, comm, tensor: torch.Tensor, dst: int, stream=None): if stream is None: stream = current_stream() - start_time = time.time() with torch.cuda.stream(stream): self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), dst, comm, cudaStream_t(stream.cuda_stream)) stream.synchronize() - duration = time.time() - start_time - logger.info( - "๐Ÿ•Nccl Send Tensor, shape:%s, duration:%.3fms, size:%.3fGB, " - "rank:%d", tensor.shape, duration * 1000, - tensor.element_size() * tensor.numel() / 1024**3, dst) - def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): assert tensor.device == self.device, ( f"this nccl communicator is created to work on {self.device}, " @@ -561,17 +517,11 @@ def _recv(self, comm, tensor: torch.Tensor, src: int, stream=None): if stream is None: stream = current_stream() - start_time = time.time() with torch.cuda.stream(stream): self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(), ncclDataTypeEnum.from_torch(tensor.dtype), src, comm, cudaStream_t(stream.cuda_stream)) stream.synchronize() - duration = time.time() - start_time - logger.info( - "๐Ÿ•Nccl Recv Tensor, shape:%s, duration:%.3fms, size:%.3fGB, " - "rank:%d", tensor.shape, duration * 1000, - tensor.element_size() * tensor.numel() / 1024**3, src) def close(self) -> None: self._listener_thread.join() From b96369f9b2b8984a91bd0979e51dfddeec25f98b Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 16 Jun 2025 14:22:03 +0800 Subject: [PATCH 149/155] log level Signed-off-by: Abatom --- .../kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py index 1d827b68662e..81f7a2525896 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_engine.py @@ -403,7 +403,7 @@ def wait_for_sent(self): while self.send_queue: self.send_queue_cv.wait() duration = time.time() - start_time - logger.info( + logger.debug( "๐Ÿšง[PUT_ASYNC]It took %.3fms to wait for the send_queue" " to be empty, rank:%d", duration * 1000, self.rank) From 938dbdb9435a411f94a0a2c9196ff1c65d422a02 Mon Sep 17 00:00:00 2001 From: Abatom Date: Mon, 16 Jun 2025 15:31:49 +0800 Subject: [PATCH 150/155] update md Signed-off-by: Abatom --- docs/design/v1/p2p_nccl_connector.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md index 934f82bf01c1..65d828ea5ec2 100644 --- a/docs/design/v1/p2p_nccl_connector.md +++ b/docs/design/v1/p2p_nccl_connector.md @@ -56,6 +56,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \ --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ + --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & ``` @@ -75,6 +76,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \ --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ + --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & ``` @@ -94,6 +96,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \ --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ + --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & ``` @@ -113,6 +116,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \ --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ + --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & ``` @@ -141,6 +145,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \ --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ + --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & ``` @@ -160,6 +165,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \ --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ + --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & ``` @@ -179,6 +185,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \ --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.9 \ + --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & ``` @@ -198,6 +205,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \ --max-num-seqs 256 \ --trust-remote-code \ --gpu-memory-utilization 0.7 \ + --disable-log-request \ --kv-transfer-config \ '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & ``` @@ -221,7 +229,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \ python3 benchmark_serving.py \ --backend vllm \ --model base_model \ - --tokenizer auto \ + --tokenizer meta-llama/Llama-3.1-8B-Instruct \ --dataset-name "random" \ --host 10.0.1.1 \ --port 10001 \ From 2dea3d04180b1290850f12d99ad612177d9b9568 Mon Sep 17 00:00:00 2001 From: Abatom Date: Tue, 17 Jun 2025 08:18:22 +0800 Subject: [PATCH 151/155] doc Signed-off-by: Abatom --- docs/design/v1/p2p_nccl_connector.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md index 65d828ea5ec2..d3410290c5bc 100644 --- a/docs/design/v1/p2p_nccl_connector.md +++ b/docs/design/v1/p2p_nccl_connector.md @@ -58,7 +58,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \ --gpu-memory-utilization 0.9 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ### Decode1 (e.g. 10.0.1.3 or 10.0.1.1) @@ -78,7 +78,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \ --gpu-memory-utilization 0.7 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ### Decode2 (e.g. 10.0.1.4 or 10.0.1.1) @@ -98,7 +98,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \ --gpu-memory-utilization 0.7 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ### Decode3 (e.g. 10.0.1.5 or 10.0.1.1) @@ -118,7 +118,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \ --gpu-memory-utilization 0.7 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ## Run 3P1D @@ -147,7 +147,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \ --gpu-memory-utilization 0.9 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1) @@ -167,7 +167,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \ --gpu-memory-utilization 0.9 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1) @@ -187,7 +187,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \ --gpu-memory-utilization 0.9 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` ### Decode1 (e.g. 10.0.1.5 or 10.0.1.1) @@ -207,7 +207,7 @@ VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \ --gpu-memory-utilization 0.7 \ --disable-log-request \ --kv-transfer-config \ - '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC"}}' > /var/vllm.log 2>&1 & + '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 & ``` # Single request From c7210ea3a0617270ef8ebb4ba6b37920057464d5 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 18 Jun 2025 09:56:00 +0800 Subject: [PATCH 152/155] Add Detailed Design Signed-off-by: Abatom --- docs/design/v1/p2p_nccl_connector.md | 61 ++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md index d3410290c5bc..ce7fc8fab1b7 100644 --- a/docs/design/v1/p2p_nccl_connector.md +++ b/docs/design/v1/p2p_nccl_connector.md @@ -1,3 +1,64 @@ +An implementation of xPyD with dynamic scaling based on point-to-point communication, partly inspired by Dynamo. + +# Detailed Design + +## Overall Process +As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow: + +1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface. +2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**. +3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**. +4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`. +5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**. +6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**. +7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**. + +![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7) + +## Proxy/Router (Demo) + +A simple HTTP service acts as the entry point for client requests and starts a background thread to listen for P/D instances reporting their HTTP IP and PORT, as well as ZMQ IP and PORT. It maintains a dictionary of `http_addr -> zmq_addr`. The `http_addr` is the IP:PORT for the vLLM instance's request, while the `zmq_addr` is the address for KV cache handshake and metadata reception. + +The Proxy/Router is responsible for selecting 1P1D based on the characteristics of the client request, such as the prompt, and generating a corresponding `request_id`, for example: + +``` +cmpl-___prefill_addr_10.0.1.2:21001___decode_addr_10.0.1.3:22001_93923d63113b4b338973f24d19d4bf11-0 +``` + +Currently, to quickly verify whether xPyD can work, a round-robin selection of 1P1D is used. In the future, it is planned to use a trie combined with the load status of instances to select appropriate P and D. + +Each P/D instance periodically sends a heartbeat packet to the Proxy/Router (currently every 3 seconds) to register (i.e., report `http_addr -> zmq_addr`) and keep the connection alive. If an instance crashes and fails to send a ping for a certain period of time, the Proxy/Router will remove the timed-out instance (this feature has not yet been developed). + +## KV Cache Transfer Methods + +There are three methods for KVcache transfer: PUT, GET, and PUT_ASYNC. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. Both PUT and PUT_ASYNC involve the P instance actively sending KVcache to the D instance. The difference is that PUT is a synchronous transfer method that blocks the main process, while PUT_ASYNC is an asynchronous transfer method. PUT_ASYNC uses a dedicated thread for sending KVcache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVcache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVcache from the P instance once it has allocated space for the KVcache. + +Experimental results have shown that the performance of these methods, from highest to lowest, is as follows: PUT_ASYNC โ†’ GET โ†’ PUT. + +## P2P Communication via ZMQ & NCCL + +As long as the address of the counterpart is known, point-to-point KV cache transfer (using NCCL) can be performed, without being constrained by rank and world size. To support dynamic scaling (expansion and contraction) of instances with PD disaggregation. This means that adding or removing P/D instances does not require a full system restart. + +Each P/D instance only needs to create a single `P2pNcclEngine` instance. This instance maintains a ZMQ Server, which runs a dedicated thread to listen on the `zmq_addr` address and receive control flow requests from other instances. These requests include requests to establish an NCCL connection and requests to send KVcache metadata (such as tensor shapes and data types). However, it does not actually transmit the KVcache data itself. + +When a P instance and a D instance transmit KVcache for the first time, they need to establish a ZMQ connection and an NCCL group. For subsequent KVcache transmissions, this ZMQ connection and NCCL group are reused. The NCCL group consists of only two ranks, meaning the world size is equal to 2. This design is intended to support dynamic scaling, which means that adding or removing P/D instances does not require a full system restart. As long as the address of the counterpart is known, point-to-point KVcache transmission can be performed, without being restricted by rank or world size. + +## NCCL Group Topology + +Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVcache transmission. Asymmetric TP and PP (Pipeline Parallelism) methods will be supported in the future. Figure 2 illustrates the 1P2D setup, where each instance has a TP (Tensor Parallelism) degree of 2. There are a total of 7 NCCL groups: three vLLM instances each have one NCCL group with TP=2. Additionally, the 0th GPU card of the P instance establishes an NCCL group with the 0th GPU card of each D instance. Similarly, the 1st GPU card of the P instance establishes an NCCL group with the 1st GPU card of each D instance. + +![image2](https://github.com/user-attachments/assets/837e61d6-365e-4cbf-8640-6dd7ab295b36) + +Each NCCL group occupies a certain amount of GPU memory buffer for communication, the size of which is primarily influenced by the `NCCL_MAX_NCHANNELS` environment variable. When `NCCL_MAX_NCHANNELS=16`, an NCCL group typically occupies 100MB, while when `NCCL_MAX_NCHANNELS=8`, it usually takes up 52MB. For large-scale xPyD configurationsโ€”such as DeepSeek's 96P144Dโ€”this implementation is currently not feasible. Moving forward, we are considering using RDMA for point-to-point communication and are also keeping an eye on UCCL. + +## GPU Memory Buffer and Tensor Memory Pool + +The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT and PUT_ASYNC modes, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVcache sent by P instances. If it is too large, it will reduce the KVcache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%๏ฝž10% of the memory size. + +If the `--max-num-seqs` parameter for P instances is set to a large value, due to the large batch size, P instances will generate a large amount of KVcache simultaneously. This may exceed the capacity of the memory buffer of D instances, resulting in KVcache loss. Once KVcache is lost, D instances need to recompute Prefill, which is equivalent to performing Prefill twice. Consequently, the time-to-first-token (TTFT) will significantly increase, leading to degraded performance. + +To address the above issues, I have designed and developed a local Tensor memory pool for storing KVcache, inspired by the buddy system used in Linux memory modules. Since the memory is sufficiently large, typically in the TB range on servers, there is no need to consider prefix caching or using block-based designs to reuse memory, thereby saving space. When the memory buffer is insufficient, KVcache can be directly stored in the Tensor memory pool, and D instances can subsequently retrieve KVcache from it. The read and write speed is that of PCIe, with PCIe 4.0 having a speed of approximately 21 GB/s, which is usually faster than the Prefill speed. Otherwise, solutions like Mooncake and lmcache would not be necessary. The Tensor memory pool acts as a flood diversion area, typically unused except during sudden traffic surges. In the worst-case scenario, my solution performs no worse than the normal situation with a Cache store. + # Install vLLM ```shell From cad61b815f4f6f2b1b8156bf252c2fc68c846e7d Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 18 Jun 2025 09:57:05 +0800 Subject: [PATCH 153/155] Add Test data Signed-off-by: Abatom --- docs/design/v1/p2p_nccl_connector.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md index ce7fc8fab1b7..06703f3de614 100644 --- a/docs/design/v1/p2p_nccl_connector.md +++ b/docs/design/v1/p2p_nccl_connector.md @@ -311,3 +311,27 @@ python3 benchmark_serving.py \ ```shell pgrep python | xargs kill -9 && pkill -f python ``` + +# Test data + +## **Scenario 1**: 1K input & 1K output tokens, E2E P99 latency ~20s +- **1P5D (6ร—A800) vs vLLM (1ร—A800)**: + - Throughput โ†‘7.2% (1085 โ†’ 6979/6) + - ITL (P99) โ†“81.3% (120ms โ†’ 22.9ms) + - TTFT (P99) โ†‘26.8% (175ms โ†’ 222ms) + - TPOT: No change + +- **1P6D (7ร—A800) vs vLLM (1ร—A800)**: + - Throughput โ†‘9.6% (1085 โ†’ 8329/7) + - ITL (P99) โ†“81.0% (120ms โ†’ 22.7ms) + - TTFT (P99) โ†‘210% (175ms โ†’543ms) + - TPOT: No change + +## **Scenario 2**: 1K input & 200 output tokens, E2E P99 latency ~4s +- **1P1D (2ร—A800) vs vLLM (1ร—A800)**: + - Throughput โ†‘37.4% (537 โ†’ 1476/2) + - ITL (P99) โ†“81.8% (127ms โ†’ 23.1ms) + - TTFT (P99) โ†‘41.8% (160ms โ†’ 227ms) + - TPOT: No change + +![testdata](https://github.com/user-attachments/assets/f791bfc7-9f3d-4e5c-9171-a42f9f4da627) \ No newline at end of file From 4fd955a1125835d650c1f727b9b23899444942d4 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 18 Jun 2025 10:06:03 +0800 Subject: [PATCH 154/155] format Signed-off-by: Abatom --- docs/design/v1/p2p_nccl_connector.md | 36 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md index 06703f3de614..b4ba12d571c8 100644 --- a/docs/design/v1/p2p_nccl_connector.md +++ b/docs/design/v1/p2p_nccl_connector.md @@ -314,24 +314,24 @@ pgrep python | xargs kill -9 && pkill -f python # Test data -## **Scenario 1**: 1K input & 1K output tokens, E2E P99 latency ~20s -- **1P5D (6ร—A800) vs vLLM (1ร—A800)**: - - Throughput โ†‘7.2% (1085 โ†’ 6979/6) - - ITL (P99) โ†“81.3% (120ms โ†’ 22.9ms) +## **Scenario 1**: 1K input & 1K output tokens, E2E P99 latency ~20s +- **1P5D (6ร—A800) vs vLLM (1ร—A800)**: + - Throughput โ†‘7.2% (1085 โ†’ 6979/6) + - ITL (P99) โ†“81.3% (120ms โ†’ 22.9ms) - TTFT (P99) โ†‘26.8% (175ms โ†’ 222ms) - - TPOT: No change - -- **1P6D (7ร—A800) vs vLLM (1ร—A800)**: - - Throughput โ†‘9.6% (1085 โ†’ 8329/7) - - ITL (P99) โ†“81.0% (120ms โ†’ 22.7ms) - - TTFT (P99) โ†‘210% (175ms โ†’543ms) - - TPOT: No change - -## **Scenario 2**: 1K input & 200 output tokens, E2E P99 latency ~4s -- **1P1D (2ร—A800) vs vLLM (1ร—A800)**: - - Throughput โ†‘37.4% (537 โ†’ 1476/2) - - ITL (P99) โ†“81.8% (127ms โ†’ 23.1ms) - - TTFT (P99) โ†‘41.8% (160ms โ†’ 227ms) - TPOT: No change -![testdata](https://github.com/user-attachments/assets/f791bfc7-9f3d-4e5c-9171-a42f9f4da627) \ No newline at end of file +- **1P6D (7ร—A800) vs vLLM (1ร—A800)**: + - Throughput โ†‘9.6% (1085 โ†’ 8329/7) + - ITL (P99) โ†“81.0% (120ms โ†’ 22.7ms) + - TTFT (P99) โ†‘210% (175ms โ†’543ms) + - TPOT: No change + +## **Scenario 2**: 1K input & 200 output tokens, E2E P99 latency ~4s +- **1P1D (2ร—A800) vs vLLM (1ร—A800)**: + - Throughput โ†‘37.4% (537 โ†’ 1476/2) + - ITL (P99) โ†“81.8% (127ms โ†’ 23.1ms) + - TTFT (P99) โ†‘41.8% (160ms โ†’ 227ms) + - TPOT: No change + +![testdata](https://github.com/user-attachments/assets/f791bfc7-9f3d-4e5c-9171-a42f9f4da627) From 9d52eeaa3e207bde842fcb4b038a485ab0f61b34 Mon Sep 17 00:00:00 2001 From: Abatom Date: Wed, 18 Jun 2025 10:12:59 +0800 Subject: [PATCH 155/155] format Signed-off-by: Abatom --- docs/design/v1/p2p_nccl_connector.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design/v1/p2p_nccl_connector.md b/docs/design/v1/p2p_nccl_connector.md index b4ba12d571c8..c24b53763709 100644 --- a/docs/design/v1/p2p_nccl_connector.md +++ b/docs/design/v1/p2p_nccl_connector.md @@ -318,7 +318,7 @@ pgrep python | xargs kill -9 && pkill -f python - **1P5D (6ร—A800) vs vLLM (1ร—A800)**: - Throughput โ†‘7.2% (1085 โ†’ 6979/6) - ITL (P99) โ†“81.3% (120ms โ†’ 22.9ms) - - TTFT (P99) โ†‘26.8% (175ms โ†’ 222ms) + - TTFT (P99) โ†‘26.8% (175ms โ†’ 222ms) - TPOT: No change - **1P6D (7ร—A800) vs vLLM (1ร—A800)**: