From 4bb98f2190aaf408cb063df5184829fb54ee5f81 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Thu, 26 Sep 2024 07:45:30 -0700 Subject: [PATCH 001/199] [Misc] Update config loading for Qwen2-VL and remove Granite (#8837) --- docs/source/models/supported_models.rst | 11 +- vllm/model_executor/models/granite.py | 2 +- vllm/model_executor/models/qwen2_vl.py | 5 +- vllm/transformers_utils/config.py | 12 +- vllm/transformers_utils/configs/__init__.py | 8 +- vllm/transformers_utils/configs/granite.py | 199 -------------------- vllm/transformers_utils/configs/qwen2vl.py | 131 +++++++++++++ 7 files changed, 144 insertions(+), 224 deletions(-) delete mode 100644 vllm/transformers_utils/configs/granite.py create mode 100644 vllm/transformers_utils/configs/qwen2vl.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index c807617a2c10..c41903f84910 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -280,7 +280,7 @@ Multimodal Language Models - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. - * - :code:`Qwen2VLForConditionalGeneration` - - Qwen2-VL (see note) + - Qwen2-VL - Image\ :sup:`+` / Video\ :sup:`+` - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc. - @@ -297,15 +297,6 @@ Multimodal Language Models For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 -.. note:: - For :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now. - This can be installed by running the following command: - - .. code-block:: bash - - pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830 - ----- If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. Otherwise, please refer to :ref:`Adding a New Model ` and :ref:`Enabling Multimodal Inputs ` diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 5f365bbc3067..d4853fd79009 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -25,6 +25,7 @@ import torch from torch import nn +from transformers import GraniteConfig from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig, LoRAConfig @@ -48,7 +49,6 @@ default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs.granite import GraniteConfig from vllm.utils import is_hip from .interfaces import SupportsLoRA diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 889ebc6c2e1f..f895e693b710 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -31,12 +31,9 @@ import torch.nn.functional as F from einops import rearrange, repeat from PIL import Image -from transformers import Qwen2VLConfig from transformers.image_utils import (get_image_size, infer_channel_dimension_format, to_numpy_array) -from transformers.models.qwen2_vl.configuration_qwen2_vl import ( - Qwen2VLVisionConfig) from transformers.models.qwen2_vl.image_processing_qwen2_vl import ( make_batched_images, make_batched_videos, smart_resize) @@ -66,6 +63,8 @@ from vllm.multimodal.image import cached_get_image_processor from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors, SequenceData +from vllm.transformers_utils.configs.qwen2vl import (Qwen2VLConfig, + Qwen2VLVisionConfig) from vllm.transformers_utils.processor import get_processor from vllm.utils import is_cpu diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 3871c0cb8b81..0f20e8d0c821 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -20,10 +20,10 @@ # yapf: disable from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig, EAGLEConfig, ExaoneConfig, - GraniteConfig, InternVLChatConfig, - JAISConfig, MedusaConfig, - MllamaConfig, MLPSpeculatorConfig, - MPTConfig, NemotronConfig, + InternVLChatConfig, JAISConfig, + MedusaConfig, MllamaConfig, + MLPSpeculatorConfig, MPTConfig, + NemotronConfig, Qwen2VLConfig, RWConfig, SolarConfig, UltravoxConfig) # yapf: enable @@ -57,9 +57,7 @@ "nemotron": NemotronConfig, "solar": SolarConfig, "ultravox": UltravoxConfig, - # Granite can be removed from here once we have upgraded to - # transformers 4.45+ - "granite": GraniteConfig, + "qwen2_vl": Qwen2VLConfig, **_CONFIG_REGISTRY_OVERRIDE_HF } diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index d5b13adb58a0..462cd964325d 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -6,7 +6,6 @@ # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # `FalconConfig` class from the official HuggingFace transformers library. from vllm.transformers_utils.configs.falcon import RWConfig -from vllm.transformers_utils.configs.granite import GraniteConfig from vllm.transformers_utils.configs.internvl import InternVLChatConfig from vllm.transformers_utils.configs.jais import JAISConfig from vllm.transformers_utils.configs.medusa import MedusaConfig @@ -14,6 +13,8 @@ from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig from vllm.transformers_utils.configs.mpt import MPTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig +from vllm.transformers_utils.configs.qwen2vl import (Qwen2VLConfig, + Qwen2VLVisionConfig) from vllm.transformers_utils.configs.solar import SolarConfig from vllm.transformers_utils.configs.ultravox import UltravoxConfig @@ -32,7 +33,6 @@ "NemotronConfig", "SolarConfig", "UltravoxConfig", - # Granite can be removed from here once we have upgraded to - # transformers 4.45+ - "GraniteConfig", + "Qwen2VLConfig", + "Qwen2VLVisionConfig", ] diff --git a/vllm/transformers_utils/configs/granite.py b/vllm/transformers_utils/configs/granite.py deleted file mode 100644 index c12838be5d38..000000000000 --- a/vllm/transformers_utils/configs/granite.py +++ /dev/null @@ -1,199 +0,0 @@ -# coding=utf-8 -# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved. -# -# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX -# and OPT implementations in this library. It has been modified from its -# original forms to accommodate minor architectural differences compared -# to GPT-NeoX and OPT used by the Meta AI team that trained the model. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Granite model configuration""" - -from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_rope_utils import rope_config_validation -from transformers.utils import logging - -logger = logging.get_logger(__name__) - - -class GraniteConfig(PretrainedConfig): - r""" - This is the configuration class to store the configuration of - a [`GraniteModel`]. It is used to instantiate an Granite - model according to the specified arguments, defining the model architecture. - Instantiating a configuration with the defaults will yield a similar - configuration to that of the Granite-3B. - - Configuration objects inherit from [`PretrainedConfig`] and can be used to - control the model outputs. Read the documentation from [`PretrainedConfig`] - for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 32000): - Vocabulary size of the Granite model. Defines the number of - different tokens that can be represented by the `inputs_ids` - passed when calling [`GraniteModel`] - hidden_size (`int`, *optional*, defaults to 4096): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 11008): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 32): - Number of hidden layers in the Transformer decoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the - Transformer decoder. - num_key_value_heads (`int`, *optional*): - This is the number of key_value heads that should be used to - implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi - Head Attention (MHA), if `num_key_value_heads=1` the model will use - Multi Query Attention (MQA) otherwise GQA is used. When converting - a multi-head checkpoint to a GQA checkpoint, each group key and - value head should be constructed by meanpooling all the original - heads within that group. For more details checkout - [this paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not - specified, will default to `num_attention_heads`. - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the - decoder. - max_position_embeddings (`int`, *optional*, defaults to 2048): - The maximum sequence length that this model might ever be used with. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for - initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values - attentions (not used by all models). Only relevant if - `config.is_decoder=True`. - pad_token_id (`int`, *optional*): - Padding token id. - bos_token_id (`int`, *optional*, defaults to 1): - Beginning of stream token id. - eos_token_id (`int`, *optional*, defaults to 2): - End of stream token id. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether to tie weight embeddings - rope_theta (`float`, *optional*, defaults to 10000.0): - The base period of the RoPE embeddings. - rope_scaling (`Dict`, *optional*): - Dictionary containing the scaling configuration for the RoPE - embeddings. Currently supports two scaling strategies: linear and - dynamic. Their scaling factor must be a float greater than 1. The - expected format is - `{"type": strategy name, "factor": scaling factor}`. - When using this flag, don't update `max_position_embeddings` to - the expected new maximum. See the following thread for more - information on how these scaling strategies behave: - https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. - This is an experimental feature, subject to breaking API changes - in future versions. - attention_bias (`bool`, *optional*, defaults to `False`): - Whether to use a bias in the query, key, value and output - projection layers during self-attention. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - mlp_bias (`bool`, *optional*, defaults to `False`): - Whether to use a bias in up_proj, down_proj and gate_proj layers - in the MLP layers. - embedding_multiplier (`float`, *optional*, defaults to 1.0): - embedding multiplier - logits_scaling (`float`, *optional*, defaults to 1.0): - divisor for output logits - residual_multiplier (`float`, *optional*, defaults to 1.0): - residual multiplier - attention_multiplier (`float`, *optional*, defaults to 1.0): - attention multiplier - - ```python - >>> from transformers import GraniteModel, GraniteConfig - - >>> # Initializing a Granite granite-3b style configuration - >>> configuration = GraniteConfig() - - >>> # Initializing a model from the granite-7b style configuration - >>> model = GraniteModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "granite" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=32000, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - rms_norm_eps=1e-6, - use_cache=True, - pad_token_id=None, - bos_token_id=1, - eos_token_id=2, - tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, - attention_bias=False, - attention_dropout=0.0, - mlp_bias=False, - embedding_multiplier=1.0, - logits_scaling=1.0, - residual_multiplier=1.0, - attention_multiplier=1.0, - **kwargs, - ): - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - self.rope_theta = rope_theta - self.rope_scaling = rope_scaling - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.mlp_bias = mlp_bias - - self.embedding_multiplier = embedding_multiplier - self.logits_scaling = logits_scaling - self.residual_multiplier = residual_multiplier - self.attention_multiplier = attention_multiplier - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - - rope_config_validation(self) diff --git a/vllm/transformers_utils/configs/qwen2vl.py b/vllm/transformers_utils/configs/qwen2vl.py new file mode 100644 index 000000000000..92dd962790bc --- /dev/null +++ b/vllm/transformers_utils/configs/qwen2vl.py @@ -0,0 +1,131 @@ +# coding=utf-8 +# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Qwen2VL model configuration""" + +import os +from typing import Union + +from transformers import PretrainedConfig + + +class Qwen2VLVisionConfig(PretrainedConfig): + model_type = "qwen2_vl" + + def __init__( + self, + depth=32, + embed_dim=1280, + hidden_size=3584, + hidden_act="quick_gelu", + mlp_ratio=4, + num_heads=16, + in_channels=3, + patch_size=14, + spatial_merge_size=2, + temporal_patch_size=2, + **kwargs, + ): + super().__init__(**kwargs) + + self.depth = depth + self.embed_dim = embed_dim + self.hidden_size = hidden_size + self.hidden_act = hidden_act + self.mlp_ratio = mlp_ratio + self.num_heads = num_heads + self.in_channels = in_channels + self.patch_size = patch_size + self.spatial_merge_size = spatial_merge_size + self.temporal_patch_size = temporal_patch_size + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, + os.PathLike], + **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict( + pretrained_model_name_or_path, **kwargs) + + if config_dict.get("model_type") == "qwen2_vl": + config_dict = config_dict["vision_config"] + + return cls.from_dict(config_dict, **kwargs) + + +class Qwen2VLConfig(PretrainedConfig): + + def __init__( + self, + vocab_size=152064, + hidden_size=8192, + intermediate_size=29568, + num_hidden_layers=80, + num_attention_heads=64, + num_key_value_heads=8, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-05, + use_cache=True, + tie_word_embeddings=False, + rope_theta=1000000.0, + use_sliding_window=False, + sliding_window=4096, + max_window_layers=80, + attention_dropout=0.0, + vision_config=None, + rope_scaling=None, + **kwargs, + ): + if isinstance(vision_config, dict): + self.vision_config = Qwen2VLVisionConfig(**vision_config) + elif vision_config is None: + self.vision_config = Qwen2VLVisionConfig() + + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.use_sliding_window = use_sliding_window + self.sliding_window = sliding_window + self.max_window_layers = max_window_layers + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + self.rope_scaling = rope_scaling + + # NOTE: the following section from original transformers config + # for Qwen2-VL is commented out to address rope config loading issue + # + # if self.rope_scaling is not None and "type" in self.rope_scaling: + # if self.rope_scaling["type"] == "mrope": + # self.rope_scaling["type"] = "default" + # self.rope_scaling["rope_type"] = self.rope_scaling["type"] + # rope_config_validation(self) + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) From f70bccac75a0aecc0a5fc934859158a3e1f019a5 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 26 Sep 2024 13:07:18 -0400 Subject: [PATCH 002/199] [Build/CI] Upgrade to gcc 10 in the base build Docker image (#8814) --- Dockerfile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Dockerfile b/Dockerfile index 6bb4bd032c39..0b06c74fc58c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,6 +27,14 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \ && python3 --version && python3 -m pip --version +# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 +# as it was causing spam when compiling the CUTLASS kernels +RUN apt-get install -y gcc-10 g++-10 +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 +RUN < Date: Thu, 26 Sep 2024 14:02:52 -0400 Subject: [PATCH 003/199] [Docs] Add README to the build docker image (#8825) --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 0b06c74fc58c..872b1bc47054 100644 --- a/Dockerfile +++ b/Dockerfile @@ -75,6 +75,7 @@ COPY csrc csrc COPY setup.py setup.py COPY cmake cmake COPY CMakeLists.txt CMakeLists.txt +COPY README.md README.md COPY requirements-common.txt requirements-common.txt COPY requirements-cuda.txt requirements-cuda.txt COPY pyproject.toml pyproject.toml From 68988d4e0d8765901c51f07f9bfbda58f35f6f63 Mon Sep 17 00:00:00 2001 From: fyuan1316 Date: Fri, 27 Sep 2024 02:04:39 +0800 Subject: [PATCH 004/199] [CI/Build] Fix missing ci dependencies (#8834) --- .github/workflows/scripts/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh index cd617e9f19fb..cda0c28c75c2 100644 --- a/.github/workflows/scripts/build.sh +++ b/.github/workflows/scripts/build.sh @@ -8,7 +8,7 @@ PATH=${cuda_home}/bin:$PATH LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH # Install requirements -$python_executable -m pip install wheel packaging +$python_executable -m pip install wheel packaging 'setuptools-scm>=8' $python_executable -m pip install -r requirements-cuda.txt # Limit the number of parallel jobs to avoid OOM From 70de39f6b46f6b90aecba52358825127a50b3921 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 26 Sep 2024 13:19:04 -0700 Subject: [PATCH 005/199] [misc][installation] build from source without compilation (#8818) --- docs/source/getting_started/installation.rst | 34 ++++++++++-- python_only_dev.py | 54 ++++++++++++++++++++ 2 files changed, 85 insertions(+), 3 deletions(-) create mode 100644 python_only_dev.py diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index afae6e655602..bdde3e933b18 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -58,13 +58,41 @@ You can install vLLM using pip: $ # export VLLM_COMMIT=... $ # pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl +Build from source (without compilation) +--------------------------------------- + +If you want to develop vLLM, and you only need to change the Python code, you can build vLLM without compilation. + +The first step is to follow the previous instructions to install the latest vLLM wheel: + +.. code-block:: console + + $ export VLLM_VERSION=0.6.1.post1 + $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl + +After verifying that the installation is successful, we have a script for you to copy and link directories, so that you can edit the Python code directly: + +.. code-block:: console + + $ git clone https://github.com/vllm-project/vllm.git + $ cd vllm + $ python python_only_dev.py + +It will: + +- Find the installed vLLM in the current environment. +- Copy built files to the current directory. +- Rename the installed vLLM +- Symbolically link the current directory to the installed vLLM. + +This way, you can edit the Python code in the current directory, and the changes will be reflected in the installed vLLM. .. _build_from_source: -Build from source ------------------ +Build from source (with compilation) +------------------------------------ -You can also build and install vLLM from source: +If you need to touch the C++ or CUDA code, you need to build vLLM from source: .. code-block:: console diff --git a/python_only_dev.py b/python_only_dev.py new file mode 100644 index 000000000000..d84122280a3c --- /dev/null +++ b/python_only_dev.py @@ -0,0 +1,54 @@ +# enable python only development +# copy compiled files to the current directory directly + +import os +import shutil +import subprocess +import sys + +# cannot directly `import vllm` , because it will try to +# import from the current directory +output = subprocess.run([sys.executable, "-m", "pip", "show", "vllm"], + capture_output=True) + +assert output.returncode == 0, "vllm is not installed" + +text = output.stdout.decode("utf-8") + +package_path = None +for line in text.split("\n"): + if line.startswith("Location: "): + package_path = line.split(": ")[1] + break + +assert package_path is not None, "could not find package path" + +cwd = os.getcwd() + +assert cwd != package_path, "should not import from the current directory" + +files_to_copy = [ + "vllm/_C.abi3.so", + "vllm/_core_C.abi3.so", + "vllm/_moe_C.abi3.so", + "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", + "vllm/vllm_flash_attn/flash_attn_interface.py", + "vllm/vllm_flash_attn/__init__.py", + # "vllm/_version.py", # not available in nightly wheels yet +] + +for file in files_to_copy: + src = os.path.join(package_path, file) + dst = file + print(f"Copying {src} to {dst}") + shutil.copyfile(src, dst) + +pre_built_vllm_path = os.path.join(package_path, "vllm") +tmp_path = os.path.join(package_path, "vllm_pre_built") +current_vllm_path = os.path.join(cwd, "vllm") + +print(f"Renaming {pre_built_vllm_path} to {tmp_path}") +os.rename(pre_built_vllm_path, tmp_path) + +print(f"linking {current_vllm_path} to {pre_built_vllm_path}") +os.symlink(current_vllm_path, pre_built_vllm_path) From d9cfbc891e2e1d62d74c7aae93bde436a29bd574 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Thu, 26 Sep 2024 15:02:16 -0700 Subject: [PATCH 006/199] [ci] Soft fail Entrypoints, Samplers, LoRA, Decoder-only VLM (#8872) Signed-off-by: kevin --- .buildkite/test-pipeline.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ea8b3d46f1b3..b4226a3ca574 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -83,6 +83,7 @@ steps: - label: Entrypoints Test # 20min working_dir: "/vllm-workspace/tests" + soft_fail: true fast_check: true mirror_hardwares: [amd] source_file_dependencies: @@ -177,6 +178,7 @@ steps: - pytest -v -s prefix_caching - label: Samplers Test # 18min + soft_fail: true source_file_dependencies: - vllm/model_executor/layers - vllm/sampling_metadata.py @@ -204,6 +206,7 @@ steps: - label: LoRA Test %N # 30min each mirror_hardwares: [amd] + soft_fail: true source_file_dependencies: - vllm/lora - tests/lora @@ -308,6 +311,7 @@ steps: - pytest -v -s models/decoder_only/language - label: Decoder-only Multi-Modal Models Test # 56min + soft_fail: true #mirror_hardwares: [amd] source_file_dependencies: - vllm/ From 93d364da3406f5523e5e4772ffbc3c72dac7bbf4 Mon Sep 17 00:00:00 2001 From: Pernekhan Utemuratov Date: Thu, 26 Sep 2024 15:47:00 -0700 Subject: [PATCH 007/199] [Bugfix] Include encoder prompts len to non-stream api usage response (#8861) --- vllm/entrypoints/openai/serving_chat.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 94076ea3a51d..254671ef4486 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -726,6 +726,8 @@ async def chat_completion_full_generator( assert final_res.prompt_token_ids is not None num_prompt_tokens = len(final_res.prompt_token_ids) + if final_res.encoder_prompt_token_ids is not None: + num_prompt_tokens += len(final_res.encoder_prompt_token_ids) num_generated_tokens = sum( len(output.token_ids) for output in final_res.outputs) usage = UsageInfo( From b28d2104dea6ba80c0f1f6c4596b5703d7ef923d Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 26 Sep 2024 19:18:14 -0400 Subject: [PATCH 008/199] [Misc] Change dummy profiling and BOS fallback warns to log once (#8820) --- vllm/inputs/preprocess.py | 14 ++++++++------ vllm/inputs/registry.py | 8 ++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index bee3d1ed75cb..6d54a07e92cc 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -8,6 +8,7 @@ from vllm.lora.request import LoRARequest from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup +from vllm.utils import print_warning_once from .data import (EncoderDecoderLLMInputs, LLMInputs, PromptInputs, SingletonPromptInputs) @@ -71,20 +72,21 @@ def get_decoder_start_token_id(self) -> Optional[int]: ''' if not self.is_encoder_decoder_model(): - logger.warning("Using None for decoder start token id because " - "this is not an encoder/decoder model.") + print_warning_once("Using None for decoder start token id because " + "this is not an encoder/decoder model.") return None if (self.model_config is None or self.model_config.hf_config is None): - logger.warning("Using None for decoder start token id because " - "model config is not available.") + print_warning_once("Using None for decoder start token id because " + "model config is not available.") return None dec_start_token_id = getattr(self.model_config.hf_config, 'decoder_start_token_id', None) if dec_start_token_id is None: - logger.warning("Falling back on for decoder start token id " - "because decoder start token id is not available.") + print_warning_once("Falling back on for decoder start token " + "id because decoder start token id is not " + "available.") dec_start_token_id = self.get_bos_token_id() return dec_start_token_id diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 159d958ebf67..e494ee122430 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -9,7 +9,7 @@ from typing_extensions import TypeVar from vllm.logger import init_logger -from vllm.utils import get_allowed_kwarg_only_overrides +from vllm.utils import get_allowed_kwarg_only_overrides, print_warning_once from .data import LLMInputs @@ -235,9 +235,9 @@ def dummy_data_for_profiling( num_tokens = seq_data.prompt_token_ids if len(num_tokens) < seq_len: if is_encoder_data: - logger.warning( - "Expected at least %d dummy encoder tokens for profiling, " - "but found %d tokens instead.", seq_len, len(num_tokens)) + print_warning_once( + f"Expected at least {seq_len} dummy encoder tokens for " + f"profiling, but found {len(num_tokens)} tokens instead.") else: raise AssertionError( f"Expected at least {seq_len} dummy tokens for profiling, " From e2f6f26e8636b8a23e5c0cda533a70c40ade01ec Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 26 Sep 2024 19:18:26 -0400 Subject: [PATCH 009/199] [Bugfix] Fix print_warning_once's line info (#8867) --- vllm/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/utils.py b/vllm/utils.py index b73e3b9bbf68..a0d2a7e50fc6 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -744,7 +744,8 @@ def create_kv_caches_with_random( @lru_cache def print_warning_once(msg: str) -> None: - logger.warning(msg) + # Set the stacklevel to 2 to print the caller's line info + logger.warning(msg, stacklevel=2) @lru_cache(maxsize=None) From ee2da3e9efb38add804e2023d47e9f42f38bd638 Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Fri, 27 Sep 2024 04:53:17 +0530 Subject: [PATCH 010/199] fix validation: Only set tool_choice `auto` if at least one tool is provided (#8568) --- ...est_chat_completion_request_validations.py | 71 +++++++++++++++++++ vllm/entrypoints/openai/protocol.py | 2 +- 2 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 tests/tool_use/test_chat_completion_request_validations.py diff --git a/tests/tool_use/test_chat_completion_request_validations.py b/tests/tool_use/test_chat_completion_request_validations.py new file mode 100644 index 000000000000..3d0fe8f06089 --- /dev/null +++ b/tests/tool_use/test_chat_completion_request_validations.py @@ -0,0 +1,71 @@ +import pytest + +from vllm.entrypoints.openai.protocol import ChatCompletionRequest + + +def test_chat_completion_request_with_no_tools(): + # tools key is not present + request = ChatCompletionRequest.model_validate({ + 'messages': [{ + 'role': 'user', + 'content': 'Hello' + }], + 'model': + 'facebook/opt-125m', + }) + assert request.tool_choice == 'none' + + # tools key is None + request = ChatCompletionRequest.model_validate({ + 'messages': [{ + 'role': 'user', + 'content': 'Hello' + }], + 'model': + 'facebook/opt-125m', + 'tools': + None + }) + assert request.tool_choice == 'none' + + # tools key present but empty + request = ChatCompletionRequest.model_validate({ + 'messages': [{ + 'role': 'user', + 'content': 'Hello' + }], + 'model': + 'facebook/opt-125m', + 'tools': [] + }) + assert request.tool_choice == 'none' + + +def test_chat_completion_request_with_tool_choice_but_no_tools(): + with pytest.raises(ValueError, + match="When using `tool_choice`, `tools` must be set."): + ChatCompletionRequest.model_validate({ + 'messages': [{ + 'role': 'user', + 'content': 'Hello' + }], + 'model': + 'facebook/opt-125m', + 'tool_choice': + 'auto' + }) + + with pytest.raises(ValueError, + match="When using `tool_choice`, `tools` must be set."): + ChatCompletionRequest.model_validate({ + 'messages': [{ + 'role': 'user', + 'content': 'Hello' + }], + 'model': + 'facebook/opt-125m', + 'tool_choice': + 'auto', + 'tools': + None + }) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 40d27f984fba..646aa4537999 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -386,7 +386,7 @@ def check_tool_usage(cls, data): # if "tool_choice" is not specified but tools are provided, # default to "auto" tool_choice - if "tool_choice" not in data and "tools" in data: + if "tool_choice" not in data and data.get("tools"): data["tool_choice"] = "auto" # if "tool_choice" is specified -- validation From 71d21c73abfb9b12ea402ce6b11c1b8e31eddf4c Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 26 Sep 2024 19:23:45 -0400 Subject: [PATCH 011/199] [Bugfix] Fixup advance_step.cu warning (#8815) --- csrc/prepare_inputs/advance_step.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index a9d08ca0dc14..1f3f4710735e 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -211,7 +211,7 @@ void advance_step_flashinfer( printf(" num_seqs = %d\n", num_seqs); printf(" num_queries = %d\n", num_queries); printf(" block_size = %d\n", block_size); - printf(" block_tables.stride(0) = %d\n", block_tables.stride(0)); + printf(" block_tables.stride(0) = %zu\n", block_tables.stride(0)); } // Verify all tensors verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong); @@ -303,4 +303,4 @@ void advance_step_flashinfer( num_seqs, num_queries, block_size, input_tokens, sampled_token_ids, input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices, paged_kv_indptr, paged_kv_last_page_len, block_table_bound); -} \ No newline at end of file +} From 4b377d6febed7ddd964f1b96079d7e78c231325e Mon Sep 17 00:00:00 2001 From: Nick Hill Date: Fri, 27 Sep 2024 00:46:43 +0100 Subject: [PATCH 012/199] [BugFix] Fix test breakages from transformers 4.45 upgrade (#8829) --- .buildkite/test-pipeline.yaml | 9 +++---- tests/conftest.py | 1 - tests/distributed/test_pipeline_parallel.py | 7 ----- tests/engine/test_custom_executor.py | 8 +++--- tests/entrypoints/openai/test_serving_chat.py | 6 +++++ tests/lora/test_tokenizer_group.py | 4 +-- .../decoder_only/language/test_granite.py | 4 --- .../vision_language/test_llava_next_video.py | 5 ---- .../vision_language/test_llava_onevision.py | 13 ++++------ tests/models/test_registry.py | 6 ----- tests/samplers/test_sampler.py | 18 ++++++++++--- vllm/entrypoints/openai/serving_chat.py | 4 +-- vllm/transformers_utils/tokenizer.py | 26 ++++++++++++++++++- 13 files changed, 62 insertions(+), 49 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b4226a3ca574..d9dcacf5d991 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -83,7 +83,6 @@ steps: - label: Entrypoints Test # 20min working_dir: "/vllm-workspace/tests" - soft_fail: true fast_check: true mirror_hardwares: [amd] source_file_dependencies: @@ -96,7 +95,8 @@ steps: - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - pytest -v -s entrypoints/openai + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests @@ -178,7 +178,6 @@ steps: - pytest -v -s prefix_caching - label: Samplers Test # 18min - soft_fail: true source_file_dependencies: - vllm/model_executor/layers - vllm/sampling_metadata.py @@ -206,7 +205,6 @@ steps: - label: LoRA Test %N # 30min each mirror_hardwares: [amd] - soft_fail: true source_file_dependencies: - vllm/lora - tests/lora @@ -311,7 +309,6 @@ steps: - pytest -v -s models/decoder_only/language - label: Decoder-only Multi-Modal Models Test # 56min - soft_fail: true #mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -463,7 +460,7 @@ steps: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py - - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus - pytest -v -s -x lora/test_mixtral.py - label: LM Eval Large Models # optional diff --git a/tests/conftest.py b/tests/conftest.py index 354862e3579a..db71d8bc3af1 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -699,7 +699,6 @@ def generate_w_logprobs( if videos is not None: for i, video in enumerate(videos): inputs[i]["multi_modal_data"] = {"video": video} - print(f"[INPUTS!!!!]: {inputs}, {sampling_params}") req_outputs = self.model.generate(inputs, sampling_params=sampling_params) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 280a8abdd13a..9fd1368cc2b5 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -8,8 +8,6 @@ import os import pytest -from packaging import version -from transformers import __version__ as transformers_version from vllm.logger import init_logger @@ -49,11 +47,6 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, pytest.skip("Skipping multi-node pipeline parallel test for " "multiprocessing distributed backend") - # Skip tests that require transformers>=4.45.0 - if "Qwen2-VL" in MODEL_NAME and version.parse( - transformers_version) < version.parse("4.45.0.dev0"): - pytest.skip("This test requires transformers>=4.45.0") - pp_args = [ # use half precision for speed and memory savings in CI environment "--dtype", diff --git a/tests/engine/test_custom_executor.py b/tests/engine/test_custom_executor.py index bff0fc99ed02..bbabb936e92b 100644 --- a/tests/engine/test_custom_executor.py +++ b/tests/engine/test_custom_executor.py @@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model): @pytest.mark.parametrize("model", ["facebook/opt-125m"]) -def test_custom_executor(model, tmpdir): +def test_custom_executor(model, tmp_path): cwd = os.path.abspath(".") - os.chdir(tmpdir) + os.chdir(tmp_path) try: assert not os.path.exists(".marker") @@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir): @pytest.mark.parametrize("model", ["facebook/opt-125m"]) -def test_custom_executor_async(model, tmpdir): +def test_custom_executor_async(model, tmp_path): cwd = os.path.abspath(".") - os.chdir(tmpdir) + os.chdir(tmp_path) try: assert not os.path.exists(".marker") diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index db31745cc102..ec550fe82c70 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -15,6 +15,11 @@ BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)] +@dataclass +class MockHFConfig: + model_type: str = "any" + + @dataclass class MockModelConfig: tokenizer = MODEL_NAME @@ -24,6 +29,7 @@ class MockModelConfig: tokenizer_revision = None embedding_mode = False multimodal_config = MultiModalConfig() + hf_config = MockHFConfig() @dataclass diff --git a/tests/lora/test_tokenizer_group.py b/tests/lora/test_tokenizer_group.py index 2dcad23c2b54..daa39b2a3dba 100644 --- a/tests/lora/test_tokenizer_group.py +++ b/tests/lora/test_tokenizer_group.py @@ -41,7 +41,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type): lora_request) -def test_get_lora_tokenizer(sql_lora_files, tmpdir): +def test_get_lora_tokenizer(sql_lora_files, tmp_path): lora_request = None tokenizer = get_lora_tokenizer(lora_request) assert not tokenizer @@ -50,6 +50,6 @@ def test_get_lora_tokenizer(sql_lora_files, tmpdir): tokenizer = get_lora_tokenizer(lora_request) assert tokenizer.get_added_vocab() - lora_request = LoRARequest("1", 1, str(tmpdir)) + lora_request = LoRARequest("1", 1, str(tmp_path)) tokenizer = get_lora_tokenizer(lora_request) assert not tokenizer diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py index e5c5ce4a8f74..0b71f0d49c70 100644 --- a/tests/models/decoder_only/language/test_granite.py +++ b/tests/models/decoder_only/language/test_granite.py @@ -3,7 +3,6 @@ Run `pytest tests/models/test_granite.py`. """ import pytest -import transformers from ...utils import check_logprobs_close @@ -12,9 +11,6 @@ ] -# GraniteForCausalLM will be in transformers >= 4.45 -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="granite model test requires transformers >= 4.45") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["bfloat16"]) @pytest.mark.parametrize("max_tokens", [64]) diff --git a/tests/models/decoder_only/vision_language/test_llava_next_video.py b/tests/models/decoder_only/vision_language/test_llava_next_video.py index d477bcc71361..7b7b23c783e2 100644 --- a/tests/models/decoder_only/vision_language/test_llava_next_video.py +++ b/tests/models/decoder_only/vision_language/test_llava_next_video.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple, Type, overload import pytest -import transformers from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer from vllm.multimodal.utils import (rescale_video_size, resize_video, @@ -158,8 +157,6 @@ def run_test( ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "size_factors", @@ -203,8 +200,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors, ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "sizes", diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index d1bffddde59a..978631feacb8 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -1,7 +1,6 @@ from typing import List, Optional, Tuple, Type, overload import pytest -import transformers from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer, BatchEncoding) @@ -166,8 +165,6 @@ def process(hf_inputs: BatchEncoding): ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "size_factors", @@ -211,8 +208,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors, ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize( "sizes", @@ -259,7 +254,9 @@ def run_image_test( # max_model_len should be greater than image_feature_size with vllm_runner(model, dtype=dtype, - max_model_len=32768, + max_num_seqs=1, + max_model_len=16384, + gpu_memory_utilization=0.98, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True, @@ -305,8 +302,8 @@ def process(hf_inputs: BatchEncoding): ) -@pytest.mark.skipif(transformers.__version__ < "4.45", - reason="Waiting for next transformers release") +# FIXME: Swap to a smaller model for this architecture +@pytest.mark.skip(reason="Model OOMing on CI") @pytest.mark.parametrize("model", models) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py index 4b9a1ca44c0d..b058e2755c24 100644 --- a/tests/models/test_registry.py +++ b/tests/models/test_registry.py @@ -1,15 +1,9 @@ import pytest -import transformers from vllm.model_executor.models import _MODELS, ModelRegistry @pytest.mark.parametrize("model_cls", _MODELS) def test_registry_imports(model_cls): - if (model_cls in ("LlavaOnevisionForConditionalGeneration", - "Qwen2VLForConditionalGeneration") - and transformers.__version__ < "4.45"): - pytest.skip("Waiting for next transformers release") - # Ensure all model classes can be imported successfully ModelRegistry.resolve_model_cls([model_cls]) diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py index 308b708feab7..3342a336a4ef 100644 --- a/tests/samplers/test_sampler.py +++ b/tests/samplers/test_sampler.py @@ -1,5 +1,6 @@ import itertools import random +from dataclasses import dataclass from typing import Dict, List, Optional, Tuple from unittest.mock import Mock, patch @@ -596,8 +597,19 @@ def test_sampler_top_k_top_p(seed: int, device: str): generation_config = GenerationConfig(top_k=top_k, top_p=top_p, do_sample=True) - warpers = generation_model._get_logits_warper(generation_config, device) - assert len(warpers) == 2 # top_p and top_k + + @dataclass + class MockConfig: + is_encoder_decoder: bool = False + + generation_model.config = MockConfig() # needed by the following method + generation_model._prepare_special_tokens(generation_config, device=device) + processors = generation_model._get_logits_processor(generation_config, + None, + None, + None, [], + device=device) + assert len(processors) == 2 # top_p and top_k seq_group_metadata_list: List[SequenceGroupMetadata] = [] seq_lens: List[int] = [] @@ -639,7 +651,7 @@ def mock_sample(probs, *args, **kwargs): assert sample_probs is not None - hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone()) + hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone()) hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5) assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 254671ef4486..8b51fc804ad9 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -152,13 +152,13 @@ async def create_chat_completion( **(request.chat_template_kwargs or {}), ) except Exception as e: - logger.error("Error in applying chat template from request: %s", e) + logger.exception("Error in applying chat template from request") return self.create_error_response(str(e)) try: mm_data = await mm_data_future except Exception as e: - logger.error("Error in loading multi-modal data: %s", e) + logger.exception("Error in loading multi-modal data") return self.create_error_response(str(e)) # validation for OpenAI tools diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 2a2d74382e37..e3b244d06660 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -1,6 +1,7 @@ import os import warnings from pathlib import Path +from types import MethodType from typing import Optional, Union import huggingface_hub @@ -152,6 +153,29 @@ def get_tokenizer( else: raise e + # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324 + if type(tokenizer).__name__ in ("ChatGLMTokenizer", + "ChatGLM4Tokenizer"): + assert isinstance(tokenizer, PreTrainedTokenizer) + orig_pad = tokenizer._pad + + # Patch _pad method to accept `padding_side` + def _pad( + self: PreTrainedTokenizer, + *args, + padding_side: Optional[str] = None, + **kwargs, + ): + if (padding_side is not None + and padding_side != self.padding_side): + msg = ("`padding_side` argument is not supported by " + "ChatGLMTokenizer and will be ignored.") + warnings.warn(msg, stacklevel=2) + + return orig_pad(*args, **kwargs) + + tokenizer._pad = MethodType(_pad, tokenizer) + if not isinstance(tokenizer, PreTrainedTokenizerFast): logger.warning( "Using a slow tokenizer. This might cause a significant " @@ -167,7 +191,7 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args, return None try: tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs) - except OSError as e: + except Exception as e: # No tokenizer was found in the LoRA folder, # use base model tokenizer logger.warning( From 1b49148e474d4d18731e159ea0460145ae52e220 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 27 Sep 2024 07:54:09 +0800 Subject: [PATCH 013/199] [Installation] Allow lower versions of FastAPI to maintain Ray 2.9 compatibility (#8764) --- requirements-common.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index 2fc89c026901..a9596878a0f8 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -7,8 +7,8 @@ py-cpuinfo transformers >= 4.45.0 # Required for Llama 3.2. tokenizers >= 0.19.1 # Required for Llama 3. protobuf # Required by LlamaTokenizer. -fastapi < 0.113.0; python_version < '3.9' -fastapi >= 0.114.1; python_version >= '3.9' +fastapi >= 0.107.0, < 0.113.0; python_version < '3.9' +fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9' aiohttp openai >= 1.40.0 # Ensure modern openai package (ensure types module present) uvicorn[standard] From 344cd2b6f4c22bf278cff96066001d216ec1fe82 Mon Sep 17 00:00:00 2001 From: Maximilien de Bayser Date: Thu, 26 Sep 2024 21:01:42 -0300 Subject: [PATCH 014/199] [Feature] Add support for Llama 3.1 and 3.2 tool use (#8343) Signed-off-by: Max de Bayser --- .../serving/openai_compatible_server.md | 26 +- .../tool_chat_template_llama3.1_json.jinja | 94 ++++++ .../tool_chat_template_llama3.2_json.jinja | 93 ++++++ tests/tool_use/test_chat_completions.py | 17 +- tests/tool_use/test_parallel_tool_calls.py | 18 +- tests/tool_use/utils.py | 71 ++++- vllm/entrypoints/openai/cli_args.py | 2 +- vllm/entrypoints/openai/serving_chat.py | 3 + .../openai/tool_parsers/__init__.py | 6 +- .../openai/tool_parsers/llama_tool_parser.py | 273 ++++++++++++++++++ 10 files changed, 576 insertions(+), 27 deletions(-) create mode 100644 examples/tool_chat_template_llama3.1_json.jinja create mode 100644 examples/tool_chat_template_llama3.2_json.jinja create mode 100644 vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index eb4ea0fb5655..e0eba7f09bd6 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -157,10 +157,10 @@ vLLM will use guided decoding to ensure the response matches the tool parameter To enable this feature, you should set the following flags: * `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it deems appropriate. -* `--tool-call-parser` -- select the tool parser to use - currently either `hermes` or `mistral`. Additional tool parsers +* `--tool-call-parser` -- select the tool parser to use - currently either `hermes`, `mistral` or `llama3_json`. Additional tool parsers will continue to be added in the future. * `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages -that contain previously generated tool calls. Hermes and Mistral models have tool-compatible chat templates in their +that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their `tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates) from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json) @@ -197,3 +197,25 @@ when tools are provided, that results in much better reliability when working wi Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` + +#### Llama Models +Supported models: +* `meta-llama/Meta-Llama-3.1-8B-Instruct` +* `meta-llama/Meta-Llama-3.1-70B-Instruct` +* `meta-llama/Meta-Llama-3.1-405B-Instruct` +* `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8` + +The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). +Other tool calling formats like the built in python tool calling or custom tool calling are not supported. + +Known issues: +1. Parallel tool calls are not supported. +2. The model can generate parameters with a wrong format, such as generating + an array serialized as string instead of an array. + +The `tool_chat_template_llama3_json.jinja` file contains the "official" Llama chat template, but tweaked so that +it works better with vLLM. + +Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja` + + diff --git a/examples/tool_chat_template_llama3.1_json.jinja b/examples/tool_chat_template_llama3.1_json.jinja new file mode 100644 index 000000000000..c24a7e51335e --- /dev/null +++ b/examples/tool_chat_template_llama3.1_json.jinja @@ -0,0 +1,94 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {#- Llama 3.1 doesn't pass all tests if the tools are in the system prompt #} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping %} + {{- message.content | tojson }} + {%- else %} + {{- { "output": message.content } | tojson }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/examples/tool_chat_template_llama3.2_json.jinja b/examples/tool_chat_template_llama3.2_json.jinja new file mode 100644 index 000000000000..7e24777726a3 --- /dev/null +++ b/examples/tool_chat_template_llama3.2_json.jinja @@ -0,0 +1,93 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = false %} +{%- endif %} +{%- if not date_string is defined %} + {%- if strftime_now is defined %} + {%- set date_string = strftime_now("%d %b %Y") %} + {%- else %} + {%- set date_string = "26 Jul 2024" %} + {%- endif %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %} +{%- endif %} + +{#- System message #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {{- "<|eot_id|>" }} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping %} + {{- message.content | tojson }} + {%- else %} + {{- { "output": message.content } | tojson }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py index 038ff81d2b67..8e7cb9f5d3d9 100644 --- a/tests/tool_use/test_chat_completions.py +++ b/tests/tool_use/test_chat_completions.py @@ -3,18 +3,20 @@ import openai import pytest -from .utils import MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL +from .utils import (MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL, ServerConfig, + ensure_system_prompt) # test: make sure chat completions without tools provided work even when tools # are enabled. This makes sure tool call chat templates work, AND that the tool # parser stream processing doesn't change the output of the model. @pytest.mark.asyncio -async def test_chat_completion_without_tools(client: openai.AsyncOpenAI): +async def test_chat_completion_without_tools(client: openai.AsyncOpenAI, + server_config: ServerConfig): models = await client.models.list() model_name: str = models.data[0].id chat_completion = await client.chat.completions.create( - messages=MESSAGES_WITHOUT_TOOLS, + messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config), temperature=0, max_tokens=150, model=model_name, @@ -34,7 +36,7 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI): # make the same request, streaming stream = await client.chat.completions.create( - messages=MESSAGES_WITHOUT_TOOLS, + messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config), temperature=0, max_tokens=150, model=model_name, @@ -77,11 +79,12 @@ async def test_chat_completion_without_tools(client: openai.AsyncOpenAI): # tools, to make sure we can still get normal chat completion responses # and that they won't be parsed as tools @pytest.mark.asyncio -async def test_chat_completion_with_tools(client: openai.AsyncOpenAI): +async def test_chat_completion_with_tools(client: openai.AsyncOpenAI, + server_config: ServerConfig): models = await client.models.list() model_name: str = models.data[0].id chat_completion = await client.chat.completions.create( - messages=MESSAGES_WITHOUT_TOOLS, + messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config), temperature=0, max_tokens=150, model=model_name, @@ -102,7 +105,7 @@ async def test_chat_completion_with_tools(client: openai.AsyncOpenAI): # make the same request, streaming stream = await client.chat.completions.create( - messages=MESSAGES_WITHOUT_TOOLS, + messages=ensure_system_prompt(MESSAGES_WITHOUT_TOOLS, server_config), temperature=0, max_tokens=150, model=model_name, diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py index b03b5a2075a6..ed7ac8afe1b4 100644 --- a/tests/tool_use/test_parallel_tool_calls.py +++ b/tests/tool_use/test_parallel_tool_calls.py @@ -6,7 +6,7 @@ from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS, MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, SEARCH_TOOL, - WEATHER_TOOL) + WEATHER_TOOL, ServerConfig) # test: getting the model to generate parallel tool calls (streaming/not) @@ -14,7 +14,13 @@ # may be added in the future. e.g. llama 3.1 models are not designed to support # parallel tool calls. @pytest.mark.asyncio -async def test_parallel_tool_calls(client: openai.AsyncOpenAI): +async def test_parallel_tool_calls(client: openai.AsyncOpenAI, + server_config: ServerConfig): + + if not server_config.get("supports_parallel", True): + pytest.skip("The {} model doesn't support parallel tool calls".format( + server_config["model"])) + models = await client.models.list() model_name: str = models.data[0].id chat_completion = await client.chat.completions.create( @@ -136,7 +142,13 @@ async def test_parallel_tool_calls(client: openai.AsyncOpenAI): # test: providing parallel tool calls back to the model to get a response # (streaming/not) @pytest.mark.asyncio -async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI): +async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI, + server_config: ServerConfig): + + if not server_config.get("supports_parallel", True): + pytest.skip("The {} model doesn't support parallel tool calls".format( + server_config["model"])) + models = await client.models.list() model_name: str = models.data[0].id chat_completion = await client.chat.completions.create( diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py index e447469e3341..1a840f8a51c9 100644 --- a/tests/tool_use/utils.py +++ b/tests/tool_use/utils.py @@ -1,4 +1,5 @@ -from typing import Dict, List +from copy import deepcopy +from typing import Any, Dict, List, Optional from openai.types.chat import (ChatCompletionMessageParam, ChatCompletionToolParam) @@ -7,9 +8,30 @@ from tests.utils import VLLM_PATH -class ServerConfig(TypedDict): +class ServerConfig(TypedDict, total=False): model: str arguments: List[str] + system_prompt: Optional[str] + supports_parallel: Optional[bool] + + +def patch_system_prompt(messages: List[Dict[str, Any]], + system_prompt: str) -> List[Dict[str, Any]]: + new_messages = deepcopy(messages) + if new_messages[0]["role"] == "system": + new_messages[0]["content"] = system_prompt + else: + new_messages.insert(0, {"role": "system", "content": system_prompt}) + return new_messages + + +def ensure_system_prompt(messages: List[Dict[str, Any]], + config: ServerConfig) -> List[Dict[str, Any]]: + prompt = config.get("system_prompt") + if prompt: + return patch_system_prompt(messages, prompt) + else: + return messages # universal args for all models go here. also good if you need to test locally @@ -23,7 +45,33 @@ class ServerConfig(TypedDict): "arguments": [ "--tool-call-parser", "hermes", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja") - ] + ], + "system_prompt": + "You are a helpful assistant with access to tools. If a tool" + " that you have would be helpful to answer a user query, " + "call the tool. Otherwise, answer the user's query directly " + "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT " + "to the user's question - just respond to it normally." + }, + "llama": { + "model": + "meta-llama/Meta-Llama-3.1-8B-Instruct", + "arguments": [ + "--tool-call-parser", "llama3_json", "--chat-template", + str(VLLM_PATH / "examples/tool_chat_template_llama3.1_json.jinja") + ], + "supports_parallel": + False, + }, + "llama3.2": { + "model": + "meta-llama/Llama-3.2-3B-Instruct", + "arguments": [ + "--tool-call-parser", "llama3_json", "--chat-template", + str(VLLM_PATH / "examples/tool_chat_template_llama3.2_json.jinja") + ], + "supports_parallel": + False, }, "mistral": { "model": @@ -32,7 +80,13 @@ class ServerConfig(TypedDict): "--tool-call-parser", "mistral", "--chat-template", str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"), "--ignore-patterns=\"consolidated.safetensors\"" - ] + ], + "system_prompt": + "You are a helpful assistant with access to tools. If a tool" + " that you have would be helpful to answer a user query, " + "call the tool. Otherwise, answer the user's query directly " + "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT " + "to the user's question - just respond to it normally." } } @@ -97,15 +151,6 @@ class ServerConfig(TypedDict): } MESSAGES_WITHOUT_TOOLS: List[ChatCompletionMessageParam] = [{ - "role": - "system", - "content": - "You are a helpful assistant with access to tools. If a tool" - " that you have would be helpful to answer a user query, " - "call the tool. Otherwise, answer the user's query directly " - "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT " - "to the user's question - just respond to it normally." -}, { "role": "user", "content": diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 9d3071a97fbe..446769a277f5 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -193,7 +193,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument( "--tool-call-parser", type=str, - choices=["mistral", "hermes"], + choices=["mistral", "hermes", "llama3_json"], default=None, help= "Select the tool call parser depending on the model that you're using." diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 8b51fc804ad9..e95ef3f39c8a 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -30,6 +30,7 @@ PromptAdapterPath, TextTokensPrompt) from vllm.entrypoints.openai.tool_parsers import (Hermes2ProToolParser, + Llama3JsonToolParser, MistralToolParser, ToolParser) from vllm.inputs import TokensPrompt @@ -85,6 +86,8 @@ def __init__(self, self.tool_parser = MistralToolParser elif tool_parser == "hermes": self.tool_parser = Hermes2ProToolParser + elif tool_parser == "llama3_json": + self.tool_parser = Llama3JsonToolParser else: raise TypeError("Error: --enable-auto-tool-choice requires " "--tool-call-parser") diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py index 5d5d53784fed..0069a2b8044b 100644 --- a/vllm/entrypoints/openai/tool_parsers/__init__.py +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -1,5 +1,9 @@ from .abstract_tool_parser import ToolParser from .hermes_tool_parser import Hermes2ProToolParser +from .llama_tool_parser import Llama3JsonToolParser from .mistral_tool_parser import MistralToolParser -__all__ = ["ToolParser", "Hermes2ProToolParser", "MistralToolParser"] \ No newline at end of file +__all__ = [ + "ToolParser", "Hermes2ProToolParser", "MistralToolParser", + "Llama3JsonToolParser" +] diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py new file mode 100644 index 000000000000..f98dca16674d --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py @@ -0,0 +1,273 @@ +import json +import re +from json import JSONDecodeError, JSONDecoder +from typing import Dict, List, Sequence, Union + +import partial_json_parser +from partial_json_parser.core.options import Allow +from transformers import PreTrainedTokenizerBase + +from vllm.entrypoints.openai.protocol import (DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser) +from vllm.entrypoints.openai.tool_parsers.utils import find_common_prefix +from vllm.logger import init_logger +from vllm.utils import random_uuid + +logger = init_logger(__name__) + + +# partial_json_parser doesn't support extra data and +# JSONDecorder.raw_decode doesn't support partial JSON +def partial_json_loads(input_str, flags): + try: + return (partial_json_parser.loads(input_str, flags), len(input_str)) + except JSONDecodeError as e: + if "Extra data" in e.msg: + dec = JSONDecoder() + return dec.raw_decode(input_str) + else: + raise + + +def is_complete_json(input_str): + try: + json.loads(input_str) + return True + except JSONDecodeError: + return False + + +class Llama3JsonToolParser(ToolParser): + """ + Tool call parser for Llama 3.1 models intended for use with the + examples/tool_chat_template_llama.jinja template. + + Used when --enable-auto-tool-choice --tool-call-parser mistral are all set + """ + + def __init__(self, tokenizer: PreTrainedTokenizerBase): + super().__init__(tokenizer) + + # initialize properties used for state when parsing tool calls in + # streaming mode + self.prev_tool_call_arr: List[Dict] = [] + self.current_tool_id: int = -1 + self.current_tool_name_sent: bool = False + self.streamed_args_for_tool: List[str] = [ + ] # map what has been streamed for each tool so far to a list + self.bot_token = "<|python_tag|>" + self.bot_token_id = tokenizer.encode(self.bot_token, + add_special_tokens=False)[0] + self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL) + + def extract_tool_calls(self, + model_output: str) -> ExtractedToolCallInformation: + """ + Extract the tool calls from a complete model response. + """ + # case -- if a tool call token is not present, return a text response + if not (model_output.startswith(self.bot_token) + or model_output.startswith('{')): + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + try: + # load the JSON, and then use it to build the Function and + # Tool Call + dec = JSONDecoder() + function_call_arr = [] + + # depending on the prompt format the Llama model may or may not + # prefix the output with the <|python_tag|> token + start_idx = len(self.bot_token) if model_output.startswith( + self.bot_token) else 0 + while start_idx < len(model_output): + (obj, end_idx) = dec.raw_decode(model_output[start_idx:]) + start_idx += end_idx + len('; ') + function_call_arr.append(obj) + + tool_calls: List[ToolCall] = [ + ToolCall( + type="function", + function=FunctionCall( + name=raw_function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps(raw_function_call["arguments"] \ + if "arguments" in raw_function_call \ + else raw_function_call["parameters"]))) + for raw_function_call in function_call_arr + ] + + # get any content before the tool call + ret = ExtractedToolCallInformation(tools_called=True, + tool_calls=tool_calls, + content=None) + return ret + + except Exception as e: + logger.error("Error in extracting tool call from response: %s", e) + print("ERROR", e) + # return information to just treat the tool call as regular JSON + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + + if not (current_text.startswith(self.bot_token) + or current_text.startswith('{')): + return DeltaMessage(content=delta_text) + + # bit mask flags for partial JSON parsing. If the name hasn't been + # sent yet, don't allow sending + # an incomplete string since OpenAI only ever (as far as I have + # seen) allows sending the entire tool/ function name at once. + flags = Allow.ALL if self.current_tool_name_sent \ + else Allow.ALL & ~Allow.STR + try: + tool_call_arr = [] + is_complete = [] + try: + # depending on the prompt format the Llama model may or may not + # prefix the output with the <|python_tag|> token + start_idx = len(self.bot_token) if current_text.startswith( + self.bot_token) else 0 + while start_idx < len(current_text): + (obj, + end_idx) = partial_json_loads(current_text[start_idx:], + flags) + is_complete.append( + is_complete_json(current_text[start_idx:start_idx + + end_idx])) + start_idx += end_idx + len('; ') + # depending on the prompt Llama can use + # either arguments or parameters + if "parameters" in obj: + assert "arguments" not in obj, \ + "model generated both parameters and arguments" + obj["arguments"] = obj["parameters"] + tool_call_arr.append(obj) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug('not enough tokens to parse into JSON yet') + return None + + # select as the current tool call the one we're on the state at + current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + if len(tool_call_arr) > 0 else {} + + # case -- if no tokens have been streamed for the tool, e.g. + # only the array brackets, stream nothing + if len(tool_call_arr) == 0: + return None + + # case: we are starting a new tool in the array + # -> array has > 0 length AND length has moved past cursor + elif (len(tool_call_arr) > 0 + and len(tool_call_arr) > self.current_tool_id + 1): + + # if we're moving on to a new call, first make sure we + # haven't missed anything in the previous one that was + # auto-generated due to JSON completions, but wasn't + # streamed to the client yet. + if self.current_tool_id >= 0: + cur_arguments = current_tool_call.get("arguments") + if cur_arguments: + cur_args_json = json.dumps(cur_arguments) + sent = len( + self.streamed_args_for_tool[self.current_tool_id]) + argument_diff = cur_args_json[sent:] + + logger.debug("got arguments diff: %s", argument_diff) + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff). + model_dump(exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += argument_diff + else: + delta = None + else: + delta = None + # re-set stuff pertaining to progress in the current tool + self.current_tool_id = len(tool_call_arr) - 1 + self.current_tool_name_sent = False + self.streamed_args_for_tool.append("") + logger.debug("starting on new tool %d", self.current_tool_id) + return delta + + # if the current tool name hasn't been sent, send if available + # - otherwise send nothing + elif not self.current_tool_name_sent: + function_name = current_tool_call.get("name") + if function_name: + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + type="function", + id=f"chatcmpl-tool-{random_uuid()}", + function=DeltaFunctionCall( + name=function_name).model_dump( + exclude_none=True)) + ]) + self.current_tool_name_sent = True + else: + delta = None + + # now we know we're on the same tool call and we're streaming + # arguments + else: + cur_arguments = current_tool_call.get("arguments") + delta = None + + if cur_arguments: + sent = len( + self.streamed_args_for_tool[self.current_tool_id]) + cur_args_json = json.dumps(cur_arguments) + prev_arguments = self.prev_tool_call_arr[ + self.current_tool_id].get("arguments") + + argument_diff = None + if is_complete[self.current_tool_id]: + argument_diff = cur_args_json[sent:] + elif prev_arguments: + prev_args_json = json.dumps(prev_arguments) + if cur_args_json != prev_args_json: + + prefix = find_common_prefix( + prev_args_json, cur_args_json) + argument_diff = prefix[sent:] + + if argument_diff is not None: + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff). + model_dump(exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += argument_diff + + self.prev_tool_call_arr = tool_call_arr + return delta + + except Exception as e: + logger.error("Error trying to handle streaming tool call: %s", e) + logger.debug( + "Skipping chunk as a result of tool streaming extraction " + "error") + return None From 3b00b9c26c91e9f9ada12975b613555698054e39 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Fri, 27 Sep 2024 11:35:15 +0800 Subject: [PATCH 015/199] [Core] rename`PromptInputs` and `inputs` (#8876) --- benchmarks/benchmark_latency.py | 8 +- .../dev/multimodal/multimodal_index.rst | 2 +- .../dev/offline_inference/llm_inputs.rst | 2 +- docs/source/models/vlm.rst | 2 +- tests/async_engine/test_async_llm_engine.py | 8 +- tests/entrypoints/llm/test_encode.py | 34 ------ tests/entrypoints/llm/test_generate.py | 37 ------ tests/mq_llm_engine/test_error_handling.py | 12 +- tests/mq_llm_engine/utils.py | 2 +- vllm/__init__.py | 4 +- vllm/engine/async_llm_engine.py | 110 +++++++++++++++--- vllm/engine/llm_engine.py | 52 +++++++-- vllm/engine/multiprocessing/__init__.py | 61 +++++++++- vllm/engine/multiprocessing/client.py | 95 ++++++++++++--- vllm/engine/multiprocessing/engine.py | 2 +- vllm/engine/protocol.py | 8 +- vllm/entrypoints/llm.py | 68 +++++------ vllm/inputs/__init__.py | 20 +++- vllm/inputs/data.py | 53 +++++---- vllm/inputs/parse.py | 22 ++-- vllm/inputs/preprocess.py | 86 +++++++------- 21 files changed, 440 insertions(+), 248 deletions(-) diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index a39d1cf842f0..eadf994cacd3 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -11,7 +11,7 @@ from vllm import LLM, SamplingParams from vllm.engine.arg_utils import DEVICE_OPTIONS, EngineArgs -from vllm.inputs import PromptInputs +from vllm.inputs import PromptType from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.utils import FlexibleArgumentParser @@ -61,7 +61,7 @@ def main(args: argparse.Namespace): dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len)) - dummy_inputs: List[PromptInputs] = [{ + dummy_prompts: List[PromptType] = [{ "prompt_token_ids": batch } for batch in dummy_prompt_token_ids.tolist()] @@ -74,13 +74,13 @@ def run_to_completion(profile_dir: Optional[str] = None): ], on_trace_ready=torch.profiler.tensorboard_trace_handler( str(profile_dir))) as p: - llm.generate(dummy_inputs, + llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) print(p.key_averages()) else: start_time = time.perf_counter() - llm.generate(dummy_inputs, + llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False) end_time = time.perf_counter() diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst index 241b2ccd0991..e112b43aade5 100644 --- a/docs/source/dev/multimodal/multimodal_index.rst +++ b/docs/source/dev/multimodal/multimodal_index.rst @@ -8,7 +8,7 @@ Multi-Modality vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package. Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models ` -via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptInputs`. +via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`. Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities by following :ref:`this guide `. diff --git a/docs/source/dev/offline_inference/llm_inputs.rst b/docs/source/dev/offline_inference/llm_inputs.rst index 9adf82d43f3e..0d47281db485 100644 --- a/docs/source/dev/offline_inference/llm_inputs.rst +++ b/docs/source/dev/offline_inference/llm_inputs.rst @@ -1,7 +1,7 @@ LLM Inputs ========== -.. autodata:: vllm.inputs.PromptInputs +.. autodata:: vllm.inputs.PromptType .. autoclass:: vllm.inputs.TextPrompt :show-inheritance: diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 08db89166504..ca5b125369c8 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -27,7 +27,7 @@ The :class:`~vllm.LLM` class can be instantiated in much the same way as languag We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow the above snippet. Specifically, ``image_feature_size`` can no longer be specified as we now calculate that internally for each model. -To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`: +To pass an image to the model, note the following in :class:`vllm.inputs.PromptType`: * ``prompt``: The prompt should follow the format that is documented on HuggingFace. * ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. diff --git a/tests/async_engine/test_async_llm_engine.py b/tests/async_engine/test_async_llm_engine.py index 6cae76f74603..1903a7582dc8 100644 --- a/tests/async_engine/test_async_llm_engine.py +++ b/tests/async_engine/test_async_llm_engine.py @@ -86,17 +86,19 @@ class MockAsyncLLMEngine(AsyncLLMEngine): @pytest.mark.asyncio async def test_new_requests_event(): + params = SamplingParams() + engine = MockAsyncLLMEngine() engine.start_background_loop() await asyncio.sleep(0.01) assert engine.engine.step_calls == 0 - await engine.add_request("1", "", None) + await engine.add_request("1", "", params) await asyncio.sleep(0.01) assert engine.engine.add_request_calls == 1 assert engine.engine.step_calls == 1 - await engine.add_request("2", "", None) + await engine.add_request("2", "", params) engine.engine.generate("2") await asyncio.sleep(0) await asyncio.sleep(0) @@ -111,7 +113,7 @@ async def test_new_requests_event(): await asyncio.sleep(0.001) assert engine.engine.step_calls == old_step_calls - await engine.add_request("3", "", None) + await engine.add_request("3", "", params) await asyncio.sleep(0.01) assert engine.engine.add_request_calls == 3 assert engine.engine.step_calls == old_step_calls + 1 diff --git a/tests/entrypoints/llm/test_encode.py b/tests/entrypoints/llm/test_encode.py index d1056a049050..1885f2e168d8 100644 --- a/tests/entrypoints/llm/test_encode.py +++ b/tests/entrypoints/llm/test_encode.py @@ -49,21 +49,6 @@ def assert_outputs_equal(o1: List[EmbeddingRequestOutput], assert [o.outputs for o in o1] == [o.outputs for o in o2] -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize('prompt', PROMPTS) -def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt): - pooling_params = PoolingParams() - - with pytest.warns(DeprecationWarning, match="'prompts'"): - v1_output = llm.encode(prompts=prompt, pooling_params=pooling_params) - - v2_output = llm.encode(prompt, pooling_params=pooling_params) - assert_outputs_equal(v1_output, v2_output) - - v2_output = llm.encode({"prompt": prompt}, pooling_params=pooling_params) - assert_outputs_equal(v1_output, v2_output) - - @pytest.mark.skip_global_cleanup @pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS) def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, @@ -79,25 +64,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, assert_outputs_equal(v1_output, v2_output) -@pytest.mark.skip_global_cleanup -def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM): - pooling_params = PoolingParams() - - with pytest.warns(DeprecationWarning, match="'prompts'"): - v1_output = llm.encode(prompts=PROMPTS, pooling_params=pooling_params) - - v2_output = llm.encode(PROMPTS, pooling_params=pooling_params) - assert_outputs_equal(v1_output, v2_output) - - v2_output = llm.encode( - [{ - "prompt": p - } for p in PROMPTS], - pooling_params=pooling_params, - ) - assert_outputs_equal(v1_output, v2_output) - - @pytest.mark.skip_global_cleanup def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): pooling_params = PoolingParams() diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index cd989225e248..6543c4bb1b58 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -47,23 +47,6 @@ def assert_outputs_equal(o1: List[RequestOutput], o2: List[RequestOutput]): assert [o.outputs for o in o1] == [o.outputs for o in o2] -@pytest.mark.skip_global_cleanup -@pytest.mark.parametrize('prompt', PROMPTS) -def test_v1_v2_api_consistency_single_prompt_string(llm: LLM, prompt): - sampling_params = SamplingParams(temperature=0.0, top_p=1.0) - - with pytest.warns(DeprecationWarning, match="'prompts'"): - v1_output = llm.generate(prompts=prompt, - sampling_params=sampling_params) - - v2_output = llm.generate(prompt, sampling_params=sampling_params) - assert_outputs_equal(v1_output, v2_output) - - v2_output = llm.generate({"prompt": prompt}, - sampling_params=sampling_params) - assert_outputs_equal(v1_output, v2_output) - - @pytest.mark.skip_global_cleanup @pytest.mark.parametrize('prompt_token_ids', TOKEN_IDS) def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, @@ -79,26 +62,6 @@ def test_v1_v2_api_consistency_single_prompt_tokens(llm: LLM, assert_outputs_equal(v1_output, v2_output) -@pytest.mark.skip_global_cleanup -def test_v1_v2_api_consistency_multi_prompt_string(llm: LLM): - sampling_params = SamplingParams(temperature=0.0, top_p=1.0) - - with pytest.warns(DeprecationWarning, match="'prompts'"): - v1_output = llm.generate(prompts=PROMPTS, - sampling_params=sampling_params) - - v2_output = llm.generate(PROMPTS, sampling_params=sampling_params) - assert_outputs_equal(v1_output, v2_output) - - v2_output = llm.generate( - [{ - "prompt": p - } for p in PROMPTS], - sampling_params=sampling_params, - ) - assert_outputs_equal(v1_output, v2_output) - - @pytest.mark.skip_global_cleanup def test_v1_v2_api_consistency_multi_prompt_tokens(llm: LLM): sampling_params = SamplingParams(temperature=0.0, top_p=1.0) diff --git a/tests/mq_llm_engine/test_error_handling.py b/tests/mq_llm_engine/test_error_handling.py index 76b2f494d5b2..616a15a1328d 100644 --- a/tests/mq_llm_engine/test_error_handling.py +++ b/tests/mq_llm_engine/test_error_handling.py @@ -61,7 +61,7 @@ async def test_evil_forward(tmp_socket): # Throws an error in first forward pass. with pytest.raises(RAISED_ERROR): - async for _ in client.generate(inputs="Hello my name is", + async for _ in client.generate(prompt="Hello my name is", sampling_params=SamplingParams(), request_id=uuid.uuid4()): pass @@ -69,7 +69,7 @@ async def test_evil_forward(tmp_socket): # Engine is errored, should get ENGINE_DEAD_ERROR. with pytest.raises(MQEngineDeadError): - async for _ in client.generate(inputs="Hello my name is", + async for _ in client.generate(prompt="Hello my name is", sampling_params=SamplingParams(), request_id=uuid.uuid4()): pass @@ -118,7 +118,7 @@ async def test_failed_health_check(tmp_socket): # Generate call should throw ENGINE_DEAD_ERROR with pytest.raises(MQEngineDeadError): - async for _ in client.generate(inputs="Hello my name is", + async for _ in client.generate(prompt="Hello my name is", sampling_params=SamplingParams(), request_id=uuid.uuid4()): pass @@ -160,7 +160,7 @@ async def test_failed_abort(tmp_socket): # with reference to the original KeyError("foo") with pytest.raises(MQEngineDeadError) as execinfo: async for _ in client.generate( - inputs="Hello my name is", + prompt="Hello my name is", sampling_params=SamplingParams(max_tokens=10), request_id=uuid.uuid4()): pass @@ -183,7 +183,7 @@ async def test_bad_request(tmp_socket): # Invalid request should fail, but not crash the server. with pytest.raises(ValueError): - async for _ in client.generate(inputs="Hello my name is", + async for _ in client.generate(prompt="Hello my name is", sampling_params=SamplingParams(), request_id="abcd-1", lora_request=LoRARequest( @@ -192,7 +192,7 @@ async def test_bad_request(tmp_socket): pass # This request should be okay. - async for _ in client.generate(inputs="Hello my name is", + async for _ in client.generate(prompt="Hello my name is", sampling_params=SamplingParams(), request_id="abcd-2"): pass diff --git a/tests/mq_llm_engine/utils.py b/tests/mq_llm_engine/utils.py index e27fd7792341..3ffa126070ca 100644 --- a/tests/mq_llm_engine/utils.py +++ b/tests/mq_llm_engine/utils.py @@ -20,7 +20,7 @@ async def generate( count = 0 async for out in client.generate( request_id=request_id, - inputs="Hello my name is Robert and", + prompt="Hello my name is Robert and", sampling_params=SamplingParams(max_tokens=num_tokens, temperature=0)): diff --git a/vllm/__init__.py b/vllm/__init__.py index 90363b3e49b7..8f477ea84756 100644 --- a/vllm/__init__.py +++ b/vllm/__init__.py @@ -5,7 +5,7 @@ from vllm.engine.llm_engine import LLMEngine from vllm.entrypoints.llm import LLM from vllm.executor.ray_utils import initialize_ray_cluster -from vllm.inputs import PromptInputs, TextPrompt, TokensPrompt +from vllm.inputs import PromptType, TextPrompt, TokensPrompt from vllm.model_executor.models import ModelRegistry from vllm.outputs import (CompletionOutput, EmbeddingOutput, EmbeddingRequestOutput, RequestOutput) @@ -19,7 +19,7 @@ "__version_tuple__", "LLM", "ModelRegistry", - "PromptInputs", + "PromptType", "TextPrompt", "TokensPrompt", "SamplingParams", diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 34e7e05341f0..54c5af2fe366 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -2,8 +2,8 @@ import time import weakref from functools import partial -from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List, - Mapping, Optional, Set, Tuple, Type, Union) +from typing import (Any, AsyncGenerator, Callable, Coroutine, Dict, Iterable, + List, Mapping, Optional, Set, Tuple, Type, Union, overload) from weakref import ReferenceType import vllm.envs as envs @@ -17,7 +17,7 @@ from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.gpu_executor import GPUExecutorAsync from vllm.executor.ray_utils import initialize_ray_cluster -from vllm.inputs import PromptInputs +from vllm.inputs import PromptType from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput @@ -28,7 +28,7 @@ from vllm.sequence import ExecuteModelRequest from vllm.transformers_utils.tokenizer import AnyTokenizer from vllm.usage.usage_lib import UsageContext -from vllm.utils import weak_bind +from vllm.utils import deprecate_kwargs, weak_bind logger = init_logger(__name__) ENGINE_ITERATION_TIMEOUT_S = envs.VLLM_ENGINE_ITERATION_TIMEOUT_S @@ -402,17 +402,54 @@ async def stop_remote_worker_execution_loop_async(self) -> None: """Stop the remote worker execution loop.""" await self.model_executor.stop_remote_worker_execution_loop_async() + @overload # DEPRECATED async def add_request_async( self, request_id: str, - inputs: PromptInputs, + *, + inputs: PromptType, params: Union[SamplingParams, PoolingParams], arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> None: + ... + + @overload + async def add_request_async( + self, + request_id: str, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> None: + ... + + @deprecate_kwargs( + "inputs", + additional_message="Please use the 'prompt' parameter instead.", + ) + async def add_request_async( + self, + request_id: str, + prompt: Optional[PromptType] = None, + params: Optional[Union[SamplingParams, PoolingParams]] = None, + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + *, + inputs: Optional[PromptType] = None, # DEPRECATED ) -> None: """Async version of :meth:`add_request`.""" + if inputs is not None: + prompt = inputs + assert prompt is not None and params is not None + if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " "not enabled!") @@ -420,7 +457,7 @@ async def add_request_async( arrival_time = time.time() preprocessed_inputs = await self.input_preprocessor.preprocess_async( - inputs, + prompt, request_id=request_id, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, @@ -774,16 +811,55 @@ async def run_engine_loop(engine_ref: ReferenceType): # This method does not need to be async, but kept that way # for backwards compatibility. - async def add_request( + @overload # DEPRECATED + def add_request( self, request_id: str, - inputs: PromptInputs, + *, + inputs: PromptType, params: Union[SamplingParams, PoolingParams], arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> Coroutine[None, None, AsyncGenerator[Union[ + RequestOutput, EmbeddingRequestOutput], None]]: + ... + + @overload + def add_request( + self, + request_id: str, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> Coroutine[None, None, AsyncGenerator[Union[ + RequestOutput, EmbeddingRequestOutput], None]]: + ... + + @deprecate_kwargs( + "inputs", + additional_message="Please use the 'prompt' parameter instead.", + ) + async def add_request( + self, + request_id: str, + prompt: Optional[PromptType] = None, + params: Optional[Union[SamplingParams, PoolingParams]] = None, + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + *, + inputs: Optional[PromptType] = None, # DEPRECATED ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]: + if inputs is not None: + prompt = inputs + assert prompt is not None and params is not None + if not self.is_running: if self.start_engine_loop: self.start_background_loop() @@ -797,7 +873,7 @@ async def add_request( stream = self._request_tracker.add_request( request_id, verbose=self.log_requests, - inputs=inputs, + prompt=prompt, params=params, arrival_time=arrival_time or time.time(), lora_request=lora_request, @@ -808,7 +884,7 @@ async def add_request( async def generate( self, - inputs: PromptInputs, + prompt: PromptType, sampling_params: SamplingParams, request_id: str, lora_request: Optional[LoRARequest] = None, @@ -822,8 +898,7 @@ async def generate( from the LLMEngine to the caller. Args: - inputs: The inputs to the LLM. See - :class:`~vllm.inputs.PromptInputs` + prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType` for more details about the format of each input. sampling_params: The sampling parameters of the request. request_id: The unique id of the request. @@ -881,7 +956,7 @@ async def generate( """ async for output in await self.add_request( request_id, - inputs, + prompt, sampling_params, lora_request=lora_request, trace_headers=trace_headers, @@ -891,7 +966,7 @@ async def generate( async def encode( self, - inputs: PromptInputs, + prompt: PromptType, pooling_params: PoolingParams, request_id: str, lora_request: Optional[LoRARequest] = None, @@ -904,8 +979,7 @@ async def encode( from the LLMEngine to the caller. Args: - inputs: The inputs to the LLM. See - :class:`~vllm.inputs.PromptInputs` + prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType` for more details about the format of each input. pooling_params: The pooling parameters of the request. request_id: The unique id of the request. @@ -959,7 +1033,7 @@ async def encode( """ async for output in await self.add_request( request_id, - inputs, + prompt, pooling_params, lora_request=lora_request, trace_headers=trace_headers, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 768ac69c3692..487255cb6b59 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -6,7 +6,7 @@ from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Deque, Dict, Iterable, List, Mapping, NamedTuple, Optional) from typing import Sequence as GenericSequence -from typing import Set, Type, Union +from typing import Set, Type, Union, overload import torch from typing_extensions import TypeVar @@ -29,7 +29,7 @@ from vllm.executor.gpu_executor import GPUExecutor from vllm.executor.ray_utils import initialize_ray_cluster from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs, - InputRegistry, LLMInputs, PromptInputs) + InputRegistry, LLMInputs, PromptType) from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -51,7 +51,7 @@ BaseTokenizerGroup, init_tokenizer_from_configs) from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message) -from vllm.utils import Counter, Device, weak_bind +from vllm.utils import Counter, Device, deprecate_kwargs, weak_bind from vllm.version import __version__ as VLLM_VERSION logger = init_logger(__name__) @@ -689,16 +689,51 @@ def _add_processed_request( def stop_remote_worker_execution_loop(self) -> None: self.model_executor.stop_remote_worker_execution_loop() + @overload # DEPRECATED def add_request( self, request_id: str, - inputs: PromptInputs, + *, + inputs: PromptType, params: Union[SamplingParams, PoolingParams], arrival_time: Optional[float] = None, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, priority: int = 0, + ) -> None: + ... + + @overload + def add_request( + self, + request_id: str, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + ) -> None: + ... + + @deprecate_kwargs( + "inputs", + additional_message="Please use the 'prompt' parameter instead.", + ) + def add_request( + self, + request_id: str, + prompt: Optional[PromptType] = None, + params: Optional[Union[SamplingParams, PoolingParams]] = None, + arrival_time: Optional[float] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, + *, + inputs: Optional[PromptType] = None, # DEPRECATED ) -> None: """Add a request to the engine's request pool. @@ -708,8 +743,7 @@ def add_request( Args: request_id: The unique ID of the request. - inputs: The inputs to the LLM. See - :class:`~vllm.inputs.PromptInputs` + prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType` for more details about the format of each input. params: Parameters for sampling or pooling. :class:`~vllm.SamplingParams` for text generation. @@ -744,6 +778,10 @@ def add_request( >>> # continue the request processing >>> ... """ + if inputs is not None: + prompt = inputs + assert prompt is not None and params is not None + if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " "not enabled!") @@ -756,7 +794,7 @@ def add_request( arrival_time = time.time() preprocessed_inputs = self.input_preprocessor.preprocess( - inputs, + prompt, request_id=request_id, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, diff --git a/vllm/engine/multiprocessing/__init__.py b/vllm/engine/multiprocessing/__init__.py index 1603189979a2..6d6d7895b210 100644 --- a/vllm/engine/multiprocessing/__init__.py +++ b/vllm/engine/multiprocessing/__init__.py @@ -1,13 +1,14 @@ from dataclasses import dataclass from enum import Enum -from typing import List, Mapping, Optional, Union +from typing import List, Mapping, Optional, Union, overload from vllm import PoolingParams -from vllm.inputs import PromptInputs +from vllm.inputs import PromptType from vllm.lora.request import LoRARequest from vllm.outputs import RequestOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams +from vllm.utils import deprecate_kwargs VLLM_RPC_SUCCESS_STR = "SUCCESS" @@ -23,13 +24,67 @@ class MQEngineDeadError(RuntimeError): @dataclass class RPCProcessRequest: - inputs: PromptInputs + prompt: PromptType params: Union[SamplingParams, PoolingParams] request_id: str lora_request: Optional[LoRARequest] = None trace_headers: Optional[Mapping[str, str]] = None prompt_adapter_request: Optional[PromptAdapterRequest] = None + @overload # DEPRECATED + def __init__( + self, + *, + inputs: PromptType, + params: Union[SamplingParams, PoolingParams], + request_id: str, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> None: + ... + + @overload + def __init__( + self, + prompt: PromptType, + params: Union[SamplingParams, PoolingParams], + request_id: str, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> None: + ... + + @deprecate_kwargs( + "inputs", + additional_message="Please use the 'prompt' parameter instead.", + ) + def __init__( + self, + prompt: Optional[PromptType] = None, + params: Optional[Union[SamplingParams, PoolingParams]] = None, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + *, + inputs: Optional[PromptType] = None, # DEPRECATED + ) -> None: + if inputs is not None: + prompt = inputs + assert (prompt is not None and params is not None + and request_id is not None) + + super().__init__() + + self.prompt = prompt + self.params = params + self.request_id = request_id + self.lora_request = lora_request + self.trace_headers = trace_headers + self.prompt_adapter_request = prompt_adapter_request + @dataclass class RPCError: diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py index 0ee56f7bf840..700e65000e05 100644 --- a/vllm/engine/multiprocessing/client.py +++ b/vllm/engine/multiprocessing/client.py @@ -3,7 +3,7 @@ import pickle from contextlib import contextmanager, suppress from typing import (Any, AsyncGenerator, Dict, Iterator, Mapping, Optional, - Union) + Union, overload) import cloudpickle import zmq @@ -25,13 +25,14 @@ RPCUProfileRequest) # yapf: enable from vllm.envs import VLLM_RPC_TIMEOUT -from vllm.inputs import PromptInputs +from vllm.inputs import PromptType from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.outputs import EmbeddingRequestOutput, RequestOutput from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs +from vllm.utils import deprecate_kwargs logger = init_logger(__name__) @@ -367,14 +368,45 @@ def errored(self) -> bool: def dead_error(self) -> BaseException: return ENGINE_DEAD_ERROR(self._errored_with) + @overload # DEPRECATED def generate( self, - inputs: PromptInputs, + *, + inputs: PromptType, sampling_params: SamplingParams, request_id: str, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> AsyncGenerator[RequestOutput, None]: + ... + + @overload + def generate( + self, + prompt: PromptType, + sampling_params: SamplingParams, + request_id: str, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + ) -> AsyncGenerator[RequestOutput, None]: + ... + + @deprecate_kwargs( + "inputs", + additional_message="Please use the 'prompt' parameter instead.", + ) + def generate( + self, + prompt: Optional[PromptType] = None, + sampling_params: Optional[SamplingParams] = None, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + *, + inputs: Optional[PromptType] = None # DEPRECATED ) -> AsyncGenerator[RequestOutput, None]: """Generate outputs for a request. @@ -383,8 +415,7 @@ def generate( from the LLMEngine to the caller. Args: - inputs: The inputs to the LLM. See - :class:`~vllm.inputs.PromptInputs` + prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType` for more details about the format of each input. sampling_params: The sampling parameters of the request. request_id: The unique id of the request. @@ -393,17 +424,51 @@ def generate( prompt_adapter_request: Prompt Adapter request to use for generation, if any. """ - return self._process_request(inputs, sampling_params, request_id, + if inputs is not None: + prompt = inputs + assert (prompt is not None and sampling_params is not None + and request_id is not None) + + return self._process_request(prompt, sampling_params, request_id, lora_request, trace_headers, prompt_adapter_request) + @overload # DEPRECATED def encode( self, - inputs: PromptInputs, + *, + inputs: PromptType, pooling_params: PoolingParams, request_id: str, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, + ) -> AsyncGenerator[EmbeddingRequestOutput, None]: + ... + + @overload + def encode( + self, + prompt: PromptType, + pooling_params: PoolingParams, + request_id: str, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + ) -> AsyncGenerator[EmbeddingRequestOutput, None]: + ... + + @deprecate_kwargs( + "inputs", + additional_message="Please use the 'prompt' parameter instead.", + ) + def encode( + self, + prompt: Optional[PromptType] = None, + pooling_params: Optional[PoolingParams] = None, + request_id: Optional[str] = None, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + *, + inputs: Optional[PromptType] = None # DEPRECATED ) -> AsyncGenerator[EmbeddingRequestOutput, None]: """Generate outputs for a request from an embedding model. @@ -412,8 +477,7 @@ def encode( from the LLMEngine to the caller. Args: - inputs: The inputs to the LLM. See - :class:`~vllm.inputs.PromptInputs` + prompt: The prompt to the LLM. See :class:`~vllm.inputs.PromptType` for more details about the format of each input. pooling_params: The pooling parameters of the request. request_id: The unique id of the request. @@ -424,12 +488,17 @@ def encode( The output `EmbeddingRequestOutput` objects from the LLMEngine for the request. """ - return self._process_request(inputs, pooling_params, request_id, + if inputs is not None: + prompt = inputs + assert (prompt is not None and pooling_params is not None + and request_id is not None) + + return self._process_request(prompt, pooling_params, request_id, lora_request, trace_headers) async def _process_request( self, - inputs: PromptInputs, + prompt: PromptType, params: Union[SamplingParams, PoolingParams], request_id: str, lora_request: Optional[LoRARequest] = None, @@ -462,7 +531,7 @@ async def _process_request( request_bytes = pickle.dumps( RPCProcessRequest( - inputs=inputs, + prompt=prompt, params=params, request_id=request_id, lora_request=lora_request, diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py index 1b2e7ccf8664..eecca82cd2f7 100644 --- a/vllm/engine/multiprocessing/engine.py +++ b/vllm/engine/multiprocessing/engine.py @@ -278,7 +278,7 @@ def _handle_process_request(self, request: RPCProcessRequest): try: self.engine.add_request( request_id=request_id, - inputs=request.inputs, + prompt=request.prompt, params=request.params, lora_request=request.lora_request, trace_headers=request.trace_headers, diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 70444faa670a..d0bbeb357b50 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -3,7 +3,7 @@ from vllm.config import DecodingConfig, ModelConfig from vllm.core.scheduler import SchedulerOutputs -from vllm.inputs.data import PromptInputs +from vllm.inputs.data import PromptType from vllm.lora.request import LoRARequest from vllm.model_executor.layers.sampler import SamplerOutput from vllm.outputs import EmbeddingRequestOutput, RequestOutput @@ -35,19 +35,19 @@ def dead_error(self) -> BaseException: def generate( self, - inputs: PromptInputs, + prompt: PromptType, sampling_params: SamplingParams, request_id: str, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None ) -> AsyncGenerator[RequestOutput, None]: - """Generates outputs for a request""" + """Generate outputs for a request.""" ... def encode( self, - inputs: PromptInputs, + prompt: PromptType, pooling_params: PoolingParams, request_id: str, lora_request: Optional[LoRARequest] = None, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 77ae7b088398..f4943cb38da4 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -12,7 +12,7 @@ apply_hf_chat_template, apply_mistral_chat_template, parse_chat_messages) -from vllm.inputs import PromptInputs, TextPrompt, TokensPrompt +from vllm.inputs import PromptType, TextPrompt, TokensPrompt from vllm.inputs.parse import parse_and_batch_prompt from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -293,8 +293,8 @@ def generate( @overload def generate( self, - inputs: Union[PromptInputs, Sequence[PromptInputs]], - /, # We may enable `inputs` keyword after removing the old API + prompts: Union[PromptType, Sequence[PromptType]], + /, *, sampling_params: Optional[Union[SamplingParams, Sequence[SamplingParams]]] = None, @@ -304,14 +304,13 @@ def generate( ... @deprecate_kwargs( - "prompts", "prompt_token_ids", is_deprecated=lambda: LLM.DEPRECATE_LEGACY, - additional_message="Please use the 'inputs' parameter instead.", + additional_message="Please use the 'prompts' parameter instead.", ) def generate( self, - prompts: Union[Union[PromptInputs, Sequence[PromptInputs]], + prompts: Union[Union[PromptType, Sequence[PromptType]], Optional[Union[str, List[str]]]] = None, sampling_params: Optional[Union[SamplingParams, Sequence[SamplingParams]]] = None, @@ -330,7 +329,9 @@ def generate( into a single list and pass it to this method. Args: - inputs: A list of inputs to generate completions for. + prompts: The prompts to the LLM. You may pass a sequence of prompts + for batch inference. See :class:`~vllm.inputs.PromptType` + for more details about the format of each prompts. sampling_params: The sampling parameters for text generation. If None, we use the default sampling parameters. When it is a single value, it is applied to every prompt. @@ -358,12 +359,13 @@ def generate( "models (XForCausalLM, XForConditionalGeneration).") if prompt_token_ids is not None: - inputs = self._convert_v1_inputs( + parsed_prompts = self._convert_v1_inputs( prompts=cast(Optional[Union[str, List[str]]], prompts), prompt_token_ids=prompt_token_ids, ) else: - inputs = cast(Union[PromptInputs, Sequence[PromptInputs]], prompts) + parsed_prompts = cast(Union[PromptType, Sequence[PromptType]], + prompts) if isinstance(guided_options_request, dict): if len(guided_options_request) > 1: @@ -378,7 +380,7 @@ def generate( sampling_params = SamplingParams() self._validate_and_add_requests( - inputs=inputs, + prompts=parsed_prompts, params=sampling_params, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, @@ -648,8 +650,8 @@ def encode( @overload def encode( self, - inputs: Union[PromptInputs, Sequence[PromptInputs]], - /, # We may enable `inputs` keyword after removing the old API + prompts: Union[PromptType, Sequence[PromptType]], + /, *, pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, @@ -659,14 +661,13 @@ def encode( ... @deprecate_kwargs( - "prompts", "prompt_token_ids", is_deprecated=lambda: LLM.DEPRECATE_LEGACY, - additional_message="Please use the 'inputs' parameter instead.", + additional_message="Please use the 'prompts' parameter instead.", ) def encode( self, - prompts: Union[Union[PromptInputs, Sequence[PromptInputs]], + prompts: Union[Union[PromptType, Sequence[PromptType]], Optional[Union[str, List[str]]]] = None, pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None, @@ -682,9 +683,9 @@ def encode( into a single list and pass it to this method. Args: - inputs: The inputs to the LLM. You may pass a sequence of inputs for - batch inference. See :class:`~vllm.inputs.PromptInputs` - for more details about the format of each input. + prompts: The prompts to the LLM. You may pass a sequence of prompts + for batch inference. See :class:`~vllm.inputs.PromptType` + for more details about the format of each prompts. pooling_params: The pooling parameters for pooling. If None, we use the default pooling parameters. use_tqdm: Whether to use tqdm to display the progress bar. @@ -707,19 +708,20 @@ def encode( ) if prompt_token_ids is not None: - inputs = self._convert_v1_inputs( + parsed_prompts = self._convert_v1_inputs( prompts=cast(Optional[Union[str, List[str]]], prompts), prompt_token_ids=prompt_token_ids, ) else: - inputs = cast(Union[PromptInputs, Sequence[PromptInputs]], prompts) + parsed_prompts = cast(Union[PromptType, Sequence[PromptType]], + prompts) if pooling_params is None: # Use default pooling params. pooling_params = PoolingParams() self._validate_and_add_requests( - inputs=inputs, + prompts=parsed_prompts, params=pooling_params, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, @@ -763,9 +765,9 @@ def _convert_v1_inputs( raise ValueError("Either prompts or prompt_token_ids must be " "provided.") - inputs: List[PromptInputs] = [] + parsed_prompts: List[PromptType] = [] for i in range(num_requests): - item: PromptInputs + item: PromptType if prompts is not None: item = TextPrompt(prompt=prompts[i]) @@ -774,13 +776,13 @@ def _convert_v1_inputs( else: raise AssertionError - inputs.append(item) + parsed_prompts.append(item) - return inputs + return parsed_prompts def _validate_and_add_requests( self, - inputs: Union[PromptInputs, Sequence[PromptInputs]], + prompts: Union[PromptType, Sequence[PromptType]], params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams, Sequence[PoolingParams]], lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]], @@ -788,11 +790,11 @@ def _validate_and_add_requests( guided_options: Optional[GuidedDecodingRequest] = None, priority: Optional[List[int]] = None, ) -> None: - if isinstance(inputs, (str, dict)): + if isinstance(prompts, (str, dict)): # Convert a single prompt to a list. - inputs = [inputs] + prompts = [prompts] - num_requests = len(inputs) + num_requests = len(prompts) if isinstance(params, list) and len(params) != num_requests: raise ValueError("The lengths of prompts and params " "must be the same.") @@ -809,9 +811,9 @@ def _validate_and_add_requests( sp.output_kind = RequestOutputKind.FINAL_ONLY # Add requests to the engine. - for i, request_inputs in enumerate(inputs): + for i, prompt in enumerate(prompts): self._add_request( - request_inputs, + prompt, params[i] if isinstance(params, Sequence) else params, lora_request=lora_request[i] if isinstance( lora_request, Sequence) else lora_request, @@ -821,7 +823,7 @@ def _validate_and_add_requests( def _add_request( self, - inputs: PromptInputs, + prompt: PromptType, params: Union[SamplingParams, PoolingParams], lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, @@ -830,7 +832,7 @@ def _add_request( request_id = str(next(self.request_counter)) self.llm_engine.add_request( request_id, - inputs, + prompt, params, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 0b08e9691f91..a8c8672cb5fe 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,5 +1,5 @@ from .data import (EncoderDecoderLLMInputs, ExplicitEncoderDecoderPrompt, - LLMInputs, PromptInputs, SingletonPromptInputs, TextPrompt, + LLMInputs, PromptType, SingletonPrompt, TextPrompt, TokensPrompt, build_explicit_enc_dec_prompt, to_enc_dec_tuple_list, zip_enc_dec_prompts) from .registry import InputContext, InputRegistry @@ -16,8 +16,8 @@ __all__ = [ "TextPrompt", "TokensPrompt", - "PromptInputs", - "SingletonPromptInputs", + "PromptType", + "SingletonPrompt", "ExplicitEncoderDecoderPrompt", "LLMInputs", "EncoderDecoderLLMInputs", @@ -28,3 +28,17 @@ "InputContext", "InputRegistry", ] + + +def __getattr__(name: str): + if name == "PromptInput": + import warnings + + msg = ("PromptInput has been renamed to PromptType. " + "The original name will be removed in an upcoming version.") + + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + return PromptType + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index a71e9a7b5db6..dfbcf9526487 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -33,7 +33,7 @@ class TokensPrompt(TypedDict): """ -SingletonPromptInputs = Union[str, TextPrompt, TokensPrompt] +SingletonPrompt = Union[str, TextPrompt, TokensPrompt] """ Set of possible schemas for a single LLM input: @@ -46,7 +46,7 @@ class TokensPrompt(TypedDict): the user desires to express both the encoder & decoder prompts explicitly, i.e. :class:`ExplicitEncoderDecoderPrompt` -A prompt of type :class:`SingletonPromptInputs` may be employed +A prompt of type :class:`SingletonPrompt` may be employed as (1) input to a decoder-only model, (2) input to the encoder of an encoder/decoder model, in the scenario where the decoder-prompt is not specified explicitly, or @@ -55,33 +55,32 @@ class TokensPrompt(TypedDict): """ _T1_co = TypeVar("_T1_co", - bound=SingletonPromptInputs, - default=SingletonPromptInputs, + bound=SingletonPrompt, + default=SingletonPrompt, covariant=True) _T2_co = TypeVar("_T2_co", - bound=SingletonPromptInputs, - default=SingletonPromptInputs, + bound=SingletonPrompt, + default=SingletonPrompt, covariant=True) # TODO: Make fields ReadOnly once mypy supports it class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): - """Represents an encoder/decoder model input prompt, - comprising an explicit encoder prompt and a - decoder prompt. + """ + Represents an encoder/decoder model input prompt, + comprising an explicit encoder prompt and a decoder prompt. - The encoder and decoder prompts, respectively, - may formatted according to any of the - :class:`SingletonPromptInputs` schemas, and are not - required to have the same schema. + The encoder and decoder prompts, respectively, may be formatted + according to any of the :class:`SingletonPrompt` schemas, + and are not required to have the same schema. Only the encoder prompt may have multi-modal data. Note that an :class:`ExplicitEncoderDecoderPrompt` may not be used as an input to a decoder-only model, - and that the `encoder_prompt` and `decoder_prompt` + and that the :code:`encoder_prompt` and :code:`decoder_prompt` fields of this data structure themselves must be - :class:`SingletonPromptInputs` instances. + :class:`SingletonPrompt` instances. """ encoder_prompt: _T1_co @@ -89,7 +88,7 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): decoder_prompt: Optional[_T2_co] -PromptInputs = Union[SingletonPromptInputs, ExplicitEncoderDecoderPrompt] +PromptType = Union[SingletonPrompt, ExplicitEncoderDecoderPrompt] """ Set of possible schemas for an LLM input, including both decoder-only and encoder/decoder input types: @@ -146,12 +145,8 @@ class EncoderDecoderLLMInputs(LLMInputs): """ -_T1 = TypeVar("_T1", - bound=SingletonPromptInputs, - default=SingletonPromptInputs) -_T2 = TypeVar("_T2", - bound=SingletonPromptInputs, - default=SingletonPromptInputs) +_T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt) +_T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt) def build_explicit_enc_dec_prompt( @@ -182,3 +177,17 @@ def to_enc_dec_tuple_list( return [(enc_dec_prompt["encoder_prompt"], enc_dec_prompt["decoder_prompt"]) for enc_dec_prompt in enc_dec_prompts] + + +def __getattr__(name: str): + if name == "PromptInput": + import warnings + + msg = ("PromptInput has been renamed to PromptType. " + "The original name will be removed in an upcoming version.") + + warnings.warn(DeprecationWarning(msg), stacklevel=2) + + return PromptType + + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index ac9d355c64c8..e5fa1e418427 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -5,7 +5,7 @@ from vllm.utils import is_list_of from .data import (EncoderDecoderLLMInputs, ExplicitEncoderDecoderPrompt, - LLMInputs, PromptInputs, SingletonPromptInputs, TextPrompt, + LLMInputs, PromptType, SingletonPrompt, TextPrompt, TokensPrompt) @@ -81,23 +81,23 @@ class ParsedTokensPrompt(TypedDict): def parse_singleton_prompt( - inputs: SingletonPromptInputs, + prompt: SingletonPrompt, ) -> Union[ParsedStrPrompt, ParsedTextPrompt, ParsedTokensPrompt]: - if isinstance(inputs, str): - return ParsedStrPrompt(type="str", content=inputs) - elif isinstance(inputs, dict): - if "prompt_token_ids" in inputs: + if isinstance(prompt, str): + return ParsedStrPrompt(type="str", content=prompt) + elif isinstance(prompt, dict): + if "prompt_token_ids" in prompt: return ParsedTokensPrompt(type="tokens", - content=inputs) # type: ignore - elif "prompt" in inputs: - return ParsedTextPrompt(type="text", content=inputs) + content=prompt) # type: ignore + elif "prompt" in prompt: + return ParsedTextPrompt(type="text", content=prompt) raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt") def is_explicit_encoder_decoder_prompt( - inputs: PromptInputs) -> TypeIs[ExplicitEncoderDecoderPrompt]: - return isinstance(inputs, dict) and "encoder_prompt" in inputs + prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]: + return isinstance(prompt, dict) and "encoder_prompt" in prompt def is_valid_encoder_decoder_llm_inputs( diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 6d54a07e92cc..d4474a10f542 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -10,8 +10,8 @@ from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.utils import print_warning_once -from .data import (EncoderDecoderLLMInputs, LLMInputs, PromptInputs, - SingletonPromptInputs) +from .data import (EncoderDecoderLLMInputs, LLMInputs, PromptType, + SingletonPrompt) from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt if TYPE_CHECKING: @@ -209,7 +209,7 @@ async def _tokenize_prompt_async( def _extract_prompt_components( self, - inputs: SingletonPromptInputs, + prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, ) -> PromptComponents: @@ -219,7 +219,7 @@ def _extract_prompt_components( Arguments: * request_id - * inputs: single encoder or decoder input prompt + * prompt: single encoder or decoder input prompt * lora_request: this is only valid for decoder prompts Returns: @@ -229,24 +229,24 @@ def _extract_prompt_components( * multi_modal_data ''' - parsed = parse_singleton_prompt(inputs) + parsed = parse_singleton_prompt(prompt) if parsed["type"] == "str": - prompt = parsed["content"] + prompt_text = parsed["content"] prompt_token_ids = self._tokenize_prompt( - prompt, + prompt_text, request_id=request_id, lora_request=lora_request, ) multi_modal_data = None elif parsed["type"] == "tokens": - prompt = None + prompt_text = None prompt_token_ids = parsed["content"]["prompt_token_ids"] multi_modal_data = parsed["content"].get("multi_modal_data") elif parsed["type"] == "text": - prompt = parsed["content"]["prompt"] + prompt_text = parsed["content"]["prompt"] prompt_token_ids = self._tokenize_prompt( - prompt, + prompt_text, request_id=request_id, lora_request=lora_request, ) @@ -254,33 +254,33 @@ def _extract_prompt_components( else: assert_never(parsed) - return prompt, prompt_token_ids, multi_modal_data + return prompt_text, prompt_token_ids, multi_modal_data async def _extract_prompt_components_async( self, - inputs: SingletonPromptInputs, + prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, ) -> PromptComponents: """Async version of :meth:`_extract_prompt_components`.""" - parsed = parse_singleton_prompt(inputs) + parsed = parse_singleton_prompt(prompt) if parsed["type"] == "str": - prompt = parsed["content"] + prompt_text = parsed["content"] prompt_token_ids = await self._tokenize_prompt_async( - prompt, + prompt_text, request_id=request_id, lora_request=lora_request, ) multi_modal_data = None elif parsed["type"] == "tokens": - prompt = None + prompt_text = None prompt_token_ids = parsed["content"]["prompt_token_ids"] multi_modal_data = parsed["content"].get("multi_modal_data") elif parsed["type"] == "text": - prompt = parsed["content"]["prompt"] + prompt_text = parsed["content"]["prompt"] prompt_token_ids = await self._tokenize_prompt_async( - prompt, + prompt_text, request_id=request_id, lora_request=lora_request, ) @@ -288,7 +288,7 @@ async def _extract_prompt_components_async( else: assert_never(parsed) - return prompt, prompt_token_ids, multi_modal_data + return prompt_text, prompt_token_ids, multi_modal_data def _build_enc_dec_llm_inputs( self, @@ -321,7 +321,7 @@ def _build_enc_dec_llm_inputs( def _process_encoder_decoder_prompt( self, - inputs: PromptInputs, + prompt: PromptType, request_id: str, ) -> EncoderDecoderLLMInputs: ''' @@ -349,7 +349,7 @@ def _process_encoder_decoder_prompt( Arguments: - * inputs: an input prompt + * prompt: an input prompt * request_id Returns: @@ -360,13 +360,13 @@ def _process_encoder_decoder_prompt( encoder_comps: PromptComponents decoder_comps: DecoderPromptComponents - if is_explicit_encoder_decoder_prompt(inputs): + if is_explicit_encoder_decoder_prompt(prompt): encoder_comps = self._extract_prompt_components( - inputs["encoder_prompt"], + prompt["encoder_prompt"], request_id=request_id, ) - if (decoder_input := inputs["decoder_prompt"]) is None: + if (decoder_input := prompt["decoder_prompt"]) is None: decoder_comps = None, None, None else: decoder_comps = self._extract_prompt_components( @@ -375,7 +375,7 @@ def _process_encoder_decoder_prompt( ) else: encoder_comps = self._extract_prompt_components( - inputs, + prompt, request_id=request_id, ) @@ -385,20 +385,20 @@ def _process_encoder_decoder_prompt( async def _process_encoder_decoder_prompt_async( self, - inputs: PromptInputs, + prompt: PromptType, request_id: str, ) -> EncoderDecoderLLMInputs: """Async version of :meth:`_process_encoder_decoder_prompt`.""" encoder_comps: PromptComponents decoder_comps: DecoderPromptComponents - if is_explicit_encoder_decoder_prompt(inputs): + if is_explicit_encoder_decoder_prompt(prompt): encoder_task = self._extract_prompt_components_async( - inputs["encoder_prompt"], + prompt["encoder_prompt"], request_id=request_id, ) - if (decoder_input := inputs["decoder_prompt"]) is None: + if (decoder_input := prompt["decoder_prompt"]) is None: encoder_comps = await encoder_task decoder_comps = None, None, None else: @@ -411,7 +411,7 @@ async def _process_encoder_decoder_prompt_async( encoder_task, decoder_task) else: encoder_comps = await self._extract_prompt_components_async( - inputs, + prompt, request_id=request_id, ) @@ -435,7 +435,7 @@ def _build_decoder_only_llm_inputs( def _process_decoder_only_prompt( self, - inputs: SingletonPromptInputs, + prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, @@ -446,7 +446,7 @@ def _process_decoder_only_prompt( Arguments: - * inputs: input prompt + * prompt: input prompt * request_id * lora_request * prompt_adapter_request @@ -457,7 +457,7 @@ def _process_decoder_only_prompt( ''' prompt_comps = self._extract_prompt_components( - inputs, + prompt, request_id=request_id, lora_request=lora_request, ) @@ -469,14 +469,14 @@ def _process_decoder_only_prompt( async def _process_decoder_only_prompt_async( self, - inputs: SingletonPromptInputs, + prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> LLMInputs: """Async version of :meth:`_process_decoder_only_prompt`.""" prompt_comps = await self._extract_prompt_components_async( - inputs, + prompt, request_id=request_id, lora_request=lora_request, ) @@ -488,7 +488,7 @@ async def _process_decoder_only_prompt_async( def preprocess( self, - inputs: PromptInputs, + prompt: PromptType, request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, @@ -498,17 +498,17 @@ def preprocess( # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder return self._process_encoder_decoder_prompt( - inputs, + prompt, request_id=request_id, ) - if is_explicit_encoder_decoder_prompt(inputs): + if is_explicit_encoder_decoder_prompt(prompt): raise ValueError("Cannot pass encoder-decoder prompt " "to decoder-only models") # Decoder-only operation return self._process_decoder_only_prompt( - inputs, + prompt, request_id=request_id, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, @@ -516,7 +516,7 @@ def preprocess( async def preprocess_async( self, - inputs: PromptInputs, + prompt: PromptType, request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, @@ -526,17 +526,17 @@ async def preprocess_async( # Encoder-decoder model requires special mapping of # input prompts to encoder & decoder return await self._process_encoder_decoder_prompt_async( - inputs, + prompt, request_id=request_id, ) - if is_explicit_encoder_decoder_prompt(inputs): + if is_explicit_encoder_decoder_prompt(prompt): raise ValueError("Cannot pass encoder-decoder prompt " "to decoder-only models") # Decoder-only operation return await self._process_decoder_only_prompt_async( - inputs, + prompt, request_id=request_id, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, From dc4e3df5c23282b2ebaead95f179c25c9d7ec4d8 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 27 Sep 2024 00:26:38 -0700 Subject: [PATCH 016/199] [misc] fix collect env (#8894) --- collect_env.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/collect_env.py b/collect_env.py index c5cd8c315e74..ae7f97f35525 100644 --- a/collect_env.py +++ b/collect_env.py @@ -267,13 +267,23 @@ def get_neuron_sdk_version(run_lambda): def get_vllm_version(): + version = "" try: import vllm - return vllm.__version__ + "@" + vllm.__commit__ + version = vllm.__version__ except Exception: - # old version of vllm does not have __commit__ - return 'N/A' - + pass + commit = "" + try: + import vllm + commit = vllm.__commit__ + except Exception: + pass + if version != "" and commit != "": + return f"{version}@{commit}" + if version == "" and commit == "": + return "N/A" + return version or commit def summarize_vllm_build_flags(): # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc. From 0e088750af2e8035c07d356b56c03393cfb56004 Mon Sep 17 00:00:00 2001 From: Peter Pan Date: Fri, 27 Sep 2024 16:13:25 +0800 Subject: [PATCH 017/199] [MISC] Fix invalid escape sequence '\' (#8830) Signed-off-by: Peter Pan --- benchmarks/benchmark_serving.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index a407a263120b..bbe712223a53 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -1,4 +1,4 @@ -"""Benchmark online serving throughput. +r"""Benchmark online serving throughput. On the server side, run one of the following commands: vLLM OpenAI API server @@ -963,4 +963,4 @@ def main(args: argparse.Namespace): ) args = parser.parse_args() - main(args) \ No newline at end of file + main(args) From 6d792d2f31b2cfb335d1a4a7c45fe4ce143c203a Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Fri, 27 Sep 2024 16:15:58 +0800 Subject: [PATCH 018/199] [Bugfix][VLM] Fix Fuyu batching inference with `max_num_seqs>1` (#8892) --- .../decoder_only/vision_language/test_fuyu.py | 6 +-- vllm/model_executor/models/fuyu.py | 51 +++++++++++++------ 2 files changed, 37 insertions(+), 20 deletions(-) diff --git a/tests/models/decoder_only/vision_language/test_fuyu.py b/tests/models/decoder_only/vision_language/test_fuyu.py index 94b8431424db..7827ecb19a74 100644 --- a/tests/models/decoder_only/vision_language/test_fuyu.py +++ b/tests/models/decoder_only/vision_language/test_fuyu.py @@ -65,8 +65,8 @@ def run_test( # max_model_len should be greater than image_feature_size with vllm_runner(model, - max_model_len=2560, - max_num_seqs=1, + max_model_len=2048, + max_num_seqs=2, dtype=dtype, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, @@ -80,8 +80,6 @@ def run_test( ] with hf_runner(model, dtype=dtype) as hf_model: - hf_model.model.get_output_embeddings = lambda: \ - hf_model.model.language_model.get_output_embeddings() eos_token_id = hf_model.processor.tokenizer.eos_token_id hf_outputs_per_image = [ hf_model.generate_greedy_logprobs_limit(prompts, diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index d50f4fb9e6ed..9f4dca78d435 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -42,7 +42,7 @@ SequenceData) from .interfaces import SupportsMultiModal -from .utils import merge_multimodal_embeddings +from .utils import flatten_bn, merge_multimodal_embeddings # Cannot find the following 2 numbers from hf config. _IMAGE_TOKEN_ID = 71011 @@ -165,7 +165,7 @@ def input_processor_for_fuyu(ctx: InputContext, llm_inputs: LLMInputs): model_config.model) model_image_input = _fuyu_image_preprocess(image_processor, image_data) - image_patches = torch.stack([ + image_patches = torch.cat([ image_patch[0] for image_patch in model_image_input["image_patches"] ]) @@ -210,7 +210,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object): ]) # image has been processed with prompt in input processor - return MultiModalInputs({"image_patches": data}) + return MultiModalInputs({"pixel_values": data}) @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu) @@ -242,23 +242,42 @@ def __init__(self, cache_config=cache_config, quant_config=quant_config) + def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: + + h = w = self.config.patch_size + num_channels = self.config.num_channels + expected_dims = num_channels * h * w + + def _validate_shape(d: torch.Tensor): + actual_dims = d.size(-1) + + if actual_dims != expected_dims: + expected_expr = str(expected_dims) + raise ValueError( + "The expected shape of pixel values per image per batch " + f" per patch is {expected_expr}. " + f"You supplied {tuple(d.shape)}.") + + for d in data: + _validate_shape(d) + + return data.to(self.vision_embed_tokens.weight.dtype) + def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[FuyuImagePixelInputs]: - image_patches = kwargs.pop("image_patches", None) + pixel_values = kwargs.pop("pixel_values", None) - if isinstance(image_patches, torch.Tensor): - # Remove the N dimension until multiple images are supported. - image_patches = image_patches.squeeze(1) + if pixel_values is not None: + if not isinstance(pixel_values, (torch.Tensor, list)): + raise ValueError("Incorrect type of image patches. " + f"Got type: {type(pixel_values)}") + + return FuyuImagePixelInputs( + type="pixel_values", + data=self._validate_pixel_values( + flatten_bn(pixel_values, concat=True)), + ) - expected_feature_size = self.image_feature_size - if image_patches.size(-1) != expected_feature_size: - raise ValueError( - f"Expected image patches to have the last dimension of " - f"{expected_feature_size}, got {image_patches.size(-1)}") - image_patches = image_patches.to( - self.vision_embed_tokens.weight.dtype) - return FuyuImagePixelInputs(type="pixel_values", - data=image_patches) return None def _process_image_input( From 8df2dc3c8812c0abb97ce3e2913411d88524e59f Mon Sep 17 00:00:00 2001 From: Brittany <24945384+bvrockwell@users.noreply.github.com> Date: Fri, 27 Sep 2024 01:16:55 -0700 Subject: [PATCH 019/199] [TPU] Update pallas.py to support trillium (#8871) --- vllm/attention/backends/pallas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index 83fdef16ef5c..a8a78d41c666 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -130,7 +130,7 @@ def __init__( assert tpu_type is not None tpu_type = tpu_type.lower() - if "lite" not in tpu_type: + if (("lite" not in tpu_type) and ("v6" not in tpu_type)): if self.num_kv_heads % 2 == 0: self.megacore_mode = "kv_head" else: From a9b15c606fea67a072416ea0ea115261a2756058 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 27 Sep 2024 08:11:32 -0700 Subject: [PATCH 020/199] [torch.compile] use empty tensor instead of None for profiling (#8875) --- tests/kernels/test_encoder_decoder_attn.py | 8 ++++++-- vllm/attention/backends/blocksparse_attn.py | 6 ++++-- vllm/attention/backends/flash_attn.py | 6 ++++-- vllm/attention/backends/flashinfer.py | 6 +++--- vllm/attention/backends/ipex_attn.py | 9 ++++++--- vllm/attention/backends/pallas.py | 12 +++++++----- vllm/attention/backends/rocm_flash_attn.py | 6 ++++-- vllm/attention/backends/torch_sdpa.py | 9 ++++++--- vllm/attention/backends/xformers.py | 8 +++++--- vllm/worker/embedding_model_runner.py | 8 +++++++- vllm/worker/enc_dec_model_runner.py | 8 +++++++- vllm/worker/model_runner.py | 8 +++++++- vllm/worker/tpu_model_runner.py | 4 ++-- vllm/worker/tpu_worker.py | 10 +++++++++- vllm/worker/xpu_model_runner.py | 8 +++++++- 15 files changed, 84 insertions(+), 32 deletions(-) diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py index b550a7fdd84f..6b979d0558c4 100644 --- a/tests/kernels/test_encoder_decoder_attn.py +++ b/tests/kernels/test_encoder_decoder_attn.py @@ -136,7 +136,9 @@ class that Attention will automatically select when it is constructed. ) if test_pt.num_blocks is None or test_pt.num_heads is None: # Caller does not require a KV cache - return TestResources(scale, attn_backend, attn, None) + return TestResources( + scale, attn_backend, attn, + torch.tensor([], dtype=torch.float32, device=CUDA_DEVICE)) # Construct KV cache kv_cache = make_kv_cache(test_pt.num_blocks, @@ -620,7 +622,9 @@ def _run_encoder_attention_test( return attn.forward(packed_qkv.query, packed_qkv.key, packed_qkv.value, - None, + torch.tensor([], + dtype=torch.float32, + device=packed_qkv.query.device), attn_metadata, attn_type=attn_type) diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py index d84a40890ebb..656cfd124ab4 100644 --- a/vllm/attention/backends/blocksparse_attn.py +++ b/vllm/attention/backends/blocksparse_attn.py @@ -357,6 +357,8 @@ def forward( key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + NOTE: kv_cache will be an empty tensor with shape [0] + for profiling run. attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] @@ -373,7 +375,7 @@ def forward( key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) - if kv_cache is not None: + if kv_cache.numel() > 0: key_cache, value_cache = PagedAttention.split_kv_cache( kv_cache, self.num_kv_heads, self.head_size) @@ -399,7 +401,7 @@ def forward( # When block_tables are not filled, it means q and k are the # prompt, and they have the same length. - assert kv_cache is None \ + assert kv_cache.numel() == 0 \ or prefill_meta.block_tables is None \ or prefill_meta.block_tables.numel() == 0, \ "Does not support prefix-enabled attention." diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 084e8113cd42..22d07c0a4f68 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -665,6 +665,8 @@ def forward( key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + NOTE: kv_cache will be an empty tensor with shape [0] + for profiling run. attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] @@ -685,7 +687,7 @@ def forward( key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) - if kv_cache is not None: + if kv_cache.numel() > 0: key_cache = kv_cache[0] value_cache = kv_cache[1] @@ -722,7 +724,7 @@ def forward( if prefill_meta := attn_metadata.prefill_metadata: # Prompt run. - if (kv_cache is None or prefill_meta.block_tables is None + if (kv_cache.numel() == 0 or prefill_meta.block_tables is None or prefill_meta.block_tables.numel() == 0): # normal attention # When block_tables are not filled, it means q and k are the diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 3a602fbfbbc0..784cff0d9878 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -746,7 +746,7 @@ def forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - kv_cache: Optional[torch.Tensor], + kv_cache: torch.Tensor, attn_metadata: FlashInferMetadata, k_scale: float = 1.0, v_scale: float = 1.0, @@ -770,7 +770,7 @@ def forward( if attn_metadata.num_decode_tokens > 0: assert attn_metadata.num_prefill_tokens == 0, ( "Chunked prefill is not supported with flashinfer yet.") - if kv_cache is not None: + if kv_cache.numel() > 0: # Use the same reshape and cache kernel as flash attention. ops.reshape_and_cache_flash( key, @@ -796,7 +796,7 @@ def forward( # when kv_cache is not provided. # This happens when vllm runs the profiling to # determine the number of blocks. - if kv_cache is None: + if kv_cache.numel() == 0: output = torch.ops.vllm.flash_attn_varlen_func( q=query, k=key, diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py index 113a2788eacd..7398732ddfc9 100644 --- a/vllm/attention/backends/ipex_attn.py +++ b/vllm/attention/backends/ipex_attn.py @@ -167,7 +167,7 @@ def forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - kv_cache: Optional[torch.Tensor], + kv_cache: torch.Tensor, attn_metadata: IpexAttnMetadata, # type: ignore k_scale: float = 1.0, v_scale: float = 1.0, @@ -180,6 +180,8 @@ def forward( key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + NOTE: kv_cache will be an empty tensor with shape [0] + for profiling run. attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] @@ -196,7 +198,7 @@ def forward( key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) - if kv_cache is not None: + if kv_cache.numel() > 0: key_cache, value_cache = self.split_kv_cache( kv_cache, self.num_kv_heads, self.head_size) ipex_ops.reshape_and_cache( @@ -212,7 +214,8 @@ def forward( if attn_metadata.is_prompt: assert attn_metadata.seq_lens is not None - if (kv_cache is None or attn_metadata.block_tables.numel() == 0): + if (kv_cache.numel() == 0 + or attn_metadata.block_tables.numel() == 0): if self.num_kv_heads != self.num_heads: key = key.repeat_interleave(self.num_queries_per_kv, dim=1) value = value.repeat_interleave(self.num_queries_per_kv, diff --git a/vllm/attention/backends/pallas.py b/vllm/attention/backends/pallas.py index a8a78d41c666..86716602985a 100644 --- a/vllm/attention/backends/pallas.py +++ b/vllm/attention/backends/pallas.py @@ -143,7 +143,7 @@ def forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - kv_cache: Tuple[Optional[torch.Tensor], Optional[torch.Tensor]], + kv_cache: Tuple[torch.Tensor, torch.Tensor], attn_metadata: PallasMetadata, k_scale: float = 1.0, v_scale: float = 1.0, @@ -155,8 +155,10 @@ def forward( query: shape = [batch_size, seq_len, num_heads * head_size] key: shape = [batch_size, seq_len, num_kv_heads * head_size] value: shape = [batch_size, seq_len, num_kv_heads * head_size] - key_cache = [num_kv_heads, num_blocks, block_size, head_size] - value_cache = [num_kv_heads, num_blocks, block_size, head_size] + kv_cache[0] = [num_kv_heads, num_blocks, block_size, head_size] + kv_cache[1] = [num_kv_heads, num_blocks, block_size, head_size] + NOTE: kv_cache[0] and kv_cache[1] will be an empty tensor + with shape [0] for profiling run. attn_metadata: Metadata for attention. Returns: shape = [batch_size, seq_len, num_heads * head_size] @@ -173,7 +175,7 @@ def forward( value = value.view(batch_size, seq_len, self.num_kv_heads, self.head_size) - if kv_cache[0] is not None: + if kv_cache[0].numel() > 0: slot_mapping = attn_metadata.slot_mapping key_cache, value_cache = kv_cache write_to_kv_cache(key, value, key_cache, value_cache, slot_mapping) @@ -205,7 +207,7 @@ def forward( output = output.permute(0, 2, 1, 3) else: # Decoding run. - assert kv_cache is not None + assert kv_cache[0].numel() > 0 pages_per_compute_block = 16 # TODO(woosuk): Tune this value. if self.megacore_mode == "batch" and batch_size % 2 != 0: diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py index 5560f44be419..5ee3c3b69cf3 100644 --- a/vllm/attention/backends/rocm_flash_attn.py +++ b/vllm/attention/backends/rocm_flash_attn.py @@ -396,6 +396,8 @@ def forward( key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + NOTE: kv_cache will be an empty tensor with shape [0] + for profiling run. attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] @@ -412,7 +414,7 @@ def forward( key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) - if kv_cache is not None: + if kv_cache.numel() > 0: key_cache, value_cache = PagedAttention.split_kv_cache( kv_cache, self.num_kv_heads, self.head_size) @@ -449,7 +451,7 @@ def forward( if prefill_meta := attn_metadata.prefill_metadata: # Prompt run. assert prefill_meta.seq_lens is not None - if kv_cache is None or prefill_meta.block_tables.numel() == 0: + if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0: # triton attention # When block_tables are not filled, it means q and k are the # prompt, and they have the same length. diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index 8a1f8f2930c8..2a215331704c 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -151,7 +151,7 @@ def forward( query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - kv_cache: Optional[torch.Tensor], + kv_cache: torch.Tensor, attn_metadata: TorchSDPAMetadata, # type: ignore k_scale: float = 1.0, v_scale: float = 1.0, @@ -164,6 +164,8 @@ def forward( key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + NOTE: kv_cache will be an empty tensor with shape [0] + for profiling run. attn_metadata: Metadata for attention. Returns: shape = [num_tokens, num_heads * head_size] @@ -180,7 +182,7 @@ def forward( key = key.view(-1, self.num_kv_heads, self.head_size) value = value.view(-1, self.num_kv_heads, self.head_size) - if kv_cache is not None: + if kv_cache.numel() > 0: key_cache, value_cache = PagedAttention.split_kv_cache( kv_cache, self.num_kv_heads, self.head_size) PagedAttention.write_to_paged_cache(key, value, key_cache, @@ -191,7 +193,8 @@ def forward( if attn_metadata.is_prompt: assert attn_metadata.seq_lens is not None - if (kv_cache is None or attn_metadata.block_tables.numel() == 0): + if (kv_cache.numel() == 0 + or attn_metadata.block_tables.numel() == 0): if self.num_kv_heads != self.num_heads: key = key.repeat_interleave(self.num_queries_per_kv, dim=1) value = value.repeat_interleave(self.num_queries_per_kv, diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py index e073d616bf01..143fa6ee7dea 100644 --- a/vllm/attention/backends/xformers.py +++ b/vllm/attention/backends/xformers.py @@ -445,7 +445,7 @@ def forward( query: torch.Tensor, key: Optional[torch.Tensor], value: Optional[torch.Tensor], - kv_cache: Optional[torch.Tensor], + kv_cache: torch.Tensor, attn_metadata: "XFormersMetadata", k_scale: float = 1.0, v_scale: float = 1.0, @@ -489,6 +489,8 @@ def forward( key: shape = [num_tokens, num_kv_heads * head_size] value: shape = [num_tokens, num_kv_heads * head_size] kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size] + NOTE: kv_cache will be an empty tensor with shape [0] + for profiling run. attn_metadata: Metadata for attention. attn_type: Select attention type, between encoder attention, decoder self-attention, or encoder/decoder cross- @@ -522,7 +524,7 @@ def forward( # which KV cache memory-mapping & which # seqlen datastructures we utilize - if (attn_type != AttentionType.ENCODER and kv_cache is not None): + if (attn_type != AttentionType.ENCODER and kv_cache.numel() > 0): # KV-cache during decoder-self- or # encoder-decoder-cross-attention, but not # during encoder attention. @@ -588,7 +590,7 @@ def forward( if prefill_meta := attn_metadata.prefill_metadata: # Prompt run. - if kv_cache is None or prefill_meta.block_tables.numel() == 0: + if kv_cache.numel() == 0 or prefill_meta.block_tables.numel() == 0: # normal attention. # block tables are empty if the prompt does not have a cached # prefix. diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py index 0121f5da79f1..5c5d20a51e7d 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/embedding_model_runner.py @@ -97,7 +97,13 @@ def execute_model( model_executable = self.model num_layers = self.model_config.get_num_layers(self.parallel_config) - kv_caches = [None] * num_layers + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value ``None``. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + kv_caches = [ + torch.tensor([], dtype=torch.float32, device=self.device) + ] * num_layers execute_model_kwargs = { "input_ids": diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index bd716ac3e7ec..3bb4e28c6e1b 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -340,7 +340,13 @@ def profile_run(self) -> None: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) - kv_caches = [None] * num_layers + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value ``None``. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + kv_caches = [ + torch.tensor([], dtype=torch.float32, device=self.device) + ] * num_layers finished_requests_ids = [seq.request_id for seq in seqs] model_input = self.prepare_model_input( seqs, finished_requests_ids=finished_requests_ids) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 0a90f767567d..8c2e6c2d721b 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1223,7 +1223,13 @@ def profile_run(self) -> None: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) - kv_caches = [None] * num_layers + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value ``None``. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + kv_caches = [ + torch.tensor([], dtype=torch.float32, device=self.device) + ] * num_layers finished_requests_ids = [seq.request_id for seq in seqs] model_input = self.prepare_model_input( seqs, finished_requests_ids=finished_requests_ids) diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 575769ca1aa4..2472ac25aee4 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -714,7 +714,7 @@ def forward( t: torch.Tensor, p: torch.Tensor, num_samples: int, - kv_caches: List[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]], + kv_caches: List[Tuple[torch.Tensor, torch.Tensor]], ) -> torch.Tensor: """Executes the forward pass of the model and samples the next token. @@ -745,7 +745,7 @@ def forward( ) # Skip this in memory profiling at initialization. - if kv_caches[0][0] is not None: + if kv_caches[0][0].numel() > 0: # index_copy_(slot_mapping) only works when the inserted dimension # is 0. However, the KV cache in the Pallas backend has the shape # [num_kv_heads, num_blocks, block_size, head_size]. To make it diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 9e0c522cee45..fe819b9f4b3a 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -115,7 +115,15 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: head_size = self.model_config.get_head_size() num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) - kv_caches = [(None, None) for _ in range(num_layers)] + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value ``None``. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + kv_caches = [(torch.tensor([], dtype=torch.float32, + device=self.device), + torch.tensor([], dtype=torch.float32, + device=self.device)) + for _ in range(num_layers)] self.model_runner._dummy_run( batch_size=1, seq_len=self.scheduler_config.max_num_batched_tokens, diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index d3c763c995b3..8282736cf479 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -464,7 +464,13 @@ def profile_run(self) -> None: # Run the model with the dummy inputs. num_layers = self.model_config.get_num_layers(self.parallel_config) - kv_caches = [None] * num_layers + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value ``None``. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + kv_caches = [ + torch.tensor([], dtype=torch.float32, device=self.device) + ] * num_layers finished_requests_ids = [seq.request_id for seq in seqs] model_input = self.prepare_model_input( seqs, finished_requests_ids=finished_requests_ids) From 172d1cd27634e9e7adc9cb9feac73552cfae1b24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Fri, 27 Sep 2024 14:25:10 -0400 Subject: [PATCH 021/199] [Kernel] AQ AZP 4/4: Integrate asymmetric quantization to linear method (#7271) --- ...Instruct-INT8-compressed-tensors-asym.yaml | 11 ++++ .../lm-eval-harness/configs/models-small.txt | 1 + .../test_lm_eval_correctness.py | 7 ++- tests/quantization/test_compressed_tensors.py | 36 +++++++++--- .../compressed_tensors/compressed_tensors.py | 16 ++++-- .../schemes/compressed_tensors_w8a8_int8.py | 55 ++++++++++++++++++- .../layers/quantization/utils/w8a8_utils.py | 19 ++++++- 7 files changed, 124 insertions(+), 21 deletions(-) create mode 100644 .buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml new file mode 100644 index 000000000000..0ecfc01ef049 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1 +model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.764 + - name: "exact_match,flexible-extract" + value: 0.764 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt index 064883859218..64a0f428587a 100644 --- a/.buildkite/lm-eval-harness/configs/models-small.txt +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -1,6 +1,7 @@ Meta-Llama-3-8B-Instruct.yaml Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml +Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml Minitron-4B-Base-FP8.yaml diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index aa0b1b096b9c..afc935c1a931 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -49,10 +49,15 @@ def test_lm_eval_correctness(): results = launch_lm_eval(eval_config) # Confirm scores match ground truth. + success = True for task in eval_config["tasks"]: for metric in task["metrics"]: ground_truth = metric["value"] measured_value = results["results"][task["name"]][metric["name"]] print(f'{task["name"]} | {metric["name"]}: ' f'ground_truth={ground_truth} | measured={measured_value}') - assert numpy.isclose(ground_truth, measured_value, rtol=RTOL) + success = success and numpy.isclose( + ground_truth, measured_value, rtol=RTOL) + + # Assert at the end, print all scores even on failure for debugging. + assert success diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 627b2abaabcf..5cdb8a8e8228 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -2,6 +2,7 @@ Run `pytest tests/quantization/test_compressed_tensors.py`. """ +from typing import Optional import pytest import torch @@ -14,14 +15,16 @@ QuantizationType) -@pytest.mark.parametrize("model_args", [ - ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor", - QuantizationType.INT, 2560), - ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel", - QuantizationType.INT, 2560), -]) +@pytest.mark.parametrize( + "model_args", + [("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", "tensor", + QuantizationType.INT, 2560, True), + ("nm-testing/tinyllama-oneshot-w8-channel-a8-tensor", "channel", + QuantizationType.INT, 2560, True), + ("nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama", "tensor", + QuantizationType.INT, 2560, False)]) def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): - model_path, strategy, quant_type, shape_0 = model_args + model_path, strategy, quant_type, shape_0, is_symmetric = model_args with vllm_runner(model_path, enforce_eager=True) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] @@ -31,6 +34,18 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args): gate_up_proj = layer.mlp.gate_up_proj down_proj = layer.mlp.down_proj + # assert zp for symmetric and asymmetric cases + def zp_valid(zp: Optional[torch.Tensor]): + if is_symmetric: + return zp is None + + return zp is not None and zp.dtype is torch.int32 + + assert zp_valid(qkv_proj.input_zero_point) + assert zp_valid(o_proj.input_zero_point) + assert zp_valid(gate_up_proj.input_zero_point) + assert zp_valid(down_proj.input_zero_point) + assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod) assert isinstance(gate_up_proj.quant_method, @@ -69,9 +84,12 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner): @pytest.mark.parametrize("model_args", [ ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2", "tensor"), + ("nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym", "tensor"), ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2", "channel"), + ("nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym", + "channel"), ]) -def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner, model_args): +def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args): model_path, strategy = model_args with vllm_runner(model_path, dtype=torch.float16) as llm: model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 @@ -160,4 +178,4 @@ def test_compressed_tensors_kv_cache(vllm_runner): model_path = "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme" with vllm_runner(model_path, kv_cache_dtype="fp8") as llm: output = llm.generate_greedy("Hello world!", max_tokens=20) - assert output \ No newline at end of file + assert output diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 362feeef2e33..abb18d31b5a8 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -138,10 +138,11 @@ def _is_static_tensor_w8a8(self, weight_quant: BaseModel, or weight_quant.strategy == QuantizationStrategy.CHANNEL.value) is_tensor = (weight_strategy and input_quant.strategy == QuantizationStrategy.TENSOR.value) - is_symmetric = weight_quant.symmetric and input_quant.symmetric is_static = not weight_quant.dynamic and not input_quant.dynamic - return is_8_bits and is_tensor and is_symmetric and is_static + # Both symmetric and asymmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_8_bits and is_tensor and weight_quant.symmetric and is_static def _is_dynamic_token_w8a8(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: @@ -151,10 +152,11 @@ def _is_dynamic_token_w8a8(self, weight_quant: BaseModel, or weight_quant.strategy == QuantizationStrategy.CHANNEL.value) is_token = (weight_strategy and input_quant.strategy == QuantizationStrategy.TOKEN.value) - is_symmetric = weight_quant.symmetric and input_quant.symmetric is_dynamic = not weight_quant.dynamic and input_quant.dynamic - return is_8_bits and is_token and is_symmetric and is_dynamic + # Both symmetric and asymmetric input quantization supported. + # Only symmetric weight quantization supported. + return is_8_bits and is_token and weight_quant.symmetric and is_dynamic def _is_fp8_w8a8(self, weight_quant: BaseModel, input_quant: BaseModel) -> bool: @@ -265,12 +267,14 @@ def _get_scheme_from_parts( if self._is_static_tensor_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8Int8( strategy=weight_quant.strategy, - is_static_input_scheme=True) + is_static_input_scheme=True, + input_symmetric=input_quant.symmetric) if self._is_dynamic_token_w8a8(weight_quant, input_quant): return CompressedTensorsW8A8Int8( strategy=weight_quant.strategy, - is_static_input_scheme=False) + is_static_input_scheme=False, + input_symmetric=input_quant.symmetric) raise NotImplementedError( "No compressed-tensors compatible scheme was found.") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 078380f15929..245a35c8783a 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -3,6 +3,7 @@ import torch from torch.nn import Parameter +from vllm.logger import init_logger from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( @@ -14,12 +15,16 @@ ModelWeightParameter, PerTensorScaleParameter) +logger = init_logger(__name__) + class CompressedTensorsW8A8Int8(CompressedTensorsScheme): - def __init__(self, strategy: str, is_static_input_scheme: bool): + def __init__(self, strategy: str, is_static_input_scheme: bool, + input_symmetric: bool): self.strategy = strategy self.is_static_input_scheme = is_static_input_scheme + self.input_symmetric = input_symmetric @classmethod def get_min_capability(cls) -> int: @@ -46,10 +51,43 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: requires_grad=False) # INPUT SCALE if self.is_static_input_scheme: - layer.input_scale = Parameter(layer.input_scale.max(), - requires_grad=False) + if self.input_symmetric: + layer.input_scale = Parameter(layer.input_scale.max(), + requires_grad=False) + layer.input_zero_point = None + else: + # reconstruct the ranges + int8_traits = torch.iinfo(torch.int8) + azps = layer.input_zero_point.to(dtype=torch.int32) + range_max = (layer.input_scale * + (int8_traits.max - azps)).max() + range_min = (layer.input_scale * + (int8_traits.min - azps)).min() + + scale = (range_max - range_min) / (int8_traits.max - + int8_traits.min) + layer.input_scale = Parameter(scale, requires_grad=False) + + # AZP loaded as int8 but used as int32 + azp = (int8_traits.min - + range_min / scale).to(dtype=torch.int32) + layer.input_zero_point = Parameter(azp, requires_grad=False) + else: layer.input_scale = None + layer.input_zero_point = None + + # azp_adj is the AZP adjustment term, used to account for weights. + # It does not depend on scales or azp, so it is the same for + # static and dynamic quantization. + # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md + # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md + if not self.input_symmetric: + layer.azp_adj = layer.weight.sum(dim=0, + keepdim=True, + dtype=torch.int32) + else: + layer.azp_adj = None def create_weights(self, layer: torch.nn.Module, output_partition_sizes: List[int], @@ -90,6 +128,15 @@ def create_weights(self, layer: torch.nn.Module, weight_loader=weight_loader) layer.register_parameter("input_scale", input_scale) + if not self.input_symmetric: + # Note: compressed-tensors stores the zp using the same dtype + # as the weights + # AZP loaded as int8 but used as int32 + input_zero_point = BasevLLMParameter( + data=torch.empty(1, dtype=torch.int8), + weight_loader=weight_loader) + layer.register_parameter("input_zero_point", input_zero_point) + def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, bias: Optional[torch.Tensor]) -> torch.Tensor: @@ -97,4 +144,6 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, weight=layer.weight, weight_scale=layer.weight_scale, input_scale=layer.input_scale, + input_zero_point=layer.input_zero_point, + azp_adj=layer.azp_adj, bias=bias) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index fb263d121fe5..fb18f2b72389 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -191,13 +191,28 @@ def apply_int8_linear( weight: torch.Tensor, weight_scale: torch.Tensor, input_scale: Optional[torch.Tensor] = None, + input_zero_point: Optional[torch.Tensor] = None, + azp_adj: Optional[torch.Tensor] = None, bias: Optional[torch.Tensor] = None, ): # ops.scaled_int8_quant supports both dynamic and static quant. # * dynamic, layer.input_scale is None and x_scale computed from x. # * static, layer.input_scale is scalar and x_scale is input_scale. - x_q, x_scale, _ = ops.scaled_int8_quant(input, input_scale) - + symmetric = azp_adj is None + x_q, x_scale, x_zp = ops.scaled_int8_quant(input, + input_scale, + input_zero_point, + symmetric=symmetric) + + if x_zp is not None: + return ops.cutlass_scaled_mm_azp(x_q, + weight, + scale_a=x_scale, + scale_b=weight_scale, + out_dtype=input.dtype, + azp_adj=azp_adj, + azp=x_zp, + bias=bias) return ops.cutlass_scaled_mm(x_q, weight, scale_a=x_scale, From c5d55356f9d2b2075ac53cf20453358c1e2b7bde Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 27 Sep 2024 15:12:34 -0400 Subject: [PATCH 022/199] [Bugfix] fix for deepseek w4a16 (#8906) Co-authored-by: mgoin --- .../model_executor/layers/quantization/kernels/marlin.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/layers/quantization/kernels/marlin.py b/vllm/model_executor/layers/quantization/kernels/marlin.py index 5b4bba76ee0c..6969583d6d47 100644 --- a/vllm/model_executor/layers/quantization/kernels/marlin.py +++ b/vllm/model_executor/layers/quantization/kernels/marlin.py @@ -38,10 +38,11 @@ def can_implement(cls, "Marlin, supported group sizes are: "\ f"{MARLIN_SUPPORTED_GROUP_SIZES}" - return check_marlin_supports_shape(c.partition_weight_shape[0], - c.partition_weight_shape[1], - c.full_weight_shape[1], - c.group_size) + return check_marlin_supports_shape( + c.partition_weight_shape[1], # out_features + c.partition_weight_shape[0], # in_features + c.full_weight_shape[0], # in_features + c.group_size) # note assumes that # `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0} From c2ec430ab5713d0626c1a7809718ef6c4eebf389 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Fri, 27 Sep 2024 16:32:07 -0400 Subject: [PATCH 023/199] [Core] Multi-Step + Single Step Prefills via Chunked Prefill code path (#8378) Co-authored-by: Varun Sundar Rabindranath --- csrc/prepare_inputs/advance_step.cu | 2 +- .../multi_step/test_correctness_async_llm.py | 9 + tests/multi_step/test_correctness_llm.py | 4 + vllm/attention/backends/flash_attn.py | 32 +++- vllm/attention/backends/flashinfer.py | 20 +- vllm/config.py | 13 +- vllm/core/block/block_table.py | 13 +- vllm/core/block_manager_v1.py | 7 +- vllm/core/block_manager_v2.py | 5 +- vllm/core/embedding_model_block_manager.py | 4 +- vllm/core/interfaces.py | 4 +- vllm/core/scheduler.py | 134 ++++++++++---- vllm/engine/arg_utils.py | 10 +- vllm/engine/async_llm_engine.py | 9 +- vllm/engine/llm_engine.py | 130 +++++++++++-- vllm/engine/output_processor/multi_step.py | 1 + vllm/sequence.py | 46 ++++- vllm/worker/multi_step_model_runner.py | 175 +++++++++++++++--- vllm/worker/multi_step_worker.py | 5 +- 19 files changed, 514 insertions(+), 109 deletions(-) diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu index 1f3f4710735e..195eb27dee74 100644 --- a/csrc/prepare_inputs/advance_step.cu +++ b/csrc/prepare_inputs/advance_step.cu @@ -52,7 +52,7 @@ __global__ void advance_step_flashattn_kernel( slot_mapping_ptr[cur_query_id] = slot_num; } -inline void verify_tensor(std::string const& name, torch::Tensor& t, +inline void verify_tensor(std::string const& name, torch::Tensor const& t, int64_t const size_0, int64_t const size_1, c10::ScalarType const type) { bool size_0_cond = true; diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index a75a671e57f7..615549f2134a 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -37,6 +37,7 @@ @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("is_async", [True]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) +@pytest.mark.parametrize("enable_chunked_prefill", [True, False]) @pytest.mark.asyncio async def test_multi_step( example_prompts, @@ -49,6 +50,7 @@ async def test_multi_step( is_async: bool, num_logprobs: Optional[int], attention_backend: str, + enable_chunked_prefill: bool, monkeypatch, ) -> None: """Test vLLM engine with multi-step scheduling in an OpenAI-protocol @@ -74,6 +76,10 @@ async def test_multi_step( num_logprobs: corresponds to the `logprobs` argument to the OpenAI completions endpoint; `None` -> no logprobs """ + if enable_chunked_prefill and \ + (pp_size > 1 or attention_backend != "FLASH_ATTN"): + pytest.skip("Multi-step with Chunked-Prefill only supports" + "PP=1 and FLASH_ATTN backend") override_backend_env_variable(monkeypatch, attention_backend) @@ -93,6 +99,9 @@ async def test_multi_step( if eager_mode: ms_server_args.append("--enforce-eager") + if enable_chunked_prefill: + ms_server_args.append("--enable-chunked-prefill") + distributed_args = [ "--tensor-parallel-size", str(tp_size), diff --git a/tests/multi_step/test_correctness_llm.py b/tests/multi_step/test_correctness_llm.py index c5dc81cc2562..ff413e8e2da3 100644 --- a/tests/multi_step/test_correctness_llm.py +++ b/tests/multi_step/test_correctness_llm.py @@ -16,6 +16,7 @@ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("tp_size", [1]) +@pytest.mark.parametrize("enable_chunked_prefill", [False, True]) @pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("enforce_eager", [True]) @pytest.mark.parametrize("num_scheduler_steps", NUM_SCHEDULER_STEPS) @@ -28,6 +29,7 @@ def test_multi_step_llm( model: str, dtype: str, tp_size: int, + enable_chunked_prefill: bool, max_tokens: int, enforce_eager: int, num_scheduler_steps: int, @@ -51,6 +53,7 @@ def test_multi_step_llm( model: model under test (same for single- and multi-step engines) dtype: tensor datatype for engine to utilize tp_size: degree of tensor-parallelism + enable_chunked_prefill: chunked-prefill on/off max_tokens: the maximum number of tokens to generate enforce_eager num_scheduler_steps: for multi-step scheduling, GPU-side steps per @@ -73,6 +76,7 @@ def test_multi_step_llm( gpu_memory_utilization=0.7, tensor_parallel_size=tp_size, use_v2_block_manager=True, + enable_chunked_prefill=enable_chunked_prefill, num_scheduler_steps=num_scheduler_steps, ) as vllm_model: vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens) diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py index 22d07c0a4f68..43ca6c9ff160 100644 --- a/vllm/attention/backends/flash_attn.py +++ b/vllm/attention/backends/flash_attn.py @@ -342,9 +342,13 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]: ) return self._cached_decode_metadata - def advance_step(self, model_input: "ModelInputForGPUWithSamplingMetadata", + def advance_step(self, + model_input: "ModelInputForGPUWithSamplingMetadata", sampled_token_ids: Optional[torch.Tensor], - block_size: int, num_seqs: int, num_queries: int): + block_size: int, + num_seqs: int, + num_queries: int, + turn_prefills_into_decodes: bool = False): """ Update metadata in-place to advance one decode step. """ @@ -355,6 +359,23 @@ def advance_step(self, model_input: "ModelInputForGPUWithSamplingMetadata", assert num_seqs > num_queries assert self.use_cuda_graph + if turn_prefills_into_decodes: + # When Mutli-Step is enabled with Chunked-Prefill, prefills and + # decodes are scheduled together. In the first step, all the + # prefills turn into decodes. This update reflects that + # conversion. + assert self.num_decode_tokens + self.num_prefills == num_seqs + self.num_decode_tokens += self.num_prefills + self.num_prefills = 0 + self.num_prefill_tokens = 0 + self.max_prefill_seq_len = 0 + self.max_query_len = 1 + + self.slot_mapping = self.slot_mapping[:num_seqs] + else: + assert self.seq_lens is not None + assert self.max_decode_seq_len == max(self.seq_lens) + assert self.num_prefills == 0 assert self.num_prefill_tokens == 0 assert self.num_decode_tokens == num_seqs @@ -366,7 +387,6 @@ def advance_step(self, model_input: "ModelInputForGPUWithSamplingMetadata", assert self.seq_lens_tensor.shape == (num_seqs, ) assert self.max_query_len == 1 assert self.max_prefill_seq_len == 0 - assert self.max_decode_seq_len == max(self.seq_lens) assert self.query_start_loc is not None assert self.query_start_loc.shape == (num_queries + 1, ) @@ -706,8 +726,10 @@ def forward( num_prefill_tokens = attn_metadata.num_prefill_tokens num_decode_tokens = attn_metadata.num_decode_tokens - assert key.shape[0] == num_prefill_tokens + num_decode_tokens - assert value.shape[0] == num_prefill_tokens + num_decode_tokens + assert key.shape[0] == num_prefill_tokens + num_decode_tokens, \ + f"key : {key.shape} : #prefill tokens {num_prefill_tokens} : #decode tokens {num_decode_tokens}" # noqa + assert value.shape[0] == num_prefill_tokens + num_decode_tokens, \ + f"value : {value.shape} : #prefill toks {num_prefill_tokens} : #decode toks {num_decode_tokens}" # noqa # Query for decode. KV is not needed because it is already cached. decode_query = query[num_prefill_tokens:] diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index 784cff0d9878..a64bf34596f9 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -410,18 +410,22 @@ def decode_metadata(self) -> Optional["FlashInferMetadata"]: return self - def advance_step( - self, - model_input: "ModelInputForGPUWithSamplingMetadata", - sampled_token_ids: Optional[torch.Tensor], - block_size: int, - num_seqs: int, - num_queries: int, - ): + def advance_step(self, + model_input: "ModelInputForGPUWithSamplingMetadata", + sampled_token_ids: Optional[torch.Tensor], + block_size: int, + num_seqs: int, + num_queries: int, + turn_prefills_into_decodes: bool = False): """ Update metadata in-place to advance one decode step. """ + assert not turn_prefills_into_decodes, \ + ("Chunked prefill is not supported with flashinfer yet." + "turn_prefills_into_decodes is a Multi-Step + Chunked-Prefill " + "specific parameter.") + assert num_seqs > 0 assert num_queries > 0 assert model_input.attn_metadata is not None diff --git a/vllm/config.py b/vllm/config.py index 108badf150c8..3139c5a08bfb 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -983,9 +983,16 @@ def __init__(self, policy: str = "fcfs") -> None: if max_num_batched_tokens is None: if enable_chunked_prefill: - # It is the values that have the best balance between ITL - # and TTFT on A100. Note it is not optimized for throughput. - max_num_batched_tokens = 512 + if num_scheduler_steps > 1: + # Multi-step Chunked-Prefill doesn't allow prompt-chunking + # for now. Have max_num_batched_tokens set to max_model_len + # so we don't reject sequences on account of a short + # max_num_batched_tokens. + max_num_batched_tokens = max(max_model_len, 2048) + else: + # It is the values that have the best balance between ITL + # and TTFT on A100. Note it is not optimized for throughput. + max_num_batched_tokens = 512 else: # If max_model_len is too short, use 2048 as the default value # for higher throughput. diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py index c002dd1397f9..a9f4bd871dfd 100644 --- a/vllm/core/block/block_table.py +++ b/vllm/core/block/block_table.py @@ -55,9 +55,12 @@ def __init__( self._num_full_slots = self._get_num_token_ids() @staticmethod - def get_num_required_blocks(token_ids: List[int], block_size: int) -> int: + def get_num_required_blocks(token_ids: List[int], + block_size: int, + num_lookahead_slots: int = 0) -> int: """Calculates the minimum number of blocks required to store a given - sequence of token IDs. + sequence of token IDs along with any look-ahead slots that may be + required (like in multi-step + chunked-prefill). This assumes worst-case scenario, where every block requires a new allocation (e.g. ignoring prefix caching). @@ -66,12 +69,14 @@ def get_num_required_blocks(token_ids: List[int], block_size: int) -> int: token_ids (List[int]): The sequence of token IDs to be stored. block_size (int): The maximum number of tokens that can be stored in a single block. + num_lookahead_slots (int): look-ahead slots that the sequence may + require. Returns: int: The minimum number of blocks required to store the given - sequence of token IDs. + sequence of token IDs along with any required look-ahead slots. """ - return cdiv(len(token_ids), block_size) + return cdiv(len(token_ids) + num_lookahead_slots, block_size) def allocate(self, token_ids: List[int], diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 24ab9eb66194..a1f96707a6b5 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -281,10 +281,15 @@ def __init__( def _get_seq_num_required_blocks(self, seq: Optional[Sequence]) -> int: return 0 if seq is None else seq.n_blocks - def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: + def can_allocate(self, + seq_group: SequenceGroup, + num_lookahead_slots: int = 0) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. + assert (num_lookahead_slots == 0 + ), "lookahead allocation not supported in BlockSpaceManagerV1" + check_no_caching_or_swa_for_blockmgr_encdec(self, seq_group) self_num_required_blocks = self._get_seq_num_required_blocks( diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py index 54818c7e3e9a..bb78b1e1c913 100644 --- a/vllm/core/block_manager_v2.py +++ b/vllm/core/block_manager_v2.py @@ -107,7 +107,9 @@ def __init__( self._last_access_blocks_tracker = LastAccessBlocksTracker( self.block_allocator) - def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: + def can_allocate(self, + seq_group: SequenceGroup, + num_lookahead_slots: int = 0) -> AllocStatus: # FIXME(woosuk): Here we assume that all sequences in the group share # the same prompt. This may not be true for preempted sequences. @@ -117,6 +119,7 @@ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: num_required_blocks = BlockTable.get_num_required_blocks( seq.get_token_ids(), block_size=self.block_size, + num_lookahead_slots=num_lookahead_slots, ) if seq_group.is_encoder_decoder(): diff --git a/vllm/core/embedding_model_block_manager.py b/vllm/core/embedding_model_block_manager.py index c47d7d8dfb07..476e043ecc52 100644 --- a/vllm/core/embedding_model_block_manager.py +++ b/vllm/core/embedding_model_block_manager.py @@ -21,7 +21,9 @@ def __init__( ) -> None: pass - def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: + def can_allocate(self, + seq_group: SequenceGroup, + num_lookahead_slots: int = 0) -> AllocStatus: # Always return OK for dummy purposes return AllocStatus.OK diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py index 96f8dd851b2f..634671158730 100644 --- a/vllm/core/interfaces.py +++ b/vllm/core/interfaces.py @@ -44,7 +44,9 @@ def get_block_space_manager_class(version: str): raise ValueError(f"Unknown version {version=}") @abstractmethod - def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus: + def can_allocate(self, + seq_group: SequenceGroup, + num_lookahead_slots: int = 0) -> AllocStatus: pass @abstractmethod diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index 873decff37c1..5b7587d15084 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -522,7 +522,7 @@ def _schedule_running( ret.swapped_out.clear() ret.num_lookahead_slots = self._get_num_lookahead_slots( - is_prefill=False) + is_prefill=False, enable_chunking=enable_chunking) ret.decode_seq_groups_list.clear() ret.prefill_seq_groups_list.clear() @@ -561,7 +561,7 @@ def _schedule_running( # NOTE(woosuk): Preemption happens only when there is no available # slot to keep all the sequence groups in the RUNNING state. - while not self._can_append_slots(seq_group): + while not self._can_append_slots(seq_group, enable_chunking): budget.subtract_num_batched_tokens(seq_group.request_id, num_running_tokens) num_running_seqs = seq_group.get_max_num_running_seqs() @@ -611,7 +611,7 @@ def _schedule_running( if not cont_loop: break else: - self._append_slots(seq_group, blocks_to_copy) + self._append_slots(seq_group, blocks_to_copy, enable_chunking) is_prefill = seq_group.is_prefill() scheduled_seq_group: ScheduledSequenceGroup = \ @@ -684,7 +684,8 @@ def _schedule_swapped( # If the sequence group cannot be swapped in, stop. is_prefill = seq_group.is_prefill() alloc_status = self.block_manager.can_swap_in( - seq_group, self._get_num_lookahead_slots(is_prefill)) + seq_group, + self._get_num_lookahead_slots(is_prefill, enable_chunking)) if alloc_status == AllocStatus.LATER: break elif alloc_status == AllocStatus.NEVER: @@ -727,7 +728,7 @@ def _schedule_swapped( curr_loras.add(lora_int_id) swapped_queue.popleft() self._swap_in(seq_group, blocks_to_swap_in) - self._append_slots(seq_group, blocks_to_copy) + self._append_slots(seq_group, blocks_to_copy, enable_chunking) is_prefill = seq_group.is_prefill() if is_prefill: prefill_seq_groups.append( @@ -747,12 +748,13 @@ def _schedule_swapped( blocks_to_swap_in=blocks_to_swap_in, blocks_to_copy=blocks_to_copy, num_lookahead_slots=self._get_num_lookahead_slots( - is_prefill=False), + is_prefill=False, enable_chunking=enable_chunking), infeasible_seq_groups=infeasible_seq_groups, ) def _get_prompt_limit(self, seq_group: SequenceGroup) -> int: - if self.scheduler_config.chunked_prefill_enabled: + if self.scheduler_config.chunked_prefill_enabled and \ + not self.scheduler_config.is_multi_step: prompt_limit = self.scheduler_config.max_model_len else: prompt_limit = min(self.scheduler_config.max_model_len, @@ -899,15 +901,21 @@ def _schedule_prefills( waiting_queue.popleft() continue + num_lookahead_slots: int = 0 + if self.scheduler_config.is_multi_step and enable_chunking: + num_lookahead_slots = self._get_num_lookahead_slots( + True, enable_chunking) + # If the sequence group cannot be allocated, stop. - can_allocate = self.block_manager.can_allocate(seq_group) + can_allocate = self.block_manager.can_allocate( + seq_group, num_lookahead_slots=num_lookahead_slots) if can_allocate == AllocStatus.LATER: break elif can_allocate == AllocStatus.NEVER: logger.warning( - "Input prompt (%d tokens) is too long" - " and exceeds the capacity of block_manager", - num_new_tokens) + "Input prompt (%d tokens) + lookahead slots (%d) is " + "too long and exceeds the capacity of block_manager", + num_new_tokens, num_lookahead_slots) for seq in waiting_seqs: seq.status = SequenceStatus.FINISHED_IGNORED ignored_seq_groups.append(seq_group) @@ -939,9 +947,24 @@ def _schedule_prefills( curr_loras.add(lora_int_id) waiting_queue.popleft() self._allocate_and_set_running(seq_group) - seq_group.init_multi_step( - num_scheduler_steps=self._get_num_lookahead_slots( - is_prefill=True) + 1) + + if enable_chunking and self.scheduler_config.is_multi_step: + blocks_to_copy: List[Tuple[int, int]] = [] + # init_multi_step_from_lookahead_slots happens in append_slots + self._append_slots(seq_group, blocks_to_copy, enable_chunking) + # This assert will trip when a copy-on-write happens. This is + # not a concern as the very first sequence-group block + # allocation happens above. Still, we have the assert to + # catch any edge-cases. + assert not blocks_to_copy + else: + seq_group.init_multi_step_from_lookahead_slots( + num_lookahead_slots, + num_scheduler_steps=self.scheduler_config. + num_scheduler_steps, + is_multi_step=self.scheduler_config.is_multi_step, + enable_chunking=enable_chunking) + seq_groups.append( ScheduledSequenceGroup(seq_group=seq_group, token_chunk_size=num_new_tokens)) @@ -956,7 +979,8 @@ def _schedule_prefills( return SchedulerPrefillOutputs( seq_groups=seq_groups, ignored_seq_groups=ignored_seq_groups, - num_lookahead_slots=self._get_num_lookahead_slots(is_prefill=True)) + num_lookahead_slots=self._get_num_lookahead_slots( + is_prefill=True, enable_chunking=enable_chunking)) def _schedule_default(self) -> SchedulerOutputs: """Schedule queued requests. @@ -1153,7 +1177,8 @@ def _schedule(self) -> SchedulerOutputs: else: return self._schedule_default() - def _can_append_slots(self, seq_group: SequenceGroup) -> bool: + def _can_append_slots(self, seq_group: SequenceGroup, + enable_chunking: bool) -> bool: """Determine whether or not we have enough space in the KV cache to continue generation of the sequence group. """ @@ -1164,13 +1189,17 @@ def _can_append_slots(self, seq_group: SequenceGroup) -> bool: self.artificial_preempt_cnt -= 1 return False - # Appending slots only occurs in decoding. - is_prefill = False + is_prefill = seq_group.is_prefill() + num_lookahead_slots = self._get_num_lookahead_slots( + is_prefill, enable_chunking) + + if is_prefill and num_lookahead_slots > 0: + # Appending prefill slots only happens multi-step and + # chunked-prefill are enabled together. + assert self.scheduler_config.is_multi_step and enable_chunking return self.block_manager.can_append_slots( - seq_group=seq_group, - num_lookahead_slots=self._get_num_lookahead_slots(is_prefill), - ) + seq_group=seq_group, num_lookahead_slots=num_lookahead_slots) def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool: no_beam_search = seq_group.sampling_params is None or ( @@ -1186,7 +1215,7 @@ def schedule( # such as self.running, self.swapped, and self.waiting. scheduler_start_time = time.perf_counter() - scheduler_outputs = self._schedule() + scheduler_outputs: SchedulerOutputs = self._schedule() now = time.time() if not self.cache_config.enable_prefix_caching: @@ -1383,11 +1412,10 @@ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None: for seq in seq_group.get_seqs(status=SequenceStatus.WAITING): seq.status = SequenceStatus.RUNNING - def _append_slots( - self, - seq_group: SequenceGroup, - blocks_to_copy: List[Tuple[int, int]], - ) -> None: + def _append_slots(self, + seq_group: SequenceGroup, + blocks_to_copy: List[Tuple[int, int]], + enable_chunking: bool = False) -> None: """Appends new slots to the sequences in the given sequence group. Args: @@ -1398,11 +1426,25 @@ def _append_slots( int is the destination block index. This list is updated with the new source and destination block indices for the appended slots. + enable_chunking (bool): True if chunked prefill is enabled. """ - num_lookahead_slots = self._get_num_lookahead_slots(is_prefill=False) - seq_group.init_multi_step(num_scheduler_steps=num_lookahead_slots + 1) - - for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING): + is_prefill: bool = seq_group.is_prefill() + num_lookahead_slots: int = self._get_num_lookahead_slots( + is_prefill, enable_chunking) + + seq_group.init_multi_step_from_lookahead_slots( + num_lookahead_slots, + num_scheduler_steps=self.scheduler_config.num_scheduler_steps, + is_multi_step=self.scheduler_config.is_multi_step, + enable_chunking=enable_chunking) + + seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING + if self.scheduler_config.is_multi_step and enable_chunking: + # In multi-step chunked-prefill any sequence type can have + # slots appended. + seq_status = None + + for seq in seq_group.get_seqs(status=seq_status): cows = self.block_manager.append_slots(seq, num_lookahead_slots) if len(cows) > 0: blocks_to_copy.extend(cows) @@ -1513,16 +1555,32 @@ def _passed_delay(self, now: float) -> bool: passed_delay = True return passed_delay - def _get_num_lookahead_slots(self, is_prefill: bool) -> int: + def _get_num_lookahead_slots(self, is_prefill: bool, + enable_chunking: bool) -> int: """The number of slots to allocate per sequence per step, beyond known token ids. Speculative decoding uses these slots to store KV activations of tokens which may or may not be accepted. Speculative decoding does not yet support prefill, so we do not perform lookahead allocation for prefill. + + When chunking is enabled with multi-step, we allocate lookahead slots + for the prefills for when the prefills turn into decodes in the first + step. """ if is_prefill: - return 0 + if self.scheduler_config.is_multi_step and enable_chunking: + # num_lookahead_slots was introduced in the context of decodes, + # in Speculative Decoding. + # When the num_scheduler_steps is 8, say, then the + # num_lookahead_slots is 7. Meaning, we are doing a 1-step of + # decode anyways and we wish to do 7 more. + # + # "lookaheads" for prefills, is introduced in support for + # Chunked-Prefill in Multi-Step. + return self.scheduler_config.num_lookahead_slots + 1 + else: + return 0 return self.scheduler_config.num_lookahead_slots @@ -1565,6 +1623,16 @@ def _get_num_new_tokens(self, seq_group: SequenceGroup, if remaining_token_budget < num_new_tokens: num_new_tokens = (remaining_token_budget // block_size) * block_size + elif self.scheduler_config.is_multi_step: + if num_new_tokens > self._get_prompt_limit(seq_group): + # If the seq_group is in prompt-stage, pass the + # num_new_tokens as-is so the caller can ignore + # the sequence. + pass + else: + num_new_tokens = 0 \ + if num_new_tokens > remaining_token_budget \ + else num_new_tokens else: num_new_tokens = min(num_new_tokens, remaining_token_budget) return num_new_tokens diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 0d4559e37742..0efb0cbbf8be 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -980,9 +980,13 @@ def create_engine_config(self) -> EngineConfig: if speculative_config is not None: raise ValueError("Speculative decoding is not supported with " "multi-step (--num-scheduler-steps > 1)") - if self.enable_chunked_prefill: - raise ValueError("Chunked prefill is not supported with " - "multi-step (--num-scheduler-steps > 1)") + if self.enable_chunked_prefill and self.enable_prefix_caching: + raise ValueError("Multi-Step is not supported with " + "both Chunked-Prefill and Prefix-Caching " + "enabled together.") + if self.enable_chunked_prefill and self.pipeline_parallel_size > 1: + raise ValueError("Multi-Step Chunked-Prefill is not supported " + "for pipeline-parallel-size > 1") # make sure num_lookahead_slots is set the higher value depending on # if we are using speculative decoding or multi-step diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 54c5af2fe366..3361fdefc960 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -363,11 +363,18 @@ async def step_async( self.cached_scheduler_outputs[ virtual_engine] = SchedulerOutputState() + # is_first_step_output is True only when the num_steps of all + # the sequences are 1. When the num_steps > 1, + # multi_step_model_runner does the first-step output append. + is_first_step_output: bool = False if not seq_group_metadata_list \ + else seq_group_metadata_list[0].state.num_steps == 1 + ctx.append_output(outputs=outputs, seq_group_metadata_list=seq_group_metadata_list, scheduler_outputs=scheduler_outputs, is_async=allow_async_output_proc, - is_last_step=True) + is_last_step=True, + is_first_step_output=is_first_step_output) if outputs and allow_async_output_proc: assert len( diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 487255cb6b59..19f88ac3e7c5 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -90,6 +90,12 @@ class OutputData(NamedTuple): scheduler_outputs: SchedulerOutputs is_async: bool is_last_step: bool + # Indicates if this output is from the first step of the + # multi-step. When multi-step is disabled, this is always + # set to True. + # is_first_step_output is invalid when `outputs` has + # outputs from multiple steps. + is_first_step_output: Optional[bool] skip: List[int] @@ -108,13 +114,15 @@ def __init__(self, multi_step_stream_outputs: bool = False): def append_output(self, outputs: List[SamplerOutput], seq_group_metadata_list: List[SequenceGroupMetadata], scheduler_outputs: SchedulerOutputs, is_async: bool, - is_last_step: bool): + is_last_step: bool, + is_first_step_output: Optional[bool]): self.output_queue.append( OutputData(outputs=outputs, seq_group_metadata_list=seq_group_metadata_list, scheduler_outputs=scheduler_outputs, is_async=is_async, is_last_step=is_last_step, + is_first_step_output=is_first_step_output, skip=[])) @@ -237,9 +245,10 @@ def __init__( "quantization_param_path=%s, device_config=%s, " "decoding_config=%r, observability_config=%r, " "seed=%d, served_model_name=%s, use_v2_block_manager=%s, " - "num_scheduler_steps=%d, multi_step_stream_outputs=%s, " - "enable_prefix_caching=%s, use_async_output_proc=%s, " - "use_cached_outputs=%s, mm_processor_kwargs=%s)", + "num_scheduler_steps=%d, chunked_prefill_enabled=%s " + "multi_step_stream_outputs=%s, enable_prefix_caching=%s, " + "use_async_output_proc=%s, use_cached_outputs=%s, " + "mm_processor_kwargs=%s)", VLLM_VERSION, model_config.model, speculative_config, @@ -270,6 +279,7 @@ def __init__( model_config.served_model_name, scheduler_config.use_v2_block_manager, scheduler_config.num_scheduler_steps, + scheduler_config.chunked_prefill_enabled, scheduler_config.multi_step_stream_outputs, cache_config.enable_prefix_caching, model_config.use_async_output_proc, @@ -957,8 +967,66 @@ def _process_model_outputs(self, ctx: The virtual engine context to work on request_id: If provided, then only this request is going to be processed - """ + + def update_prefill_num_computed_tokens( + seq_group: SequenceGroup, + seq_group_meta: SequenceGroupMetadata, num_outputs: int, + is_first_step_output: Optional[bool]) -> None: + """ + When multi-step and chunked-prefill are enabled together, the + prefill sequence scheduled for multi-step execution turn into + decodes in the first step itself. This function accounts + for that conversion. + + seq_group: SequenceGroup - A prefill seq_group + seq_group_meta: SequenceGroupMetadata - Metadata of the given + prefill seq_group + num_outputs: int - number of output tokens being processed for the + given seq_group + is_first_step_output: Optional[bool] - + If multi-step is enabled and num_outputs is 1, this value + indicates if this outputs belongs to the first step in the + multi-step. + If multi-step is enabled and num_outputs > 1, this value + must be None, as num_outputs > 1 indicates that outputs from + all the steps in multi-step are submitted in a single burst. + When multi-step is disabled, this value is always True. + """ + + assert seq_group_meta.is_prompt + + token_chunk_size = seq_group_meta.token_chunk_size + + if num_outputs == 1: + assert is_first_step_output is not None + + if seq_group_meta.state.num_steps == 1: + assert is_first_step_output is True + seq_group.update_num_computed_tokens(token_chunk_size) + return + + # multi-step prefill is only supported when multi-step is + # enabled with chunked prefill + assert self.scheduler_config.is_multi_step and \ + self.scheduler_config.chunked_prefill_enabled + if is_first_step_output is True: + # This sequence is a prompt during the first step only. + seq_group.update_num_computed_tokens(token_chunk_size) + return + + assert is_first_step_output is None + + # multi-step prefill is only supported when multi-step is + # enabled with chunked prefill. Outputs from all the steps are + # submitted in a single burst. + assert self.scheduler_config.is_multi_step and \ + self.scheduler_config.chunked_prefill_enabled + assert num_outputs == seq_group_meta.state.num_steps, \ + f"#outputs {len(outputs)} - num steps {seq_group_meta.state.num_steps}" #noqa + # This sequence is a prompt during the first step only. + seq_group.update_num_computed_tokens(token_chunk_size) + now = time.time() if len(ctx.output_queue) == 0: @@ -969,20 +1037,27 @@ def _process_model_outputs(self, # When we process only one request, no pop is required # (since later we will process all of the rest) (outputs, seq_group_metadata_list, scheduler_outputs, is_async, - is_last_step, skip) = ctx.output_queue[0] + is_last_step, is_first_step_output, skip) = ctx.output_queue[0] else: (outputs, seq_group_metadata_list, scheduler_outputs, is_async, - is_last_step, skip) = ctx.output_queue.popleft() + is_last_step, is_first_step_output, + skip) = ctx.output_queue.popleft() # Sanity check assert len(seq_group_metadata_list) == len( scheduler_outputs.scheduled_seq_groups) - # Organize outputs by [step][sequence group] instead of - # [sequence group][step]. - if len(outputs) > 1: + has_multiple_outputs: bool = len(outputs) > 1 + if has_multiple_outputs: + assert self.scheduler_config.is_multi_step or \ + self.speculative_config + # Organize outputs by [step][sequence group] instead of + # [sequence group][step]. outputs_by_sequence_group = create_output_by_sequence_group( outputs, num_seq_groups=len(seq_group_metadata_list)) + # We have outputs for multiple steps submitted in a single burst, + # so invalidate is_first_step_output. + is_first_step_output = None else: outputs_by_sequence_group = outputs @@ -1018,14 +1093,17 @@ def _process_model_outputs(self, finished_before.append(i) continue - if len(outputs) > 1: + if has_multiple_outputs: output = outputs_by_sequence_group[i] else: output = [outputs_by_sequence_group[0][i]] - if not is_async: - seq_group.update_num_computed_tokens( - scheduled_seq_group.token_chunk_size) + if not is_async and seq_group_meta.is_prompt: + # Updates for all decodes happen when we actually append the + # token ids to the seq in process_outputs. + update_prefill_num_computed_tokens(seq_group, seq_group_meta, + len(output), + is_first_step_output) if outputs: for o in outputs: @@ -1159,8 +1237,18 @@ def _advance_to_next_step( if seq_group.is_finished(): continue - seq_group.update_num_computed_tokens( - seq_group_metadata.token_chunk_size) + if seq_group_metadata.is_prompt: + if self.scheduler_config.is_multi_step and \ + self.scheduler_config.chunked_prefill_enabled: + # Prompts are scheduled in multi-step only when + # chunking is enabled. These prompts turn into + # decodes after the very first step. Therefore, + # we skip the update to the num_computed_tokens + # here. + pass + else: + seq_group.update_num_computed_tokens( + seq_group_metadata.token_chunk_size) if seq_group_metadata.do_sample: assert len(sequence_group_outputs.samples) == 1, ( @@ -1172,6 +1260,7 @@ def _advance_to_next_step( assert len(seq_group.seqs) == 1 seq = seq_group.seqs[0] seq.append_token_id(sample.output_token, sample.logprobs) + seq_group.update_num_computed_tokens(1) def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. @@ -1324,12 +1413,19 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: if self.scheduler_config.is_multi_step: self.cached_scheduler_outputs[0] = SchedulerOutputState() + # is_first_step_output is True only when the num_steps of all + # the sequences are 1. When the num_steps > 1, + # multi_step_model_runner does the first-step output append. + is_first_step_output: bool = False if not seq_group_metadata_list \ + else seq_group_metadata_list[0].state.num_steps == 1 + # Add results to the output_queue ctx.append_output(outputs=outputs, seq_group_metadata_list=seq_group_metadata_list, scheduler_outputs=scheduler_outputs, is_async=allow_async_output_proc, - is_last_step=True) + is_last_step=True, + is_first_step_output=is_first_step_output) if outputs and allow_async_output_proc: assert len(outputs) == 1, ( diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index 31c2bbc8e712..cd5cfe5485f2 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -170,6 +170,7 @@ def _process_seq_outputs(self, seq: Sequence, token_id=output_token_id, logprobs=output_logprob, ) + seq.data.update_num_computed_tokens(1) self._process_decode_and_stop(seq, sampling_params) diff --git a/vllm/sequence.py b/vllm/sequence.py index 49a198df045b..781bcedde2b5 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -743,10 +743,35 @@ def prompt_adapter_num_virtual_tokens(self) -> int: return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens\ if self.prompt_adapter_request else 0 - def init_multi_step(self, num_scheduler_steps: int) -> None: - self.state.num_steps = num_scheduler_steps + def init_multi_step(self, num_steps: int) -> None: + self.state.num_steps = num_steps self.state.current_step = 0 + def init_multi_step_from_lookahead_slots(self, num_lookahead_slots: int, + num_scheduler_steps: int, + is_multi_step: bool, + enable_chunking: bool) -> None: + + if not is_multi_step: + self.init_multi_step(num_steps=num_scheduler_steps) + return + + # Multi-Step case + is_prefill = self.is_prefill() + + # The asserts below reflect the expectations of the current system. + if is_prefill and enable_chunking: + assert num_lookahead_slots == num_scheduler_steps + self.init_multi_step(num_steps=num_lookahead_slots) + else: + is_decode: bool = not is_prefill + # If it is a prefill, num_lookahead_slots must be 0 + assert num_lookahead_slots == 0 or is_decode + # If it is a decode, num_lookahead_slots + 1 must match + # the scheduler steps. + assert num_lookahead_slots + 1 == num_scheduler_steps or is_prefill + self.init_multi_step(num_steps=num_lookahead_slots + 1) + def get_last_latency(self, now: float) -> Optional[float]: """Sets the last token time for Request level timings.""" # If still in prefill phase, raise Error. @@ -1010,6 +1035,20 @@ def prompt_adapter_num_virtual_tokens(self) -> int: return self.prompt_adapter_request.prompt_adapter_num_virtual_tokens \ if self.prompt_adapter_request else 0 + # Multi-Step Chunked-Prefill property + @property + def is_single_step_prompt(self) -> bool: + # do_sample is true, only when the token_chunk_size matches the + # num_uncomputed_tokens of the sequence. This indicates that + # the prompt will finish processing in a single `execute_model` + # step. + return self.is_prompt and self.do_sample + + def get_first_seq_id(self) -> int: + # This is an efficient way of fetching the seq_id when + # we know this SequenceGroup has only one sequence. + return next(iter(self.seq_data)) + def apply_delta(self, sequence_group_metadata_delta: SequenceGroupMetadataDelta): for id, delta in sequence_group_metadata_delta.seq_data_delta.items(): @@ -1022,7 +1061,8 @@ def apply_delta(self, def finish_step(self) -> None: assert self.state is not None - assert self.state.current_step < self.state.num_steps + assert self.state.current_step < self.state.num_steps, \ + f"current step {self.state.current_step}, num_steps {self.state.num_steps}" # noqa self.state.current_step += 1 diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index c7295f872f70..4c57a37c8787 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -14,7 +14,7 @@ get_pythonized_sample_results) from vllm.sequence import (CompletionSequenceGroupOutput, IntermediateTensors, Logprob, SequenceGroupMetadata, SequenceOutput) -from vllm.utils import PyObjectCache +from vllm.utils import PyObjectCache, async_tensor_h2d from vllm.worker.model_runner import (GPUModelRunnerBase, ModelInputForGPUWithSamplingMetadata) from vllm.worker.model_runner_base import ( @@ -30,6 +30,14 @@ logger = init_logger(__name__) MULTI_STEP_ATTENTION_BACKENDS = ["flash-attn", "rocm-flash-attn", "flashinfer"] +MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["flash-attn"] + +def _get_supported_attention_backends(chunked_prefill_enabled: bool) \ + -> List[str]: + if chunked_prefill_enabled: + return MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS + else: + return MULTI_STEP_ATTENTION_BACKENDS def seq_output_builder(): @@ -144,11 +152,13 @@ class StatefulModelInput(BroadcastableModelInput): is_multi_step: bool = True is_last_step: bool = False is_first_multi_step: bool = False + base_output_proc_callback: Optional[Callable] = None # ping-pong data structures for multi-step to wait on the previous step step_cuda_events: List[torch.cuda.Event] = field( default_factory=lambda: [torch.cuda.Event(blocking=True)] * 2) num_seqs: int = -1 num_queries: int = -1 + num_single_step_prefills: int = 0 def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: assert self.frozen_model_input is not None @@ -161,6 +171,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: 'is_first_multi_step': self.is_first_multi_step, 'num_seqs': self.num_seqs, 'num_queries': self.num_queries, + 'num_single_step_prefills': self.num_single_step_prefills, } tensor_dict.update(new_tensor_dict) return tensor_dict @@ -209,6 +220,81 @@ def add_sampler_output(self, sampled_token_ids=sampled_token_ids, pythonized=False)) + def maybe_advance_sampling_metadata(self, device: str, pin_memory: bool): + """ + sampling_metadata.selected_token_indices is constructed for the + first-step in Multi-Step. However, when chunked-prefill is enabled with + multi-step, the scheduled prompts are fully processed in the + first-step and are processed as decodes in the rest of the steps. + This function updates the sampling_metadata.selected_token_indices + to account for this conversion. + + Example: + Let 2 prompts and 2 decodes be scheduled together. Let the + num-tokens to process for the 2 prompts be 5 and 8 respectively. + + In that case, sampling_metadata.sampled_token_indices will be, + [4, 12, 13, 14] as it is constructed for the first-step in + multi-step. + However, the prompts turns to decodes after the first-step + and the num-tokens for the previously-prompt sequences will + be 1 and 1 as they are decodes now. The self.sampled_token_indices + must be updated to [0,1,2,3]. + """ + assert self.current_step == 1 and self.num_single_step_prefills > 0 + if not get_pp_group().is_last_rank: + return + + assert self.frozen_model_input is not None + assert self.frozen_model_input.sampling_metadata is not None + self.frozen_model_input.sampling_metadata.selected_token_indices = \ + async_tensor_h2d(list(range(self.num_queries)), + dtype=torch.long, + target_device=device, + pin_memory=pin_memory) + + def maybe_advance_frozen_model_input(self, device: str, pin_memory: bool): + """ + Advancing the datastructures of StatefulModelInput::frozen_model_input + is only required when prefills are scheduled with decodes to run in + multi-step. This advancement/correction is required to account for + the conversion of Prefills to Decodes after the first multi-step. + """ + if self.current_step != 1 or self.num_single_step_prefills == 0: + return + + assert self.frozen_model_input is not None + fmi = self.frozen_model_input + + # Truncate input_tokens + assert fmi.input_tokens is not None + assert fmi.input_tokens.shape[0] >= self.num_seqs + fmi_new_input_tokens: torch.Tensor = fmi.input_tokens[:self.num_seqs] + + # Update frozen_model_input::input_positons. + assert fmi.input_positions is not None + assert fmi.input_positions.shape[0] >= self.num_seqs + fmi_new_input_positions: torch.Tensor = fmi.input_positions[:self. + num_seqs] + + # Assert unsupported + assert fmi.lora_mapping is None + assert fmi.lora_requests is not None + assert len(fmi.lora_requests) == 0 + assert fmi.attn_metadata is not None + assert fmi.prompt_adapter_mapping is None + assert fmi.prompt_adapter_requests is not None + assert len(fmi.prompt_adapter_requests) == 0 + assert fmi.multi_modal_kwargs is not None + assert len(fmi.multi_modal_kwargs) == 0 + + self.frozen_model_input = dataclasses.replace( + self.frozen_model_input, + input_tokens=fmi_new_input_tokens, + input_positions=fmi_new_input_positions) + + self.maybe_advance_sampling_metadata(device, pin_memory) + # MutableModelInputForGPUWithMultiStepMetadata is not subclass of # ModelInputForGPU but it wraps the actual input dataclass and adds multi-step @@ -220,6 +306,19 @@ class MultiStepModelRunner(GPUModelRunnerBase[StatefulModelInput]): def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs): super().__init__(*args, **kwargs) + # Check attention backend support. + supported_attention_backends: List[str] = \ + _get_supported_attention_backends( + self.scheduler_config.chunked_prefill_enabled) + if self.attn_backend.get_name() not in supported_attention_backends: + ms_config_str: str = "Multi-Step + Chunked-Prefill" \ + if self.scheduler_config.chunked_prefill_enabled \ + else "Multi-Step" + raise ValueError( + f"{ms_config_str} not supported for attention backend: " + f"{self.attn_backend.get_name()}. Set VLLM_ATTENTION_BACKEND " + f"to a value from {supported_attention_backends}.") + # uses the base model runner to execute the model and wraps it with # multi-step logic self._base_model_runner: GPUModelRunnerBase = base_model_runner @@ -248,14 +347,25 @@ def prepare_model_input( virtual_engine: int = 0, finished_requests_ids: Optional[List[str]] = None ) -> StatefulModelInput: - frozen_model_input = self._base_model_runner.prepare_model_input( - seq_group_metadata_list, virtual_engine, finished_requests_ids) + frozen_model_input: ModelInputForGPUWithSamplingMetadata = \ + self._base_model_runner.prepare_model_input( + seq_group_metadata_list, + virtual_engine, + finished_requests_ids) + + assert frozen_model_input.query_lens is not None + assert frozen_model_input.seq_lens is not None + assert frozen_model_input.attn_metadata is not None + num_queries = len(frozen_model_input.query_lens) + num_seqs = len(frozen_model_input.seq_lens) + num_single_step_prefills = frozen_model_input.attn_metadata.num_prefills model_input = StatefulModelInput( frozen_model_input=frozen_model_input, - num_seqs=len(frozen_model_input.seq_lens), - num_queries=len(frozen_model_input.query_lens), - ) + num_seqs=num_seqs, + num_queries=num_queries, + num_single_step_prefills=num_single_step_prefills) + return model_input def _async_process_outputs(self, model_input: StatefulModelInput, @@ -265,7 +375,7 @@ def _async_process_outputs(self, model_input: StatefulModelInput, output_proc_callback() cont = True - for model_output in model_input.cached_outputs: + for step_num, model_output in enumerate(model_input.cached_outputs): if not model_output.pythonized: model_output.maybe_pythonize(model_input, self._copy_stream, self.pinned_sampled_token_ids) @@ -276,7 +386,8 @@ def _async_process_outputs(self, model_input: StatefulModelInput, seq_group_metadata_list=ctx.seq_group_metadata_list, scheduler_outputs=ctx.scheduler_outputs, is_async=False, - is_last_step=False) + is_last_step=False, + is_first_step_output=step_num == 0) output_proc_callback() else: @@ -292,9 +403,8 @@ def _final_process_outputs(self, model_input: StatefulModelInput, has_async_callback = output_proc_callback is not None outputs = [] - for output_id in range(len(model_input.cached_outputs)): - output = model_input.cached_outputs[output_id] - is_last_step = output_id == len(model_input.cached_outputs) - 1 + for step_num, output in enumerate(model_input.cached_outputs): + is_last_step = step_num == len(model_input.cached_outputs) - 1 # For non-async case: # -- We simply add the outputs @@ -323,7 +433,8 @@ def _final_process_outputs(self, model_input: StatefulModelInput, seq_group_metadata_list, scheduler_outputs=ctx.scheduler_outputs, is_async=False, - is_last_step=False) + is_last_step=False, + is_first_step_output=step_num == 0) else: outputs.append(output.sampler_output) else: @@ -389,18 +500,27 @@ def execute_model( model_input = self._advance_step( model_input, model_input.cached_outputs[-1].sampler_output) - output_proc_callback = None + # frozen_model_input may have been updated + frozen_model_input = model_input.frozen_model_input + assert frozen_model_input is not None + + if model_input.base_output_proc_callback is None: + assert frozen_model_input is not None + model_input.base_output_proc_callback = \ + frozen_model_input.async_callback + if frozen_model_input.async_callback is not None: - output_proc_callback = frozen_model_input.async_callback - assert output_proc_callback is not None + assert model_input.base_output_proc_callback is not None async_callback = functools.partial( self._async_process_outputs, model_input=model_input, - output_proc_callback=output_proc_callback) + output_proc_callback=model_input.base_output_proc_callback) - frozen_model_input = dataclasses.replace( # type: ignore + model_input.frozen_model_input = dataclasses.replace( # type: ignore model_input.frozen_model_input, async_callback=async_callback) + # Update the local instance + frozen_model_input = model_input.frozen_model_input assert frozen_model_input is not None # Execute the model @@ -455,8 +575,8 @@ def execute_model( # Pythonize the output and block if needed since it is the last step if model_input.is_last_step: - outputs = self._final_process_outputs(model_input, - output_proc_callback) + outputs = self._final_process_outputs( + model_input, model_input.base_output_proc_callback) self.pythonization_cache.reset() return outputs @@ -484,11 +604,14 @@ def _update_sampling_metadata(self, sampling_metadata, num_seqs, def _advance_step(self, model_input: StatefulModelInput, out: SamplerOutput) -> StatefulModelInput: - if self.attn_backend.get_name() not in MULTI_STEP_ATTENTION_BACKENDS: - raise ValueError( - f"Multi-step not supported for attention backend: " - f"{self.attn_backend.get_name()}. Set VLLM_ATTENTION_BACKEND " - f"to a value from {MULTI_STEP_ATTENTION_BACKENDS}.") + + model_input.maybe_advance_frozen_model_input(self.device, + self.pin_memory) + frozen_model_input = model_input.frozen_model_input + assert frozen_model_input is not None + assert frozen_model_input.input_tokens is not None + assert frozen_model_input.input_tokens.shape[0] == model_input.num_seqs + assert frozen_model_input.attn_metadata is not None sampled_token_ids = model_input.cached_outputs[-1].sampled_token_ids num_seqs = model_input.num_seqs @@ -498,13 +621,15 @@ def _advance_step(self, model_input: StatefulModelInput, attn_metadata = frozen_model_input.attn_metadata assert attn_metadata is not None + turn_prefills_into_decodes: bool = model_input.current_step == 1 and \ + model_input.num_single_step_prefills != 0 attn_metadata.advance_step( frozen_model_input, sampled_token_ids, self.block_size, num_seqs, num_queries, - ) + turn_prefills_into_decodes=turn_prefills_into_decodes) return model_input diff --git a/vllm/worker/multi_step_worker.py b/vllm/worker/multi_step_worker.py index 562285f828cc..bf66f32d7d24 100644 --- a/vllm/worker/multi_step_worker.py +++ b/vllm/worker/multi_step_worker.py @@ -76,8 +76,9 @@ def _get_driver_input_and_broadcast( frozen_model_input = model_input.frozen_model_input assert frozen_model_input is not None assert frozen_model_input.attn_metadata is not None - # clear the cached decode metadata so that it can be recomputed on - # the workers + # clear the cached metadata so that it can be recomputed on + # the workers. + frozen_model_input.attn_metadata._cached_prefill_metadata = None frozen_model_input.attn_metadata._cached_decode_metadata = None model_input.is_first_multi_step = is_first_multi_step From 18e60d7d1394541b48bf48b0a57a546a93607ac2 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 27 Sep 2024 14:27:56 -0700 Subject: [PATCH 024/199] [misc][distributed] add VLLM_SKIP_P2P_CHECK flag (#8911) --- .../distributed/device_communicators/custom_all_reduce.py | 4 ++++ vllm/envs.py | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index d239d645edc1..c95192a5a1bc 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -28,6 +28,10 @@ def _can_p2p(rank: int, world_size: int) -> bool: for i in range(world_size): if i == rank: continue + if envs.VLLM_SKIP_P2P_CHECK: + logger.info( + "Skipping P2P check and trusting the driver's P2P report.") + return torch.cuda.can_device_access_peer(rank, i) if not gpu_p2p_access_check(rank, i): return False return True diff --git a/vllm/envs.py b/vllm/envs.py index 705d858e71a6..7cbffc83a625 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -63,6 +63,7 @@ VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_ALLOW_DEPRECATED_BEAM_SEARCH: bool = False + VLLM_SKIP_P2P_CHECK: bool = False def get_default_cache_root(): @@ -423,6 +424,13 @@ def get_default_config_root(): lambda: (os.environ.get("VLLM_ALLOW_RUNTIME_LORA_UPDATING", "0").strip().lower() in ("1", "true")), + + # By default, vLLM will check the peer-to-peer capability itself, + # in case of broken drivers. See https://github.com/vllm-project/vllm/blob/a9b15c606fea67a072416ea0ea115261a2756058/vllm/distributed/device_communicators/custom_all_reduce_utils.py#L101-L108 for details. # noqa + # If this env var is set to 1, vLLM will skip the peer-to-peer check, + # and trust the driver's peer-to-peer capability report. + "VLLM_SKIP_P2P_CHECK": + lambda: os.getenv("VLLM_SKIP_P2P_CHECK", "0") == "1", } # end-env-vars-definition From bd429f2b75f3622fabaf9c9470ca2e921f6f56ca Mon Sep 17 00:00:00 2001 From: Sebastian Schoennenbeck Date: Sat, 28 Sep 2024 00:07:10 +0200 Subject: [PATCH 025/199] [Core] Priority-based scheduling in async engine (#8850) --- vllm/engine/async_llm_engine.py | 25 +++++++++++++++++++++++-- vllm/engine/llm_engine.py | 2 +- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 3361fdefc960..7778732dd8be 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -420,6 +420,7 @@ async def add_request_async( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, ) -> None: ... @@ -433,6 +434,7 @@ async def add_request_async( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, ) -> None: ... @@ -449,6 +451,7 @@ async def add_request_async( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, *, inputs: Optional[PromptType] = None, # DEPRECATED ) -> None: @@ -460,6 +463,9 @@ async def add_request_async( if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " "not enabled!") + if priority != 0 and not self.scheduler_config.policy == "priority": + raise ValueError(f"Got priority {priority} but " + "Priority scheduling is not enabled.") if arrival_time is None: arrival_time = time.time() @@ -479,6 +485,7 @@ async def add_request_async( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, trace_headers=trace_headers, + priority=priority, ) async def check_health_async(self) -> None: @@ -829,6 +836,7 @@ def add_request( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, ) -> Coroutine[None, None, AsyncGenerator[Union[ RequestOutput, EmbeddingRequestOutput], None]]: ... @@ -843,6 +851,7 @@ def add_request( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, ) -> Coroutine[None, None, AsyncGenerator[Union[ RequestOutput, EmbeddingRequestOutput], None]]: ... @@ -860,6 +869,7 @@ async def add_request( lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, *, inputs: Optional[PromptType] = None, # DEPRECATED ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]: @@ -877,6 +887,11 @@ async def add_request( "error that caused the background loop to stop " "(AsyncEngineDeadError).") + if (priority != 0 + and not self.engine.scheduler_config.policy == "priority"): + raise ValueError(f"Got priority {priority} but " + "Priority scheduling is not enabled.") + stream = self._request_tracker.add_request( request_id, verbose=self.log_requests, @@ -885,7 +900,9 @@ async def add_request( arrival_time=arrival_time or time.time(), lora_request=lora_request, trace_headers=trace_headers, - prompt_adapter_request=prompt_adapter_request) + prompt_adapter_request=prompt_adapter_request, + priority=priority, + ) return stream.generator() @@ -896,7 +913,8 @@ async def generate( request_id: str, lora_request: Optional[LoRARequest] = None, trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, ) -> AsyncGenerator[RequestOutput, None]: """Generate outputs for a request. @@ -913,6 +931,8 @@ async def generate( trace_headers: OpenTelemetry trace headers. prompt_adapter_request: Prompt Adapter request to use for generation, if any. + priority: The priority of the request. + Only applicable with priority scheduling. Yields: The output `RequestOutput` objects from the LLMEngine @@ -968,6 +988,7 @@ async def generate( lora_request=lora_request, trace_headers=trace_headers, prompt_adapter_request=prompt_adapter_request, + priority=priority, ): yield LLMEngine.validate_output(output, RequestOutput) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 19f88ac3e7c5..e3cd822f648f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -796,7 +796,7 @@ def add_request( raise ValueError(f"Got lora_request {lora_request} but LoRA is " "not enabled!") - if priority > 0 and not self.scheduler_config.policy == "priority": + if priority != 0 and not self.scheduler_config.policy == "priority": raise ValueError(f"Got priority {priority} but " "Priority scheduling is not enabled.") From d86f6b2afb006ea4b4b14a49a58f64bf3b952de6 Mon Sep 17 00:00:00 2001 From: youkaichao Date: Fri, 27 Sep 2024 22:10:44 -0700 Subject: [PATCH 026/199] [misc] fix wheel name (#8919) --- .buildkite/release-pipeline.yaml | 5 +++-- docs/source/getting_started/installation.rst | 20 ++++++++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index 416fe344a36e..e72138e29dd6 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -8,8 +8,9 @@ steps: - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" # rename the files to change linux -> manylinux1 - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done" - - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/" - - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/" + - "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" + - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" env: DOCKER_BUILDKIT: "1" diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst index bdde3e933b18..622983e494b9 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation.rst @@ -48,15 +48,20 @@ You can install vLLM using pip: .. note:: - vLLM also publishes a subset of wheels (Python 3.10, 3.11 with CUDA 12) for every commit since v0.5.3. You can download them with the following command: + vLLM also publishes wheels for Linux running on x86 platform with cuda 12 for every commit since v0.5.3. You can download and install them with the following command: .. code-block:: console - $ export VLLM_VERSION=0.6.1.post1 # vLLM's main branch version is currently set to latest released tag - $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl - $ # You can also access a specific commit - $ # export VLLM_COMMIT=... - $ # pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl + $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch + $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl + + You can also just download the latest wheel by running: + + .. code-block:: console + + $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl + + Note that the wheels are built with Python 3.8 abi (see `PEP 425 `_ for more details about abi), so they are compatible with Python 3.8 and later. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual version of wheels is contained in the wheel metadata. Build from source (without compilation) --------------------------------------- @@ -67,8 +72,7 @@ The first step is to follow the previous instructions to install the latest vLLM .. code-block:: console - $ export VLLM_VERSION=0.6.1.post1 - $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl + $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl After verifying that the installation is successful, we have a script for you to copy and link directories, so that you can edit the Python code directly: From 260024a3749fb6856625dfee28560a98a92dd339 Mon Sep 17 00:00:00 2001 From: Tyler Titsworth Date: Fri, 27 Sep 2024 23:45:50 -0700 Subject: [PATCH 027/199] [Bugfix][Intel] Fix XPU Dockerfile Build (#7824) Signed-off-by: tylertitsworth Co-authored-by: youkaichao --- .buildkite/run-xpu-test.sh | 2 +- .dockerignore | 4 +++- Dockerfile.xpu | 47 ++++++++++++++++++++++++++++++------- requirements-common.txt | 2 +- requirements-xpu.txt | 8 +++++-- setup.py | 2 ++ vllm/platforms/__init__.py | 12 ++++++++++ vllm/platforms/interface.py | 4 ++++ vllm/platforms/xpu.py | 20 ++++++++++++++++ 9 files changed, 87 insertions(+), 14 deletions(-) create mode 100644 vllm/platforms/xpu.py diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh index 22a7e76937a7..6ffa66d5ef3d 100644 --- a/.buildkite/run-xpu-test.sh +++ b/.buildkite/run-xpu-test.sh @@ -11,4 +11,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py +docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py diff --git a/.dockerignore b/.dockerignore index 79fa088fa809..17ed0d97c88b 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,4 +1,6 @@ -vllm/*.so +/.github/ /.venv /build dist +Dockerfile* +vllm/*.so diff --git a/Dockerfile.xpu b/Dockerfile.xpu index 8471edd16e4b..83db341556ea 100644 --- a/Dockerfile.xpu +++ b/Dockerfile.xpu @@ -1,4 +1,4 @@ -FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 +FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ @@ -7,20 +7,49 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \ chmod 644 /usr/share/keyrings/intel-graphics.gpg -RUN apt-get update -y && \ - apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1 - -COPY ./ /workspace/vllm +RUN apt-get update -y && \ + apt-get install -y --no-install-recommends --fix-missing \ + curl \ + ffmpeg \ + git \ + libsndfile1 \ + libsm6 \ + libxext6 \ + libgl1 \ + lsb-release \ + numactl \ + python3 \ + python3-dev \ + python3-pip \ + # vim \ + wget WORKDIR /workspace/vllm +COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt +COPY requirements-common.txt /workspace/vllm/requirements-common.txt RUN --mount=type=cache,target=/root/.cache/pip \ - pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \ - cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \ - -r requirements-xpu.txt + pip install --no-cache-dir \ + --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \ + -r requirements-xpu.txt + +COPY ./ /workspace/vllm + +ENV VLLM_TARGET_DEVICE=xpu RUN --mount=type=cache,target=/root/.cache/pip \ --mount=type=bind,source=.git,target=.git \ - VLLM_TARGET_DEVICE=xpu python3 setup.py install + python3 setup.py install CMD ["/bin/bash"] + +FROM vllm-base AS vllm-openai + +# install additional dependencies for openai api server +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install accelerate hf_transfer 'modelscope!=1.15.0' + +ENV VLLM_USAGE_SOURCE production-docker-image \ + TRITON_XPU_PROFILE 1 + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/requirements-common.txt b/requirements-common.txt index a9596878a0f8..855169aae5fd 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -1,7 +1,7 @@ psutil sentencepiece # Required for LLaMA tokenizer. numpy < 2.0.0 -requests +requests >= 2.26.0 tqdm py-cpuinfo transformers >= 4.45.0 # Required for Llama 3.2. diff --git a/requirements-xpu.txt b/requirements-xpu.txt index 9b21845e084d..ce83a178c618 100644 --- a/requirements-xpu.txt +++ b/requirements-xpu.txt @@ -1,9 +1,13 @@ # Common dependencies -r requirements-common.txt -setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed. - ray >= 2.9 +cmake>=3.26 +ninja +packaging +setuptools-scm>=8 +wheel +jinja2 # Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ torch == 2.3.1+cxx11.abi intel-extension-for-pytorch == 2.3.110+xpu diff --git a/setup.py b/setup.py index 8ef759f5245f..26ed33f89745 100644 --- a/setup.py +++ b/setup.py @@ -415,6 +415,8 @@ def _read_requirements(filename: str) -> List[str]: for line in requirements: if line.startswith("-r "): resolved_requirements += _read_requirements(line.split()[1]) + elif line.startswith("--"): + continue else: resolved_requirements.append(line) return resolved_requirements diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py index a483614d067e..c648862b2d75 100644 --- a/vllm/platforms/__init__.py +++ b/vllm/platforms/__init__.py @@ -42,6 +42,15 @@ except Exception: pass +is_xpu = False + +try: + import torch + if hasattr(torch, 'xpu') and torch.xpu.is_available(): + is_xpu = True +except Exception: + pass + is_cpu = False try: from importlib.metadata import version @@ -60,6 +69,9 @@ elif is_rocm: from .rocm import RocmPlatform current_platform = RocmPlatform() +elif is_xpu: + from .xpu import XPUPlatform + current_platform = XPUPlatform() elif is_cpu: from .cpu import CpuPlatform current_platform = CpuPlatform() diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index 360590d7d5eb..7d3de706d14f 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -8,6 +8,7 @@ class PlatformEnum(enum.Enum): CUDA = enum.auto() ROCM = enum.auto() TPU = enum.auto() + XPU = enum.auto() CPU = enum.auto() UNSPECIFIED = enum.auto() @@ -41,6 +42,9 @@ def is_rocm(self) -> bool: def is_tpu(self) -> bool: return self._enum == PlatformEnum.TPU + def is_xpu(self) -> bool: + return self._enum == PlatformEnum.XPU + def is_cpu(self) -> bool: return self._enum == PlatformEnum.CPU diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py new file mode 100644 index 000000000000..e0f98d745b5e --- /dev/null +++ b/vllm/platforms/xpu.py @@ -0,0 +1,20 @@ +import torch + +from .interface import DeviceCapability, Platform, PlatformEnum + + +class XPUPlatform(Platform): + _enum = PlatformEnum.XPU + + @staticmethod + def get_device_capability(device_id: int = 0) -> DeviceCapability: + return DeviceCapability(major=int( + torch.xpu.get_device_capability(device_id)['version'].split('.') + [0]), + minor=int( + torch.xpu.get_device_capability(device_id) + ['version'].split('.')[1])) + + @staticmethod + def get_device_name(device_id: int = 0) -> str: + return torch.xpu.get_device_name(device_id) From b0298aa8cc4a54bde659e57271778630785abc9b Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 28 Sep 2024 16:11:25 +0800 Subject: [PATCH 028/199] [Misc] Remove vLLM patch of `BaichuanTokenizer` (#8921) --- vllm/transformers_utils/tokenizer.py | 16 +- .../transformers_utils/tokenizers/__init__.py | 5 +- .../transformers_utils/tokenizers/baichuan.py | 255 ------------------ 3 files changed, 3 insertions(+), 273 deletions(-) delete mode 100644 vllm/transformers_utils/tokenizers/baichuan.py diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index e3b244d06660..85c339df4a76 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -11,8 +11,7 @@ from vllm.envs import VLLM_USE_MODELSCOPE from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.transformers_utils.tokenizers import (BaichuanTokenizer, - MistralTokenizer) +from vllm.transformers_utils.tokenizers import MistralTokenizer from vllm.transformers_utils.utils import check_gguf_file from vllm.utils import make_async @@ -139,19 +138,6 @@ def get_tokenizer( raise RuntimeError(err_msg) from e else: raise e - except AttributeError as e: - if "BaichuanTokenizer" in str(e): - # This is for the error "'BaichuanTokenizer' object has no - # attribute 'sp_model'". - tokenizer = BaichuanTokenizer.from_pretrained( - tokenizer_name, - *args, - trust_remote_code=trust_remote_code, - revision=revision, - **kwargs, - ) - else: - raise e # NOTE: We can remove this after https://github.com/THUDM/ChatGLM3/issues/1324 if type(tokenizer).__name__ in ("ChatGLMTokenizer", diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py index 9433f2d48f6f..5f437d414e18 100644 --- a/vllm/transformers_utils/tokenizers/__init__.py +++ b/vllm/transformers_utils/tokenizers/__init__.py @@ -1,4 +1,3 @@ -from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer -from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer +from .mistral import MistralTokenizer -__all__ = ["BaichuanTokenizer", "MistralTokenizer"] +__all__ = ["MistralTokenizer"] diff --git a/vllm/transformers_utils/tokenizers/baichuan.py b/vllm/transformers_utils/tokenizers/baichuan.py deleted file mode 100644 index 76daabc41e0a..000000000000 --- a/vllm/transformers_utils/tokenizers/baichuan.py +++ /dev/null @@ -1,255 +0,0 @@ -# Adapted from -# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py -# This includes a fix suggested in -# https://github.com/vllm-project/vllm/issues/1403#issuecomment-1767503058 -# Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved. - -import os -from shutil import copyfile -from typing import Any, Dict, List, Optional, Tuple - -import sentencepiece as spm -from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer -from transformers.utils import logging - -logger = logging.get_logger(__name__) - -VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"} - -PRETRAINED_VOCAB_FILES_MAP = { # type: ignore - "vocab_file": {}, - "tokenizer_file": {}, -} -PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} # type: ignore - - -class BaichuanTokenizer(PreTrainedTokenizer): - """ - Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding. - - Args: - vocab_file (`str`): - Path to the vocabulary file. - """ - - vocab_files_names = VOCAB_FILES_NAMES - pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["input_ids", "attention_mask"] - - def __init__( - self, - vocab_file, - unk_token="", - bos_token="", - eos_token="", - pad_token=None, - sp_model_kwargs: Optional[Dict[str, Any]] = None, - add_bos_token=True, - add_eos_token=False, - clean_up_tokenization_spaces=False, - **kwargs, - ): - self.sp_model_kwargs = ({} if sp_model_kwargs is None else - sp_model_kwargs) - bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False) - if isinstance(bos_token, str) else bos_token) - eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False) - if isinstance(eos_token, str) else eos_token) - unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False) - if isinstance(unk_token, str) else unk_token) - pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False) - if isinstance(pad_token, str) else pad_token) - self.vocab_file = vocab_file - self.add_bos_token = add_bos_token - self.add_eos_token = add_eos_token - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) - super().__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, - pad_token=pad_token, - add_bos_token=add_bos_token, - add_eos_token=add_eos_token, - sp_model_kwargs=self.sp_model_kwargs, - clean_up_tokenization_spaces=clean_up_tokenization_spaces, - **kwargs, - ) - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - return state - - def __setstate__(self, d): - self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(self.vocab_file) - - @property - def vocab_size(self): - """Returns vocab size""" - return self.sp_model.get_piece_size() - - def get_vocab(self): - """Returns vocab as a dict""" - vocab = { - self.convert_ids_to_tokens(i): i - for i in range(self.vocab_size) - } - vocab.update(self.added_tokens_encoder) - return vocab - - def _tokenize(self, text): - """Returns a tokenized string.""" - return self.sp_model.encode(text, out_type=str) - - def _convert_token_to_id(self, token): - """Converts a token (str) in an id using the vocab.""" - return self.sp_model.piece_to_id(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - token = self.sp_model.IdToPiece(index) - return token - - def convert_tokens_to_string(self, tokens: List[str]): - """Converts a sequence of tokens (string) in a single string.""" - current_sub_tokens: List[str] = [] - out_string = "" - prev_is_special = False - for i, token in enumerate(tokens): - # make sure that special tokens are not decoded using - # sentencepiece model - if token in self.all_special_tokens: - if not prev_is_special and i != 0: - out_string += " " - out_string += self.sp_model.decode(current_sub_tokens) + token - prev_is_special = True - current_sub_tokens = [] - else: - current_sub_tokens.append(token) - prev_is_special = False - out_string += self.sp_model.decode(current_sub_tokens) - return out_string - - def save_vocabulary(self, - save_directory, - filename_prefix: Optional[str] = None) -> Tuple[str]: - """ - Save the vocabulary and special tokens file to a directory. - - Args: - save_directory (`str`): - The directory in which to save the vocabulary. - - Returns: - `Tuple(str)`: Paths to the files saved. - """ - if not os.path.isdir(save_directory): - raise ValueError(f"Vocabulary path ({save_directory}) " - "should be a directory") - - out_vocab_file = os.path.join( - save_directory, - (filename_prefix + "-" if filename_prefix else "") + - VOCAB_FILES_NAMES["vocab_file"], - ) - - if os.path.abspath(self.vocab_file) != os.path.abspath( - out_vocab_file) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, out_vocab_file) - elif not os.path.isfile(self.vocab_file): - with open(out_vocab_file, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - return (out_vocab_file, ) - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - bos_token_id = [self.bos_token_id] if self.add_bos_token else [] - eos_token_id = [self.eos_token_id] if self.add_eos_token else [] - - output = bos_token_id + token_ids_0 + eos_token_id - - if token_ids_1 is not None: - output = output + bos_token_id + token_ids_1 + eos_token_id - - return output - - def get_special_tokens_mask( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None, - already_has_special_tokens: bool = False, - ) -> List[int]: - """ - Retrieve sequence ids from a token list that has no special tokens - added. This method is called when adding - special tokens using the tokenizer `prepare_for_model` method. - - Args: - token_ids_0 (`List[int]`): - List of IDs. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - already_has_special_tokens (`bool`, *optional*, defaults to - `False`): - Whether or not the token list is already formatted with - special tokens for the model. - - Returns: - `List[int]`: A list of integers in the range [0, 1]: - 1 for a special token, 0 for a sequence token. - """ - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, - token_ids_1=token_ids_1, - already_has_special_tokens=True, - ) - - bos_token_id = [1] if self.add_bos_token else [] - eos_token_id = [1] if self.add_eos_token else [] - - if token_ids_1 is None: - return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id - return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + - bos_token_id + ([0] * len(token_ids_1)) + eos_token_id) - - def create_token_type_ids_from_sequences( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None) -> List[int]: - """ - Creates a mask from the two sequences passed to be used in a - sequence-pair classification task. An ALBERT - sequence pair mask has the following format: - - ``` - 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence | - ``` - - if token_ids_1 is None, only returns the first portion of the mask (0s). - - Args: - token_ids_0 (`List[int]`): - List of ids. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of [token type IDs](../glossary#token-type-ids) - according to the given sequence(s). - """ - bos_token_id = [self.bos_token_id] if self.add_bos_token else [] - eos_token_id = [self.eos_token_id] if self.add_eos_token else [] - - output = [0] * len(bos_token_id + token_ids_0 + eos_token_id) - - if token_ids_1 is not None: - output += [1] * len(bos_token_id + token_ids_1 + eos_token_id) - - return output From 39d3f8d94fd2691b70ee809e7565402f8a061c6b Mon Sep 17 00:00:00 2001 From: tastelikefeet <58414341+tastelikefeet@users.noreply.github.com> Date: Sat, 28 Sep 2024 23:24:12 +0800 Subject: [PATCH 029/199] [Bugfix] Fix code for downloading models from modelscope (#8443) --- vllm/transformers_utils/__init__.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py index e69de29bb2d1..74ca396276c3 100644 --- a/vllm/transformers_utils/__init__.py +++ b/vllm/transformers_utils/__init__.py @@ -0,0 +1,17 @@ +from vllm.envs import VLLM_USE_MODELSCOPE + +if VLLM_USE_MODELSCOPE: + # Patch here, before each import happens + import modelscope + from packaging import version + + # patch_hub begins from modelscope>=1.18.1 + if version.parse(modelscope.__version__) <= version.parse('1.18.0'): + raise ImportError( + 'Using vLLM with ModelScope needs modelscope>=1.18.1, please ' + 'install by `pip install modelscope>=1.18.1`') + + from modelscope.utils.hf_util import patch_hub + + # Patch hub to download models from modelscope to speed up. + patch_hub() From 19d02ff93812fb6a28f0f1a0a0f9233e9388d616 Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Sat, 28 Sep 2024 11:52:46 -0400 Subject: [PATCH 030/199] [Bugfix] Fix PP for Multi-Step (#8887) --- .../multi_step/test_correctness_async_llm.py | 82 +++++++++++++++++++ tests/utils.py | 38 ++++++--- vllm/engine/output_processor/multi_step.py | 3 + vllm/worker/model_runner.py | 10 ++- vllm/worker/multi_step_model_runner.py | 12 ++- 5 files changed, 130 insertions(+), 15 deletions(-) diff --git a/tests/multi_step/test_correctness_async_llm.py b/tests/multi_step/test_correctness_async_llm.py index 615549f2134a..000c923ef3e6 100644 --- a/tests/multi_step/test_correctness_async_llm.py +++ b/tests/multi_step/test_correctness_async_llm.py @@ -142,3 +142,85 @@ async def test_multi_step( name_0="hf", name_1="vllm", ) + + +@pytest.mark.parametrize(("tp_size, pp_size"), [ + (1, 2), +]) +@pytest.mark.asyncio +async def test_multi_step_pp_smoke( + tp_size: int, + pp_size: int, + monkeypatch, +) -> None: + """ + Smoke test for the vLLM engine with multi-step scheduling in an + OpenAI-protocol client/server environment. + + This tests compares the outputs between multi-step scheduling and + single-step scheduling. Notably, this test lets the engines generate + more tokens (default is 5) and test for an exact match over all the + tokens. + + Args: + tp_size: degree of tensor-parallelism + pp_size: degree of pipeline-parallelism + eager_mode + """ + + model = "JackFram/llama-160m" + num_scheduler_steps = 8 + attention_backend = "FLASH_ATTN" + max_num_seqs = 3 + + override_backend_env_variable(monkeypatch, attention_backend) + + # Prompt from the ShareGPT dataset + prompts = [ + "in the jtbd context whats a push?", # codespell:ignore + "in the jtbd context whats a push?", # codespell:ignore + "in the jtbd context whats a push?", # codespell:ignore + "in the jtbd context whats a push?", # codespell:ignore + ] + # Use varying max_tokens to introduce scheduling randomness. + max_tokens = [10 * i for i in range(1, len(prompts) + 1)] + assert len(prompts) == len(max_tokens) + + test_args = [ + "--tensor-parallel-size", + str(tp_size), "--pipeline-parallel-size", + str(pp_size), "--max-num-seqs", + str(max_num_seqs) + ] + + server_args = DEFAULT_SERVER_ARGS + test_args + ms_server_args = DEFAULT_SERVER_ARGS + \ + ["--num-scheduler-steps", f"{num_scheduler_steps}"] + \ + test_args + + # Spin up client/server & issue completion API requests. + # Default `max_wait_seconds` is 240 but was empirically + # was raised 3x to 720 *just for this test* due to + # observed timeouts in GHA CI + ref_completions = await completions_with_server_args( + prompts=prompts, + model_name=model, + server_cli_args=server_args, + num_logprobs=None, + max_wait_seconds=5 * 240, + max_tokens=max_tokens) + + test_completions = await completions_with_server_args( + prompts=prompts, + model_name=model, + server_cli_args=ms_server_args, + num_logprobs=None, + max_wait_seconds=5 * 240, + max_tokens=max_tokens) + + # Assert multi-step scheduling produces identical tokens + # to single-step scheduling. + ref_generations = get_client_text_generations(ref_completions) + test_generations = get_client_text_generations(test_completions) + + assert ref_generations == test_generations diff --git a/tests/utils.py b/tests/utils.py index 43825e813836..3eff77f396e1 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,3 +1,4 @@ +import asyncio import functools import os import signal @@ -7,7 +8,7 @@ import warnings from contextlib import contextmanager from pathlib import Path -from typing import Any, Callable, Dict, List, Optional +from typing import Any, Callable, Dict, List, Optional, Union import openai import pytest @@ -476,7 +477,8 @@ async def completions_with_server_args( server_cli_args: List[str], num_logprobs: Optional[int], max_wait_seconds: int = 240, -) -> Completion: + max_tokens: Union[int, list] = 5, +) -> List[Completion]: '''Construct a remote OpenAI server, obtain an async client to the server & invoke the completions API to obtain completions. @@ -487,37 +489,49 @@ async def completions_with_server_args( num_logprobs: Number of logprobs to report (or `None`) max_wait_seconds: timeout interval for bringing up server. Default: 240sec + max_tokens: max_tokens value for each of the given input prompts. + if only one max_token value is given, the same value is used + for all the prompts. Returns: OpenAI Completion instance ''' + if isinstance(max_tokens, int): + max_tokens = [max_tokens] * len(prompts) + + assert len(max_tokens) == len(prompts) + outputs = None max_wait_seconds = 240 * 3 # 240 is default with RemoteOpenAIServer(model_name, server_cli_args, max_wait_seconds=max_wait_seconds) as server: client = server.get_async_client() - outputs = await client.completions.create(model=model_name, - prompt=prompts, - temperature=0, - stream=False, - max_tokens=5, - logprobs=num_logprobs) + outputs = [ client.completions.create(model=model_name, + prompt=[p], + temperature=0, + stream=False, + max_tokens=max_tok, + logprobs=num_logprobs) \ + for p, max_tok in zip(prompts, max_tokens) ] + outputs = await asyncio.gather(*outputs) + assert outputs is not None, "Completion API call failed." return outputs -def get_client_text_generations(completions: Completion) -> List[str]: +def get_client_text_generations(completions: List[Completion]) -> List[str]: '''Extract generated tokens from the output of a request made to an Open-AI-protocol completions endpoint. ''' - return [x.text for x in completions.choices] + assert all([len(x.choices) == 1 for x in completions]) + return [x.choices[0].text for x in completions] def get_client_text_logprob_generations( - completions: Completion) -> List[TextTextLogprobs]: + completions: List[Completion]) -> List[TextTextLogprobs]: '''Operates on the output of a request made to an Open-AI-protocol completions endpoint; obtains top-rank logprobs for each token in each :class:`SequenceGroup` @@ -526,4 +540,4 @@ def get_client_text_logprob_generations( text = ''.join(text_generations) return [(text_generations, text, (None if x.logprobs is None else x.logprobs.top_logprobs)) - for x in completions.choices] + for completion in completions for x in completion.choices] diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py index cd5cfe5485f2..6dac3619580b 100644 --- a/vllm/engine/output_processor/multi_step.py +++ b/vllm/engine/output_processor/multi_step.py @@ -97,6 +97,9 @@ def process_outputs(self, assert len(seqs) == 1, ( "Beam search not supported in multi-step decoding.") seq = seqs[0] + seq_id = seq.seq_id + assert all( + [seq_id == output.samples[0].parent_seq_id for output in outputs]) if is_async: # Async case: We process tokens one by one. Here, we know the token diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 8c2e6c2d721b..4ac67a5fade8 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1007,8 +1007,16 @@ def __init__( # Used to cache python objects self.inter_data_cache: Dict[int, PyObjectCache] = {} + + # Using the PythonizationCache in Pipeline-Parallel clobbers the + # SequenceGroupToSample object. In Pipeline-Parallel, we have + # more than 1 Scheduler, resulting in a potential back-to-back + # prepare_model_inputs() call. This clobbers the cached + # SequenceGroupToSample objects, as we reset the cache during + # every prepare_model_inputs() call. self.sampling_metadata_cache: SamplingMetadataCache = \ - SamplingMetadataCache() + SamplingMetadataCache() \ + if self.parallel_config.pipeline_parallel_size == 1 else None def load_model(self) -> None: logger.info("Starting to load model %s...", self.model_config.model) diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py index 4c57a37c8787..12aa473525c1 100644 --- a/vllm/worker/multi_step_model_runner.py +++ b/vllm/worker/multi_step_model_runner.py @@ -326,7 +326,14 @@ def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs): self.is_multi_step = self.scheduler_config.is_multi_step self.pinned_sampled_token_ids: Optional[torch.Tensor] = None - self.pythonization_cache = PythonizationCache() + # Using the PythonizationCache in Pipeline-Parallel clobbers the + # SequenceOutput and CompletionSequenceGroupOutput object. + # When cache-reset happens at the last step of a multi-step + # execution, there may be other on-going single-step/multi-step + # executions. The current caching implementation does not check + # for this. + self.pythonization_cache = PythonizationCache() \ + if self.parallel_config.pipeline_parallel_size == 1 else None @functools.cached_property def _copy_stream(self): @@ -577,7 +584,8 @@ def execute_model( if model_input.is_last_step: outputs = self._final_process_outputs( model_input, model_input.base_output_proc_callback) - self.pythonization_cache.reset() + if self.pythonization_cache: + self.pythonization_cache.reset() return outputs # should be [SamplerOutput] From e1a3f5e831a467b2867a66e0e56ac0f70ed44394 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 29 Sep 2024 00:54:35 +0800 Subject: [PATCH 031/199] [CI/Build] Update models tests & examples (#8874) Co-authored-by: Roger Wang --- .buildkite/test-pipeline.yaml | 51 +++--- examples/offline_inference_vision_language.py | 28 ++-- ...e_inference_vision_language_multi_image.py | 13 +- tests/conftest.py | 84 +++++----- .../vision_language/test_llava_onevision.py | 29 ++-- .../vision_language/test_minicpmv.py | 2 +- .../vision_language/test_phi3v.py | 2 +- .../decoder_only/vision_language/test_qwen.py | 2 +- .../vision_language/test_broadcast.py | 35 ++++ .../vision_language/test_mllama.py | 153 ++++++++---------- tests/models/utils.py | 9 +- vllm/inputs/registry.py | 12 +- .../layers/quantization/utils/w8a8_utils.py | 3 +- 13 files changed, 239 insertions(+), 184 deletions(-) create mode 100644 tests/models/encoder_decoder/vision_language/test_broadcast.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d9dcacf5d991..bb42b5f29a72 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -9,6 +9,7 @@ # label(str): the name of the test. emoji allowed. # fast_check(bool): whether to run this on each commit on fastcheck pipeline. # fast_check_only(bool): run this test on fastcheck pipeline only +# optional(bool): never run this test by default (i.e. need to unblock manually) # command(str): the single command to run for tests. incompatible with commands. # commands(list): the list of commands to run for test. incompatbile with command. # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd] @@ -39,7 +40,7 @@ steps: # Check API reference (if it fails, you may have missing mock imports) - grep \"sig sig-object py\" build/html/dev/sampling_params.html -- label: Async Engine, Inputs, Utils, Worker Test # 15min +- label: Async Engine, Inputs, Utils, Worker Test # 24min fast_check: true source_file_dependencies: - vllm/ @@ -81,7 +82,7 @@ steps: commands: - pytest -v -s core -- label: Entrypoints Test # 20min +- label: Entrypoints Test # 40min working_dir: "/vllm-workspace/tests" fast_check: true mirror_hardwares: [amd] @@ -151,7 +152,7 @@ steps: # OOM in the CI unless we run this separately - pytest -v -s tokenization -- label: Examples Test # 12min +- label: Examples Test # 15min working_dir: "/vllm-workspace/examples" #mirror_hardwares: [amd] source_file_dependencies: @@ -169,7 +170,7 @@ steps: - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference_encoder_decoder.py -- label: Prefix Caching Test # 7min +- label: Prefix Caching Test # 9min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -177,7 +178,7 @@ steps: commands: - pytest -v -s prefix_caching -- label: Samplers Test # 18min +- label: Samplers Test # 36min source_file_dependencies: - vllm/model_executor/layers - vllm/sampling_metadata.py @@ -193,7 +194,7 @@ steps: - tests/test_logits_processor command: pytest -v -s test_logits_processor.py -- label: Speculative decoding tests # 22min +- label: Speculative decoding tests # 30min source_file_dependencies: - vllm/spec_decode - tests/spec_decode @@ -203,7 +204,7 @@ steps: - pytest -v -s spec_decode/e2e/test_multistep_correctness.py - pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py -- label: LoRA Test %N # 30min each +- label: LoRA Test %N # 15min each mirror_hardwares: [amd] source_file_dependencies: - vllm/lora @@ -211,7 +212,7 @@ steps: command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py parallelism: 4 -- label: "PyTorch Fullgraph Smoke Test" +- label: "PyTorch Fullgraph Smoke Test" # 9min fast_check: true source_file_dependencies: - vllm/ @@ -219,14 +220,14 @@ steps: commands: - pytest -v -s compile/test_full_graph_smoke.py -- label: "PyTorch Fullgraph Test" +- label: "PyTorch Fullgraph Test" # 18min source_file_dependencies: - vllm/ - tests/compile commands: - pytest -v -s compile/test_full_graph.py -- label: Kernels Test %N # 30min each +- label: Kernels Test %N # 1h each mirror_hardwares: [amd] source_file_dependencies: - csrc/ @@ -256,7 +257,7 @@ steps: - pip install aiohttp - bash run-benchmarks.sh -- label: Quantization Test # 15min +- label: Quantization Test # 33min source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization @@ -300,7 +301,7 @@ steps: - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/*.py --ignore=models/test_oot_registration.py -- label: Decoder-only Language Models Test # 1h3min +- label: Decoder-only Language Models Test # 1h36min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -308,7 +309,7 @@ steps: commands: - pytest -v -s models/decoder_only/language -- label: Decoder-only Multi-Modal Models Test # 56min +- label: Decoder-only Multi-Modal Models Test # 1h31min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ @@ -318,15 +319,25 @@ steps: - pytest -v -s models/decoder_only/audio_language - pytest -v -s models/decoder_only/vision_language -- label: Other Models Test # 5min +- label: Other Models Test # 6min #mirror_hardwares: [amd] source_file_dependencies: - vllm/ - tests/models/embedding/language - tests/models/encoder_decoder/language + - tests/models/encoder_decoder/vision_language commands: - pytest -v -s models/embedding/language - pytest -v -s models/encoder_decoder/language + - pytest -v -s models/encoder_decoder/vision_language + +- label: Custom Models Test + #mirror_hardwares: [amd] + optional: true + commands: + # PR authors can temporarily add commands below to test individual models + # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py + # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* ##### 1 GPU test ##### ##### multi gpus test ##### @@ -359,7 +370,7 @@ steps: - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed' -- label: Distributed Tests (2 GPUs) # 28min +- label: Distributed Tests (2 GPUs) # 40min #mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 2 @@ -376,14 +387,16 @@ steps: - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed' - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus # Avoid importing model tests that cause CUDA reinitialization error - - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus + - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus + - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus + - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py -- label: Multi-step Tests (4 GPUs) # 21min +- label: Multi-step Tests (4 GPUs) # 36min working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: @@ -401,7 +414,7 @@ steps: - pytest -v -s multi_step/test_correctness_async_llm.py - pytest -v -s multi_step/test_correctness_llm.py -- label: Pipeline Parallelism Test # 23min +- label: Pipeline Parallelism Test # 45min working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: @@ -427,7 +440,7 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s -x lora/test_long_context.py -- label: Weight Loading Multiple GPU Test +- label: Weight Loading Multiple GPU Test # 33min working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 6d34621a8a9b..b94ef537d783 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -12,6 +12,10 @@ from vllm.assets.video import VideoAsset from vllm.utils import FlexibleArgumentParser +# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on +# lower-end GPUs. +# Unless specified, these settings have been tested to work on a single L4. + # LLaVA-1.5 def run_llava(question, modality): @@ -19,7 +23,7 @@ def run_llava(question, modality): prompt = f"USER: \n{question}\nASSISTANT:" - llm = LLM(model="llava-hf/llava-1.5-7b-hf") + llm = LLM(model="llava-hf/llava-1.5-7b-hf", max_model_len=4096) stop_token_ids = None return llm, prompt, stop_token_ids @@ -57,7 +61,7 @@ def run_llava_onevision(question, modality): <|im_start|>assistant\n" llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf", - max_model_len=32768) + max_model_len=16384) stop_token_ids = None return llm, prompt, stop_token_ids @@ -67,7 +71,7 @@ def run_fuyu(question, modality): assert modality == "image" prompt = f"{question}\n" - llm = LLM(model="adept/fuyu-8b") + llm = LLM(model="adept/fuyu-8b", max_model_len=2048, max_num_seqs=2) stop_token_ids = None return llm, prompt, stop_token_ids @@ -99,7 +103,8 @@ def run_phi3v(question, modality): llm = LLM( model="microsoft/Phi-3-vision-128k-instruct", trust_remote_code=True, - max_num_seqs=5, + max_model_len=4096, + max_num_seqs=2, mm_processor_kwargs={"num_crops": 16}, ) stop_token_ids = None @@ -122,7 +127,7 @@ def run_chameleon(question, modality): assert modality == "image" prompt = f"{question}" - llm = LLM(model="facebook/chameleon-7b") + llm = LLM(model="facebook/chameleon-7b", max_model_len=4096) stop_token_ids = None return llm, prompt, stop_token_ids @@ -145,6 +150,8 @@ def run_minicpmv(question, modality): trust_remote_code=True) llm = LLM( model=model_name, + max_model_len=4096, + max_num_seqs=2, trust_remote_code=True, ) # NOTE The stop_token_ids are different for various versions of MiniCPM-V @@ -177,7 +184,7 @@ def run_internvl(question, modality): llm = LLM( model=model_name, trust_remote_code=True, - max_num_seqs=5, + max_model_len=4096, ) tokenizer = AutoTokenizer.from_pretrained(model_name, @@ -215,7 +222,8 @@ def run_qwen_vl(question, modality): llm = LLM( model="Qwen/Qwen-VL", trust_remote_code=True, - max_num_seqs=5, + max_model_len=1024, + max_num_seqs=2, ) prompt = f"{question}Picture 1: \n" @@ -229,8 +237,10 @@ def run_qwen2_vl(question, modality): model_name = "Qwen/Qwen2-VL-7B-Instruct" + # Tested on L40 llm = LLM( model=model_name, + max_model_len=8192, max_num_seqs=5, ) @@ -252,10 +262,10 @@ def run_mllama(question, modality): # max_model_len (131072) for this model may cause OOM. # You may lower either to run this example on lower-end GPUs. - # The configuration below has been confirmed to launch on a - # single H100 GPU. + # The configuration below has been confirmed to launch on a single L40 GPU. llm = LLM( model=model_name, + max_model_len=4096, max_num_seqs=16, enforce_eager=True, ) diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index 8c5f1a7b7af0..1e99c02234d0 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -28,12 +28,18 @@ class ModelRequestData(NamedTuple): chat_template: Optional[str] +# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on +# lower-end GPUs. +# Unless specified, these settings have been tested to work on a single L4. + + def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData: model_name = "Qwen/Qwen-VL-Chat" llm = LLM( model=model_name, trust_remote_code=True, - max_num_seqs=5, + max_model_len=1024, + max_num_seqs=2, limit_mm_per_prompt={"image": len(image_urls)}, ) placeholders = "".join(f"Picture {i}: \n" @@ -83,6 +89,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: model="microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, max_model_len=4096, + max_num_seqs=2, limit_mm_per_prompt={"image": len(image_urls)}, mm_processor_kwargs={"num_crops": 4}, ) @@ -106,7 +113,6 @@ def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: llm = LLM( model=model_name, trust_remote_code=True, - max_num_seqs=5, max_model_len=4096, limit_mm_per_prompt={"image": len(image_urls)}, ) @@ -148,10 +154,11 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: model_name = "Qwen/Qwen2-VL-7B-Instruct" + # Tested on L40 llm = LLM( model=model_name, - max_num_seqs=5, max_model_len=32768 if process_vision_info is None else 4096, + max_num_seqs=5, limit_mm_per_prompt={"image": len(image_urls)}, ) diff --git a/tests/conftest.py b/tests/conftest.py index db71d8bc3af1..45dc5e8323ca 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -246,17 +246,14 @@ def video_assets() -> _VideoAssets: class HfRunner: - def wrap_device(self, input: _T) -> _T: - if not is_cpu(): - # Check if the input is already on the GPU - if hasattr(input, 'device') and input.device.type == "cuda": - return input # Already on GPU, no need to move - return input.to("cuda") - else: - # Check if the input is already on the CPU - if hasattr(input, 'device') and input.device.type == "cpu": - return input # Already on CPU, no need to move - return input.to("cpu") + def wrap_device(self, input: _T, device: Optional[str] = None) -> _T: + if device is None: + return self.wrap_device(input, "cpu" if is_cpu() else "cuda") + + if hasattr(input, "device") and input.device.type == device: + return input + + return input.to(device) def __init__( self, @@ -333,7 +330,7 @@ def generate( inputs = self.postprocess_inputs(inputs) output_ids = self.model.generate( - **self.wrap_device(inputs), + **self.wrap_device(inputs, device=self.model.device.type), use_cache=True, **kwargs, ) @@ -406,7 +403,7 @@ def generate_greedy_logprobs( inputs = self.postprocess_inputs(inputs) output = self.model.generate( - **self.wrap_device(inputs), + **self.wrap_device(inputs, device=self.model.device.type), use_cache=True, do_sample=False, max_new_tokens=max_tokens, @@ -414,40 +411,39 @@ def generate_greedy_logprobs( return_dict_in_generate=True, **kwargs, ) - seq_logprobs: List[torch.Tensor] = [] - for hidden_states in output.hidden_states: - last_hidden_states = hidden_states[-1][0] - logits = torch.matmul( - last_hidden_states, - self.model.get_output_embeddings().weight.t(), - ) - if self.model.get_output_embeddings().bias is not None: - logits += self.model.get_output_embeddings( - ).bias.unsqueeze(0) - logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) - seq_logprobs.append(logprobs) + seq_logprobs = self._hidden_states_to_seq_logprobs( + output.hidden_states) all_logprobs.append(seq_logprobs) return all_logprobs - def _hidden_states_to_logprobs( + def _hidden_states_to_seq_logprobs( self, - hidden_states, - num_logprobs, - ) -> Tuple[List[Dict[int, float]], int]: + hidden_states: Tuple[Tuple[torch.Tensor, ...], ...], + ) -> List[torch.Tensor]: + output_embeddings = self.model.get_output_embeddings() + seq_logprobs: List[torch.Tensor] = [] - output_len = len(hidden_states) for _, hidden_state in enumerate(hidden_states): last_hidden_states = hidden_state[-1][0] logits = torch.matmul( - last_hidden_states, - self.model.get_output_embeddings().weight.t(), + last_hidden_states.to(output_embeddings.weight.device), + output_embeddings.weight.t(), ) - if getattr(self.model.get_output_embeddings(), "bias", - None) is not None: - logits += self.model.get_output_embeddings().bias.unsqueeze(0) + if getattr(output_embeddings, "bias", None) is not None: + logits += output_embeddings.bias.unsqueeze(0) logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) seq_logprobs.append(logprobs) + return seq_logprobs + + def _hidden_states_to_logprobs( + self, + hidden_states: Tuple[Tuple[torch.Tensor, ...], ...], + num_logprobs: int, + ) -> Tuple[List[Dict[int, float]], int]: + seq_logprobs = self._hidden_states_to_seq_logprobs(hidden_states) + output_len = len(hidden_states) + # convert to dict seq_logprobs_lst: List[Dict[int, float]] = [] for tok_idx, tok_logprobs in enumerate(seq_logprobs): @@ -500,7 +496,7 @@ def generate_greedy_logprobs_limit( inputs = self.postprocess_inputs(inputs) output = self.model.generate( - **self.wrap_device(inputs), + **self.wrap_device(inputs, device=self.model.device.type), use_cache=True, do_sample=False, max_new_tokens=max_tokens, @@ -543,12 +539,20 @@ def generate_encoder_decoder_greedy_logprobs_limit( for (encoder_prompt, decoder_prompt) in to_enc_dec_tuple_list(encoder_decoder_prompts): + encoder_input_ids = self.wrap_device( - self.tokenizer(encoder_prompt, return_tensors="pt").input_ids) - decoder_input_ids = ( - None if decoder_prompt is None else self.wrap_device( + self.tokenizer(encoder_prompt, return_tensors="pt").input_ids, + device=self.model.device.type, + ) + + if decoder_prompt is None: + decoder_input_ids = None + else: + decoder_input_ids = self.wrap_device( self.tokenizer(decoder_prompt, - return_tensors="pt").input_ids)) + return_tensors="pt").input_ids, + device=self.model.device.type, + ) output = self.model.generate( encoder_input_ids, diff --git a/tests/models/decoder_only/vision_language/test_llava_onevision.py b/tests/models/decoder_only/vision_language/test_llava_onevision.py index 978631feacb8..2c4cd3fb8529 100644 --- a/tests/models/decoder_only/vision_language/test_llava_onevision.py +++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py @@ -16,8 +16,7 @@ # Video test HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({ "sample_demo_1": - "<|im_start|>user