From 3d8149a895b87e1bd46ccbd6fe90da54bb7acce4 Mon Sep 17 00:00:00 2001 From: mgoin Date: Fri, 27 Sep 2024 15:10:23 +0000 Subject: [PATCH 1/2] Directly use compressed-tensors for checkpoint definitions --- requirements-common.txt | 1 + requirements-test.txt | 1 - tests/quantization/test_compressed_tensors.py | 3 +- .../compressed_tensors/compressed_tensors.py | 7 +- .../compressed_tensors_moe.py | 4 +- .../schemes/compressed_tensors_w8a16_fp8.py | 3 +- .../schemes/compressed_tensors_w8a8_int8.py | 3 +- .../quantization/compressed_tensors/utils.py | 102 +----------------- 8 files changed, 13 insertions(+), 111 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index a9596878a0f8..73e90447e9df 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -31,3 +31,4 @@ pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 einops # Required for Qwen2-VL. +compressed-tensors == 0.6.0 # required for compressed-tensors diff --git a/requirements-test.txt b/requirements-test.txt index 9c6fadb88865..1ccff94ee709 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -17,7 +17,6 @@ requests ray[adag]==2.35 sentence-transformers # required for embedding soundfile # required for audio test -compressed-tensors==0.4.0 # required for compressed-tensors timm # required for internvl test transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py index 627b2abaabcf..acb9bd72179f 100644 --- a/tests/quantization/test_compressed_tensors.py +++ b/tests/quantization/test_compressed_tensors.py @@ -5,13 +5,12 @@ import pytest import torch +from compressed_tensors.quantization import QuantizationType from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501 CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationType) @pytest.mark.parametrize("model_args", [ diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 362feeef2e33..f677a0b706f4 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -2,6 +2,10 @@ import torch from pydantic import BaseModel +from compressed_tensors.config import CompressionFormat +from compressed_tensors.quantization import (QuantizationArgs, + QuantizationStrategy, + QuantizationType) from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, @@ -16,8 +20,7 @@ CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - CompressionFormat, QuantizationArgs, QuantizationStrategy, - QuantizationType, find_matched_target, is_activation_quantization_format, + find_matched_target, is_activation_quantization_format, should_ignore_layer) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod from vllm.platforms import current_platform diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 6666a4bf1f26..d65834181095 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -3,14 +3,14 @@ from typing import Callable, List, Optional import torch +from compressed_tensors import CompressionFormat +from compressed_tensors.quantization import QuantizationStrategy from vllm import _custom_ops as ops from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( WNA16_SUPPORTED_BITS) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - CompressionFormat, QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( all_close_1d, normalize_e4m3fn_to_e4m3fnuz, per_tensor_dequantize) from vllm.model_executor.utils import set_weight_attrs diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py index 3d55d55cc390..1671a23d77c6 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py @@ -1,11 +1,10 @@ from typing import Callable, List, Optional import torch +from compressed_tensors.quantization import QuantizationStrategy from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index 078380f15929..f00218581854 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -2,11 +2,10 @@ import torch from torch.nn import Parameter +from compressed_tensors.quantization import QuantizationStrategy from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( apply_int8_linear, convert_to_channelwise) from vllm.model_executor.parameter import (BasevLLMParameter, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py index fc531b9d666e..a74eaef5efde 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py @@ -1,111 +1,13 @@ import re -from enum import Enum -from typing import Any, Dict, Iterable, Optional, Union +from typing import Iterable, Optional -from pydantic import BaseModel, Field, field_validator +from compressed_tensors import CompressionFormat from torch.nn import Module from vllm.model_executor.layers.quantization.utils.quant_utils import ( FUSED_LAYER_NAME_MAPPING) -class CompressionFormat(Enum): - dense = "dense" - sparse_bitmask = "sparse-bitmask" - naive_quantized = "naive-quantized" - float_quantized = "float-quantized" - int_quantized = "int-quantized" - pack_quantized = "pack-quantized" - marlin_24 = "marlin-24" - - -class QuantizationType(str, Enum): - """ - Enum storing quantization type options - """ - - INT = "int" - FLOAT = "float" - - -class QuantizationStrategy(str, Enum): - """ - Enum storing quantization strategy options - """ - - TENSOR = "tensor" - CHANNEL = "channel" - GROUP = "group" - BLOCK = "block" - TOKEN = "token" - - -class ActivationOrdering(str, Enum): - """ - Enum storing strategies for activation ordering - - Group: reorder groups and weight\n - Weight: only reorder weight, not groups. Slightly lower latency and - accuracy compared to group actorder\n - """ - - GROUP = "group" - WEIGHT = "weight" - - -class QuantizationArgs(BaseModel): - """ - User facing arguments used to define a quantization config - for weights or activations - - :param num_bits: quantization bit depth - :param type: dtype to quantized to, either int or float - :param symmetric: whether or not quantization scale is symmetric - :param strategy: string determining the scope of scale/zero-point to apply - :param group_size: group length to use for the group strategy - :param block_structure: 2d block structure to use for the block - strategy, must be of the format "2x4", "8x16", etc. - :param dynamic: set True to perform dynamic quantization - - values will not be calibrated during calibration phase, - instead during inference new quantization ranges will be - observed with every sample. Defaults to False for static - quantization. Note that enabling dynamic quantization - will change the default observer to a memoryless one - :param actorder: whether to apply group quantization in decreasing order of - activation. Defaults to None for arbitrary ordering - """ - - num_bits: int = 8 - type: QuantizationType = QuantizationType.INT - symmetric: bool = True - group_size: Optional[int] = None - strategy: Optional[QuantizationStrategy] = None - block_structure: Optional[str] = None - dynamic: bool = False - actorder: Union[ActivationOrdering, bool, None] = None - observer: str = Field( - default="minmax", - description=("The class to use to compute the quantization param - " - "scale and zero-point'"), - ) - observer_kwargs: Dict[str, Any] = Field( - default_factory=dict, - description= - ("optional dict of kwargs to be passed directly to torch quantization " - "Observers constructor excluding quantization range or symmetry"), - ) - - @field_validator("actorder", mode="before") - def validate_actorder(cls, value) -> Optional[ActivationOrdering]: - if isinstance(value, bool): - return ActivationOrdering.GROUP if value else None - - if isinstance(value, str): - return ActivationOrdering(value.lower()) - - return value - - def is_activation_quantization_format(format: str) -> bool: _ACTIVATION_QUANTIZATION_FORMATS = [ CompressionFormat.naive_quantized.value, From 97c00c6e9a44e8c4e755c03cef601dc263ca71f7 Mon Sep 17 00:00:00 2001 From: mgoin Date: Fri, 27 Sep 2024 15:14:29 +0000 Subject: [PATCH 2/2] Format --- .../quantization/compressed_tensors/compressed_tensors.py | 2 +- .../compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py | 3 +-- .../compressed_tensors/schemes/compressed_tensors_w8a8_int8.py | 2 +- .../compressed_tensors/schemes/compressed_tensors_wNa16.py | 3 +-- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index f677a0b706f4..dffc278a9f91 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,11 +1,11 @@ from typing import Any, Dict, List, Optional, cast import torch -from pydantic import BaseModel from compressed_tensors.config import CompressionFormat from compressed_tensors.quantization import (QuantizationArgs, QuantizationStrategy, QuantizationType) +from pydantic import BaseModel from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py index 5931ec36c97d..7270b302ef96 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py @@ -1,12 +1,11 @@ from typing import Callable, List, Optional import torch +from compressed_tensors.quantization import QuantizationStrategy from torch.nn import Parameter from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - QuantizationStrategy) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz, requantize_with_max_scale) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py index f00218581854..5597dc888b7b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py @@ -1,8 +1,8 @@ from typing import Callable, List, Optional import torch -from torch.nn import Parameter from compressed_tensors.quantization import QuantizationStrategy +from torch.nn import Parameter from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index cb65557be8f9..a51573801778 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -1,12 +1,11 @@ from typing import Callable, List, Optional, Set import torch +from compressed_tensors.quantization import ActivationOrdering from vllm.logger import init_logger from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( CompressedTensorsScheme) -from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( - ActivationOrdering) from vllm.model_executor.layers.quantization.kernels import ( MPLinearLayerConfig, choose_mp_linear_kernel) from vllm.model_executor.layers.quantization.utils.marlin_utils import (