Merge pull request vllm-project#2 from ri938/add_awq_improvements

ri938 · web-flow · commit c39ec2addc19 · 2023-08-15T12:26:55.000+01:00
Add awq improvements
diff --git a/vllm/awq_quantization/qmodule.py b/vllm/awq_quantization/qmodule.py
@@ -32,67 +32,20 @@ def __init__(self, w_bit, group_size, in_features, out_features, bias, dev):
         self.out_features = out_features
         self.w_bit = w_bit
         self.group_size = group_size if group_size != -1 else in_features
+
         # quick sanity check (make sure aligment)
         assert self.in_features % self.group_size == 0
         assert out_features % (32 // self.w_bit) == 0
 
-        self.register_buffer('qweight', torch.zeros((in_features, out_features // (32 // self.w_bit)), dtype=torch.int32, device=dev))
-        self.register_buffer('qzeros', torch.zeros((in_features // self.group_size, out_features // (32 // self.w_bit)), dtype=torch.int32, device=dev))
-        self.register_buffer('scales', torch.zeros((in_features // self.group_size, out_features), dtype=torch.float16, device=dev))
+        self.register_buffer('qweight', torch.empty((in_features, out_features // (32 // self.w_bit)), dtype=torch.int32, device=dev))
+        self.register_buffer('qzeros', torch.empty((in_features // self.group_size, out_features // (32 // self.w_bit)), dtype=torch.int32, device=dev))
+        self.register_buffer('scales', torch.empty((in_features // self.group_size, out_features), dtype=torch.float16, device=dev))
+
         if bias:
-            self.register_buffer('bias', torch.zeros((out_features), dtype=torch.float16, device=dev))
+            self.register_buffer('bias', torch.empty((out_features), dtype=torch.float16, device=dev))
         else:
             self.bias = None
 
-    @classmethod
-    def from_linear(cls, linear, w_bit, group_size, init_only=False, scales=None, zeros=None):
-        awq_linear = cls(w_bit, group_size, linear.in_features, linear.out_features, linear.bias is not None, linear.weight.device)
-        if init_only:  # just prepare for loading sd
-            return awq_linear
-        
-        # need scales and zeros info for real quantization
-        assert scales is not None and zeros is not None  
-        scale_zeros = zeros * scales
-        
-        awq_linear.scales = scales.clone().half()
-        if linear.bias is not None:
-            awq_linear.bias = linear.bias.clone().half()
-
-        pack_num = 32 // awq_linear.w_bit
-        
-        intweight = []
-        for idx in range(awq_linear.in_features):
-            intweight.append(torch.round((linear.weight.data[:, idx] + scale_zeros[idx // group_size]) / awq_linear.scales[idx // group_size]).to(torch.int)[:, None])
-        intweight = torch.cat(intweight, dim=1)
-        intweight = intweight.t().contiguous()
-        intweight = intweight.to(dtype=torch.int32)
-        qweight = torch.zeros((intweight.shape[0], intweight.shape[1] // 32 * awq_linear.w_bit), dtype=torch.int32, device=intweight.device)           
-         
-        for col in range(intweight.shape[1] // pack_num):
-            if awq_linear.w_bit == 4:
-                order_map = [0, 2, 4, 6, 1, 3, 5, 7]
-            else:
-                raise NotImplementedError("Only 4-bit are supported for now.")
-            for i in range(pack_num):
-                qweight_col = intweight[:, col * pack_num + order_map[i]]
-                qweight[:, col] |= qweight_col << (i * awq_linear.w_bit)
-        awq_linear.qweight = qweight
-
-        zeros = zeros.to(dtype=torch.int32)
-        qzeros = torch.zeros((zeros.shape[0], zeros.shape[1] // 32 * awq_linear.w_bit), dtype=torch.int32, device=zeros.device)
-        
-        for col in range(zeros.shape[1] // pack_num):     
-            if awq_linear.w_bit == 4:
-                order_map = [0, 2, 4, 6, 1, 3, 5, 7]
-            else:
-                raise NotImplementedError("Only 4-bit are supported for now.")
-            for i in range(pack_num):
-                qzero_col = zeros[:, col * pack_num + order_map[i]]
-                qzeros[:, col] |= qzero_col << (i * awq_linear.w_bit)
-        awq_linear.qzeros = qzeros
-        
-        return awq_linear
-
     @torch.no_grad()
     def forward(self, x):
         out_shape = x.shape[:-1] + (self.out_features, )
diff --git a/vllm/config.py b/vllm/config.py
@@ -12,6 +12,34 @@
 _GB = 1 << 30
 
 
+class QuantizationConfig:
+    """Quantization settings
+
+    Args:
+        method: The quantization method to apply
+        bits: How many bits the linear layers are quantized to
+        group_size: What size the weights were quantized in groups of
+    """
+
+    def __init__(
+        self,
+        method: str,
+        bits: Optional[int] = 4,
+        group_size: Optional[int] = 128
+    ) -> None:
+        self.method = method
+        self.bits = bits
+        self.group_size = group_size
+        self._verify()
+
+    def _verify(self) -> None:
+        allowed_methods = ['awq']
+        if self.method not in allowed_methods:
+            raise ValueError(
+                f"Unknown quantization method ({self.method})"
+                f" must be from choice of {allowed_methods}")
+
+
 class ModelConfig:
     """Configuration for the model.
 
@@ -31,6 +59,7 @@ class ModelConfig:
             will use FP16 precision for FP32 and FP16 models, and BF16 precision
             for BF16 models.
         seed: Random seed for reproducibility.
+        quantization_config: Optional quantization settings
     """
 
     def __init__(
@@ -44,6 +73,7 @@ def __init__(
         use_dummy_weights: bool,
         dtype: str,
         seed: int,
+        quantization_config: Optional[QuantizationConfig] = None
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -53,6 +83,7 @@ def __init__(
         self.use_np_weights = use_np_weights
         self.use_dummy_weights = use_dummy_weights
         self.seed = seed
+        self.quantization_config = quantization_config
 
         self.hf_config = get_config(model, trust_remote_code)
         self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
@@ -86,6 +117,9 @@ def verify_with_parallel_config(
                 "must be divisible by pipeline parallel size "
                 f"({pipeline_parallel_size}).")
 
+        if self.quantization_config and tensor_parallel_size > 1:
+            raise NotImplementedError("Quantization does not currently support tensor parallelism")
+
     def get_hidden_size(self) -> int:
         return self.hf_config.hidden_size
 
@@ -140,6 +174,13 @@ def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
         total_num_hidden_layers = self.hf_config.num_hidden_layers
         return total_num_hidden_layers // parallel_config.pipeline_parallel_size
 
+    def get_quantization_method(self):
+        if self.quantization_config is None:
+            method = None
+        else:
+            method = self.quantization_config.method
+        return method
+
 
 class CacheConfig:
     """Configuration for the KV cache.
@@ -295,39 +336,3 @@ def _get_and_verify_dtype(
                 f"of at least 8.0. Your {gpu_name} GPU has compute capability "
                 f"{compute_capability[0]}.{compute_capability[1]}.")
     return torch_dtype
-
-
-class QuantizationConfig:
-    """Quantization settings
-
-    Args:
-        method: The quantization method to apply
-        bits: How many bits the linear layers are quantized to
-        group_size: What size the weights were quantized in groups of
-    """
-
-    def __init__(
-        self,
-        method: str,
-        bits: Optional[int] = 4,
-        group_size: Optional[int] = 128
-    ) -> None:
-        self.method = method
-        self.bits = bits
-        self.group_size = group_size
-
-        self._verify()
-
-    def _verify(self) -> None:
-        allowed_methods = ['awq']
-        if self.method not in allowed_methods:
-            raise ValueError(
-                f"Unknown quantization method ({self.method})"
-                f" must be from choice of {allowed_methods}")
-
-    def verify_with_parallel_config(self, parallel_config: "ParallelConfig") -> None:
-        tensor_parallel_size = parallel_config.tensor_parallel_size
-
-        if self.method is not None and tensor_parallel_size > 1:
-            raise NotImplementedError(
-                "Quantization does not currently support tensor parallelism")
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -152,11 +152,12 @@ def create_engine_configs(
         self,
     ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
         # Initialize the configs.
+        quantization_config = QuantizationConfig(self.quantization) if self.quantization else None
         model_config = ModelConfig(self.model, self.tokenizer,
                                    self.tokenizer_mode, self.trust_remote_code,
                                    self.download_dir, self.use_np_weights,
                                    self.use_dummy_weights, self.dtype,
-                                   self.seed)
+                                   self.seed, quantization_config)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space)
@@ -166,8 +167,7 @@ def create_engine_configs(
         scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
                                            self.max_num_seqs,
                                            model_config.get_max_model_len())
-        quantization_config = QuantizationConfig(self.quantization) if self.quantization else None
-        return model_config, cache_config, parallel_config, scheduler_config, quantization_config
+        return model_config, cache_config, parallel_config, scheduler_config
 
 
 @dataclass
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -4,7 +4,7 @@
 from typing import Any, List, Optional, Tuple, TYPE_CHECKING
 
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, QuantizationConfig)
+                         SchedulerConfig)
 from vllm.core.scheduler import Scheduler
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.ray_utils import initialize_cluster, ray, RayWorker
@@ -55,7 +55,6 @@ class LLMEngine:
         stage_devices: The list of devices for each stage. Each stage is a list
             of (rank, node_resource, device) tuples.
         log_stats: Whether to log statistics.
-        quantization_config: Optional settings related to using quantized layers
     """
 
     def __init__(
@@ -64,7 +63,6 @@ def __init__(
         cache_config: CacheConfig,
         parallel_config: ParallelConfig,
         scheduler_config: SchedulerConfig,
-        quantization_config: Optional[QuantizationConfig],
         distributed_init_method: str,
         placement_group: Optional["PlacementGroup"],
         log_stats: bool,
@@ -80,15 +78,14 @@ def __init__(
             f"download_dir={model_config.download_dir!r}, "
             f"use_np_weights={model_config.use_np_weights}, "
             f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
-            f"quantization_method={getattr(quantization_config, 'method', None)}, "
+            f"quantization_method={model_config.get_quantization_method()}, "
             f"seed={model_config.seed})")
         # TODO(woosuk): Print more configs in debug mode.
 
         self.model_config = model_config
         self.cache_config = cache_config
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
-        self.quantization_config = quantization_config
         self.log_stats = log_stats
         self._verify_args()
 
@@ -132,7 +129,6 @@ def _init_workers(self, distributed_init_method: str):
             self.scheduler_config,
             0,
             distributed_init_method,
-            quantization_config=self.quantization_config
         )
         self.workers.append(worker)
         self._run_workers(
@@ -171,7 +167,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup"):
                               scheduler_config,
                               None,
                               None,
-                              self.quantization_config
                           ))
         self._run_workers(
             "init_model",
@@ -182,9 +177,6 @@ def _verify_args(self) -> None:
         self.model_config.verify_with_parallel_config(self.parallel_config)
         self.cache_config.verify_with_parallel_config(self.parallel_config)
 
-        if self.quantization_config is not None:
-            self.quantization_config.verify_with_parallel_config(self.parallel_config)
-
     def _init_cache(self) -> None:
         """Profiles the memory usage and initializes the KV cache."""
         # Get the maximum number of blocks that can be allocated on GPU and CPU.
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
@@ -5,7 +5,7 @@
 import torch.nn as nn
 from transformers import PretrainedConfig
 
-from vllm.config import ModelConfig, QuantizationConfig
+from vllm.config import ModelConfig
 from vllm.model_executor.models import *  # pylint: disable=wildcard-import
 from vllm.model_executor.weight_utils import initialize_dummy_weights
 
@@ -39,16 +39,19 @@ def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
         f"Supported architectures: {list(_MODEL_REGISTRY.keys())}")
 
 
-def get_model(model_config: ModelConfig, quantization_config: QuantizationConfig) -> nn.Module:
+def _supports_quantization(model_class):
+    return model_class is LlamaForCausalLM
+
+
+def get_model(model_config: ModelConfig) -> nn.Module:
     model_class = _get_model_architecture(model_config.hf_config)
     torch.set_default_dtype(model_config.dtype)
 
     # Create a model instance.
     # The weights will be initialized as empty tensors.
 
-    # TODO: better way to do this
-    if model_class is LlamaForCausalLM:
-        model = model_class(model_config.hf_config, quantization_config)
+    if _supports_quantization(model_class):
+        model = model_class(model_config.hf_config, model_config.quantization_config)
     else:
         model = model_class(model_config.hf_config)
 
@@ -62,4 +65,5 @@ def get_model(model_config: ModelConfig, quantization_config: QuantizationConfig
         model.load_weights(model_config.model, model_config.download_dir,
                            model_config.use_np_weights)
         model = model.cuda()
+
     return model.eval()
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -147,7 +147,7 @@ def get_quantized_layer(in_features, out_features, quant_config):
         in_features=in_features,
         out_features=out_features,
         bias=None,
-        dev=0  ## TODO: fix this without large spike in memory
+        dev=torch.cuda.current_device()
     )
     return layer
 
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
@@ -6,7 +6,7 @@
 import torch.distributed
 
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig, QuantizationConfig)
+                         SchedulerConfig)
 from vllm.model_executor import get_model, InputMetadata, set_random_seed
 from vllm.model_executor.parallel_utils.parallel_state import (
     initialize_model_parallel)
@@ -31,12 +31,10 @@ def __init__(
         scheduler_config: SchedulerConfig,
         rank: Optional[int] = None,
         distributed_init_method: Optional[str] = None,
-        quantization_config: Optional[QuantizationConfig] = None
     ) -> None:
         self.model_config = model_config
         self.parallel_config = parallel_config
         self.scheduler_config = scheduler_config
-        self.quantization_config = quantization_config
         self.rank = rank
         self.distributed_init_method = distributed_init_method
 
@@ -66,7 +64,7 @@ def init_model(self):
 
         # Initialize the model.
         set_random_seed(self.model_config.seed)
-        self.model = get_model(self.model_config, self.quantization_config)
+        self.model = get_model(self.model_config)
 
     @torch.inference_mode()
     def profile_num_available_blocks(

Original file line number	Diff line number	Diff line change
`@@ -147,7 +147,7 @@ def get_quantized_layer(in_features, out_features, quant_config):`
`147`	`147`	`in_features=in_features,`
`148`	`148`	`out_features=out_features,`
`149`	`149`	`bias=None,`
`150`		`- dev=0 ## TODO: fix this without large spike in memory`
	`150`	`+ dev=torch.cuda.current_device()`
`151`	`151`	`)`
`152`	`152`	`return layer`
`153`	`153`