Skip to content

Commit d55e28b

Browse files
committed
Added asymmetric integration to linear layers
1 parent f2bd246 commit d55e28b

File tree

3 files changed

+47
-8
lines changed

3 files changed

+47
-8
lines changed

vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
138138
or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
139139
is_tensor = (weight_strategy and input_quant.strategy
140140
== QuantizationStrategy.TENSOR.value)
141-
is_symmetric = weight_quant.symmetric and input_quant.symmetric
141+
is_symmetric = weight_quant.symmetric
142142
is_static = not weight_quant.dynamic and not input_quant.dynamic
143143

144144
return is_8_bits and is_tensor and is_symmetric and is_static
@@ -151,7 +151,7 @@ def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
151151
or weight_quant.strategy == QuantizationStrategy.CHANNEL.value)
152152
is_token = (weight_strategy and input_quant.strategy
153153
== QuantizationStrategy.TOKEN.value)
154-
is_symmetric = weight_quant.symmetric and input_quant.symmetric
154+
is_symmetric = weight_quant.symmetric
155155
is_dynamic = not weight_quant.dynamic and input_quant.dynamic
156156

157157
return is_8_bits and is_token and is_symmetric and is_dynamic
@@ -265,12 +265,14 @@ def _get_scheme_from_parts(
265265
if self._is_static_tensor_w8a8(weight_quant, input_quant):
266266
return CompressedTensorsW8A8Int8(
267267
strategy=weight_quant.strategy,
268-
is_static_input_scheme=True)
268+
is_static_input_scheme=True,
269+
input_symmetric=input_quant.symmetric)
269270

270271
if self._is_dynamic_token_w8a8(weight_quant, input_quant):
271272
return CompressedTensorsW8A8Int8(
272273
strategy=weight_quant.strategy,
273-
is_static_input_scheme=False)
274+
is_static_input_scheme=False,
275+
input_symmetric=input_quant.symmetric)
274276

275277
raise NotImplementedError(
276278
"No compressed-tensors compatible scheme was found.")

vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@
1717

1818
class CompressedTensorsW8A8Int8(CompressedTensorsScheme):
1919

20-
def __init__(self, strategy: str, is_static_input_scheme: bool):
20+
def __init__(self, strategy: str, is_static_input_scheme: bool,
21+
input_symmetric: bool):
2122
self.strategy = strategy
2223
self.is_static_input_scheme = is_static_input_scheme
24+
self.input_symmetric = input_symmetric
2325

2426
@classmethod
2527
def get_min_capability(cls) -> int:
@@ -48,8 +50,21 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
4850
if self.is_static_input_scheme:
4951
layer.input_scale = Parameter(layer.input_scale.max(),
5052
requires_grad=False)
53+
if not self.input_symmetric:
54+
layer.input_zero_point = Parameter(layer.input_zero_point,
55+
requires_grad=False)
56+
else:
57+
layer.input_zero_point = None
5158
else:
5259
layer.input_scale = None
60+
layer.input_zero_point = None
61+
62+
if not self.input_symmetric:
63+
layer.azp_adj = layer.weight.sum(dim=0,
64+
keepdim=True,
65+
dtype=torch.int32)
66+
else:
67+
layer.azp_adj = None
5368

5469
def create_weights(self, layer: torch.nn.Module,
5570
output_partition_sizes: List[int],
@@ -90,11 +105,18 @@ def create_weights(self, layer: torch.nn.Module,
90105
weight_loader=weight_loader)
91106
layer.register_parameter("input_scale", input_scale)
92107

108+
if not self.input_symmetric:
109+
raise NotImplementedError(
110+
"static input asymmetric quantization not supported yet")
111+
input_zero_point = Parameter(torch.zeros(1, dtype=torch.int8))
112+
layer.register_parameter("input_zero_point", input_zero_point)
113+
93114
def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
94115
bias: Optional[torch.Tensor]) -> torch.Tensor:
95-
96116
return apply_int8_linear(input=x,
97117
weight=layer.weight,
98118
weight_scale=layer.weight_scale,
99119
input_scale=layer.input_scale,
120+
input_zero_point=layer.input_zero_point,
121+
azp_adj=layer.azp_adj,
100122
bias=bias)

vllm/model_executor/layers/quantization/utils/w8a8_utils.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,13 +191,28 @@ def apply_int8_linear(
191191
weight: torch.Tensor,
192192
weight_scale: torch.Tensor,
193193
input_scale: Optional[torch.Tensor] = None,
194+
input_zero_point: Optional[torch.Tensor] = None,
195+
azp_adj: Optional[torch.Tensor] = None,
194196
bias: Optional[torch.Tensor] = None,
195197
):
196198
# ops.scaled_int8_quant supports both dynamic and static quant.
197199
# * dynamic, layer.input_scale is None and x_scale computed from x.
198200
# * static, layer.input_scale is scalar and x_scale is input_scale.
199-
x_q, x_scale, _ = ops.scaled_int8_quant(input, input_scale)
200-
201+
symmetric = azp_adj is None
202+
x_q, x_scale, x_zp = ops.scaled_int8_quant(input,
203+
input_scale,
204+
input_zero_point,
205+
symmetric=symmetric)
206+
207+
if x_zp is not None:
208+
return ops.cutlass_scaled_mm_azp(x_q,
209+
weight,
210+
scale_a=x_scale,
211+
scale_b=weight_scale,
212+
out_dtype=input.dtype,
213+
azp_adj=azp_adj,
214+
azp=x_zp,
215+
bias=bias)
201216
return ops.cutlass_scaled_mm(x_q,
202217
weight,
203218
scale_a=x_scale,

0 commit comments

Comments
 (0)