-
-
Notifications
You must be signed in to change notification settings - Fork 11.7k
[AMD][ROCm]Quantization methods on ROCm; Fix _scaled_mm call #8380
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
91e47ae
a108f1d
36bce81
457c026
02074f4
ea368b0
3c332de
2fe7a3b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,10 +8,12 @@ | |
| from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( | ||
| QuantizationStrategy) | ||
| from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( | ||
| apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale) | ||
| apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz, | ||
| requantize_with_max_scale) | ||
| from vllm.model_executor.parameter import (ChannelQuantScaleParameter, | ||
| ModelWeightParameter, | ||
| PerTensorScaleParameter) | ||
| from vllm.utils import is_hip | ||
|
|
||
| __all__ = ["CompressedTensorsW8A8Fp8"] | ||
|
|
||
|
|
@@ -39,16 +41,37 @@ def process_weights_after_loading(self, layer) -> None: | |
| logical_widths=layer.logical_widths, | ||
| ) | ||
|
|
||
| if is_hip(): | ||
| weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz( | ||
| weight=weight, | ||
| weight_scale=max_w_scale, | ||
| input_scale=layer.input_scale) | ||
| if input_scale is not None: | ||
| layer.input_scale = Parameter(input_scale, | ||
| requires_grad=False) | ||
|
|
||
| layer.weight = Parameter(weight.t(), requires_grad=False) | ||
| layer.weight_scale = Parameter(max_w_scale, requires_grad=False) | ||
|
|
||
| # If channelwise, scales are already lined up, so just transpose. | ||
| elif self.strategy == QuantizationStrategy.CHANNEL: | ||
| weight = layer.weight | ||
|
|
||
| if is_hip(): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we move this is_hip() check + its proceeding code to a utils file? seems to be similar update to the weights/scales in multiple cases
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could move the input_scale adjustment into utils, as it is the one common part between the 3 quantization types that support HIP fnuz), but IMO that would make it less explicit, because different parameters of the layer (weight, weight_scale, input_scale) would be adjusted in different places in the code. |
||
| weight, weight_scale, input_scale = \ | ||
| normalize_e4m3fn_to_e4m3fnuz( | ||
| weight=weight, | ||
| weight_scale=layer.weight_scale, | ||
| input_scale=layer.input_scale) | ||
| if input_scale is not None: | ||
| layer.input_scale = Parameter(input_scale, | ||
| requires_grad=False) | ||
| else: | ||
| weight_scale = layer.weight_scale.data | ||
|
|
||
| layer.weight = Parameter(weight.t(), requires_grad=False) | ||
| # required by torch.compile to be torch.nn.Parameter | ||
| layer.weight_scale = Parameter(layer.weight_scale.data, | ||
| requires_grad=False) | ||
| layer.weight_scale = Parameter(weight_scale, requires_grad=False) | ||
|
|
||
| else: | ||
| raise ValueError(f"Unknown quantization strategy {self.strategy}") | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.