From 91e47ae9a468b449595654ccab8ffb9bf8d75270 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Wed, 11 Sep 2024 16:03:07 +0000
Subject: [PATCH 1/5] Enabling compressed-tensors and fbgemm quantization on
 rocm. Fixing scaled_mm required parameters

---
 vllm/config.py                                |  4 +-
 .../schemes/compressed_tensors_w8a8_fp8.py    | 29 ++++++++++++--
 .../layers/quantization/fbgemm_fp8.py         | 15 ++++++-
 .../layers/quantization/utils/w8a8_utils.py   | 39 ++++++++++---------
 4 files changed, 63 insertions(+), 24 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 4d9310af79ed..664933db0b7b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -280,7 +280,9 @@ def _parse_quant_hf_config(self):
 
     def _verify_quantization(self) -> None:
         supported_quantization = [*QUANTIZATION_METHODS]
-        rocm_supported_quantization = ["awq", "gptq", "fp8"]
+        rocm_supported_quantization = [
+            "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8"
+        ]
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
             "awq_marlin", "fbgemm_fp8", "compressed_tensors",
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
index 8a3d24e2fd25..5931ec36c97d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py
@@ -8,10 +8,12 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     QuantizationStrategy)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear, cutlass_fp8_supported, requantize_with_max_scale)
+    apply_fp8_linear, cutlass_fp8_supported, normalize_e4m3fn_to_e4m3fnuz,
+    requantize_with_max_scale)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter,
                                            PerTensorScaleParameter)
+from vllm.utils import is_hip
 
 __all__ = ["CompressedTensorsW8A8Fp8"]
 
@@ -39,16 +41,37 @@ def process_weights_after_loading(self, layer) -> None:
                 logical_widths=layer.logical_widths,
             )
 
+            if is_hip():
+                weight, max_w_scale, input_scale = normalize_e4m3fn_to_e4m3fnuz(
+                    weight=weight,
+                    weight_scale=max_w_scale,
+                    input_scale=layer.input_scale)
+                if input_scale is not None:
+                    layer.input_scale = Parameter(input_scale,
+                                                  requires_grad=False)
+
             layer.weight = Parameter(weight.t(), requires_grad=False)
             layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
 
         # If channelwise, scales are already lined up, so just transpose.
         elif self.strategy == QuantizationStrategy.CHANNEL:
             weight = layer.weight
+
+            if is_hip():
+                weight, weight_scale, input_scale = \
+                    normalize_e4m3fn_to_e4m3fnuz(
+                        weight=weight,
+                        weight_scale=layer.weight_scale,
+                        input_scale=layer.input_scale)
+                if input_scale is not None:
+                    layer.input_scale = Parameter(input_scale,
+                                                  requires_grad=False)
+            else:
+                weight_scale = layer.weight_scale.data
+
             layer.weight = Parameter(weight.t(), requires_grad=False)
             # required by torch.compile to be torch.nn.Parameter
-            layer.weight_scale = Parameter(layer.weight_scale.data,
-                                           requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
 
         else:
             raise ValueError(f"Unknown quantization strategy {self.strategy}")
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 3ccf1af9eb89..0b1f6ff68520 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -15,10 +15,11 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     is_layer_skipped)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear)
+    apply_fp8_linear, normalize_e4m3fn_to_e4m3fnuz)
 from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
                                            ModelWeightParameter)
 from vllm.platforms import current_platform
+from vllm.utils import is_hip
 
 logger = init_logger(__name__)
 
@@ -127,8 +128,18 @@ def process_weights_after_loading(self, layer: Module) -> None:
         layer.weight = Parameter(layer.weight.data, requires_grad=False)
 
         weight = layer.weight
-        layer.weight = Parameter(weight.t(), requires_grad=False)
 
+        if is_hip():
+            weight, weight_scale, input_scale = \
+                normalize_e4m3fn_to_e4m3fnuz(
+                    weight=weight,
+                    weight_scale=layer.weight_scale,
+                    input_scale=None)
+            if input_scale is not None:
+                layer.input_scale = Parameter(input_scale, requires_grad=False)
+            layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+
+        layer.weight = Parameter(weight.t(), requires_grad=False)
         if self.quant_config.use_marlin:
             prepare_fp8_layer_for_marlin(layer)
             # Activations not quantized for marlin.
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index a54e3cae73b1..791cfebd2844 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -6,11 +6,9 @@
 from vllm.platforms import current_platform
 from vllm.utils import is_hip
 
-# scaled_mm in pytorch on rocm has a bug that requires always
-# providing scaling factor for result. This value is created
-# as global value to avoid multiple tensor allocations, and
-# can be removed once pytorch fixes the bug.
-TORCH_SCALED_MM_SCALE_RESULT = torch.ones(1).cuda() if is_hip() else None
+# Input scaling factors are no longer optional in _scaled_mm starting
+# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
+TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() if is_hip() else None
 
 
 def cutlass_fp8_supported() -> bool:
@@ -130,19 +128,17 @@ def apply_fp8_linear(
 
         if per_tensor_weights and per_tensor_activations:
             # Fused GEMM_DQ
-            output = torch._scaled_mm(
-                qinput,
-                weight,
-                out_dtype=input.dtype,
-                scale_a=x_scale,
-                scale_b=weight_scale,
-                scale_result=TORCH_SCALED_MM_SCALE_RESULT,
-                bias=bias)
-            # Since in torch 2.5, scaled_mm only returns single value
-            # This should be removed when vllm-nvidia also moves to 2.5
-            if is_hip():
-                return torch.narrow(output, 0, 0, input.shape[0])
-            return torch.narrow(output[0], 0, 0, input.shape[0])
+            output = torch._scaled_mm(qinput,
+                                      weight,
+                                      out_dtype=input.dtype,
+                                      scale_a=x_scale,
+                                      scale_b=weight_scale,
+                                      bias=bias)
+            # A fix for discrepancy in scaled_mm which returns tuple
+            # for torch < 2.5 and a single value in torch >= 2.5
+            if type(output) is tuple and len(output) == 2:
+                return torch.narrow(output[0], 0, 0, input.shape[0])
+            return torch.narrow(output, 0, 0, input.shape[0])
 
         else:
             # Fallback for channelwise case, where we use unfused DQ
@@ -160,11 +156,18 @@ def apply_fp8_linear(
             # For the scaled_mm fallback case, we break this down, since it
             # does not support s_w being a vector.
 
+            # Making sure the dummy tensor is on the same device as the weight
+            global TORCH_DEVICE_IDENTITY
+            if TORCH_DEVICE_IDENTITY.device != weight.device:
+                TORCH_DEVICE_IDENTITY = TORCH_DEVICE_IDENTITY.to(weight.device)
+
             # GEMM
             # This computes C = (X * W).
             # Output in fp32 to allow subsequent ops to happen in-place
             output, _ = torch._scaled_mm(qinput,
                                          weight,
+                                         scale_a=TORCH_DEVICE_IDENTITY,
+                                         scale_b=TORCH_DEVICE_IDENTITY,
                                          out_dtype=torch.float32)
             # Unpad (undo num_token_padding)
             output = torch.narrow(output, 0, 0, input.shape[0])

From a108f1d7b432b5abcd801a66e734d161c05aada9 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Thu, 12 Sep 2024 10:42:34 -0500
Subject: [PATCH 2/5] Update run-amd-test.sh - enable
 tests/kernels/test_fp8_quant.py

enable tests/kernels/test_fp8_quant.py
---
 .buildkite/run-amd-test.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index c9b72a3264e8..89eb83b901bd 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -83,7 +83,6 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_encoder_decoder_attn.py \
   --ignore=kernels/test_flash_attn.py \
   --ignore=kernels/test_flashinfer.py \
-  --ignore=kernels/test_int8_quant.py \
   --ignore=kernels/test_machete_gemm.py \
   --ignore=kernels/test_mamba_ssm.py \
   --ignore=kernels/test_marlin_gemm.py \

From 457c026e4d0d6fa250903f7c96de36c10b6315f7 Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Fri, 13 Sep 2024 10:14:24 -0500
Subject: [PATCH 3/5] Restore blocking of the kernels/test_int8_quant.py test

---
 .buildkite/run-amd-test.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index dfe80ff86a21..6659440135ff 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -83,6 +83,7 @@ if [[ $commands == *" kernels "* ]]; then
   --ignore=kernels/test_encoder_decoder_attn.py \
   --ignore=kernels/test_flash_attn.py \
   --ignore=kernels/test_flashinfer.py \
+  --ignore=kernels/test_int8_quant.py \
   --ignore=kernels/test_machete_gemm.py \
   --ignore=kernels/test_mamba_ssm.py \
   --ignore=kernels/test_marlin_gemm.py \

From 02074f4bc9ec90ef587084bcfa70eaf668f9bcf3 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Date: Fri, 13 Sep 2024 13:31:28 -0400
Subject: [PATCH 4/5] Update vllm/config.py

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 vllm/config.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 3aac5dbedcea..c7d0e725fc5b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -281,7 +281,8 @@ def _parse_quant_hf_config(self):
     def _verify_quantization(self) -> None:
         supported_quantization = [*QUANTIZATION_METHODS]
         rocm_supported_quantization = [
-            "awq", "gptq", "fp8", "compressed-tensors", "fbgemm_fp8"
+            "awq", "gptq", "fp8", "compressed_tensors", "compressed-tensors",
+            "fbgemm_fp8"
         ]
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",

From ea368b026d602e97cd25aeb660c2e436732b1572 Mon Sep 17 00:00:00 2001
From: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Date: Fri, 13 Sep 2024 22:13:06 +0000
Subject: [PATCH 5/5] Adapt the alternative scaled_mm call to torch >=2.5

---
 .../layers/quantization/utils/w8a8_utils.py        | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 791cfebd2844..9fc6fa9fe793 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -164,11 +164,15 @@ def apply_fp8_linear(
             # GEMM
             # This computes C = (X * W).
             # Output in fp32 to allow subsequent ops to happen in-place
-            output, _ = torch._scaled_mm(qinput,
-                                         weight,
-                                         scale_a=TORCH_DEVICE_IDENTITY,
-                                         scale_b=TORCH_DEVICE_IDENTITY,
-                                         out_dtype=torch.float32)
+            output = torch._scaled_mm(qinput,
+                                      weight,
+                                      scale_a=TORCH_DEVICE_IDENTITY,
+                                      scale_b=TORCH_DEVICE_IDENTITY,
+                                      out_dtype=torch.float32)
+            # A fix for discrepancy in scaled_mm which returns tuple
+            # for torch < 2.5 and a single value in torch >= 2.5
+            if type(output) is tuple and len(output) == 2:
+                output = output[0]
             # Unpad (undo num_token_padding)
             output = torch.narrow(output, 0, 0, input.shape[0])
             x_scale = torch.narrow(x_scale, 0, 0, input.shape[0])