From 87160e5cf2b3f92d440f8cff2bd75e3956559e57 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 13 Jun 2024 21:48:29 +0000 Subject: [PATCH 1/3] Fix broadcast load epilogue --- csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp index 8f38bbf50790..877a9f5b9e5d 100644 --- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp +++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp @@ -153,7 +153,7 @@ struct Sm90RowOrScalarBroadcast { CUTLASS_DEVICE void begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) { - if (params.ptr_row == nullptr) { + if (!params.row_broadcast) { return; } From bf129c8ae01e586daa2a2d52e9d4e5342c25f06f Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 13 Jun 2024 21:50:26 +0000 Subject: [PATCH 2/3] turn on cutlass again --- vllm/model_executor/layers/quantization/fp8.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index bc08bfcc32b3..c02cccd909be 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -257,9 +257,7 @@ def apply(self, # If dynamic, layer.input_scale is None and x_scale computed from x. # If static, layer.input_scale is scalar and x_scale is input_scale. - # Temporarily disable CUTLASS kernels due to an illegal memory access - #if bias is None and self.cutlass_fp8_supported: - if False: + if bias is None and self.cutlass_fp8_supported: qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale) # Fused GEMM_DQ From b25b3f3ba3544c6e23bffe46e60840de725993c0 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 13 Jun 2024 22:00:09 +0000 Subject: [PATCH 3/3] format --- vllm/model_executor/layers/quantization/fp8.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index c02cccd909be..e89fd65813c0 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -257,7 +257,7 @@ def apply(self, # If dynamic, layer.input_scale is None and x_scale computed from x. # If static, layer.input_scale is scalar and x_scale is input_scale. - if bias is None and self.cutlass_fp8_supported: + if bias is None and self.cutlass_fp8_supported: qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale) # Fused GEMM_DQ