From 6a414d9ee88d46b76530f8baee6690a616d00610 Mon Sep 17 00:00:00 2001 From: uvos Date: Sun, 22 Jun 2025 10:29:33 +0200 Subject: [PATCH 1/4] CUDA/HIP: optimize mmv paths taken for HIP/CDNA --- ggml/src/ggml-cuda/common.cuh | 6 +++++- ggml/src/ggml-cuda/mmv.cu | 6 ++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 1369bc2d9e5..4b69771af84 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -263,7 +263,11 @@ static bool fp16_mma_hardware_available(const int cc) { } static bool bf16_mma_hardware_available(const int cc) { - return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE; + return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3; +} + +static bool fp32_mma_hardware_available(const int cc) { + return GGML_CUDA_CC_IS_CDNA(cc); } // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later. diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu index 1502e9d942f..269f648a888 100644 --- a/ggml/src/ggml-cuda/mmv.cu +++ b/ggml/src/ggml-cuda/mmv.cu @@ -456,6 +456,8 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ return ne11 <= 4; } return ne11 <= 3; + } else if (fp32_mma_hardware_available(cc)) { + return ne11 <= 3; } return ne11 <= 8; case GGML_TYPE_F16: @@ -468,6 +470,8 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ return src0_small && ne11 <= 3; } return ne11 <= 8; + } else if (fp16_mma_hardware_available(cc)) { + return ne11 <= 2; } return ne11 <= 8; case GGML_TYPE_BF16: @@ -480,6 +484,8 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ return src0_small && ne11 <= 3; } return ne11 <= 8; + } else if (bf16_mma_hardware_available(cc)) { + return ne11 <= 3; } return ne11 <= 8; default: From 66f4c93b2c4cfaf4d5352d6ab8a89bf534bf4e96 Mon Sep 17 00:00:00 2001 From: uvos Date: Sun, 22 Jun 2025 21:37:26 +0200 Subject: [PATCH 2/4] CUDA/HIP: use mmv instead of rocblas for batch 3-5 mmv --- ggml/src/ggml-cuda/mmv.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu index 269f648a888..bea8c448196 100644 --- a/ggml/src/ggml-cuda/mmv.cu +++ b/ggml/src/ggml-cuda/mmv.cu @@ -471,6 +471,9 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ } return ne11 <= 8; } else if (fp16_mma_hardware_available(cc)) { + if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { + return ne11 <= 5; + } return ne11 <= 2; } return ne11 <= 8; From 70774e169ce189d896597f34bbe9df1f0c13ee5a Mon Sep 17 00:00:00 2001 From: uvos Date: Mon, 23 Jun 2025 18:26:38 +0200 Subject: [PATCH 3/4] Update ggml/src/ggml-cuda/common.cuh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/common.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 4b69771af84..f6127aeee42 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -263,7 +263,7 @@ static bool fp16_mma_hardware_available(const int cc) { } static bool bf16_mma_hardware_available(const int cc) { - return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3; + return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3; } static bool fp32_mma_hardware_available(const int cc) { From 8bd52b67803636674af29efcc6dd326ceacedc26 Mon Sep 17 00:00:00 2001 From: uvos Date: Mon, 23 Jun 2025 22:39:20 +0200 Subject: [PATCH 4/4] restirct to amd --- ggml/src/ggml-cuda/mmv.cu | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu index bea8c448196..e14c93516bd 100644 --- a/ggml/src/ggml-cuda/mmv.cu +++ b/ggml/src/ggml-cuda/mmv.cu @@ -456,8 +456,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ return ne11 <= 4; } return ne11 <= 3; - } else if (fp32_mma_hardware_available(cc)) { - return ne11 <= 3; + } else if (GGML_CUDA_CC_IS_AMD(cc)) { + if (fp32_mma_hardware_available(cc)) { + return ne11 <= 3; + } + return ne11 <= 8; } return ne11 <= 8; case GGML_TYPE_F16: @@ -470,11 +473,14 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ return src0_small && ne11 <= 3; } return ne11 <= 8; - } else if (fp16_mma_hardware_available(cc)) { - if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { - return ne11 <= 5; + } else if (GGML_CUDA_CC_IS_AMD(cc)) { + if (fp16_mma_hardware_available(cc)) { + if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) { + return ne11 <= 5; + } + return ne11 <= 2; } - return ne11 <= 2; + return ne11 <= 8; } return ne11 <= 8; case GGML_TYPE_BF16: @@ -487,8 +493,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_ return src0_small && ne11 <= 3; } return ne11 <= 8; - } else if (bf16_mma_hardware_available(cc)) { - return ne11 <= 3; + } else if (GGML_CUDA_CC_IS_AMD(cc)) { + if (bf16_mma_hardware_available(cc)) { + return ne11 <= 3; + } + return ne11 <= 8; } return ne11 <= 8; default: