From 9b9c5f84cd6a37749d99791f871ba81714759542 Mon Sep 17 00:00:00 2001 From: Jonas Kuebler Date: Mon, 29 Sep 2025 14:30:06 +0000 Subject: [PATCH] optimize decoding performance for headdim 128 fp8 Signed-off-by: Jonas Kuebler --- hopper/tile_size.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hopper/tile_size.h b/hopper/tile_size.h index 24c76b84c2..cede4f6232 100644 --- a/hopper/tile_size.h +++ b/hopper/tile_size.h @@ -60,7 +60,11 @@ constexpr std::tuple tile_size_fwd_sm90( } else if (headdim <= 96) { return {192, 128, true, true}; } else if (headdim <= 128) { - return {128, paged_kv_non_TMA ? 160 : (v_colmajor || (softcap && is_local) ? 192 : 224), true, true}; + if (use_one_mma_wg) { + return {64, 96, true, true}; + } else{ + return {128, paged_kv_non_TMA ? 160 : (v_colmajor || (softcap && is_local) ? 192 : 224), true, true}; + } } else if (headdim <= 192) { return {128, (paged_kv_non_TMA || softcap) && is_local ? 128 : 160, true, true}; } else {