From a4575b65280562180a2551a152ff113e1fedad78 Mon Sep 17 00:00:00 2001 From: Bangtian Liu Date: Mon, 24 Nov 2025 21:38:43 -0800 Subject: [PATCH 1/3] [tuner] add padding support along IGEMM path for conv ops Signed-off-by: Bangtian Liu --- amdsharktuner/amdsharktuner/common.py | 57 +++++++++++++++++ .../amdsharktuner/constraint_generator.py | 45 +++++++++---- .../amdsharktuner/dispatch_constraints.py | 3 - amdsharktuner/tests/common_test.py | 62 ++++++++++++++++++ .../tests/constraint_generator_test.py | 64 ++++++++++--------- 5 files changed, 184 insertions(+), 47 deletions(-) diff --git a/amdsharktuner/amdsharktuner/common.py b/amdsharktuner/amdsharktuner/common.py index f193b1393eb..76a06b25c5e 100644 --- a/amdsharktuner/amdsharktuner/common.py +++ b/amdsharktuner/amdsharktuner/common.py @@ -523,3 +523,60 @@ def get_target_info(input_module: ir.Module) -> iree_gpu.TargetInfo: target = executable_variant_op.target return iree_gpu.TargetInfo.get_gpu_target_info(target) + + +# The following two functions are from IREE side for padding utility: +# https://github.com/iree-org/iree/blob/8ae91ebb0e555e660b8a6898f6071476f7a1f20b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp#L631-L671 +def maybe_padded_bounds(original_bound: int, alignment: int) -> tuple[int, bool]: + remainder = original_bound % alignment + if remainder == 0: + return original_bound, False + return original_bound + alignment - remainder, True + + +def get_dim_bounds( + dims: list[int], + padding_can_be_expensive: bool, +) -> tuple[list[int], bool]: + result = [] + any_padding_applied = False + + for dim in dims: + if padding_can_be_expensive: + result.append(dim) + continue + + if dim > 128: + padded, was_padded = maybe_padded_bounds(dim, 128) + result.append(padded) + any_padding_applied = any_padding_applied or was_padded + elif dim > 32: + padded, was_padded = maybe_padded_bounds(dim, 32) + result.append(padded) + any_padding_applied = any_padding_applied or was_padded + else: + result.append(dim) + + return result, any_padding_applied + + +# Use padding logic from IREE side: +# https://github.com/iree-org/iree/blob/8ae91ebb0e555e660b8a6898f6071476f7a1f20b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp#L691-L703 +def calculate_padded_dimensions( + M: list[int], + N: list[int], + contraction_dims: ContractionDimensions, + contraction_maps: list[ir.AffineMap], +) -> tuple[list[int], list[int], bool]: + # Detect LHS transposition. Padding is disabled only when LHS is transposed. + k_dim_inner = contraction_dims.k[-1] + lhs_map = contraction_maps[0] + lhs_last_expr = lhs_map.results[-1] + lhs_dim_expr = ir.AffineDimExpr(lhs_last_expr) + + transposed_lhs = k_dim_inner != lhs_dim_expr.position + + M_padded, m_padding_applied = get_dim_bounds(M, transposed_lhs) + N_padded, n_padding_applied = get_dim_bounds(N, transposed_lhs) + + return M_padded, N_padded, m_padding_applied or n_padding_applied diff --git a/amdsharktuner/amdsharktuner/constraint_generator.py b/amdsharktuner/amdsharktuner/constraint_generator.py index e425f978cda..d342cabc27c 100644 --- a/amdsharktuner/amdsharktuner/constraint_generator.py +++ b/amdsharktuner/amdsharktuner/constraint_generator.py @@ -9,6 +9,7 @@ from abc import ABC, abstractmethod from typing import Iterator, Optional +from iree.compiler import ir # type: ignore from iree.compiler.dialects import iree_codegen, iree_gpu, linalg # type: ignore from . import common, dispatch_constraints, dispatch_parser @@ -57,7 +58,6 @@ def adjust_problem_size_for_pipeline( matmul_size.N = [bounds[i] for i in contraction_dims.n] matmul_size.K = [bounds[i] for i in contraction_dims.k] matmul_size.B = [bounds[i] for i in contraction_dims.batch] - return # Fallback: Manual flattening for legacy path when IGEMM details are unavailable. @@ -76,6 +76,7 @@ def generate_generic_contraction_solutions( rhs_type: common.ShapedType, res_type: common.ShapedType, dispatch_kind: common.DispatchKind, + indexing_maps: list[ir.AffineMap], codegen_pipeline: iree_codegen.DispatchLoweringPassPipeline = iree_codegen.DispatchLoweringPassPipeline.LLVMGPUVectorDistribute, num_subgroups: int = 4, allowed_waves_per_eu: list[int] = [2], @@ -94,6 +95,23 @@ def generate_generic_contraction_solutions( M, N, K = matmul_size.M, matmul_size.N, matmul_size.K tuner_ctx.logger.debug(f"{M},{N},{K}") + # Apply padding for TileAndFuse pipeline to get better tile sizes. + padding_applied = False + if codegen_pipeline == iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse: + # Use IGEMM maps if available (dimensions were restructured), otherwise use original indexing maps. + padding_maps = indexing_maps + if igemm_details: + padding_maps = [ + map_attr.value for map_attr in igemm_details.igemm_contraction_maps + ] + + M_padded, N_padded, padding_applied = common.calculate_padded_dimensions( + M, N, contraction_dims, padding_maps + ) + M, N = M_padded, N_padded + matmul_size.M = M + matmul_size.N = N + m_vars = [z3.Int(f"m{i}") for i in range(len(M))] n_vars = [z3.Int(f"n{i}") for i in range(len(N))] k_vars = [z3.Int(f"k{i}") for i in range(len(K))] @@ -238,23 +256,20 @@ def set_cdim_tile_sizes(tile_sizes, contraction_dims, csizes): [lookup(v) for v in k_vars], ) - required_padding = any( - p[-1] % i != 0 for p, i in zip((M, N, K), intrinsic_mnk_shape, strict=True) - ) promote_operands = [0, 1] padding = None - if required_padding: + if padding_applied: # TODO: Remove promotion of operand 2 once codegen supports handling padded outputs without promotion. promote_operands = [0, 1, 2] - _, _, mma_intrinsic_k = mma_attr.mnk_shape - padding = [ - *(workgroup_tile_sizes[d] for d in contraction_dims.m), - *(workgroup_tile_sizes[d] for d in contraction_dims.n), - *( - reduction_tile_sizes[d] * mma_intrinsic_k - for d in contraction_dims.k - ), - ] + padding_tile_sizes = list(workgroup_tile_sizes) + for k_dim in contraction_dims.k: + padding_tile_sizes[k_dim] = reduction_tile_sizes[k_dim] + + mma_intrinsic_k = mma_attr.mnk_shape[2] + inner_k_dim = contraction_dims.k[-1] + padding_tile_sizes[inner_k_dim] *= mma_intrinsic_k + + padding = padding_tile_sizes # Setting subgroup basis. # TODO(Bangtian): Sync changes from IREE PR: https://github.com/iree-org/iree/pull/22000. subgroup_basis_counts = [1] * num_loops @@ -562,6 +577,7 @@ def generate_solutions( rhs_type=self.op_info.rhs_type, res_type=self.op_info.res_type, dispatch_kind=common.DispatchKind.contraction, + indexing_maps=self.op_info.indexing_maps, codegen_pipeline=codegen_pipeline, **pipeline_constraint_options, ) @@ -587,6 +603,7 @@ def generate_solutions( rhs_type=self.op_info.rhs_type, res_type=self.op_info.res_type, dispatch_kind=common.DispatchKind.conv, + indexing_maps=self.op_info.indexing_maps, codegen_pipeline=codegen_pipeline, igemm_details=self.op_info.igemm_details, **pipeline_constraint_options, diff --git a/amdsharktuner/amdsharktuner/dispatch_constraints.py b/amdsharktuner/amdsharktuner/dispatch_constraints.py index 21cf5194629..2f55b05db7a 100644 --- a/amdsharktuner/amdsharktuner/dispatch_constraints.py +++ b/amdsharktuner/amdsharktuner/dispatch_constraints.py @@ -284,9 +284,6 @@ def generate_tile_and_fuse_constraints( M, N, K = list(matmul_size.M), list(matmul_size.N), list(matmul_size.K) m_tiles, n_tiles, k_tiles, subgroup_m_tiles, subgroup_n_tiles = tile_sizes intrinsic_mn, intrinsic_k = intrinsic_size - M[-1] = ((M[-1] + intrinsic_mn - 1) / intrinsic_mn) * intrinsic_mn - N[-1] = ((N[-1] + intrinsic_mn - 1) / intrinsic_mn) * intrinsic_mn - K[-1] = ((K[-1] + intrinsic_k - 1) / intrinsic_k) * intrinsic_k wg_x, wg_y, wg_z = workgroup_size wg_threads = wg_x constraints = [wg_y == 1, wg_z == 1] diff --git a/amdsharktuner/tests/common_test.py b/amdsharktuner/tests/common_test.py index 70d087a3080..24b40a2b972 100644 --- a/amdsharktuner/tests/common_test.py +++ b/amdsharktuner/tests/common_test.py @@ -505,3 +505,65 @@ def test_get_target_info(tuner_ctx: common.TunerContext) -> None: iree_gpu.MMAIntrinsic.MFMA_F32_16x16x16_F16, iree_gpu.VirtualMMAIntrinsic.VMFMA_F32_16x16x32_F16, ] + + +def test_maybe_padded_bounds() -> None: + # Already aligned: no padding. + assert common.maybe_padded_bounds(128, 128) == (128, False) + assert common.maybe_padded_bounds(64, 32) == (64, False) + + # Not aligned: padding needed. + assert common.maybe_padded_bounds(100, 128) == (128, True) + assert common.maybe_padded_bounds(50, 32) == (64, True) + assert common.maybe_padded_bounds(200, 128) == (256, True) + + +def test_get_dim_bounds() -> None: + # Padding not expensive: apply padding. + assert common.get_dim_bounds([200, 300], False) == ([256, 384], True) + assert common.get_dim_bounds([128, 256], False) == ([128, 256], False) + assert common.get_dim_bounds([200, 50, 20], False) == ([256, 64, 20], True) + + # Padding expensive: no padding applied. + assert common.get_dim_bounds([100, 200], True) == ([100, 200], False) + + +def test_calculate_padded_dimensions( + tuner_ctx: common.TunerContext, +) -> None: + with ir.Location.unknown(tuner_ctx.mlir_ctx): + dim0 = ir.AffineExpr.get_dim(0) + dim1 = ir.AffineExpr.get_dim(1) + dim2 = ir.AffineExpr.get_dim(2) + + lhs_map = ir.AffineMap.get(3, 0, [dim0, dim2]) + rhs_map = ir.AffineMap.get(3, 0, [dim2, dim1]) + res_map = ir.AffineMap.get(3, 0, [dim0, dim1]) + + contraction_dims = common.ContractionDimensions(m=[0], n=[1], k=[2], batch=[]) + + # Test with non-transposed LHS: (m, k) x (k, n) -> (m, n). + # LHS map: (d0, d1, d2) -> (d0, d2) # M=d0, K=d2 (non-transposed). + (M_padded, N_padded, padding_applied,) = common.calculate_padded_dimensions( + M=[200], + N=[300], + contraction_dims=contraction_dims, + contraction_maps=[lhs_map, rhs_map, res_map], + ) + assert M_padded == [256], f"Expected M padded to 256, got {M_padded}" + assert N_padded == [384], f"Expected N padded to 384, got {N_padded}" + assert padding_applied == True + + # Test with transposed LHS: (k, m) x (k, n) -> (m, n). + # LHS map: (d0, d1, d2) -> (d2, d0) # K=d2, M=d0 (transposed). + lhs_transposed_map = ir.AffineMap.get(3, 0, [dim2, dim0]) + + (M_padded, N_padded, padding_applied,) = common.calculate_padded_dimensions( + M=[200], + N=[300], + contraction_dims=contraction_dims, + contraction_maps=[lhs_transposed_map, rhs_map, res_map], + ) + assert M_padded == [200], f"Expected M not padded, got {M_padded}" + assert N_padded == [300], f"Expected N not padded, got {N_padded}" + assert padding_applied == False diff --git a/amdsharktuner/tests/constraint_generator_test.py b/amdsharktuner/tests/constraint_generator_test.py index eeec74bddd4..05029321aee 100644 --- a/amdsharktuner/tests/constraint_generator_test.py +++ b/amdsharktuner/tests/constraint_generator_test.py @@ -306,9 +306,9 @@ def test_generate_solutions_tile_and_fuse_conv_padding( f16 = tuner_ctx.type.f16 f32 = tuner_ctx.type.f32 - input_shape = (2, 7, 7, 32) - kernel_shape = (3, 3, 32, 64) - output_shape = (2, 5, 5, 64) + input_shape = (2, 32, 32, 128) + kernel_shape = (3, 3, 128, 256) + output_shape = (2, 30, 30, 256) with ir.Location.unknown(context): module = ir.Module.create() @@ -336,13 +336,13 @@ def test_generate_solutions_tile_and_fuse_conv_padding( assert gen.op_info.dims.k == [4, 5, 6] assert gen.op_info.matmul_size.B == [] - assert gen.op_info.matmul_size.M == [2, 5, 5] - assert gen.op_info.matmul_size.N == [64] - assert gen.op_info.matmul_size.K == [3, 3, 32] + assert gen.op_info.matmul_size.M == [2, 30, 30] + assert gen.op_info.matmul_size.N == [256] + assert gen.op_info.matmul_size.K == [3, 3, 128] - assert gen.op_info.lhs_type.shape == [2, 7, 7, 32] - assert gen.op_info.rhs_type.shape == [3, 3, 32, 64] - assert gen.op_info.res_type.shape == [2, 5, 5, 64] + assert gen.op_info.lhs_type.shape == [2, 32, 32, 128] + assert gen.op_info.rhs_type.shape == [3, 3, 128, 256] + assert gen.op_info.res_type.shape == [2, 30, 30, 256] solutions = list( gen.generate_solutions( @@ -353,27 +353,31 @@ def test_generate_solutions_tile_and_fuse_conv_padding( ) ) - assert len(solutions) > 0, "No solutions generated with TileAndFuse pipeline." - for solution in solutions: - assert len(solution) == 1, f"Expected a single-item list, got: {solution}" - config = solution[0] - assert isinstance( - config, common.TuningConfiguration - ), f"Expected TuningConfiguration, got: {type(config)}" - - assert ( - config.name == "compilation_info" - ), f"Expected key 'compilation_info', got: {config.name}" - assert isinstance( - config.configuration, iree_codegen.CompilationInfoAttr - ), f"Expected CompilationInfoAttr, got: {type(config.configuration)}" - - lowering_config = config.configuration.lowering_config - assert "padding =" in str( - lowering_config - ), f"Missing padding in lowering config: {lowering_config}" - promote = [int(x) for x in lowering_config.attributes["promote_operands"]] - assert promote == [0, 1, 2] + if len(solutions) > 0: + for solution in solutions: + assert ( + len(solution) == 1 + ), f"Expected a single-item list, got: {solution}" + config = solution[0] + assert isinstance( + config, common.TuningConfiguration + ), f"Expected TuningConfiguration, got: {type(config)}" + + assert ( + config.name == "compilation_info" + ), f"Expected key 'compilation_info', got: {config.name}" + assert isinstance( + config.configuration, iree_codegen.CompilationInfoAttr + ), f"Expected CompilationInfoAttr, got: {type(config.configuration)}" + + lowering_config = config.configuration.lowering_config + assert "padding =" in str( + lowering_config + ), f"Missing padding in lowering config: {lowering_config}" + promote = [ + int(x) for x in lowering_config.attributes["promote_operands"] + ] + assert promote == [0, 1, 2] def test_adjust_problem_size_for_pipeline( From 927a2ffa290c32f9125b4f3b38a66cad7239e929 Mon Sep 17 00:00:00 2001 From: Bangtian Liu Date: Wed, 26 Nov 2025 09:39:02 -0800 Subject: [PATCH 2/3] add todo comment about padding Signed-off-by: Bangtian Liu --- amdsharktuner/amdsharktuner/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/amdsharktuner/amdsharktuner/common.py b/amdsharktuner/amdsharktuner/common.py index 76a06b25c5e..85e72792935 100644 --- a/amdsharktuner/amdsharktuner/common.py +++ b/amdsharktuner/amdsharktuner/common.py @@ -546,6 +546,8 @@ def get_dim_bounds( result.append(dim) continue + # TODO: Make over-padding a tunable parameter. This logic allows over-padding to get larger + # tile sizes, which may result in better performance despite doing more padded computation. if dim > 128: padded, was_padded = maybe_padded_bounds(dim, 128) result.append(padded) From 6e904d5d5c71e779ed734857763c41009588ec4d Mon Sep 17 00:00:00 2001 From: Bangtian Liu Date: Fri, 28 Nov 2025 18:37:55 +0000 Subject: [PATCH 3/3] [tuner] add padding_conv attribute along IGEMM supprot for conv Signed-off-by: Bangtian Liu --- amdsharktuner/amdsharktuner/common.py | 147 +++++++++++++++++- .../amdsharktuner/constraint_generator.py | 18 +++ .../amdsharktuner/dispatch_constraints.py | 4 + .../amdsharktuner/dispatch_parser.py | 59 +++++++ amdsharktuner/tests/common_test.py | 25 +++ .../tests/constraint_generator_test.py | 22 ++- 6 files changed, 263 insertions(+), 12 deletions(-) diff --git a/amdsharktuner/amdsharktuner/common.py b/amdsharktuner/amdsharktuner/common.py index 85e72792935..5946d0120fd 100644 --- a/amdsharktuner/amdsharktuner/common.py +++ b/amdsharktuner/amdsharktuner/common.py @@ -18,7 +18,7 @@ import tempfile from iree.compiler import ir # type: ignore -from iree.compiler.dialects import iree_codegen, iree_gpu, transform # type: ignore +from iree.compiler.dialects import iree_codegen, iree_gpu, linalg, transform # type: ignore import iree.compiler as ireec # type: ignore from iree.compiler._mlir_libs._mlir import ir # type: ignore @@ -190,6 +190,23 @@ class ContractionDimensions: batch: list[int] = field(default_factory=list) +@dataclass +class ConvToIgemmInfo: + """ + Stores information about convolution to IGEMM transformation. + This corresponds to the C++ ConvToIgemmInfo struct in IREE. + + Note: In C++, conv_to_igemm_dim_map is DenseMap, + but in Python bindings it's dict[int, int] mapping conv dim to IGEMM position. + """ + + is_batch_dim_last: bool = False + is_spatial_dim_last: bool = False + conv_dims: Optional[linalg.ConvolutionDimensions] = None + conv_to_igemm_dim_map: dict[int, int] = field(default_factory=dict) + input_channel_dim_to_size: dict[int, int] = field(default_factory=dict) + + @dataclass class MatmulShapeType: m: int @@ -233,6 +250,24 @@ class AttentionKnobs(KnobAssignment): pass +def is_affine_expr_function_of_dim(expr: ir.AffineExpr, position: int) -> bool: + """ + Return True if the expression depends on the dimension at the given position. + """ + if ir.AffineDimExpr.isinstance(expr): + dim_expr = ir.AffineDimExpr(expr) + return dim_expr.position == position + + # Check if it's a binary operation and recursively check both sides. + if ir.AffineBinaryExpr.isinstance(expr): + binary_expr = ir.AffineBinaryExpr(expr) + return is_affine_expr_function_of_dim( + binary_expr.lhs, position + ) or is_affine_expr_function_of_dim(binary_expr.rhs, position) + + return False + + def get_map_result_dim_positions(map: ir.AffineMap) -> Optional[list[int]]: if not map.is_projected_permutation: return None @@ -281,7 +316,7 @@ def get_lowering_config( # A local variable to hold the transformed value. promoted_value = value match key: - case "workgroup" | "reduction" | "subgroup" | "promote_operands" | "padding": + case "workgroup" | "reduction" | "subgroup" | "promote_operands" | "padding" | "padding_conv": if isinstance(value, Sequence): promoted_value = ir.ArrayAttr.get( [tuner_ctx.type.getI64(x) for x in value] @@ -562,8 +597,112 @@ def get_dim_bounds( return result, any_padding_applied -# Use padding logic from IREE side: -# https://github.com/iree-org/iree/blob/8ae91ebb0e555e660b8a6898f6071476f7a1f20b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp#L691-L703 +# Implemented padding logic from IREE side: +# https://github.com/iree-org/iree/blob/8ae91ebb0e555e660b8a6898f6071476f7a1f20b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp#L382-L467 +def get_padding_conv_sizes( + bounds: list[int], + padding_sizes: list[int], + workgroup_tile_sizes: list[int], + reduction_tile_sizes: list[int], + conv_to_igemm_info: ConvToIgemmInfo, +) -> Optional[list[int]]: + """ + Calculate padding sizes for convolution dimensions when using IGEMM. + This corresponds to C++ getPaddingConvSizes function in IREE. + + Args: + bounds: Loop bounds for each dimension + padding_sizes: Padding sizes for IGEMM dimensions + workgroup_tile_sizes: Workgroup tile sizes + reduction_tile_sizes: Reduction tile sizes + conv_to_igemm_info: Convolution to IGEMM transformation info (must not be None) + + Returns: + List of padding sizes for convolution dimensions, or None if padding should be skipped. + Caller should convert to ArrayAttr if needed. + """ + # Skip padding convolution for NCHW layout (spatial dimension last). + if conv_to_igemm_info.is_spatial_dim_last: + return None + + conv_to_igemm_map = conv_to_igemm_info.conv_to_igemm_dim_map + padded_igemm_dims = set() + conv_dims = conv_to_igemm_info.conv_dims + + assert conv_dims is not None, "Expected conv_dims to be set in ConvToIgemmInfo" + + input_channel_dims = set(conv_dims.input_channel) + + padding_conv_sizes = [0] * len(conv_to_igemm_map) + + # For batch-last layout (e.g., CHWN), only pad the batch dimension to avoid + # introducing pad op as the producer of collapse_shape op which may cause fusion problem. + if conv_to_igemm_info.is_batch_dim_last: + last_batch_dim = conv_dims.batch[-1] + # The map stores integer positions, use them directly. + igemm_batch_pos = conv_to_igemm_map[last_batch_dim] + + if ( + padding_sizes[igemm_batch_pos] + and bounds[igemm_batch_pos] % padding_sizes[igemm_batch_pos] == 0 + ): + return None + + padding_conv_sizes[last_batch_dim] = padding_sizes[igemm_batch_pos] + return padding_conv_sizes + + # Process each convolution dimension mapping. + for conv_dim, igemm_pos in conv_to_igemm_map.items(): + # The map stores integer positions directly. + + if reduction_tile_sizes[igemm_pos] != 0: + # For reduction dimensions, avoid setting padding on the convolution + # if the product of the corresponding conv sizes are already divisible by the padding size. + if ( + padding_sizes[igemm_pos] + and bounds[igemm_pos] % padding_sizes[igemm_pos] == 0 + ): + padded_igemm_dims.add(igemm_pos) + continue + + # Only pad input channel dims. If we need to pad filter dims, then we + # would rather just do padding on the IGEMM instead. + if conv_dim in input_channel_dims: + # Multiple input channel dims for a single IGEMMPos is not supported. + if igemm_pos in padded_igemm_dims: + return None + + input_channel_size = conv_to_igemm_info.input_channel_dim_to_size.get( + conv_dim, 0 + ) + is_input_channel_size_small = ( + padding_sizes[igemm_pos] // input_channel_size > 2 + ) + + # If the input channel dimension is much smaller than the padding size, + # skip padding along that dimension while still padding the others. + if is_input_channel_size_small: + padding_conv_sizes[conv_dim] = 0 + else: + padding_conv_sizes[conv_dim] = padding_sizes[igemm_pos] + + padded_igemm_dims.add(igemm_pos) + continue + + # Multiple padded parallel dims mapping to the same IGEMM dim is not supported. + if workgroup_tile_sizes[igemm_pos] != 0 and igemm_pos in padded_igemm_dims: + return None + + padding_conv_sizes[conv_dim] = padding_sizes[igemm_pos] + padded_igemm_dims.add(igemm_pos) + + # Ensure that all dimensions have been padded. + if len(padded_igemm_dims) != len(padding_sizes): + return None + + return padding_conv_sizes + + def calculate_padded_dimensions( M: list[int], N: list[int], diff --git a/amdsharktuner/amdsharktuner/constraint_generator.py b/amdsharktuner/amdsharktuner/constraint_generator.py index d342cabc27c..2c45bea593f 100644 --- a/amdsharktuner/amdsharktuner/constraint_generator.py +++ b/amdsharktuner/amdsharktuner/constraint_generator.py @@ -82,6 +82,7 @@ def generate_generic_contraction_solutions( allowed_waves_per_eu: list[int] = [2], pipeline_options_search_space: dispatch_constraints.PipelineOptionsSearchSpace = dispatch_constraints.PipelineOptionsSearchSpace(), igemm_details: Optional[iree_codegen.IGEMMGenericConvDetails] = None, + conv_to_igemm_info: Optional[common.ConvToIgemmInfo] = None, ) -> Iterator[list[common.TuningConfiguration]]: adjust_problem_size_for_pipeline( contraction_dims, @@ -258,6 +259,7 @@ def set_cdim_tile_sizes(tile_sizes, contraction_dims, csizes): promote_operands = [0, 1] padding = None + padding_conv = None if padding_applied: # TODO: Remove promotion of operand 2 once codegen supports handling padded outputs without promotion. promote_operands = [0, 1, 2] @@ -270,6 +272,18 @@ def set_cdim_tile_sizes(tile_sizes, contraction_dims, csizes): padding_tile_sizes[inner_k_dim] *= mma_intrinsic_k padding = padding_tile_sizes + + # Calculate padding_conv sizes for convolutions when using IGEMM. + if conv_to_igemm_info and igemm_details: + # Use IGEMM loop bounds directly from igemm_details. + bounds = list(igemm_details.igemm_loop_bounds) + padding_conv = common.get_padding_conv_sizes( + bounds, + padding_tile_sizes, + workgroup_tile_sizes, + reduction_tile_sizes, + conv_to_igemm_info, + ) # Setting subgroup basis. # TODO(Bangtian): Sync changes from IREE PR: https://github.com/iree-org/iree/pull/22000. subgroup_basis_counts = [1] * num_loops @@ -294,6 +308,7 @@ def set_cdim_tile_sizes(tile_sizes, contraction_dims, csizes): pipeline_options_search_space, allowed_waves_per_eu, padding=padding, + padding_conv=padding_conv, ) solver.add(z3.simplify(z3.Not(z3.And(list(x == model[x] for x in all_vars))))) @@ -594,6 +609,8 @@ def generate_solutions( codegen_pipeline: iree_codegen.DispatchLoweringPassPipeline, **pipeline_constraint_options, ) -> Iterator[list[common.TuningConfiguration]]: + # TODO(Bangtian): Simplify the function signature to accept op_info directly instead of + # unpacking all individual fields. return generate_generic_contraction_solutions( tuner_ctx=tuner_context, gpu_target_info=gpu_target_info, @@ -606,6 +623,7 @@ def generate_solutions( indexing_maps=self.op_info.indexing_maps, codegen_pipeline=codegen_pipeline, igemm_details=self.op_info.igemm_details, + conv_to_igemm_info=self.op_info.conv_to_igemm_info, **pipeline_constraint_options, ) diff --git a/amdsharktuner/amdsharktuner/dispatch_constraints.py b/amdsharktuner/amdsharktuner/dispatch_constraints.py index 2f55b05db7a..3dfc0e1f539 100644 --- a/amdsharktuner/amdsharktuner/dispatch_constraints.py +++ b/amdsharktuner/amdsharktuner/dispatch_constraints.py @@ -672,6 +672,7 @@ def generate_compilation_infos( pipeline_options_search_space: PipelineOptionsSearchSpace, allowed_waves_per_eu: list[int], padding: Optional[list[int]] = None, + padding_conv: Optional[list[int]] = None, ) -> list[iree_codegen.CompilationInfoAttr]: subgroup_basis = [subgroup_basis_counts, subgroup_basis_mapping] # Create the LoweringConfigAttr. @@ -688,6 +689,9 @@ def generate_compilation_infos( if padding is not None: lowering_config_args["padding"] = padding + if padding_conv is not None: + lowering_config_args["padding_conv"] = padding_conv + if codegen_pipeline == iree_codegen.DispatchLoweringPassPipeline.LLVMGPUTileAndFuse: lowering_config_args["subgroup"] = subgroup_tile_sizes diff --git a/amdsharktuner/amdsharktuner/dispatch_parser.py b/amdsharktuner/amdsharktuner/dispatch_parser.py index ead96040b5c..8cf0f62ff32 100644 --- a/amdsharktuner/amdsharktuner/dispatch_parser.py +++ b/amdsharktuner/amdsharktuner/dispatch_parser.py @@ -74,6 +74,8 @@ class ConvolutionOpInfo(OpInfo): # IGEMM details for TileAndFuse pipeline (None if not available). igemm_details: Optional[iree_codegen.IGEMMGenericConvDetails] = None + # Convolution to IGEMM transformation info (None if not available). + conv_to_igemm_info: Optional[common.ConvToIgemmInfo] = None @dataclass @@ -275,6 +277,62 @@ def __init__(self, root_op: ir.Operation, tuner_ctx: common.TunerContext): # for any convolution layout (nhwc_hwcf, nchw_fchw, etc.). igemm_details = iree_codegen.get_igemm_generic_conv_details(root_op) + # Build ConvToIgemmInfo using convolution_dims (similar to C++ implementation). + conv_to_igemm_info = None + if igemm_details: + conv_to_igemm_info = common.ConvToIgemmInfo() + + # Get input operand information (first operand). + input_type = lhs_type + input_shape = input_type.shape + input_map = indexing_maps[0] + + # Store the convolution dimensions. + conv_to_igemm_info.conv_dims = convolution_dims + + # Process input channel dimensions. + # Note: For convolutions with strides, expressions may be complex (e.g., d0*2 + d1). + # We use isFunctionOfDim to check if the expression depends on a specific dimension. + for dim in convolution_dims.input_channel: + for idx, expr in enumerate(input_map.results): + if common.is_affine_expr_function_of_dim(expr, dim): + conv_to_igemm_info.input_channel_dim_to_size[dim] = input_shape[ + idx + ] + + # Process output image dimensions to find input image positions. + input_image_pos = [] + for dim in convolution_dims.output_image: + for idx, expr in enumerate(input_map.results): + if common.is_affine_expr_function_of_dim(expr, dim): + input_image_pos.append(idx) + + # Process batch dimensions to find batch positions. + batch_pos = [] + for dim in convolution_dims.batch: + for idx, expr in enumerate(input_map.results): + if common.is_affine_expr_function_of_dim(expr, dim): + batch_pos.append(idx) + + # Sort positions. + input_image_pos = sorted(input_image_pos) + batch_pos = sorted(batch_pos) + + # Determine if batch dimension is last. + conv_to_igemm_info.is_batch_dim_last = ( + len(batch_pos) > 0 and batch_pos[-1] == len(input_shape) - 1 + ) + + # Determine if spatial dimension is last. + conv_to_igemm_info.is_spatial_dim_last = ( + len(input_image_pos) > 0 and input_image_pos[-1] == len(input_shape) - 1 + ) + + # Store conv to IGEMM dimension mapping from IGEMM details. + conv_to_igemm_info.conv_to_igemm_dim_map = dict( + igemm_details.conv_to_igemm_dim_map + ) + self._op_info: ConvolutionOpInfo = ConvolutionOpInfo( root_op=root_op, indexing_maps=indexing_maps, @@ -292,6 +350,7 @@ def __init__(self, root_op: ir.Operation, tuner_ctx: common.TunerContext): strides=strides, dilations=dilations, igemm_details=igemm_details, + conv_to_igemm_info=conv_to_igemm_info, ) def has_valid_root_op(self) -> bool: diff --git a/amdsharktuner/tests/common_test.py b/amdsharktuner/tests/common_test.py index 24b40a2b972..c32c66bdf10 100644 --- a/amdsharktuner/tests/common_test.py +++ b/amdsharktuner/tests/common_test.py @@ -567,3 +567,28 @@ def test_calculate_padded_dimensions( assert M_padded == [200], f"Expected M not padded, got {M_padded}" assert N_padded == [300], f"Expected N not padded, got {N_padded}" assert padding_applied == False + + +def test_is_affine_expr_function_of_dim(tuner_ctx: common.TunerContext) -> None: + with tuner_ctx.mlir_ctx: + d0 = ir.AffineDimExpr.get(0) + d1 = ir.AffineDimExpr.get(1) + + assert common.is_affine_expr_function_of_dim(d0, 0) == True + assert common.is_affine_expr_function_of_dim(d0, 1) == False + + c42 = ir.AffineConstantExpr.get(42) + assert common.is_affine_expr_function_of_dim(c42, 0) == False + assert common.is_affine_expr_function_of_dim(c42, 1) == False + + add_expr = d0 + d1 + assert common.is_affine_expr_function_of_dim(add_expr, 0) == True + assert common.is_affine_expr_function_of_dim(add_expr, 1) == True + + mul_expr = d1 * 2 + assert common.is_affine_expr_function_of_dim(mul_expr, 0) == False + assert common.is_affine_expr_function_of_dim(mul_expr, 1) == True + + complex_expr = (d0 + d1) * 2 + assert common.is_affine_expr_function_of_dim(complex_expr, 0) == True + assert common.is_affine_expr_function_of_dim(complex_expr, 1) == True diff --git a/amdsharktuner/tests/constraint_generator_test.py b/amdsharktuner/tests/constraint_generator_test.py index 05029321aee..c0b7865e5a1 100644 --- a/amdsharktuner/tests/constraint_generator_test.py +++ b/amdsharktuner/tests/constraint_generator_test.py @@ -295,6 +295,10 @@ def test_generate_solutions_tile_and_fuse_contraction_padding( assert "padding =" in str( lowering_config ), f"Missing padding in lowering config: {lowering_config}" + # padding_conv should not be present for regular contractions (only for convolutions). + assert "padding_conv =" not in str( + lowering_config + ), f"Unexpected padding_conv in non-convolution lowering config: {lowering_config}" promote = [int(x) for x in lowering_config.attributes["promote_operands"]] assert promote == [0, 1, 2] @@ -370,14 +374,16 @@ def test_generate_solutions_tile_and_fuse_conv_padding( config.configuration, iree_codegen.CompilationInfoAttr ), f"Expected CompilationInfoAttr, got: {type(config.configuration)}" - lowering_config = config.configuration.lowering_config - assert "padding =" in str( - lowering_config - ), f"Missing padding in lowering config: {lowering_config}" - promote = [ - int(x) for x in lowering_config.attributes["promote_operands"] - ] - assert promote == [0, 1, 2] + lowering_config = config.configuration.lowering_config + assert "padding =" in str( + lowering_config + ), f"Missing padding in lowering config: {lowering_config}" + # For convolutions with IGEMM, padding_conv should be present. + assert "padding_conv =" in str( + lowering_config + ), f"Missing padding_conv in convolution lowering config: {lowering_config}" + promote = [int(x) for x in lowering_config.attributes["promote_operands"]] + assert promote == [0, 1, 2] def test_adjust_problem_size_for_pipeline(