diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp index 013ff0fea0d1..32117120e2fb 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp @@ -830,42 +830,32 @@ struct GPUConvertToCoalescedDMAPass final FunctionOpInterface funcOp = getOperation(); MLIRContext *context = &getContext(); - // Pre-check: decide whether all linalg.copy ops should be DMA-converted. - // Only activate when at least one copy already has use_global_load_dma - // (indicating DMA intent from upstream config, e.g. --iree-llvmgpu-use- - // direct-load). Collect all promoted copies (use_global_load_dma or - // derived_thread_config). If ALL are DMA-convertible, upgrade them all to - // use_global_load_dma. If ANY fails, downgrade them all to - // derived_thread_config. + // Pre-check: verify that all copies marked with use_global_load_dma are + // actually DMA-convertible. If any DMA-marked copy fails the check, + // downgrade ALL DMA-marked copies to derived_thread_config. + // Copies already marked with derived_thread_config are left unchanged — + // they should not be upgraded to use_global_load_dma because they may + // have shapes (e.g. scale operands) that are too small for DMA after + // per-warp tiling, leading to incorrect thread distribution. // Note: GatherOps are excluded — they come from input IR (not from // GPUPromoteMatmulOperands) and are handled independently by // ConvertGatherToCoalescedDMA. - SmallVector promotedCopies; - bool hasDMAIntent = false; + SmallVector dmaCopies; funcOp->walk([&](linalg::CopyOp copyOp) { if (getLoweringConfig(copyOp)) { - hasDMAIntent = true; - promotedCopies.push_back(copyOp); - } else if (getLoweringConfig( - copyOp)) { - promotedCopies.push_back(copyOp); + dmaCopies.push_back(copyOp); } }); - if (hasDMAIntent) { - bool allConvertible = llvm::all_of(promotedCopies, isCopyDMAConvertible); - LLVM_DEBUG({ - if (!allConvertible) { - llvm::dbgs() << "DMA pre-check: not all copies convertible, " - << "downgrading " << promotedCopies.size() + if (!dmaCopies.empty()) { + bool allConvertible = llvm::all_of(dmaCopies, isCopyDMAConvertible); + if (!allConvertible) { + LLVM_DEBUG({ + llvm::dbgs() << "DMA pre-check: not all DMA copies convertible, " + << "downgrading " << dmaCopies.size() << " copies to derived_thread_config\n"; - } - }); - for (linalg::CopyOp copyOp : promotedCopies) { - if (allConvertible) { - setLoweringConfig(copyOp, - IREE::GPU::UseGlobalLoadDMAAttr::get(context)); - } else { + }); + for (linalg::CopyOp copyOp : dmaCopies) { setLoweringConfig(copyOp, IREE::GPU::DerivedThreadConfigAttr::get(context)); } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 990233dce3d6..9bb674e03de3 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -784,15 +784,7 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize( lhsScaleType, rhsScaleType}; - // TODO(#22119): We don't use global load DMA for scaled matmuls, because - // compilation doesn't support it. Once this is fixed, we should use global - // load DMA here when possible. Location loc = operands[0].getLoc(); - if (scaled && useDirectLoad) { - mlir::emitWarning(loc) << "direct load (global load DMA) is not yet " - "supported for scaled matmuls, ignoring"; - useDirectLoad = false; - } // Accumulator needs shared memory if: // - Padding requires C promotion, OR @@ -910,18 +902,28 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize( if (scaled) { promotionList.append({2, 3}); auto defaultConfigAttr = IREE::GPU::DerivedThreadConfigAttr::get(context); - // TODO(#23329): Do not swizzle shapes that have no bank conflicts. - FailureOr lhsSwizzleAttr = - getXorShuffleAttr(context, defaultConfigAttr, target, kind, - schedule->kTileSizes, kMMAOperandLhs); - FailureOr rhsSwizzleAttr = - getXorShuffleAttr(context, defaultConfigAttr, target, kind, - schedule->kTileSizes, kMMAOperandRhs); - if (failed(lhsSwizzleAttr) || failed(rhsSwizzleAttr)) { - promotionArray = {}; - } else { - promotionArray = {*lhsSwizzleAttr, *rhsSwizzleAttr, defaultConfigAttr, + if (useDirectLoad) { + // Use DMA for LHS/RHS (operands 0,1) and thread-based copy for scale + // operands (2,3). Scale operands use a different mapping level than DMA + // copies, so mixing DMA for all operands would prevent loop fusion in + // GPUFuseAndHoistParallelLoops (see #22119). + Attribute useGlobalDma = IREE::GPU::UseGlobalLoadDMAAttr::get(context); + promotionArray = {useGlobalDma, useGlobalDma, defaultConfigAttr, defaultConfigAttr}; + } else { + // TODO(#23329): Do not swizzle shapes that have no bank conflicts. + FailureOr lhsSwizzleAttr = + getXorShuffleAttr(context, defaultConfigAttr, target, kind, + schedule->kTileSizes, kMMAOperandLhs); + FailureOr rhsSwizzleAttr = + getXorShuffleAttr(context, defaultConfigAttr, target, kind, + schedule->kTileSizes, kMMAOperandRhs); + if (failed(lhsSwizzleAttr) || failed(rhsSwizzleAttr)) { + promotionArray = {}; + } else { + promotionArray = {*lhsSwizzleAttr, *rhsSwizzleAttr, defaultConfigAttr, + defaultConfigAttr}; + } } } if ((!mustBeAligned || couldNeedPadding) && cPromoteIfPadding) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel index 9bacaa26e8b4..f0e63b12ae4e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel @@ -39,6 +39,7 @@ iree_lit_test_suite( "pipeline_igemm_tile_and_fuse.mlir", "pipeline_igemm_tile_and_fuse_gfx950.mlir", "pipeline_lower_to_llvmgpu.mlir", + "pipeline_scaled_matmul_dma.mlir", "pipeline_scaled_truncation_gfx950.mlir", "pipeline_tile_and_fuse.mlir", "pipeline_tile_and_fuse_gfx950.mlir", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt index 6a6ca51f5644..1636f7730deb 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt @@ -34,6 +34,7 @@ iree_lit_test_suite( "pipeline_igemm_tile_and_fuse.mlir" "pipeline_igemm_tile_and_fuse_gfx950.mlir" "pipeline_lower_to_llvmgpu.mlir" + "pipeline_scaled_matmul_dma.mlir" "pipeline_scaled_truncation_gfx950.mlir" "pipeline_tile_and_fuse.mlir" "pipeline_tile_and_fuse_gfx950.mlir" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir index 5a1d9f74a9dc..87604a2e7415 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir @@ -9,6 +9,12 @@ // RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \ // RUN: --remarks-filter=".*" %s 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS +// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx950 \ +// RUN: --iree-codegen-llvmgpu-use-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \ +// RUN: --iree-codegen-llvmgpu-use-igemm=false --iree-llvmgpu-use-direct-load=true --iree-llvmgpu-prefetch-num-stages=2 \ +// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s \ +// RUN: | FileCheck %s --check-prefix=CHECK-DIRECT-LOAD + // RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx950 \ // RUN: --iree-codegen-llvmgpu-use-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \ // RUN: --iree-codegen-llvmgpu-use-igemm=false --iree-llvmgpu-use-direct-load=true --iree-llvmgpu-prefetch-num-stages=2 \ @@ -53,17 +59,25 @@ func.func @scaled_matmul( // CHECK-SAME: subgroup = [4, 8, 0, 0] // CHECK-SAME: workgroup = [256, 256, 0, 0] +// With --iree-llvmgpu-use-direct-load, LHS/RHS get use_global_load_dma while +// scales keep derived_thread_config. +// CHECK-DIRECT-LOAD-LABEL: func.func @scaled_matmul +// CHECK-DIRECT-LOAD: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config +// CHECK-DIRECT-LOAD-SAME: promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma, #iree_gpu.derived_thread_config, #iree_gpu.derived_thread_config] + // CHECK-REMARKS: [Analysis] SharedMemoryUsage // CHECK-REMARKS-SAME: Category:deduceMMASchedule // CHECK-REMARKS-SAME: Remark=34816 +// TODO(#22119): With direct-load, no cache swizzle on LHS/RHS so shared +// memory increases. This needs to be addressed. // CHECK-REMARKS-DIRECT-LOAD-2: [Analysis] SharedMemoryUsage // CHECK-REMARKS-DIRECT-LOAD-2-SAME: Category:deduceMMASchedule -// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=34816 +// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=69632 // CHECK-REMARKS-DIRECT-LOAD-3: [Analysis] SharedMemoryUsage // CHECK-REMARKS-DIRECT-LOAD-3-SAME: Category:deduceMMASchedule -// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=34816 +// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=104448 // ----- @@ -105,11 +119,11 @@ func.func @scaled_matmul_with_batch( // CHECK-REMARKS-DIRECT-LOAD-2: [Analysis] SharedMemoryUsage // CHECK-REMARKS-DIRECT-LOAD-2-SAME: Category:deduceMMASchedule -// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=34816 +// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=69632 // CHECK-REMARKS-DIRECT-LOAD-3: [Analysis] SharedMemoryUsage // CHECK-REMARKS-DIRECT-LOAD-3-SAME: Category:deduceMMASchedule -// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=34816 +// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=104448 // ----- @@ -179,11 +193,11 @@ func.func @scaled_matmul_with_dynamic_batch( // CHECK-REMARKS-DIRECT-LOAD-2: [Analysis] SharedMemoryUsage // CHECK-REMARKS-DIRECT-LOAD-2-SAME: Category:deduceMMASchedule -// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=26112 +// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=52224 // CHECK-REMARKS-DIRECT-LOAD-3: [Analysis] SharedMemoryUsage // CHECK-REMARKS-DIRECT-LOAD-3-SAME: Category:deduceMMASchedule -// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=26112 +// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=78336 // ----- @@ -225,11 +239,11 @@ func.func @small_scaled_matmul( // CHECK-REMARKS-DIRECT-LOAD-2: [Analysis] SharedMemoryUsage // CHECK-REMARKS-DIRECT-LOAD-2-SAME: Category:deduceMMASchedule -// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=2176 +// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=4352 // CHECK-REMARKS-DIRECT-LOAD-3: [Analysis] SharedMemoryUsage // CHECK-REMARKS-DIRECT-LOAD-3-SAME: Category:deduceMMASchedule -// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=2176 +// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=6528 // ----- @@ -346,11 +360,11 @@ func.func @scaled_matmul_accumulate( // CHECK-REMARKS-DIRECT-LOAD-2: [Analysis] SharedMemoryUsage // CHECK-REMARKS-DIRECT-LOAD-2-SAME: Category:deduceMMASchedule -// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=157184 +// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=109056 // CHECK-REMARKS-DIRECT-LOAD-3: [Analysis] SharedMemoryUsage // CHECK-REMARKS-DIRECT-LOAD-3-SAME: Category:deduceMMASchedule -// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=157184 +// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=130816 // ----- diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_scaled_matmul_dma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_scaled_matmul_dma.mlir new file mode 100644 index 000000000000..cf5dfd68479c --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_scaled_matmul_dma.mlir @@ -0,0 +1,99 @@ +// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx950 \ +// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target{for-rocdl=true})))))" %s | FileCheck %s + +// Test: Scaled matmul (f4E2M1FN * f4E2M1FN with f8E8M0FNU scales) compiles +// through the full pipeline with DMA config. This validates that the pipeline +// handles sub-byte types correctly, including the narrow type emulation for +// gather_to_lds ops. + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding, + #hal.pipeline.binding, + #hal.pipeline.binding +]> +#translation_info = #iree_codegen.translation_info + } +> +#config = #iree_gpu.lowering_config<{ + mma_kind = #iree_gpu.scaled_mma_layout< + intrinsic = MFMA_SCALE_F32_16x16x128_B32, + lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN, + acc_elem_type = f32>, + promote_operands = [0, 1, 2, 3], + promotion_types = [ + #iree_gpu.use_global_load_dma, + #iree_gpu.use_global_load_dma, + #iree_gpu.derived_thread_config, + #iree_gpu.derived_thread_config], + reduction = [0, 0, 1, 1], + subgroup = [4, 8, 0, 0], + workgroup = [256, 256, 0, 0] +}> +#lhs_map = affine_map<(M, N, Ko, Kb) -> (M, Ko, Kb)> +#rhs_map = affine_map<(M, N, Ko, Kb) -> (N, Ko, Kb)> +#scale_m = affine_map<(M, N, Ko, Kb) -> (M, Ko)> +#scale_n = affine_map<(M, N, Ko, Kb) -> (N, Ko)> +#out_map = affine_map<(M, N, Ko, Kb) -> (M, N)> +hal.executable public @main { + hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { + hal.executable.export public @scaled_matmul_dma ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) { + %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice() + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @scaled_matmul_dma() + attributes {translation_info = #translation_info} { + %cst = arith.constant 0.000000e+00 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor> + %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor> + %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor> + %A = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 32], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor> -> tensor<1024x512x32xf4E2M1FN> + %B = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1024, 512, 32], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor> -> tensor<1024x512x32xf4E2M1FN> + %A_scales = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor> -> tensor<1024x512xf8E8M0FNU> + %B_scales = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor> -> tensor<1024x512xf8E8M0FNU> + %empty = tensor.empty() : tensor<1024x1024xf32> + %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<1024x1024xf32>) -> tensor<1024x1024xf32> + %result = linalg.generic { + indexing_maps = [#lhs_map, #rhs_map, #scale_m, #scale_n, #out_map], + iterator_types = ["parallel", "parallel", "reduction", "reduction"] + } ins(%A, %B, %A_scales, %B_scales : tensor<1024x512x32xf4E2M1FN>, tensor<1024x512x32xf4E2M1FN>, tensor<1024x512xf8E8M0FNU>, tensor<1024x512xf8E8M0FNU>) outs(%fill : tensor<1024x1024xf32>) attrs = {lowering_config = #config} { + ^bb0(%a: f4E2M1FN, %b: f4E2M1FN, %a_scale: f8E8M0FNU, %b_scale: f8E8M0FNU, %out: f32): + %s1 = arith.scaling_extf %a, %a_scale : f4E2M1FN, f8E8M0FNU to f32 + %s2 = arith.scaling_extf %b, %b_scale : f4E2M1FN, f8E8M0FNU to f32 + %m = arith.mulf %s1, %s2 : f32 + %r = arith.addf %out, %m : f32 + linalg.yield %r : f32 + } -> tensor<1024x1024xf32> + iree_tensor_ext.dispatch.tensor.store %result, %4, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !iree_tensor_ext.dispatch.tensor> + return + } + } + } +} + +// Verify pipeline completes and produces scaled MFMA compute ops. +// LHS/RHS are promoted to workgroup shared memory and scales use thread-based +// copies. The compute uses 16x16x128 scaled MFMA instructions. + +// CHECK-LABEL: func.func @scaled_matmul_dma +// CHECK-DAG: memref.alloc() : memref<{{.*}}xf8E8M0FNU, #gpu.address_space> +// CHECK-DAG: memref.alloc() : memref<{{.*}}xf4E2M1FN, #gpu.address_space> +// CHECK: scf.forall +// CHECK: scf.for +// TODO: The DMA config is set but the pipeline currently lowers LHS/RHS copies +// via vector.transfer_read/write instead of amdgpu.gather_to_lds. Once the DMA +// lowering path handles scaled matmul operands, add: +// COM: CHECK: amdgpu.gather_to_lds +// CHECK: amdgpu.scaled_mfma 16x16x128 diff --git a/tests/e2e/matmul/CMakeLists.txt b/tests/e2e/matmul/CMakeLists.txt index 97031f8f9a32..2f6dd6566d5b 100644 --- a/tests/e2e/matmul/CMakeLists.txt +++ b/tests/e2e/matmul/CMakeLists.txt @@ -2624,6 +2624,37 @@ iree_generated_e2e_runner_test( "requires-gpu-cdna4" ) +iree_generated_e2e_runner_test( + NAME + e2e_matmul_cdna4_mxfp4_dma + TEST_TYPE + matmul + GENERATOR + "generate_e2e_matmul_tests.py" + GENERATOR_ARGS + "--lhs_rhs_type=f4E2M1FN" + "--acc_type=f32" + "--mx_scale_type=f8E8M0FNU" + "--mx_block_size=32" + "--shapes=easy_large_static" + "--transpose_rhs" + TEST_RUNNER + iree_tools_testing_e2e_iree-e2e-matmul-test + TARGET_BACKENDS + "rocm" + DRIVERS + "hip" + COMPILER_FLAGS + ${IREE_HIP_TEST_COMPILER_FLAGS} + "--iree-llvmgpu-use-direct-load" + LABELS + "noasan" + "nomsan" + "notsan" + "noubsan" + "requires-gpu-cdna4" +) + endif()