[Codegen] Use DMA for LHS/RHS only in scaled matmul

lialan · lialan · commit d3c3f1d81516 · 2026-03-12T12:57:14.000-07:00
* For now, remove the blanket guard that disabled DMA for all scaled matmuls.
* Use DMA (UseGlobalLoadDMAAttr) for LHS/RHS operands.
* Fix lowering of DMA copy.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -784,15 +784,7 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
                              lhsScaleType,
                              rhsScaleType};
 
-  // TODO(#22119): We don't use global load DMA for scaled matmuls, because
-  // compilation doesn't support it. Once this is fixed, we should use global
-  // load DMA here when possible.
   Location loc = operands[0].getLoc();
-  if (scaled && useDirectLoad) {
-    mlir::emitWarning(loc) << "direct load (global load DMA) is not yet "
-                              "supported for scaled matmuls, ignoring";
-    useDirectLoad = false;
-  }
 
   // Accumulator needs shared memory if:
   // - Padding requires C promotion, OR
@@ -910,18 +902,28 @@ getMatmulOrIGEMMLoweringConfigAndWorkgroupSize(
   if (scaled) {
     promotionList.append({2, 3});
     auto defaultConfigAttr = IREE::GPU::DerivedThreadConfigAttr::get(context);
-    // TODO(#23329): Do not swizzle shapes that have no bank conflicts.
-    FailureOr<Attribute> lhsSwizzleAttr =
-        getXorShuffleAttr(context, defaultConfigAttr, target, kind,
-                          schedule->kTileSizes, kMMAOperandLhs);
-    FailureOr<Attribute> rhsSwizzleAttr =
-        getXorShuffleAttr(context, defaultConfigAttr, target, kind,
-                          schedule->kTileSizes, kMMAOperandRhs);
-    if (failed(lhsSwizzleAttr) || failed(rhsSwizzleAttr)) {
-      promotionArray = {};
-    } else {
-      promotionArray = {*lhsSwizzleAttr, *rhsSwizzleAttr, defaultConfigAttr,
+    if (useDirectLoad) {
+      // Use DMA for LHS/RHS (operands 0,1) and thread-based copy for scale
+      // operands (2,3). Scale operands use a different mapping level than DMA
+      // copies, so mixing DMA for all operands would prevent loop fusion in
+      // GPUFuseAndHoistParallelLoops (see #22119).
+      Attribute useGlobalDma = IREE::GPU::UseGlobalLoadDMAAttr::get(context);
+      promotionArray = {useGlobalDma, useGlobalDma, defaultConfigAttr,
                         defaultConfigAttr};
+    } else {
+      // TODO(#23329): Do not swizzle shapes that have no bank conflicts.
+      FailureOr<Attribute> lhsSwizzleAttr =
+          getXorShuffleAttr(context, defaultConfigAttr, target, kind,
+                            schedule->kTileSizes, kMMAOperandLhs);
+      FailureOr<Attribute> rhsSwizzleAttr =
+          getXorShuffleAttr(context, defaultConfigAttr, target, kind,
+                            schedule->kTileSizes, kMMAOperandRhs);
+      if (failed(lhsSwizzleAttr) || failed(rhsSwizzleAttr)) {
+        promotionArray = {};
+      } else {
+        promotionArray = {*lhsSwizzleAttr, *rhsSwizzleAttr, defaultConfigAttr,
+                          defaultConfigAttr};
+      }
     }
   }
   if ((!mustBeAligned || couldNeedPadding) && cPromoteIfPadding) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel
@@ -39,6 +39,7 @@ iree_lit_test_suite(
             "pipeline_igemm_tile_and_fuse.mlir",
             "pipeline_igemm_tile_and_fuse_gfx950.mlir",
             "pipeline_lower_to_llvmgpu.mlir",
+            "pipeline_scaled_matmul_dma.mlir",
             "pipeline_scaled_truncation_gfx950.mlir",
             "pipeline_tile_and_fuse.mlir",
             "pipeline_tile_and_fuse_gfx950.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt
@@ -34,6 +34,7 @@ iree_lit_test_suite(
     "pipeline_igemm_tile_and_fuse.mlir"
     "pipeline_igemm_tile_and_fuse_gfx950.mlir"
     "pipeline_lower_to_llvmgpu.mlir"
+    "pipeline_scaled_matmul_dma.mlir"
     "pipeline_scaled_truncation_gfx950.mlir"
     "pipeline_tile_and_fuse.mlir"
     "pipeline_tile_and_fuse_gfx950.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse_gfx950.mlir
@@ -9,6 +9,12 @@
 // RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" \
 // RUN: --remarks-filter=".*" %s 2>&1 | FileCheck %s --check-prefix=CHECK-REMARKS
 
+// RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx950 \
+// RUN: --iree-codegen-llvmgpu-use-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
+// RUN: --iree-codegen-llvmgpu-use-igemm=false --iree-llvmgpu-use-direct-load=true --iree-llvmgpu-prefetch-num-stages=2 \
+// RUN: --pass-pipeline="builtin.module(iree-llvmgpu-select-lowering-strategy)" %s \
+// RUN: | FileCheck %s --check-prefix=CHECK-DIRECT-LOAD
+
 // RUN: iree-opt --mlir-print-local-scope --split-input-file --iree-gpu-test-target=gfx950 \
 // RUN: --iree-codegen-llvmgpu-use-tile-and-fuse-matmul=true --iree-codegen-llvmgpu-test-tile-and-fuse-vectorize=true \
 // RUN: --iree-codegen-llvmgpu-use-igemm=false --iree-llvmgpu-use-direct-load=true --iree-llvmgpu-prefetch-num-stages=2 \
@@ -53,17 +59,25 @@ func.func @scaled_matmul(
 //  CHECK-SAME:     subgroup = [4, 8, 0, 0]
 //  CHECK-SAME:     workgroup = [256, 256, 0, 0]
 
+// With --iree-llvmgpu-use-direct-load, LHS/RHS get use_global_load_dma while
+// scales keep derived_thread_config.
+// CHECK-DIRECT-LOAD-LABEL: func.func @scaled_matmul
+// CHECK-DIRECT-LOAD:       linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
+// CHECK-DIRECT-LOAD-SAME:    promotion_types = [#iree_gpu.use_global_load_dma, #iree_gpu.use_global_load_dma, #iree_gpu.derived_thread_config, #iree_gpu.derived_thread_config]
+
 // CHECK-REMARKS: [Analysis] SharedMemoryUsage
 // CHECK-REMARKS-SAME: Category:deduceMMASchedule
 // CHECK-REMARKS-SAME: Remark=34816
 
+// TODO(#22119): With direct-load, no cache swizzle on LHS/RHS so shared
+// memory increases. This needs to be addressed.
 // CHECK-REMARKS-DIRECT-LOAD-2: [Analysis] SharedMemoryUsage
 // CHECK-REMARKS-DIRECT-LOAD-2-SAME: Category:deduceMMASchedule
-// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=34816
+// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=69632
 
 // CHECK-REMARKS-DIRECT-LOAD-3: [Analysis] SharedMemoryUsage
 // CHECK-REMARKS-DIRECT-LOAD-3-SAME: Category:deduceMMASchedule
-// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=34816
+// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=104448
 
 // -----
 
@@ -105,11 +119,11 @@ func.func @scaled_matmul_with_batch(
 
 // CHECK-REMARKS-DIRECT-LOAD-2: [Analysis] SharedMemoryUsage
 // CHECK-REMARKS-DIRECT-LOAD-2-SAME: Category:deduceMMASchedule
-// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=34816
+// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=69632
 
 // CHECK-REMARKS-DIRECT-LOAD-3: [Analysis] SharedMemoryUsage
 // CHECK-REMARKS-DIRECT-LOAD-3-SAME: Category:deduceMMASchedule
-// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=34816
+// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=104448
 
 // -----
 
@@ -179,11 +193,11 @@ func.func @scaled_matmul_with_dynamic_batch(
 
 // CHECK-REMARKS-DIRECT-LOAD-2: [Analysis] SharedMemoryUsage
 // CHECK-REMARKS-DIRECT-LOAD-2-SAME: Category:deduceMMASchedule
-// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=26112
+// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=52224
 
 // CHECK-REMARKS-DIRECT-LOAD-3: [Analysis] SharedMemoryUsage
 // CHECK-REMARKS-DIRECT-LOAD-3-SAME: Category:deduceMMASchedule
-// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=26112
+// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=78336
 
 // -----
 
@@ -225,11 +239,11 @@ func.func @small_scaled_matmul(
 
 // CHECK-REMARKS-DIRECT-LOAD-2: [Analysis] SharedMemoryUsage
 // CHECK-REMARKS-DIRECT-LOAD-2-SAME: Category:deduceMMASchedule
-// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=2176
+// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=4352
 
 // CHECK-REMARKS-DIRECT-LOAD-3: [Analysis] SharedMemoryUsage
 // CHECK-REMARKS-DIRECT-LOAD-3-SAME: Category:deduceMMASchedule
-// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=2176
+// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=6528
 
 // -----
 
@@ -346,11 +360,11 @@ func.func @scaled_matmul_accumulate(
 
 // CHECK-REMARKS-DIRECT-LOAD-2: [Analysis] SharedMemoryUsage
 // CHECK-REMARKS-DIRECT-LOAD-2-SAME: Category:deduceMMASchedule
-// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=157184
+// CHECK-REMARKS-DIRECT-LOAD-2-SAME: Remark=109056
 
 // CHECK-REMARKS-DIRECT-LOAD-3: [Analysis] SharedMemoryUsage
 // CHECK-REMARKS-DIRECT-LOAD-3-SAME: Category:deduceMMASchedule
-// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=157184
+// CHECK-REMARKS-DIRECT-LOAD-3-SAME: Remark=130816
 
 // -----
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_scaled_matmul_dma.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_scaled_matmul_dma.mlir
@@ -0,0 +1,95 @@
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx950 \
+// RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target{for-rocdl=true})))))" %s | FileCheck %s
+
+// Test: Scaled matmul (f4E2M1FN * f4E2M1FN with f8E8M0FNU scales) compiles
+// through the full pipeline with DMA config. This validates that the pipeline
+// handles sub-byte types correctly, including the narrow type emulation for
+// gather_to_lds ops.
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer, ReadOnly>,
+  #hal.pipeline.binding<storage_buffer, ReadOnly>,
+  #hal.pipeline.binding<storage_buffer, ReadOnly>,
+  #hal.pipeline.binding<storage_buffer, ReadOnly>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+#translation_info = #iree_codegen.translation_info<pipeline =
+  LLVMGPUTileAndFuse
+  workgroup_size = [512, 1, 1]
+  subgroup_size = 64,
+  {
+    gpu_pipeline_options = #iree_gpu.pipeline_options<
+      prefetch_num_stages = 2,
+      no_reduce_shared_memory_bank_conflicts = true>
+  }
+>
+#config = #iree_gpu.lowering_config<{
+  mma_kind = #iree_gpu.scaled_mma_layout<
+    intrinsic = MFMA_SCALE_F32_16x16x128_B32,
+    lhs_elem_type = f4E2M1FN, rhs_elem_type = f4E2M1FN,
+    acc_elem_type = f32>,
+  promote_operands = [0, 1, 2, 3],
+  promotion_types = [
+    #iree_gpu.use_global_load_dma,
+    #iree_gpu.use_global_load_dma,
+    #iree_gpu.derived_thread_config,
+    #iree_gpu.derived_thread_config],
+  reduction = [0, 0, 1, 1],
+  subgroup = [4, 8, 0, 0],
+  workgroup = [256, 256, 0, 0]
+}>
+#lhs_map = affine_map<(M, N, Ko, Kb) -> (M, Ko, Kb)>
+#rhs_map = affine_map<(M, N, Ko, Kb) -> (N, Ko, Kb)>
+#scale_m = affine_map<(M, N, Ko, Kb) -> (M, Ko)>
+#scale_n = affine_map<(M, N, Ko, Kb) -> (N, Ko)>
+#out_map = affine_map<(M, N, Ko, Kb) -> (M, N)>
+hal.executable public @main {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @scaled_matmul_dma ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @scaled_matmul_dma()
+        attributes {translation_info = #translation_info} {
+        %cst = arith.constant 0.000000e+00 : f32
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512x32xf4E2M1FN>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512x32xf4E2M1FN>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512xf8E8M0FNU>>
+        %3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512xf8E8M0FNU>>
+        %4 = hal.interface.binding.subspan layout(#pipeline_layout) binding(4) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
+        %A = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1024, 512, 32], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512x32xf4E2M1FN>> -> tensor<1024x512x32xf4E2M1FN>
+        %B = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0, 0], sizes = [1024, 512, 32], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512x32xf4E2M1FN>> -> tensor<1024x512x32xf4E2M1FN>
+        %A_scales = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512xf8E8M0FNU>> -> tensor<1024x512xf8E8M0FNU>
+        %B_scales = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x512xf8E8M0FNU>> -> tensor<1024x512xf8E8M0FNU>
+        %empty = tensor.empty() : tensor<1024x1024xf32>
+        %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+        %result = linalg.generic {
+          indexing_maps = [#lhs_map, #rhs_map, #scale_m, #scale_n, #out_map],
+          iterator_types = ["parallel", "parallel", "reduction", "reduction"]
+        } ins(%A, %B, %A_scales, %B_scales : tensor<1024x512x32xf4E2M1FN>, tensor<1024x512x32xf4E2M1FN>, tensor<1024x512xf8E8M0FNU>, tensor<1024x512xf8E8M0FNU>) outs(%fill : tensor<1024x1024xf32>) attrs = {lowering_config = #config} {
+        ^bb0(%a: f4E2M1FN, %b: f4E2M1FN, %a_scale: f8E8M0FNU, %b_scale: f8E8M0FNU, %out: f32):
+          %s1 = arith.scaling_extf %a, %a_scale : f4E2M1FN, f8E8M0FNU to f32
+          %s2 = arith.scaling_extf %b, %b_scale : f4E2M1FN, f8E8M0FNU to f32
+          %m = arith.mulf %s1, %s2 : f32
+          %r = arith.addf %out, %m : f32
+          linalg.yield %r : f32
+        } -> tensor<1024x1024xf32>
+        iree_tensor_ext.dispatch.tensor.store %result, %4, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
+        return
+      }
+    }
+  }
+}
+
+// Verify pipeline completes and produces scaled MFMA compute ops.
+// LHS/RHS are promoted to workgroup shared memory and scales use thread-based
+// copies. The compute uses 16x16x128 scaled MFMA instructions.
+
+// CHECK-LABEL: func.func @scaled_matmul_dma
+//   CHECK-DAG:   memref.alloc() : memref<{{.*}}xf8E8M0FNU, #gpu.address_space<workgroup>>
+//   CHECK-DAG:   memref.alloc() : memref<{{.*}}xf4E2M1FN, #gpu.address_space<workgroup>>
+//       CHECK:   scf.forall
+//       CHECK:     scf.for
+//       CHECK:       amdgpu.scaled_mfma 16x16x128
diff --git a/tests/e2e/matmul/CMakeLists.txt b/tests/e2e/matmul/CMakeLists.txt
@@ -2624,6 +2624,37 @@ iree_generated_e2e_runner_test(
     "requires-gpu-cdna4"
 )
 
+iree_generated_e2e_runner_test(
+  NAME
+    e2e_matmul_cdna4_mxfp4_dma
+  TEST_TYPE
+    matmul
+  GENERATOR
+    "generate_e2e_matmul_tests.py"
+  GENERATOR_ARGS
+    "--lhs_rhs_type=f4E2M1FN"
+    "--acc_type=f32"
+    "--mx_scale_type=f8E8M0FNU"
+    "--mx_block_size=32"
+    "--shapes=easy_large_static"
+    "--transpose_rhs"
+  TEST_RUNNER
+    iree_tools_testing_e2e_iree-e2e-matmul-test
+  TARGET_BACKENDS
+    "rocm"
+  DRIVERS
+    "hip"
+  COMPILER_FLAGS
+    ${IREE_HIP_TEST_COMPILER_FLAGS}
+    "--iree-llvmgpu-use-direct-load"
+  LABELS
+    "noasan"
+    "nomsan"
+    "notsan"
+    "noubsan"
+    "requires-gpu-cdna4"
+)
+
 endif()