iree-org · hanhanW · Mar 14, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 13, 2026
@@ -1219,6 +1219,111 @@ func.func @map_store_f4_mask_depends_on_inner_index(
 
 // -----
 
+#contraction_accesses = [
+ affine_map<(i, j, k) -> (i, k)>,
+ affine_map<(i, j, k) -> (k, j)>,
+ affine_map<(i, j, k) -> (i, j)>
+]
+func.func @tensor_multi_mma(%lhs: tensor<2x3x4xf16>, %rhs: tensor<3x5x4xf16>, %acc: tensor<2x5x4xf32>) -> tensor<2x5x4xf32> {
+  %0 = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) {
+    indexing_maps = #contraction_accesses,
+    iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
+    kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+    semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
+  } : tensor<2x3x4xf16>, tensor<3x5x4xf16> into tensor<2x5x4xf32>
+  return %0 : tensor<2x5x4xf32>
+}
+
+//      CHECK-LABEL: func @tensor_multi_mma
+// CHECK-MASK-LABEL: func @tensor_multi_mma
+// CHECK-FOLD-LABEL: func @tensor_multi_mma
+// CHECK-GATHER-LABEL: func @tensor_multi_mma
+// CHECK-MAP-STORE-LABEL: func @tensor_multi_mma
+
+//   CHECK-DAG:   %[[CST:.+]] = arith.constant 0.000000e+00 : f16
+//   CHECK-DAG:   %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32
+//   CHECK-DAG:   %[[LHS:.+]] = vector.transfer_read %arg0[%c0, %c0, %c0], %[[CST]] {{.*}} : tensor<2x3x4xf16>, vector<2x3x4xf16>
+//   CHECK-DAG:   %[[RHS:.+]] = vector.transfer_read %arg1[%c0, %c0, %c0], %[[CST]] {{.*}} : tensor<3x5x4xf16>, vector<3x5x4xf16>
+//   CHECK-DAG:   %[[ACC:.+]] = vector.transfer_read %arg2[%c0, %c0, %c0], %[[CSTF32]] {{.*}} : tensor<2x5x4xf32>, vector<2x5x4xf32>
+//       CHECK:   %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]]) outs(%[[ACC]])
+//  CHECK-SAME:     : vector<2x3x4xf16>, vector<3x5x4xf16> into vector<2x5x4xf32>
+//       CHECK:   vector.transfer_write %[[MMA]], %arg2[%c0, %c0, %c0] {{.*}} : vector<2x5x4xf32>, tensor<2x5x4xf32>
+
+// -----
+
+#contraction_accesses = [
+ affine_map<() -> ()>,
+ affine_map<() -> ()>,
+ affine_map<() -> ()>
+]
+func.func @tensor_single_multi_mma(%lhs: tensor<4xf16>, %rhs: tensor<4xf16>, %acc: tensor<4xf32>) -> tensor<4xf32> {
+  %0 = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) {
+    indexing_maps = #contraction_accesses,
+    iterator_types = [],
+    kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+    semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
+  } : tensor<4xf16>, tensor<4xf16> into tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @tensor_single_multi_mma
+
+//   CHECK-DAG:   %[[CST:.+]] = arith.constant 0.000000e+00 : f16
+//   CHECK-DAG:   %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32
+//   CHECK-DAG:   %[[LHS:.+]] = vector.transfer_read %arg0[%c0], %[[CST]] {in_bounds = [true]} : tensor<4xf16>, vector<4xf16>
+//   CHECK-DAG:   %[[RHS:.+]] = vector.transfer_read %arg1[%c0], %[[CST]] {in_bounds = [true]} : tensor<4xf16>, vector<4xf16>
+//   CHECK-DAG:   %[[ACC:.+]] = vector.transfer_read %arg2[%c0], %[[CSTF32]] {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
+//       CHECK:   %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]]) outs(%[[ACC]])
+//  CHECK-SAME:     : vector<4xf16>, vector<4xf16> into vector<4xf32>
+//       CHECK:   vector.transfer_write %[[MMA]], %arg2[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
+
+// -----
+
+#contraction_accesses = [
+ affine_map<(i, j, k, b) -> (i, k, b)>,
+ affine_map<(i, j, k, b) -> (k, b, j)>,
+ affine_map<(i, j, k, b) -> (i, k)>,
+ affine_map<(i, j, k, b) -> (k, j)>,
+ affine_map<(i, j, k, b) -> (i, j)>
+]
+
+#iterator_types = [
+  #linalg.iterator_type<parallel>,
+  #linalg.iterator_type<parallel>,
+  #linalg.iterator_type<reduction>,
+  #linalg.iterator_type<reduction>
+]
+
+func.func @scaled_tensor_multi_mma(%arg0: tensor<3x5x1x32xf4E2M1FN>, %arg1: tensor<5x1x7x32xf8E4M3FN>, %arg2: tensor<3x5x1xf8E8M0FNU>, %arg3: tensor<5x7x1xf8E8M0FNU>,
+  %arg4: tensor<3x7x4xf32>) -> tensor<3x7x4xf32> {
+  %0 = iree_codegen.inner_tiled ins(%arg0, %arg1, %arg2, %arg3) outs(%arg4) {
+    indexing_maps = #contraction_accesses,
+    iterator_types = #iterator_types,
+    kind = #iree_gpu.scaled_mma_layout<intrinsic = MFMA_SCALE_F32_16x16x128_B32,
+      lhs_elem_type = f4E2M1FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>,
+    semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
+    } : tensor<3x5x1x32xf4E2M1FN>, tensor<5x1x7x32xf8E4M3FN>, tensor<3x5x1xf8E8M0FNU>, tensor<5x7x1xf8E8M0FNU>
+      into tensor<3x7x4xf32>
+  return %0 : tensor<3x7x4xf32>
+}
+
+// CHECK-LABEL: func @scaled_tensor_multi_mma
+
+//   CHECK-DAG:   %[[CSTFP4:.+]] = arith.constant 0.000000e+00 : f4E2M1FN
+//   CHECK-DAG:   %[[CSTFP8:.+]] = arith.constant 0.000000e+00 : f8E4M3FN
+//   CHECK-DAG:   %[[CSTSCALE:.+]] = arith.constant 5.877470e-39 : f8E8M0FNU
+//   CHECK-DAG:   %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32
+//   CHECK-DAG:   %[[LHS:.+]] = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %[[CSTFP4]] {{.*}} : tensor<3x5x1x32xf4E2M1FN>, vector<3x5x1x32xf4E2M1FN>
+//   CHECK-DAG:   %[[RHS:.+]] = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %[[CSTFP8]] {{.*}} : tensor<5x1x7x32xf8E4M3FN>, vector<5x1x7x32xf8E4M3FN>
+//   CHECK-DAG:   %[[LHS_SCALE:.+]] = vector.transfer_read %arg2[%c0, %c0, %c0], %[[CSTSCALE]] {{.*}} : tensor<3x5x1xf8E8M0FNU>, vector<3x5x1xf8E8M0FNU>
+//   CHECK-DAG:   %[[RHS_SCALE:.+]] = vector.transfer_read %arg3[%c0, %c0, %c0], %[[CSTSCALE]] {{.*}} : tensor<5x7x1xf8E8M0FNU>, vector<5x7x1xf8E8M0FNU>
+//   CHECK-DAG:   %[[ACC:.+]] = vector.transfer_read %arg4[%c0, %c0, %c0], %[[CSTF32]] {{.*}} : tensor<3x7x4xf32>, vector<3x7x4xf32>
+//       CHECK:   %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]], %[[LHS_SCALE]], %[[RHS_SCALE]]) outs(%[[ACC]])
+//  CHECK-SAME: : vector<3x5x1x32xf4E2M1FN>, vector<5x1x7x32xf8E4M3FN>, vector<3x5x1xf8E8M0FNU>, vector<5x7x1xf8E8M0FNU> into vector<3x7x4xf32>
+//       CHECK:   vector.transfer_write %[[MMA]], %arg4[%c0, %c0, %c0] {{.*}} : vector<3x7x4xf32>, tensor<3x7x4xf32>
+
+// -----
+
 func.func @implicit_gather_like_generic_stride_2(%arg0: tensor<1x1x31xf32>, %arg1: tensor<1x1x1x1x16xf32>) -> tensor<1x1x1x1x16xf32> {
   %0 = linalg.generic {
     indexing_maps = [

@@ -47,6 +47,7 @@ iree_td_library(
         "@llvm-project//mlir:SMTTdFiles",
         "@llvm-project//mlir:SideEffectInterfacesTdFiles",
         "@llvm-project//mlir:TilingInterfaceTdFiles",
+        "@llvm-project//mlir:ValueBoundsOpInterfaceTdFiles",
         "@llvm-project//mlir:VectorInterfacesTdFiles",
         "@llvm-project//mlir:ViewLikeInterfaceTdFiles",
     ],
@@ -125,6 +126,7 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:TilingInterface",
         "@llvm-project//mlir:TransformDialect",
         "@llvm-project//mlir:TransformDialectTransforms",
+        "@llvm-project//mlir:ValueBoundsOpInterface",
         "@llvm-project//mlir:VectorInterfaces",
         "@llvm-project//mlir:ViewLikeInterface",
     ],

@@ -75,6 +75,7 @@ iree_cc_library(
     MLIRTilingInterface
     MLIRTransformDialect
     MLIRTransformDialectTransforms
+    MLIRValueBoundsOpInterface
     MLIRVectorInterfaces
     MLIRViewLikeInterface
     iree::compiler::Codegen::Dialect::PCF::IR

@@ -23,6 +23,7 @@
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Support/LLVM.h"
 
 // Custom parse/print helper for the knobs dictionary in constraints op.
@@ -422,6 +423,13 @@ std::optional<SmallVector<int64_t, 4>> InnerTiledOp::getShapeForUnroll() {
   return shape;
 }
 
+void InnerTiledOp::populateBoundsForShapedValueDim(
+    Value value, int64_t dim, ValueBoundsConstraintSet &cstr) {
+  // Result shapes equal the corresponding DPS init shapes.
+  auto resultIdx = cast<OpResult>(value).getResultNumber();
+  cstr.bound(value)[dim] == cstr.getExpr(getDpsInits()[resultIdx], dim);
+}
+
 //===----------------------------------------------------------------------===//
 // WorkgroupCountHintOp
 //===----------------------------------------------------------------------===//

@@ -21,6 +21,7 @@
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/TilingInterface.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Interfaces/VectorInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 

@@ -22,6 +22,7 @@ include "mlir/Interfaces/TilingInterface.td"
 include "mlir/Interfaces/VectorInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/ValueBoundsOpInterface.td"
 include "mlir/Dialect/SMT/IR/SMTTypes.td"
 
 def TensorTypeAttr : TypeAttrBase<"::mlir::TensorType", "Tensor type attribute">;
@@ -256,6 +257,8 @@ def IREECodegen_InnerTiledOp : Op<IREECodegen_Dialect, "inner_tiled", [
     AttrSizedOperandSegments,
     InferTypeOpAdaptor,
     DestinationStyleOpInterface,
+    DeclareOpInterfaceMethods<ValueBoundsOpInterface,
+       ["populateBoundsForShapedValueDim"]>,
     DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>,
     DeclareOpInterfaceMethods<TilingInterface,
        ["getIterationDomain",

@@ -71,15 +71,6 @@ void transform_dialect::ApplyUnrollMultiMmaOp::populatePatterns(
   GPU::populateIREEGPUVectorUnrollPatterns(patterns);
 }
 
-//===---------------------------------------------------------------------===//
-// ApplyVectorizeIREEGPUOp
-//===---------------------------------------------------------------------===//
-
-void transform_dialect::ApplyVectorizeIREEGPUOp::populatePatterns(
-    RewritePatternSet &patterns) {
-  IREE::GPU::populateIREEGPUVectorizationPatterns(patterns);
-}
-
 //===---------------------------------------------------------------------===//
 // ConvertToMultiMmaOp
 //===---------------------------------------------------------------------===//

@@ -80,19 +80,6 @@ def ApplyUnrollMultiMmaOp : Op<Transform_Dialect,
   let assemblyFormat = "attr-dict";
 }
 
-def ApplyVectorizeIREEGPUOp : Op<Transform_Dialect,
-    "apply_patterns.iree.vectorize_iree_gpu",
-    [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>,
-     ReportTrackingListenerFailuresOpTrait]> {
-  let description = [{
-    Populate patterns to vectorize various iree_gpu ops. Expected to run
-    before or as a part of a larger vectorization pass.
-  }];
-
-  let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
-  let assemblyFormat = "attr-dict";
-}
-
 def ConvertToMultiMmaOp : Op<Transform_Dialect, "iree.convert_to_multi_mma",
     [TransformEachOpTrait,
      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,

@@ -30,7 +30,6 @@ iree_lit_test_suite(
             "transform_fuse_forall.mlir",
             "transform_lower_barrier_region.mlir",
             "unroll_multi_mma.mlir",
-            "vectorize_iree_gpu_ops.mlir",
         ],
         include = ["*.mlir"],
     ),

@@ -25,7 +25,6 @@ iree_lit_test_suite(
     "transform_fuse_forall.mlir"
     "transform_lower_barrier_region.mlir"
     "unroll_multi_mma.mlir"
-    "vectorize_iree_gpu_ops.mlir"
   TOOLS
     FileCheck
     iree-opt

@@ -65,7 +65,6 @@ iree_compiler_cc_library(
         "Passes.cpp",
         "Transforms.cpp",
         "UnrollToIntrinsics.cpp",
-        "VectorizeIREEGPUOps.cpp",
     ],
     hdrs = [
         "Passes.h",

@@ -54,7 +54,6 @@ iree_cc_library(
     "Passes.cpp"
     "Transforms.cpp"
     "UnrollToIntrinsics.cpp"
-    "VectorizeIREEGPUOps.cpp"
   DEPS
     ::PassesIncGen
     LLVMSupport

@@ -55,14 +55,4 @@ def UnrollToIntrinsicsPass :
   ];
 }
 
-def VectorizeIREEGPUOpsPass :
-    InterfacePass<"iree-gpu-vectorize-ops", "mlir::FunctionOpInterface"> {
-  let summary = "Vectorizes then lowers a few iree_gpu ops before vectorization.";
-  let dependentDialects = [
-    "::mlir::vector::VectorDialect",
-    "::mlir::arith::ArithDialect",
-    "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect"
-  ];
-}
-
 #endif // IREE_CODEGEN_DIALECT_GPU_TRANSFORMS_PASSES