diff --git a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir
index 6a3344b12bf8..a16b929bbefe 100644
--- a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir
@@ -1219,6 +1219,111 @@ func.func @map_store_f4_mask_depends_on_inner_index(
 
 // -----
 
+#contraction_accesses = [
+ affine_map<(i, j, k) -> (i, k)>,
+ affine_map<(i, j, k) -> (k, j)>,
+ affine_map<(i, j, k) -> (i, j)>
+]
+func.func @tensor_multi_mma(%lhs: tensor<2x3x4xf16>, %rhs: tensor<3x5x4xf16>, %acc: tensor<2x5x4xf32>) -> tensor<2x5x4xf32> {
+  %0 = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) {
+    indexing_maps = #contraction_accesses,
+    iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
+    kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+    semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
+  } : tensor<2x3x4xf16>, tensor<3x5x4xf16> into tensor<2x5x4xf32>
+  return %0 : tensor<2x5x4xf32>
+}
+
+//      CHECK-LABEL: func @tensor_multi_mma
+// CHECK-MASK-LABEL: func @tensor_multi_mma
+// CHECK-FOLD-LABEL: func @tensor_multi_mma
+// CHECK-GATHER-LABEL: func @tensor_multi_mma
+// CHECK-MAP-STORE-LABEL: func @tensor_multi_mma
+
+//   CHECK-DAG:   %[[CST:.+]] = arith.constant 0.000000e+00 : f16
+//   CHECK-DAG:   %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32
+//   CHECK-DAG:   %[[LHS:.+]] = vector.transfer_read %arg0[%c0, %c0, %c0], %[[CST]] {{.*}} : tensor<2x3x4xf16>, vector<2x3x4xf16>
+//   CHECK-DAG:   %[[RHS:.+]] = vector.transfer_read %arg1[%c0, %c0, %c0], %[[CST]] {{.*}} : tensor<3x5x4xf16>, vector<3x5x4xf16>
+//   CHECK-DAG:   %[[ACC:.+]] = vector.transfer_read %arg2[%c0, %c0, %c0], %[[CSTF32]] {{.*}} : tensor<2x5x4xf32>, vector<2x5x4xf32>
+//       CHECK:   %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]]) outs(%[[ACC]])
+//  CHECK-SAME:     : vector<2x3x4xf16>, vector<3x5x4xf16> into vector<2x5x4xf32>
+//       CHECK:   vector.transfer_write %[[MMA]], %arg2[%c0, %c0, %c0] {{.*}} : vector<2x5x4xf32>, tensor<2x5x4xf32>
+
+// -----
+
+#contraction_accesses = [
+ affine_map<() -> ()>,
+ affine_map<() -> ()>,
+ affine_map<() -> ()>
+]
+func.func @tensor_single_multi_mma(%lhs: tensor<4xf16>, %rhs: tensor<4xf16>, %acc: tensor<4xf32>) -> tensor<4xf32> {
+  %0 = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) {
+    indexing_maps = #contraction_accesses,
+    iterator_types = [],
+    kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+    semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
+  } : tensor<4xf16>, tensor<4xf16> into tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @tensor_single_multi_mma
+
+//   CHECK-DAG:   %[[CST:.+]] = arith.constant 0.000000e+00 : f16
+//   CHECK-DAG:   %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32
+//   CHECK-DAG:   %[[LHS:.+]] = vector.transfer_read %arg0[%c0], %[[CST]] {in_bounds = [true]} : tensor<4xf16>, vector<4xf16>
+//   CHECK-DAG:   %[[RHS:.+]] = vector.transfer_read %arg1[%c0], %[[CST]] {in_bounds = [true]} : tensor<4xf16>, vector<4xf16>
+//   CHECK-DAG:   %[[ACC:.+]] = vector.transfer_read %arg2[%c0], %[[CSTF32]] {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
+//       CHECK:   %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]]) outs(%[[ACC]])
+//  CHECK-SAME:     : vector<4xf16>, vector<4xf16> into vector<4xf32>
+//       CHECK:   vector.transfer_write %[[MMA]], %arg2[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
+
+// -----
+
+#contraction_accesses = [
+ affine_map<(i, j, k, b) -> (i, k, b)>,
+ affine_map<(i, j, k, b) -> (k, b, j)>,
+ affine_map<(i, j, k, b) -> (i, k)>,
+ affine_map<(i, j, k, b) -> (k, j)>,
+ affine_map<(i, j, k, b) -> (i, j)>
+]
+
+#iterator_types = [
+  #linalg.iterator_type<parallel>,
+  #linalg.iterator_type<parallel>,
+  #linalg.iterator_type<reduction>,
+  #linalg.iterator_type<reduction>
+]
+
+func.func @scaled_tensor_multi_mma(%arg0: tensor<3x5x1x32xf4E2M1FN>, %arg1: tensor<5x1x7x32xf8E4M3FN>, %arg2: tensor<3x5x1xf8E8M0FNU>, %arg3: tensor<5x7x1xf8E8M0FNU>,
+  %arg4: tensor<3x7x4xf32>) -> tensor<3x7x4xf32> {
+  %0 = iree_codegen.inner_tiled ins(%arg0, %arg1, %arg2, %arg3) outs(%arg4) {
+    indexing_maps = #contraction_accesses,
+    iterator_types = #iterator_types,
+    kind = #iree_gpu.scaled_mma_layout<intrinsic = MFMA_SCALE_F32_16x16x128_B32,
+      lhs_elem_type = f4E2M1FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>,
+    semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
+    } : tensor<3x5x1x32xf4E2M1FN>, tensor<5x1x7x32xf8E4M3FN>, tensor<3x5x1xf8E8M0FNU>, tensor<5x7x1xf8E8M0FNU>
+      into tensor<3x7x4xf32>
+  return %0 : tensor<3x7x4xf32>
+}
+
+// CHECK-LABEL: func @scaled_tensor_multi_mma
+
+//   CHECK-DAG:   %[[CSTFP4:.+]] = arith.constant 0.000000e+00 : f4E2M1FN
+//   CHECK-DAG:   %[[CSTFP8:.+]] = arith.constant 0.000000e+00 : f8E4M3FN
+//   CHECK-DAG:   %[[CSTSCALE:.+]] = arith.constant 5.877470e-39 : f8E8M0FNU
+//   CHECK-DAG:   %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32
+//   CHECK-DAG:   %[[LHS:.+]] = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %[[CSTFP4]] {{.*}} : tensor<3x5x1x32xf4E2M1FN>, vector<3x5x1x32xf4E2M1FN>
+//   CHECK-DAG:   %[[RHS:.+]] = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %[[CSTFP8]] {{.*}} : tensor<5x1x7x32xf8E4M3FN>, vector<5x1x7x32xf8E4M3FN>
+//   CHECK-DAG:   %[[LHS_SCALE:.+]] = vector.transfer_read %arg2[%c0, %c0, %c0], %[[CSTSCALE]] {{.*}} : tensor<3x5x1xf8E8M0FNU>, vector<3x5x1xf8E8M0FNU>
+//   CHECK-DAG:   %[[RHS_SCALE:.+]] = vector.transfer_read %arg3[%c0, %c0, %c0], %[[CSTSCALE]] {{.*}} : tensor<5x7x1xf8E8M0FNU>, vector<5x7x1xf8E8M0FNU>
+//   CHECK-DAG:   %[[ACC:.+]] = vector.transfer_read %arg4[%c0, %c0, %c0], %[[CSTF32]] {{.*}} : tensor<3x7x4xf32>, vector<3x7x4xf32>
+//       CHECK:   %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]], %[[LHS_SCALE]], %[[RHS_SCALE]]) outs(%[[ACC]])
+//  CHECK-SAME: : vector<3x5x1x32xf4E2M1FN>, vector<5x1x7x32xf8E4M3FN>, vector<3x5x1xf8E8M0FNU>, vector<5x7x1xf8E8M0FNU> into vector<3x7x4xf32>
+//       CHECK:   vector.transfer_write %[[MMA]], %arg4[%c0, %c0, %c0] {{.*}} : vector<3x7x4xf32>, tensor<3x7x4xf32>
+
+// -----
+
 func.func @implicit_gather_like_generic_stride_2(%arg0: tensor<1x1x31xf32>, %arg1: tensor<1x1x1x1x16xf32>) -> tensor<1x1x1x1x16xf32> {
   %0 = linalg.generic {
     indexing_maps = [
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel
index a502ee974f6c..936a1ca8622a 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel
@@ -47,6 +47,7 @@ iree_td_library(
         "@llvm-project//mlir:SMTTdFiles",
         "@llvm-project//mlir:SideEffectInterfacesTdFiles",
         "@llvm-project//mlir:TilingInterfaceTdFiles",
+        "@llvm-project//mlir:ValueBoundsOpInterfaceTdFiles",
         "@llvm-project//mlir:VectorInterfacesTdFiles",
         "@llvm-project//mlir:ViewLikeInterfaceTdFiles",
     ],
@@ -125,6 +126,7 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:TilingInterface",
         "@llvm-project//mlir:TransformDialect",
         "@llvm-project//mlir:TransformDialectTransforms",
+        "@llvm-project//mlir:ValueBoundsOpInterface",
         "@llvm-project//mlir:VectorInterfaces",
         "@llvm-project//mlir:ViewLikeInterface",
     ],
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt
index ce709de10b83..a44c26f20f70 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt
@@ -75,6 +75,7 @@ iree_cc_library(
     MLIRTilingInterface
     MLIRTransformDialect
     MLIRTransformDialectTransforms
+    MLIRValueBoundsOpInterface
     MLIRVectorInterfaces
     MLIRViewLikeInterface
     iree::compiler::Codegen::Dialect::PCF::IR
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.cpp
index 316dbe496aba..24459f67aff5 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.cpp
@@ -23,6 +23,7 @@
 #include "mlir/IR/DialectImplementation.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Support/LLVM.h"
 
 // Custom parse/print helper for the knobs dictionary in constraints op.
@@ -422,6 +423,13 @@ std::optional<SmallVector<int64_t, 4>> InnerTiledOp::getShapeForUnroll() {
   return shape;
 }
 
+void InnerTiledOp::populateBoundsForShapedValueDim(
+    Value value, int64_t dim, ValueBoundsConstraintSet &cstr) {
+  // Result shapes equal the corresponding DPS init shapes.
+  auto resultIdx = cast<OpResult>(value).getResultNumber();
+  cstr.bound(value)[dim] == cstr.getExpr(getDpsInits()[resultIdx], dim);
+}
+
 //===----------------------------------------------------------------------===//
 // WorkgroupCountHintOp
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h
index 8ca29fbeddb8..d5c6b54d6124 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h
@@ -21,6 +21,7 @@
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/TilingInterface.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Interfaces/VectorInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.td b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.td
index 73b3a88f4b01..2b0d0a0c92aa 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.td
@@ -22,6 +22,7 @@ include "mlir/Interfaces/TilingInterface.td"
 include "mlir/Interfaces/VectorInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
+include "mlir/Interfaces/ValueBoundsOpInterface.td"
 include "mlir/Dialect/SMT/IR/SMTTypes.td"
 
 def TensorTypeAttr : TypeAttrBase<"::mlir::TensorType", "Tensor type attribute">;
@@ -256,6 +257,8 @@ def IREECodegen_InnerTiledOp : Op<IREECodegen_Dialect, "inner_tiled", [
     AttrSizedOperandSegments,
     InferTypeOpAdaptor,
     DestinationStyleOpInterface,
+    DeclareOpInterfaceMethods<ValueBoundsOpInterface,
+       ["populateBoundsForShapedValueDim"]>,
     DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>,
     DeclareOpInterfaceMethods<TilingInterface,
        ["getIterationDomain",
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensions.cpp
index 25e62addb191..3aa018f46022 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensions.cpp
@@ -71,15 +71,6 @@ void transform_dialect::ApplyUnrollMultiMmaOp::populatePatterns(
   GPU::populateIREEGPUVectorUnrollPatterns(patterns);
 }
 
-//===---------------------------------------------------------------------===//
-// ApplyVectorizeIREEGPUOp
-//===---------------------------------------------------------------------===//
-
-void transform_dialect::ApplyVectorizeIREEGPUOp::populatePatterns(
-    RewritePatternSet &patterns) {
-  IREE::GPU::populateIREEGPUVectorizationPatterns(patterns);
-}
-
 //===---------------------------------------------------------------------===//
 // ConvertToMultiMmaOp
 //===---------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensionsOps.td
index a3b9132d43d2..09f39e41aba0 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/IREEGPUExtensionsOps.td
@@ -80,19 +80,6 @@ def ApplyUnrollMultiMmaOp : Op<Transform_Dialect,
   let assemblyFormat = "attr-dict";
 }
 
-def ApplyVectorizeIREEGPUOp : Op<Transform_Dialect,
-    "apply_patterns.iree.vectorize_iree_gpu",
-    [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>,
-     ReportTrackingListenerFailuresOpTrait]> {
-  let description = [{
-    Populate patterns to vectorize various iree_gpu ops. Expected to run
-    before or as a part of a larger vectorization pass.
-  }];
-
-  let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
-  let assemblyFormat = "attr-dict";
-}
-
 def ConvertToMultiMmaOp : Op<Transform_Dialect, "iree.convert_to_multi_mma",
     [TransformEachOpTrait,
      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/BUILD.bazel
index 5cc241ac631f..2b9ca722ac70 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/BUILD.bazel
@@ -30,7 +30,6 @@ iree_lit_test_suite(
             "transform_fuse_forall.mlir",
             "transform_lower_barrier_region.mlir",
             "unroll_multi_mma.mlir",
-            "vectorize_iree_gpu_ops.mlir",
         ],
         include = ["*.mlir"],
     ),
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/CMakeLists.txt
index 5ee9c6b01f24..d061ee43202a 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/CMakeLists.txt
@@ -25,7 +25,6 @@ iree_lit_test_suite(
     "transform_fuse_forall.mlir"
     "transform_lower_barrier_region.mlir"
     "unroll_multi_mma.mlir"
-    "vectorize_iree_gpu_ops.mlir"
   TOOLS
     FileCheck
     iree-opt
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir
deleted file mode 100644
index f2b76fb480a8..000000000000
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir
+++ /dev/null
@@ -1,130 +0,0 @@
-// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule --split-input-file | FileCheck %s
-
-#contraction_accesses = [
- affine_map<(i, j, k) -> (i, k)>,
- affine_map<(i, j, k) -> (k, j)>,
- affine_map<(i, j, k) -> (i, j)>
-]
-func.func @tensor_multi_mma(%lhs: tensor<2x3x4xf16>, %rhs: tensor<3x5x4xf16>, %acc: tensor<2x5x4xf32>) -> tensor<2x5x4xf32> {
-  %0 = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) {
-    indexing_maps = #contraction_accesses,
-    iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
-    kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-    semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
-  } : tensor<2x3x4xf16>, tensor<3x5x4xf16> into tensor<2x5x4xf32>
-  return %0 : tensor<2x5x4xf32>
-}
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
-    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.vectorize_iree_gpu
-    } : !transform.any_op
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: func @tensor_multi_mma
-
-//   CHECK-DAG:   %[[CST:.+]] = arith.constant 0.000000e+00 : f16
-//   CHECK-DAG:   %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32
-//   CHECK-DAG:   %[[LHS:.+]] = vector.transfer_read %arg0[%c0, %c0, %c0], %[[CST]] {{.*}} : tensor<2x3x4xf16>, vector<2x3x4xf16>
-//   CHECK-DAG:   %[[RHS:.+]] = vector.transfer_read %arg1[%c0, %c0, %c0], %[[CST]] {{.*}} : tensor<3x5x4xf16>, vector<3x5x4xf16>
-//   CHECK-DAG:   %[[ACC:.+]] = vector.transfer_read %arg2[%c0, %c0, %c0], %[[CSTF32]] {{.*}} : tensor<2x5x4xf32>, vector<2x5x4xf32>
-//       CHECK:   %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]]) outs(%[[ACC]])
-//  CHECK-SAME:     : vector<2x3x4xf16>, vector<3x5x4xf16> into vector<2x5x4xf32>
-//       CHECK:   vector.transfer_write %[[MMA]], %arg2[%c0, %c0, %c0] {{.*}} : vector<2x5x4xf32>, tensor<2x5x4xf32>
-
-// -----
-
-#contraction_accesses = [
- affine_map<() -> ()>,
- affine_map<() -> ()>,
- affine_map<() -> ()>
-]
-func.func @tensor_single_multi_mma(%lhs: tensor<4xf16>, %rhs: tensor<4xf16>, %acc: tensor<4xf32>) -> tensor<4xf32> {
-  %0 = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) {
-    indexing_maps = #contraction_accesses,
-    iterator_types = [],
-    kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
-    semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
-  } : tensor<4xf16>, tensor<4xf16> into tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
-    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.vectorize_iree_gpu
-    } : !transform.any_op
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: func @tensor_single_multi_mma
-
-//   CHECK-DAG:   %[[CST:.+]] = arith.constant 0.000000e+00 : f16
-//   CHECK-DAG:   %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32
-//   CHECK-DAG:   %[[LHS:.+]] = vector.transfer_read %arg0[%c0], %[[CST]] {in_bounds = [true]} : tensor<4xf16>, vector<4xf16>
-//   CHECK-DAG:   %[[RHS:.+]] = vector.transfer_read %arg1[%c0], %[[CST]] {in_bounds = [true]} : tensor<4xf16>, vector<4xf16>
-//   CHECK-DAG:   %[[ACC:.+]] = vector.transfer_read %arg2[%c0], %[[CSTF32]] {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
-//       CHECK:   %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]]) outs(%[[ACC]])
-//  CHECK-SAME:     : vector<4xf16>, vector<4xf16> into vector<4xf32>
-//       CHECK:   vector.transfer_write %[[MMA]], %arg2[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>
-
-// -----
-
-#contraction_accesses = [
- affine_map<(i, j, k, b) -> (i, k, b)>,
- affine_map<(i, j, k, b) -> (k, b, j)>,
- affine_map<(i, j, k, b) -> (i, k)>,
- affine_map<(i, j, k, b) -> (k, j)>,
- affine_map<(i, j, k, b) -> (i, j)>
-]
-
-#iterator_types = [
-  #linalg.iterator_type<parallel>,
-  #linalg.iterator_type<parallel>,
-  #linalg.iterator_type<reduction>,
-  #linalg.iterator_type<reduction>
-]
-
-func.func @scaled_tensor_multi_mma(%arg0: tensor<3x5x1x32xf4E2M1FN>, %arg1: tensor<5x1x7x32xf8E4M3FN>, %arg2: tensor<3x5x1xf8E8M0FNU>, %arg3: tensor<5x7x1xf8E8M0FNU>,
-  %arg4: tensor<3x7x4xf32>) -> tensor<3x7x4xf32> {
-  %0 = iree_codegen.inner_tiled ins(%arg0, %arg1, %arg2, %arg3) outs(%arg4) {
-    indexing_maps = #contraction_accesses,
-    iterator_types = #iterator_types,
-    kind = #iree_gpu.scaled_mma_layout<intrinsic = MFMA_SCALE_F32_16x16x128_B32,
-      lhs_elem_type = f4E2M1FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>,
-    semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
-    } : tensor<3x5x1x32xf4E2M1FN>, tensor<5x1x7x32xf8E4M3FN>, tensor<3x5x1xf8E8M0FNU>, tensor<5x7x1xf8E8M0FNU>
-      into tensor<3x7x4xf32>
-  return %0 : tensor<3x7x4xf32>
-}
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
-    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %func {
-      transform.apply_patterns.iree.vectorize_iree_gpu
-    } : !transform.any_op
-    transform.yield
-  }
-}
-
-// CHECK-LABEL: func @scaled_tensor_multi_mma
-
-//   CHECK-DAG:   %[[CSTFP4:.+]] = arith.constant 0.000000e+00 : f4E2M1FN
-//   CHECK-DAG:   %[[CSTFP8:.+]] = arith.constant 0.000000e+00 : f8E4M3FN
-//   CHECK-DAG:   %[[CSTSCALE:.+]] = arith.constant 5.877470e-39 : f8E8M0FNU
-//   CHECK-DAG:   %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32
-//   CHECK-DAG:   %[[LHS:.+]] = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %[[CSTFP4]] {{.*}} : tensor<3x5x1x32xf4E2M1FN>, vector<3x5x1x32xf4E2M1FN>
-//   CHECK-DAG:   %[[RHS:.+]] = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %[[CSTFP8]] {{.*}} : tensor<5x1x7x32xf8E4M3FN>, vector<5x1x7x32xf8E4M3FN>
-//   CHECK-DAG:   %[[LHS_SCALE:.+]] = vector.transfer_read %arg2[%c0, %c0, %c0], %[[CSTSCALE]] {{.*}} : tensor<3x5x1xf8E8M0FNU>, vector<3x5x1xf8E8M0FNU>
-//   CHECK-DAG:   %[[RHS_SCALE:.+]] = vector.transfer_read %arg3[%c0, %c0, %c0], %[[CSTSCALE]] {{.*}} : tensor<5x7x1xf8E8M0FNU>, vector<5x7x1xf8E8M0FNU>
-//   CHECK-DAG:   %[[ACC:.+]] = vector.transfer_read %arg4[%c0, %c0, %c0], %[[CSTF32]] {{.*}} : tensor<3x7x4xf32>, vector<3x7x4xf32>
-//       CHECK:   %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]], %[[LHS_SCALE]], %[[RHS_SCALE]]) outs(%[[ACC]])
-//  CHECK-SAME: : vector<3x5x1x32xf4E2M1FN>, vector<5x1x7x32xf8E4M3FN>, vector<3x5x1xf8E8M0FNU>, vector<5x7x1xf8E8M0FNU> into vector<3x7x4xf32>
-//       CHECK:   vector.transfer_write %[[MMA]], %arg4[%c0, %c0, %c0] {{.*}} : vector<3x7x4xf32>, tensor<3x7x4xf32>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BUILD.bazel
index 45cad1a92e2e..f2a079799c70 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BUILD.bazel
@@ -65,7 +65,6 @@ iree_compiler_cc_library(
         "Passes.cpp",
         "Transforms.cpp",
         "UnrollToIntrinsics.cpp",
-        "VectorizeIREEGPUOps.cpp",
     ],
     hdrs = [
         "Passes.h",
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/CMakeLists.txt
index 58fa6fa875d3..6ec757579b21 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/CMakeLists.txt
@@ -54,7 +54,6 @@ iree_cc_library(
     "Passes.cpp"
     "Transforms.cpp"
     "UnrollToIntrinsics.cpp"
-    "VectorizeIREEGPUOps.cpp"
   DEPS
     ::PassesIncGen
     LLVMSupport
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td
index 1cc05023865c..e97e0b1bed67 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td
@@ -55,14 +55,4 @@ def UnrollToIntrinsicsPass :
   ];
 }
 
-def VectorizeIREEGPUOpsPass :
-    InterfacePass<"iree-gpu-vectorize-ops", "mlir::FunctionOpInterface"> {
-  let summary = "Vectorizes then lowers a few iree_gpu ops before vectorization.";
-  let dependentDialects = [
-    "::mlir::vector::VectorDialect",
-    "::mlir::arith::ArithDialect",
-    "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect"
-  ];
-}
-
 #endif // IREE_CODEGEN_DIALECT_GPU_TRANSFORMS_PASSES
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
index ce293325abf8..9fa3eaf336f2 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp
@@ -1983,80 +1983,6 @@ void populateIREEGPULowerBarrierRegionPatterns(RewritePatternSet &patterns) {
   patterns.add<LowerBarrierRegion>(patterns.getContext());
 }
 
-//===---------------------------------------------------------------------===//
-// InnerTiledOp Vectorization
-//===---------------------------------------------------------------------===//
-
-static LogicalResult
-vectorizeStaticInnerTiledOp(RewriterBase &rewriter,
-                            IREE::Codegen::InnerTiledOp tiledOp) {
-  if (!tiledOp.hasTensorSemantics()) {
-    return failure();
-  }
-  SmallVector<ShapedType> argTypes = tiledOp.getOperandShapedTypes();
-  if (!llvm::all_of(argTypes, [](auto st) { return st.hasStaticShape(); })) {
-    return rewriter.notifyMatchFailure(tiledOp,
-                                       "non-static shape for vectorization");
-  }
-
-  OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(tiledOp);
-
-  Location loc = tiledOp.getLoc();
-
-  // Construct the (never used) zero padding value for each operand.
-  SmallVector<Value> padValues =
-      llvm::map_to_vector(argTypes, [&](ShapedType argType) -> Value {
-        return arith::ConstantOp::create(
-            rewriter, loc, rewriter.getZeroAttr(argType.getElementType()));
-      });
-
-  SmallVector<Value> newOperands = tiledOp.getOperands();
-  for (auto [operand, type, padValue] :
-       llvm::zip_equal(newOperands, argTypes, padValues)) {
-    operand = vector::createReadOrMaskedRead(
-        rewriter, loc, operand, type.getShape(), padValue,
-        /*useInBoundsInsteadOfMasking=*/true);
-  }
-  auto newTiledOp = IREE::Codegen::InnerTiledOp::create(
-      rewriter, loc, ValueRange{newOperands}.take_front(tiledOp.getNumInputs()),
-      ValueRange{newOperands}.take_back(tiledOp.getNumOutputs()),
-      tiledOp.getIndexingMaps(), tiledOp.getIteratorTypes(), tiledOp.getKind(),
-      tiledOp.getSemantics());
-
-  auto zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
-  SmallVector<Value> transferWrites;
-  for (auto [result, tensorAcc] :
-       llvm::zip_equal(newTiledOp.getResults(), tiledOp.getOutputs())) {
-    // Create the write back to a tensor.
-    int64_t rank = cast<RankedTensorType>(tensorAcc.getType()).getRank();
-    auto write = vector::TransferWriteOp::create(
-        rewriter, loc,
-        /*vector=*/result,
-        /*source=*/tensorAcc,
-        /*indices=*/SmallVector<Value>(rank, zero),
-        /*inBounds=*/SmallVector<bool>(rank, true));
-    transferWrites.push_back(write.getResults().front());
-  }
-  rewriter.replaceOp(tiledOp, transferWrites);
-  return success();
-}
-
-namespace {
-struct VectorizeStaticInnerTiledOpPattern final
-    : OpRewritePattern<IREE::Codegen::InnerTiledOp> {
-  using Base::Base;
-  LogicalResult matchAndRewrite(IREE::Codegen::InnerTiledOp tiledOp,
-                                PatternRewriter &rewriter) const override {
-    return vectorizeStaticInnerTiledOp(rewriter, tiledOp);
-  }
-};
-} // namespace
-
-void populateIREEGPUVectorizationPatterns(RewritePatternSet &patterns) {
-  patterns.add<VectorizeStaticInnerTiledOpPattern>(patterns.getContext());
-}
-
 //===----------------------------------------------------------------------===//
 // VectorBarrierOp Lowering
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h
index dcdd11f4232a..bfd8fa163d66 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h
@@ -193,8 +193,6 @@ void populateIREEGPUVectorUnrollPatterns(
     RewritePatternSet &patterns, const vector::UnrollVectorOptions &options);
 // Version of unrolling with a preset configuration.
 void populateIREEGPUVectorUnrollPatterns(RewritePatternSet &patterns);
-void populateIREEGPUVectorizationPatterns(RewritePatternSet &patterns);
-
 // Populate patterns to fold tensor.empty ops through swizzle hint ops.
 void populateFoldSwizzleHintOpPatterns(RewritePatternSet &patterns);
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/VectorizeIREEGPUOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/VectorizeIREEGPUOps.cpp
deleted file mode 100644
index dc201a184222..000000000000
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/VectorizeIREEGPUOps.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// Copyright 2024 The IREE Authors
-//
-// Licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
-#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.h"
-#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Interfaces/FunctionInterfaces.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-namespace mlir::iree_compiler::IREE::GPU {
-
-#define GEN_PASS_DEF_VECTORIZEIREEGPUOPSPASS
-#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.h.inc"
-
-namespace {
-struct VectorizeIREEGPUOpsPass final
-    : impl::VectorizeIREEGPUOpsPassBase<VectorizeIREEGPUOpsPass> {
-  void runOnOperation() override;
-};
-} // namespace
-
-void VectorizeIREEGPUOpsPass::runOnOperation() {
-  MLIRContext *context = &getContext();
-  RewritePatternSet patterns(context);
-  populateIREEGPUVectorizationPatterns(patterns);
-  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-    return signalPassFailure();
-  }
-}
-
-} // namespace mlir::iree_compiler::IREE::GPU
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Interfaces/BUILD.bazel
index 27d1d818b132..2a3beac600be 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/BUILD.bazel
@@ -232,6 +232,7 @@ iree_compiler_cc_library(
     ],
     deps = [
         ":VectorizableOpInterfaceGen",
+        "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR:IREEVectorExtDialect",
         "//compiler/src/iree/compiler/Dialect/LinalgExt/IR",
         "//compiler/src/iree/compiler/Utils",
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt
index 439bbb6c5ccb..ecf117dfa663 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt
@@ -179,6 +179,7 @@ iree_cc_library(
     MLIRTensorDialect
     MLIRUBDialect
     MLIRVectorDialect
+    iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
     iree::compiler::Codegen::Dialect::VectorExt::IR::IREEVectorExtDialect
     iree::compiler::Dialect::LinalgExt::IR
     iree::compiler::Utils
diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/VectorizableOpInterface.cpp b/compiler/src/iree/compiler/Codegen/Interfaces/VectorizableOpInterface.cpp
index d0b457398be3..28ec62ad622f 100644
--- a/compiler/src/iree/compiler/Codegen/Interfaces/VectorizableOpInterface.cpp
+++ b/compiler/src/iree/compiler/Codegen/Interfaces/VectorizableOpInterface.cpp
@@ -6,6 +6,8 @@
 
 #include "iree/compiler/Codegen/Interfaces/VectorizableOpInterface.h"
 
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h"
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
@@ -1055,6 +1057,74 @@ struct PadOpVectorizationModel
   }
 };
 
+/// External model for IREE::Codegen::InnerTiledOp. Reads tensor operands into
+/// vectors, creates a vector-semantic InnerTiledOp, and writes results back.
+struct InnerTiledOpVectorizationModel
+    : VectorizableOpInterface::ExternalModel<InnerTiledOpVectorizationModel,
+                                             IREE::Codegen::InnerTiledOp> {
+
+  bool isVectorizable(Operation *op, ArrayRef<int64_t> vectorSizes,
+                      ArrayRef<bool> scalableDims,
+                      DictionaryAttr options) const {
+    auto tiledOp = cast<IREE::Codegen::InnerTiledOp>(op);
+    if (!tiledOp.hasTensorSemantics()) {
+      return false;
+    }
+    SmallVector<ShapedType> argTypes = tiledOp.getOperandShapedTypes();
+    return llvm::all_of(argTypes,
+                        [](ShapedType st) { return st.hasStaticShape(); });
+  }
+
+  FailureOr<SmallVector<Value>> vectorize(Operation *op, RewriterBase &rewriter,
+                                          ArrayRef<int64_t> vectorSizes,
+                                          ArrayRef<bool> scalableDims,
+                                          DictionaryAttr options) const {
+    auto tiledOp = cast<IREE::Codegen::InnerTiledOp>(op);
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPoint(tiledOp);
+    Location loc = tiledOp.getLoc();
+
+    SmallVector<ShapedType> argTypes = tiledOp.getOperandShapedTypes();
+
+    // Construct the zero padding value for each operand. Ideally, we'd need the
+    // InnerTile interface to return the padding value to use. If it is not
+    // provided, ub::Poison is a better choice. Zero was chosen because the op
+    // was designed for matmul, and zero padding is the most common case.
+    SmallVector<Value> padValues =
+        llvm::map_to_vector(argTypes, [&](ShapedType argType) -> Value {
+          return arith::ConstantOp::create(
+              rewriter, loc, rewriter.getZeroAttr(argType.getElementType()));
+        });
+
+    SmallVector<Value> newOperands = tiledOp.getOperands();
+    for (auto [operand, type, padValue] :
+         llvm::zip_equal(newOperands, argTypes, padValues)) {
+      operand = vector::createReadOrMaskedRead(
+          rewriter, loc, operand, type.getShape(), padValue,
+          /*useInBoundsInsteadOfMasking=*/true);
+    }
+    auto newTiledOp = IREE::Codegen::InnerTiledOp::create(
+        rewriter, loc,
+        ValueRange{newOperands}.take_front(tiledOp.getNumInputs()),
+        ValueRange{newOperands}.take_back(tiledOp.getNumOutputs()),
+        tiledOp.getIndexingMaps(), tiledOp.getIteratorTypes(),
+        tiledOp.getKind(), tiledOp.getSemantics());
+
+    auto zero = arith::ConstantIndexOp::create(rewriter, loc, 0);
+    SmallVector<Value> results;
+    for (auto [result, tensorAcc] :
+         llvm::zip_equal(newTiledOp.getResults(), tiledOp.getOutputs())) {
+      int64_t rank = cast<RankedTensorType>(tensorAcc.getType()).getRank();
+      auto write = vector::TransferWriteOp::create(
+          rewriter, loc, result, tensorAcc,
+          /*indices=*/SmallVector<Value>(rank, zero),
+          /*inBounds=*/SmallVector<bool>(rank, true));
+      results.push_back(write.getResults().front());
+    }
+    return results;
+  }
+};
+
 /// Registers the LinalgStructuredOpVectorizationModel for a single op type.
 template <typename OpTy>
 static void registerInterfaceForLinalgOps(MLIRContext *ctx) {
@@ -1087,6 +1157,12 @@ void registerVectorizableOpInterfaceExternalModels(DialectRegistry &registry) {
         *ctx);
   });
 
+  registry.addExtension(
+      +[](MLIRContext *ctx, IREE::Codegen::IREECodegenDialect *dialect) {
+        IREE::Codegen::InnerTiledOp::attachInterface<
+            InnerTiledOpVectorizationModel>(*ctx);
+      });
+
   // Upstream linalg ops.
 #define GET_OP_LIST
   registry.addExtension(+[](MLIRContext *ctx, linalg::LinalgDialect *dialect) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 8f54ed19faef..8584d2775761 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -273,7 +273,6 @@ static void addGPUVectorizationPasses(OpPassManager &funcPassManager,
   funcPassManager.addPass(IREE::LinalgExt::createDecomposeIm2colPass());
   funcPassManager.addPass(createCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
-  funcPassManager.addPass(IREE::GPU::createVectorizeIREEGPUOpsPass());
   // Vectorize.
   GenericVectorizationPassOptions options;
   options.vectorizeCopies = vectorizeCopies;
@@ -559,8 +558,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
   funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
   funcPassManager.addPass(createGPUCombineValueSemanticBarriersPass());
 
-  // Step 6. Lower special ops and vectorize.
-  funcPassManager.addPass(IREE::GPU::createVectorizeIREEGPUOpsPass());
+  // Step 6. Vectorize.
   addGPUVectorizationPasses(funcPassManager, /*vectorizeCopies=*/false,
                             /*enableMasking=*/true,
                             /*foldIdentitySlices=*/true,