diff --git a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir index 6a3344b12bf8..a16b929bbefe 100644 --- a/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/test/generic_vectorization.mlir @@ -1219,6 +1219,111 @@ func.func @map_store_f4_mask_depends_on_inner_index( // ----- +#contraction_accesses = [ + affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)> +] +func.func @tensor_multi_mma(%lhs: tensor<2x3x4xf16>, %rhs: tensor<3x5x4xf16>, %acc: tensor<2x5x4xf32>) -> tensor<2x5x4xf32> { + %0 = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) { + indexing_maps = #contraction_accesses, + iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type], + kind = #iree_gpu.mma_layout, + semantics = #iree_gpu.mma_semantics + } : tensor<2x3x4xf16>, tensor<3x5x4xf16> into tensor<2x5x4xf32> + return %0 : tensor<2x5x4xf32> +} + +// CHECK-LABEL: func @tensor_multi_mma +// CHECK-MASK-LABEL: func @tensor_multi_mma +// CHECK-FOLD-LABEL: func @tensor_multi_mma +// CHECK-GATHER-LABEL: func @tensor_multi_mma +// CHECK-MAP-STORE-LABEL: func @tensor_multi_mma + +// CHECK-DAG: %[[CST:.+]] = arith.constant 0.000000e+00 : f16 +// CHECK-DAG: %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32 +// CHECK-DAG: %[[LHS:.+]] = vector.transfer_read %arg0[%c0, %c0, %c0], %[[CST]] {{.*}} : tensor<2x3x4xf16>, vector<2x3x4xf16> +// CHECK-DAG: %[[RHS:.+]] = vector.transfer_read %arg1[%c0, %c0, %c0], %[[CST]] {{.*}} : tensor<3x5x4xf16>, vector<3x5x4xf16> +// CHECK-DAG: %[[ACC:.+]] = vector.transfer_read %arg2[%c0, %c0, %c0], %[[CSTF32]] {{.*}} : tensor<2x5x4xf32>, vector<2x5x4xf32> +// CHECK: %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]]) outs(%[[ACC]]) +// CHECK-SAME: : vector<2x3x4xf16>, vector<3x5x4xf16> into vector<2x5x4xf32> +// CHECK: vector.transfer_write %[[MMA]], %arg2[%c0, %c0, %c0] {{.*}} : vector<2x5x4xf32>, tensor<2x5x4xf32> + +// ----- + +#contraction_accesses = [ + affine_map<() -> ()>, + affine_map<() -> ()>, + affine_map<() -> ()> +] +func.func @tensor_single_multi_mma(%lhs: tensor<4xf16>, %rhs: tensor<4xf16>, %acc: tensor<4xf32>) -> tensor<4xf32> { + %0 = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) { + indexing_maps = #contraction_accesses, + iterator_types = [], + kind = #iree_gpu.mma_layout, + semantics = #iree_gpu.mma_semantics + } : tensor<4xf16>, tensor<4xf16> into tensor<4xf32> + return %0 : tensor<4xf32> +} + +// CHECK-LABEL: func @tensor_single_multi_mma + +// CHECK-DAG: %[[CST:.+]] = arith.constant 0.000000e+00 : f16 +// CHECK-DAG: %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32 +// CHECK-DAG: %[[LHS:.+]] = vector.transfer_read %arg0[%c0], %[[CST]] {in_bounds = [true]} : tensor<4xf16>, vector<4xf16> +// CHECK-DAG: %[[RHS:.+]] = vector.transfer_read %arg1[%c0], %[[CST]] {in_bounds = [true]} : tensor<4xf16>, vector<4xf16> +// CHECK-DAG: %[[ACC:.+]] = vector.transfer_read %arg2[%c0], %[[CSTF32]] {in_bounds = [true]} : tensor<4xf32>, vector<4xf32> +// CHECK: %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]]) outs(%[[ACC]]) +// CHECK-SAME: : vector<4xf16>, vector<4xf16> into vector<4xf32> +// CHECK: vector.transfer_write %[[MMA]], %arg2[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32> + +// ----- + +#contraction_accesses = [ + affine_map<(i, j, k, b) -> (i, k, b)>, + affine_map<(i, j, k, b) -> (k, b, j)>, + affine_map<(i, j, k, b) -> (i, k)>, + affine_map<(i, j, k, b) -> (k, j)>, + affine_map<(i, j, k, b) -> (i, j)> +] + +#iterator_types = [ + #linalg.iterator_type, + #linalg.iterator_type, + #linalg.iterator_type, + #linalg.iterator_type +] + +func.func @scaled_tensor_multi_mma(%arg0: tensor<3x5x1x32xf4E2M1FN>, %arg1: tensor<5x1x7x32xf8E4M3FN>, %arg2: tensor<3x5x1xf8E8M0FNU>, %arg3: tensor<5x7x1xf8E8M0FNU>, + %arg4: tensor<3x7x4xf32>) -> tensor<3x7x4xf32> { + %0 = iree_codegen.inner_tiled ins(%arg0, %arg1, %arg2, %arg3) outs(%arg4) { + indexing_maps = #contraction_accesses, + iterator_types = #iterator_types, + kind = #iree_gpu.scaled_mma_layout, + semantics = #iree_gpu.mma_semantics + } : tensor<3x5x1x32xf4E2M1FN>, tensor<5x1x7x32xf8E4M3FN>, tensor<3x5x1xf8E8M0FNU>, tensor<5x7x1xf8E8M0FNU> + into tensor<3x7x4xf32> + return %0 : tensor<3x7x4xf32> +} + +// CHECK-LABEL: func @scaled_tensor_multi_mma + +// CHECK-DAG: %[[CSTFP4:.+]] = arith.constant 0.000000e+00 : f4E2M1FN +// CHECK-DAG: %[[CSTFP8:.+]] = arith.constant 0.000000e+00 : f8E4M3FN +// CHECK-DAG: %[[CSTSCALE:.+]] = arith.constant 5.877470e-39 : f8E8M0FNU +// CHECK-DAG: %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32 +// CHECK-DAG: %[[LHS:.+]] = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %[[CSTFP4]] {{.*}} : tensor<3x5x1x32xf4E2M1FN>, vector<3x5x1x32xf4E2M1FN> +// CHECK-DAG: %[[RHS:.+]] = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %[[CSTFP8]] {{.*}} : tensor<5x1x7x32xf8E4M3FN>, vector<5x1x7x32xf8E4M3FN> +// CHECK-DAG: %[[LHS_SCALE:.+]] = vector.transfer_read %arg2[%c0, %c0, %c0], %[[CSTSCALE]] {{.*}} : tensor<3x5x1xf8E8M0FNU>, vector<3x5x1xf8E8M0FNU> +// CHECK-DAG: %[[RHS_SCALE:.+]] = vector.transfer_read %arg3[%c0, %c0, %c0], %[[CSTSCALE]] {{.*}} : tensor<5x7x1xf8E8M0FNU>, vector<5x7x1xf8E8M0FNU> +// CHECK-DAG: %[[ACC:.+]] = vector.transfer_read %arg4[%c0, %c0, %c0], %[[CSTF32]] {{.*}} : tensor<3x7x4xf32>, vector<3x7x4xf32> +// CHECK: %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]], %[[LHS_SCALE]], %[[RHS_SCALE]]) outs(%[[ACC]]) +// CHECK-SAME: : vector<3x5x1x32xf4E2M1FN>, vector<5x1x7x32xf8E4M3FN>, vector<3x5x1xf8E8M0FNU>, vector<5x7x1xf8E8M0FNU> into vector<3x7x4xf32> +// CHECK: vector.transfer_write %[[MMA]], %arg4[%c0, %c0, %c0] {{.*}} : vector<3x7x4xf32>, tensor<3x7x4xf32> + +// ----- + func.func @implicit_gather_like_generic_stride_2(%arg0: tensor<1x1x31xf32>, %arg1: tensor<1x1x1x1x16xf32>) -> tensor<1x1x1x1x16xf32> { %0 = linalg.generic { indexing_maps = [ diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel index a502ee974f6c..936a1ca8622a 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/BUILD.bazel @@ -47,6 +47,7 @@ iree_td_library( "@llvm-project//mlir:SMTTdFiles", "@llvm-project//mlir:SideEffectInterfacesTdFiles", "@llvm-project//mlir:TilingInterfaceTdFiles", + "@llvm-project//mlir:ValueBoundsOpInterfaceTdFiles", "@llvm-project//mlir:VectorInterfacesTdFiles", "@llvm-project//mlir:ViewLikeInterfaceTdFiles", ], @@ -125,6 +126,7 @@ iree_compiler_cc_library( "@llvm-project//mlir:TilingInterface", "@llvm-project//mlir:TransformDialect", "@llvm-project//mlir:TransformDialectTransforms", + "@llvm-project//mlir:ValueBoundsOpInterface", "@llvm-project//mlir:VectorInterfaces", "@llvm-project//mlir:ViewLikeInterface", ], diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt index ce709de10b83..a44c26f20f70 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/CMakeLists.txt @@ -75,6 +75,7 @@ iree_cc_library( MLIRTilingInterface MLIRTransformDialect MLIRTransformDialectTransforms + MLIRValueBoundsOpInterface MLIRVectorInterfaces MLIRViewLikeInterface iree::compiler::Codegen::Dialect::PCF::IR diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.cpp index 316dbe496aba..24459f67aff5 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.cpp @@ -23,6 +23,7 @@ #include "mlir/IR/DialectImplementation.h" #include "mlir/IR/OpImplementation.h" #include "mlir/Interfaces/FunctionInterfaces.h" +#include "mlir/Interfaces/ValueBoundsOpInterface.h" #include "mlir/Support/LLVM.h" // Custom parse/print helper for the knobs dictionary in constraints op. @@ -422,6 +423,13 @@ std::optional> InnerTiledOp::getShapeForUnroll() { return shape; } +void InnerTiledOp::populateBoundsForShapedValueDim( + Value value, int64_t dim, ValueBoundsConstraintSet &cstr) { + // Result shapes equal the corresponding DPS init shapes. + auto resultIdx = cast(value).getResultNumber(); + cstr.bound(value)[dim] == cstr.getExpr(getDpsInits()[resultIdx], dim); +} + //===----------------------------------------------------------------------===// // WorkgroupCountHintOp //===----------------------------------------------------------------------===// diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h index 8ca29fbeddb8..d5c6b54d6124 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h +++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h @@ -21,6 +21,7 @@ #include "mlir/Interfaces/InferTypeOpInterface.h" #include "mlir/Interfaces/SideEffectInterfaces.h" #include "mlir/Interfaces/TilingInterface.h" +#include "mlir/Interfaces/ValueBoundsOpInterface.h" #include "mlir/Interfaces/VectorInterfaces.h" #include "mlir/Interfaces/ViewLikeInterface.h" diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.td b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.td index 73b3a88f4b01..2b0d0a0c92aa 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.td @@ -22,6 +22,7 @@ include "mlir/Interfaces/TilingInterface.td" include "mlir/Interfaces/VectorInterfaces.td" include "mlir/Interfaces/ViewLikeInterface.td" include "mlir/Interfaces/InferTypeOpInterface.td" +include "mlir/Interfaces/ValueBoundsOpInterface.td" include "mlir/Dialect/SMT/IR/SMTTypes.td" def TensorTypeAttr : TypeAttrBase<"::mlir::TensorType", "Tensor type attribute">; @@ -256,6 +257,8 @@ def IREECodegen_InnerTiledOp : Op, DeclareOpInterfaceMethods, DeclareOpInterfaceMethods, - ReportTrackingListenerFailuresOpTrait]> { - let description = [{ - Populate patterns to vectorize various iree_gpu ops. Expected to run - before or as a part of a larger vectorization pass. - }]; - - let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect"; - let assemblyFormat = "attr-dict"; -} - def ConvertToMultiMmaOp : Op, diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/BUILD.bazel index 5cc241ac631f..2b9ca722ac70 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/BUILD.bazel @@ -30,7 +30,6 @@ iree_lit_test_suite( "transform_fuse_forall.mlir", "transform_lower_barrier_region.mlir", "unroll_multi_mma.mlir", - "vectorize_iree_gpu_ops.mlir", ], include = ["*.mlir"], ), diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/CMakeLists.txt index 5ee9c6b01f24..d061ee43202a 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/CMakeLists.txt @@ -25,7 +25,6 @@ iree_lit_test_suite( "transform_fuse_forall.mlir" "transform_lower_barrier_region.mlir" "unroll_multi_mma.mlir" - "vectorize_iree_gpu_ops.mlir" TOOLS FileCheck iree-opt diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir deleted file mode 100644 index f2b76fb480a8..000000000000 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/vectorize_iree_gpu_ops.mlir +++ /dev/null @@ -1,130 +0,0 @@ -// RUN: iree-opt %s -iree-transform-dialect-interpreter -transform-dialect-drop-schedule --split-input-file | FileCheck %s - -#contraction_accesses = [ - affine_map<(i, j, k) -> (i, k)>, - affine_map<(i, j, k) -> (k, j)>, - affine_map<(i, j, k) -> (i, j)> -] -func.func @tensor_multi_mma(%lhs: tensor<2x3x4xf16>, %rhs: tensor<3x5x4xf16>, %acc: tensor<2x5x4xf32>) -> tensor<2x5x4xf32> { - %0 = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) { - indexing_maps = #contraction_accesses, - iterator_types = [#linalg.iterator_type, #linalg.iterator_type, #linalg.iterator_type], - kind = #iree_gpu.mma_layout, - semantics = #iree_gpu.mma_semantics - } : tensor<2x3x4xf16>, tensor<3x5x4xf16> into tensor<2x5x4xf32> - return %0 : tensor<2x5x4xf32> -} - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) { - %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.vectorize_iree_gpu - } : !transform.any_op - transform.yield - } -} - -// CHECK-LABEL: func @tensor_multi_mma - -// CHECK-DAG: %[[CST:.+]] = arith.constant 0.000000e+00 : f16 -// CHECK-DAG: %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32 -// CHECK-DAG: %[[LHS:.+]] = vector.transfer_read %arg0[%c0, %c0, %c0], %[[CST]] {{.*}} : tensor<2x3x4xf16>, vector<2x3x4xf16> -// CHECK-DAG: %[[RHS:.+]] = vector.transfer_read %arg1[%c0, %c0, %c0], %[[CST]] {{.*}} : tensor<3x5x4xf16>, vector<3x5x4xf16> -// CHECK-DAG: %[[ACC:.+]] = vector.transfer_read %arg2[%c0, %c0, %c0], %[[CSTF32]] {{.*}} : tensor<2x5x4xf32>, vector<2x5x4xf32> -// CHECK: %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]]) outs(%[[ACC]]) -// CHECK-SAME: : vector<2x3x4xf16>, vector<3x5x4xf16> into vector<2x5x4xf32> -// CHECK: vector.transfer_write %[[MMA]], %arg2[%c0, %c0, %c0] {{.*}} : vector<2x5x4xf32>, tensor<2x5x4xf32> - -// ----- - -#contraction_accesses = [ - affine_map<() -> ()>, - affine_map<() -> ()>, - affine_map<() -> ()> -] -func.func @tensor_single_multi_mma(%lhs: tensor<4xf16>, %rhs: tensor<4xf16>, %acc: tensor<4xf32>) -> tensor<4xf32> { - %0 = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) { - indexing_maps = #contraction_accesses, - iterator_types = [], - kind = #iree_gpu.mma_layout, - semantics = #iree_gpu.mma_semantics - } : tensor<4xf16>, tensor<4xf16> into tensor<4xf32> - return %0 : tensor<4xf32> -} - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) { - %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.vectorize_iree_gpu - } : !transform.any_op - transform.yield - } -} - -// CHECK-LABEL: func @tensor_single_multi_mma - -// CHECK-DAG: %[[CST:.+]] = arith.constant 0.000000e+00 : f16 -// CHECK-DAG: %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32 -// CHECK-DAG: %[[LHS:.+]] = vector.transfer_read %arg0[%c0], %[[CST]] {in_bounds = [true]} : tensor<4xf16>, vector<4xf16> -// CHECK-DAG: %[[RHS:.+]] = vector.transfer_read %arg1[%c0], %[[CST]] {in_bounds = [true]} : tensor<4xf16>, vector<4xf16> -// CHECK-DAG: %[[ACC:.+]] = vector.transfer_read %arg2[%c0], %[[CSTF32]] {in_bounds = [true]} : tensor<4xf32>, vector<4xf32> -// CHECK: %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]]) outs(%[[ACC]]) -// CHECK-SAME: : vector<4xf16>, vector<4xf16> into vector<4xf32> -// CHECK: vector.transfer_write %[[MMA]], %arg2[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32> - -// ----- - -#contraction_accesses = [ - affine_map<(i, j, k, b) -> (i, k, b)>, - affine_map<(i, j, k, b) -> (k, b, j)>, - affine_map<(i, j, k, b) -> (i, k)>, - affine_map<(i, j, k, b) -> (k, j)>, - affine_map<(i, j, k, b) -> (i, j)> -] - -#iterator_types = [ - #linalg.iterator_type, - #linalg.iterator_type, - #linalg.iterator_type, - #linalg.iterator_type -] - -func.func @scaled_tensor_multi_mma(%arg0: tensor<3x5x1x32xf4E2M1FN>, %arg1: tensor<5x1x7x32xf8E4M3FN>, %arg2: tensor<3x5x1xf8E8M0FNU>, %arg3: tensor<5x7x1xf8E8M0FNU>, - %arg4: tensor<3x7x4xf32>) -> tensor<3x7x4xf32> { - %0 = iree_codegen.inner_tiled ins(%arg0, %arg1, %arg2, %arg3) outs(%arg4) { - indexing_maps = #contraction_accesses, - iterator_types = #iterator_types, - kind = #iree_gpu.scaled_mma_layout, - semantics = #iree_gpu.mma_semantics - } : tensor<3x5x1x32xf4E2M1FN>, tensor<5x1x7x32xf8E4M3FN>, tensor<3x5x1xf8E8M0FNU>, tensor<5x7x1xf8E8M0FNU> - into tensor<3x7x4xf32> - return %0 : tensor<3x7x4xf32> -} - -module attributes { transform.with_named_sequence } { - transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) { - %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op - transform.apply_patterns to %func { - transform.apply_patterns.iree.vectorize_iree_gpu - } : !transform.any_op - transform.yield - } -} - -// CHECK-LABEL: func @scaled_tensor_multi_mma - -// CHECK-DAG: %[[CSTFP4:.+]] = arith.constant 0.000000e+00 : f4E2M1FN -// CHECK-DAG: %[[CSTFP8:.+]] = arith.constant 0.000000e+00 : f8E4M3FN -// CHECK-DAG: %[[CSTSCALE:.+]] = arith.constant 5.877470e-39 : f8E8M0FNU -// CHECK-DAG: %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32 -// CHECK-DAG: %[[LHS:.+]] = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %[[CSTFP4]] {{.*}} : tensor<3x5x1x32xf4E2M1FN>, vector<3x5x1x32xf4E2M1FN> -// CHECK-DAG: %[[RHS:.+]] = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %[[CSTFP8]] {{.*}} : tensor<5x1x7x32xf8E4M3FN>, vector<5x1x7x32xf8E4M3FN> -// CHECK-DAG: %[[LHS_SCALE:.+]] = vector.transfer_read %arg2[%c0, %c0, %c0], %[[CSTSCALE]] {{.*}} : tensor<3x5x1xf8E8M0FNU>, vector<3x5x1xf8E8M0FNU> -// CHECK-DAG: %[[RHS_SCALE:.+]] = vector.transfer_read %arg3[%c0, %c0, %c0], %[[CSTSCALE]] {{.*}} : tensor<5x7x1xf8E8M0FNU>, vector<5x7x1xf8E8M0FNU> -// CHECK-DAG: %[[ACC:.+]] = vector.transfer_read %arg4[%c0, %c0, %c0], %[[CSTF32]] {{.*}} : tensor<3x7x4xf32>, vector<3x7x4xf32> -// CHECK: %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]], %[[LHS_SCALE]], %[[RHS_SCALE]]) outs(%[[ACC]]) -// CHECK-SAME: : vector<3x5x1x32xf4E2M1FN>, vector<5x1x7x32xf8E4M3FN>, vector<3x5x1xf8E8M0FNU>, vector<5x7x1xf8E8M0FNU> into vector<3x7x4xf32> -// CHECK: vector.transfer_write %[[MMA]], %arg4[%c0, %c0, %c0] {{.*}} : vector<3x7x4xf32>, tensor<3x7x4xf32> diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BUILD.bazel index 45cad1a92e2e..f2a079799c70 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/BUILD.bazel @@ -65,7 +65,6 @@ iree_compiler_cc_library( "Passes.cpp", "Transforms.cpp", "UnrollToIntrinsics.cpp", - "VectorizeIREEGPUOps.cpp", ], hdrs = [ "Passes.h", diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/CMakeLists.txt index 58fa6fa875d3..6ec757579b21 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/CMakeLists.txt @@ -54,7 +54,6 @@ iree_cc_library( "Passes.cpp" "Transforms.cpp" "UnrollToIntrinsics.cpp" - "VectorizeIREEGPUOps.cpp" DEPS ::PassesIncGen LLVMSupport diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td index 1cc05023865c..e97e0b1bed67 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.td @@ -55,14 +55,4 @@ def UnrollToIntrinsicsPass : ]; } -def VectorizeIREEGPUOpsPass : - InterfacePass<"iree-gpu-vectorize-ops", "mlir::FunctionOpInterface"> { - let summary = "Vectorizes then lowers a few iree_gpu ops before vectorization."; - let dependentDialects = [ - "::mlir::vector::VectorDialect", - "::mlir::arith::ArithDialect", - "::mlir::iree_compiler::IREE::GPU::IREEGPUDialect" - ]; -} - #endif // IREE_CODEGEN_DIALECT_GPU_TRANSFORMS_PASSES diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp index ce293325abf8..9fa3eaf336f2 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.cpp @@ -1983,80 +1983,6 @@ void populateIREEGPULowerBarrierRegionPatterns(RewritePatternSet &patterns) { patterns.add(patterns.getContext()); } -//===---------------------------------------------------------------------===// -// InnerTiledOp Vectorization -//===---------------------------------------------------------------------===// - -static LogicalResult -vectorizeStaticInnerTiledOp(RewriterBase &rewriter, - IREE::Codegen::InnerTiledOp tiledOp) { - if (!tiledOp.hasTensorSemantics()) { - return failure(); - } - SmallVector argTypes = tiledOp.getOperandShapedTypes(); - if (!llvm::all_of(argTypes, [](auto st) { return st.hasStaticShape(); })) { - return rewriter.notifyMatchFailure(tiledOp, - "non-static shape for vectorization"); - } - - OpBuilder::InsertionGuard g(rewriter); - rewriter.setInsertionPoint(tiledOp); - - Location loc = tiledOp.getLoc(); - - // Construct the (never used) zero padding value for each operand. - SmallVector padValues = - llvm::map_to_vector(argTypes, [&](ShapedType argType) -> Value { - return arith::ConstantOp::create( - rewriter, loc, rewriter.getZeroAttr(argType.getElementType())); - }); - - SmallVector newOperands = tiledOp.getOperands(); - for (auto [operand, type, padValue] : - llvm::zip_equal(newOperands, argTypes, padValues)) { - operand = vector::createReadOrMaskedRead( - rewriter, loc, operand, type.getShape(), padValue, - /*useInBoundsInsteadOfMasking=*/true); - } - auto newTiledOp = IREE::Codegen::InnerTiledOp::create( - rewriter, loc, ValueRange{newOperands}.take_front(tiledOp.getNumInputs()), - ValueRange{newOperands}.take_back(tiledOp.getNumOutputs()), - tiledOp.getIndexingMaps(), tiledOp.getIteratorTypes(), tiledOp.getKind(), - tiledOp.getSemantics()); - - auto zero = arith::ConstantIndexOp::create(rewriter, loc, 0); - SmallVector transferWrites; - for (auto [result, tensorAcc] : - llvm::zip_equal(newTiledOp.getResults(), tiledOp.getOutputs())) { - // Create the write back to a tensor. - int64_t rank = cast(tensorAcc.getType()).getRank(); - auto write = vector::TransferWriteOp::create( - rewriter, loc, - /*vector=*/result, - /*source=*/tensorAcc, - /*indices=*/SmallVector(rank, zero), - /*inBounds=*/SmallVector(rank, true)); - transferWrites.push_back(write.getResults().front()); - } - rewriter.replaceOp(tiledOp, transferWrites); - return success(); -} - -namespace { -struct VectorizeStaticInnerTiledOpPattern final - : OpRewritePattern { - using Base::Base; - LogicalResult matchAndRewrite(IREE::Codegen::InnerTiledOp tiledOp, - PatternRewriter &rewriter) const override { - return vectorizeStaticInnerTiledOp(rewriter, tiledOp); - } -}; -} // namespace - -void populateIREEGPUVectorizationPatterns(RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); -} - //===----------------------------------------------------------------------===// // VectorBarrierOp Lowering //===----------------------------------------------------------------------===// diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h index dcdd11f4232a..bfd8fa163d66 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h @@ -193,8 +193,6 @@ void populateIREEGPUVectorUnrollPatterns( RewritePatternSet &patterns, const vector::UnrollVectorOptions &options); // Version of unrolling with a preset configuration. void populateIREEGPUVectorUnrollPatterns(RewritePatternSet &patterns); -void populateIREEGPUVectorizationPatterns(RewritePatternSet &patterns); - // Populate patterns to fold tensor.empty ops through swizzle hint ops. void populateFoldSwizzleHintOpPatterns(RewritePatternSet &patterns); diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/VectorizeIREEGPUOps.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/VectorizeIREEGPUOps.cpp deleted file mode 100644 index dc201a184222..000000000000 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/VectorizeIREEGPUOps.cpp +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h" -#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.h" -#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Transforms.h" -#include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" -#include "mlir/Interfaces/FunctionInterfaces.h" -#include "mlir/Transforms/GreedyPatternRewriteDriver.h" - -namespace mlir::iree_compiler::IREE::GPU { - -#define GEN_PASS_DEF_VECTORIZEIREEGPUOPSPASS -#include "iree/compiler/Codegen/Dialect/GPU/Transforms/Passes.h.inc" - -namespace { -struct VectorizeIREEGPUOpsPass final - : impl::VectorizeIREEGPUOpsPassBase { - void runOnOperation() override; -}; -} // namespace - -void VectorizeIREEGPUOpsPass::runOnOperation() { - MLIRContext *context = &getContext(); - RewritePatternSet patterns(context); - populateIREEGPUVectorizationPatterns(patterns); - if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { - return signalPassFailure(); - } -} - -} // namespace mlir::iree_compiler::IREE::GPU diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/BUILD.bazel b/compiler/src/iree/compiler/Codegen/Interfaces/BUILD.bazel index 27d1d818b132..2a3beac600be 100644 --- a/compiler/src/iree/compiler/Codegen/Interfaces/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/Interfaces/BUILD.bazel @@ -232,6 +232,7 @@ iree_compiler_cc_library( ], deps = [ ":VectorizableOpInterfaceGen", + "//compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR:IREECodegenDialect", "//compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR:IREEVectorExtDialect", "//compiler/src/iree/compiler/Dialect/LinalgExt/IR", "//compiler/src/iree/compiler/Utils", diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt index 439bbb6c5ccb..ecf117dfa663 100644 --- a/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/Interfaces/CMakeLists.txt @@ -179,6 +179,7 @@ iree_cc_library( MLIRTensorDialect MLIRUBDialect MLIRVectorDialect + iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect iree::compiler::Codegen::Dialect::VectorExt::IR::IREEVectorExtDialect iree::compiler::Dialect::LinalgExt::IR iree::compiler::Utils diff --git a/compiler/src/iree/compiler/Codegen/Interfaces/VectorizableOpInterface.cpp b/compiler/src/iree/compiler/Codegen/Interfaces/VectorizableOpInterface.cpp index d0b457398be3..28ec62ad622f 100644 --- a/compiler/src/iree/compiler/Codegen/Interfaces/VectorizableOpInterface.cpp +++ b/compiler/src/iree/compiler/Codegen/Interfaces/VectorizableOpInterface.cpp @@ -6,6 +6,8 @@ #include "iree/compiler/Codegen/Interfaces/VectorizableOpInterface.h" +#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h" +#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenOps.h" #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h" #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtOps.h" #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h" @@ -1055,6 +1057,74 @@ struct PadOpVectorizationModel } }; +/// External model for IREE::Codegen::InnerTiledOp. Reads tensor operands into +/// vectors, creates a vector-semantic InnerTiledOp, and writes results back. +struct InnerTiledOpVectorizationModel + : VectorizableOpInterface::ExternalModel { + + bool isVectorizable(Operation *op, ArrayRef vectorSizes, + ArrayRef scalableDims, + DictionaryAttr options) const { + auto tiledOp = cast(op); + if (!tiledOp.hasTensorSemantics()) { + return false; + } + SmallVector argTypes = tiledOp.getOperandShapedTypes(); + return llvm::all_of(argTypes, + [](ShapedType st) { return st.hasStaticShape(); }); + } + + FailureOr> vectorize(Operation *op, RewriterBase &rewriter, + ArrayRef vectorSizes, + ArrayRef scalableDims, + DictionaryAttr options) const { + auto tiledOp = cast(op); + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(tiledOp); + Location loc = tiledOp.getLoc(); + + SmallVector argTypes = tiledOp.getOperandShapedTypes(); + + // Construct the zero padding value for each operand. Ideally, we'd need the + // InnerTile interface to return the padding value to use. If it is not + // provided, ub::Poison is a better choice. Zero was chosen because the op + // was designed for matmul, and zero padding is the most common case. + SmallVector padValues = + llvm::map_to_vector(argTypes, [&](ShapedType argType) -> Value { + return arith::ConstantOp::create( + rewriter, loc, rewriter.getZeroAttr(argType.getElementType())); + }); + + SmallVector newOperands = tiledOp.getOperands(); + for (auto [operand, type, padValue] : + llvm::zip_equal(newOperands, argTypes, padValues)) { + operand = vector::createReadOrMaskedRead( + rewriter, loc, operand, type.getShape(), padValue, + /*useInBoundsInsteadOfMasking=*/true); + } + auto newTiledOp = IREE::Codegen::InnerTiledOp::create( + rewriter, loc, + ValueRange{newOperands}.take_front(tiledOp.getNumInputs()), + ValueRange{newOperands}.take_back(tiledOp.getNumOutputs()), + tiledOp.getIndexingMaps(), tiledOp.getIteratorTypes(), + tiledOp.getKind(), tiledOp.getSemantics()); + + auto zero = arith::ConstantIndexOp::create(rewriter, loc, 0); + SmallVector results; + for (auto [result, tensorAcc] : + llvm::zip_equal(newTiledOp.getResults(), tiledOp.getOutputs())) { + int64_t rank = cast(tensorAcc.getType()).getRank(); + auto write = vector::TransferWriteOp::create( + rewriter, loc, result, tensorAcc, + /*indices=*/SmallVector(rank, zero), + /*inBounds=*/SmallVector(rank, true)); + results.push_back(write.getResults().front()); + } + return results; + } +}; + /// Registers the LinalgStructuredOpVectorizationModel for a single op type. template static void registerInterfaceForLinalgOps(MLIRContext *ctx) { @@ -1087,6 +1157,12 @@ void registerVectorizableOpInterfaceExternalModels(DialectRegistry ®istry) { *ctx); }); + registry.addExtension( + +[](MLIRContext *ctx, IREE::Codegen::IREECodegenDialect *dialect) { + IREE::Codegen::InnerTiledOp::attachInterface< + InnerTiledOpVectorizationModel>(*ctx); + }); + // Upstream linalg ops. #define GET_OP_LIST registry.addExtension(+[](MLIRContext *ctx, linalg::LinalgDialect *dialect) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 8f54ed19faef..8584d2775761 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -273,7 +273,6 @@ static void addGPUVectorizationPasses(OpPassManager &funcPassManager, funcPassManager.addPass(IREE::LinalgExt::createDecomposeIm2colPass()); funcPassManager.addPass(createCanonicalizerPass()); funcPassManager.addPass(createCSEPass()); - funcPassManager.addPass(IREE::GPU::createVectorizeIREEGPUOpsPass()); // Vectorize. GenericVectorizationPassOptions options; options.vectorizeCopies = vectorizeCopies; @@ -559,8 +558,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager, funcPassManager.addPass(createIREELoopInvariantCodeMotionPass()); funcPassManager.addPass(createGPUCombineValueSemanticBarriersPass()); - // Step 6. Lower special ops and vectorize. - funcPassManager.addPass(IREE::GPU::createVectorizeIREEGPUOpsPass()); + // Step 6. Vectorize. addGPUVectorizationPasses(funcPassManager, /*vectorizeCopies=*/false, /*enableMasking=*/true, /*foldIdentitySlices=*/true,