Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1219,6 +1219,111 @@ func.func @map_store_f4_mask_depends_on_inner_index(

// -----

#contraction_accesses = [
affine_map<(i, j, k) -> (i, k)>,
affine_map<(i, j, k) -> (k, j)>,
affine_map<(i, j, k) -> (i, j)>
]
func.func @tensor_multi_mma(%lhs: tensor<2x3x4xf16>, %rhs: tensor<3x5x4xf16>, %acc: tensor<2x5x4xf32>) -> tensor<2x5x4xf32> {
%0 = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) {
indexing_maps = #contraction_accesses,
iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>],
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
} : tensor<2x3x4xf16>, tensor<3x5x4xf16> into tensor<2x5x4xf32>
return %0 : tensor<2x5x4xf32>
}

// CHECK-LABEL: func @tensor_multi_mma
// CHECK-MASK-LABEL: func @tensor_multi_mma
// CHECK-FOLD-LABEL: func @tensor_multi_mma
// CHECK-GATHER-LABEL: func @tensor_multi_mma
// CHECK-MAP-STORE-LABEL: func @tensor_multi_mma
Comment on lines +1238 to +1241
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the purpose of this? Avoid false matches for check lines above?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes; I'm going to split the test file in a follow-up. I'm waiting for few in-flight PR landed, then I'll start the work.


// CHECK-DAG: %[[CST:.+]] = arith.constant 0.000000e+00 : f16
// CHECK-DAG: %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32
// CHECK-DAG: %[[LHS:.+]] = vector.transfer_read %arg0[%c0, %c0, %c0], %[[CST]] {{.*}} : tensor<2x3x4xf16>, vector<2x3x4xf16>
// CHECK-DAG: %[[RHS:.+]] = vector.transfer_read %arg1[%c0, %c0, %c0], %[[CST]] {{.*}} : tensor<3x5x4xf16>, vector<3x5x4xf16>
// CHECK-DAG: %[[ACC:.+]] = vector.transfer_read %arg2[%c0, %c0, %c0], %[[CSTF32]] {{.*}} : tensor<2x5x4xf32>, vector<2x5x4xf32>
// CHECK: %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]]) outs(%[[ACC]])
// CHECK-SAME: : vector<2x3x4xf16>, vector<3x5x4xf16> into vector<2x5x4xf32>
// CHECK: vector.transfer_write %[[MMA]], %arg2[%c0, %c0, %c0] {{.*}} : vector<2x5x4xf32>, tensor<2x5x4xf32>

// -----

#contraction_accesses = [
affine_map<() -> ()>,
affine_map<() -> ()>,
affine_map<() -> ()>
]
func.func @tensor_single_multi_mma(%lhs: tensor<4xf16>, %rhs: tensor<4xf16>, %acc: tensor<4xf32>) -> tensor<4xf32> {
%0 = iree_codegen.inner_tiled ins(%lhs, %rhs) outs(%acc) {
indexing_maps = #contraction_accesses,
iterator_types = [],
kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
} : tensor<4xf16>, tensor<4xf16> into tensor<4xf32>
return %0 : tensor<4xf32>
}

// CHECK-LABEL: func @tensor_single_multi_mma

// CHECK-DAG: %[[CST:.+]] = arith.constant 0.000000e+00 : f16
// CHECK-DAG: %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32
// CHECK-DAG: %[[LHS:.+]] = vector.transfer_read %arg0[%c0], %[[CST]] {in_bounds = [true]} : tensor<4xf16>, vector<4xf16>
// CHECK-DAG: %[[RHS:.+]] = vector.transfer_read %arg1[%c0], %[[CST]] {in_bounds = [true]} : tensor<4xf16>, vector<4xf16>
// CHECK-DAG: %[[ACC:.+]] = vector.transfer_read %arg2[%c0], %[[CSTF32]] {in_bounds = [true]} : tensor<4xf32>, vector<4xf32>
// CHECK: %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]]) outs(%[[ACC]])
// CHECK-SAME: : vector<4xf16>, vector<4xf16> into vector<4xf32>
// CHECK: vector.transfer_write %[[MMA]], %arg2[%c0] {in_bounds = [true]} : vector<4xf32>, tensor<4xf32>

// -----

#contraction_accesses = [
affine_map<(i, j, k, b) -> (i, k, b)>,
affine_map<(i, j, k, b) -> (k, b, j)>,
affine_map<(i, j, k, b) -> (i, k)>,
affine_map<(i, j, k, b) -> (k, j)>,
affine_map<(i, j, k, b) -> (i, j)>
]

#iterator_types = [
#linalg.iterator_type<parallel>,
#linalg.iterator_type<parallel>,
#linalg.iterator_type<reduction>,
#linalg.iterator_type<reduction>
]

func.func @scaled_tensor_multi_mma(%arg0: tensor<3x5x1x32xf4E2M1FN>, %arg1: tensor<5x1x7x32xf8E4M3FN>, %arg2: tensor<3x5x1xf8E8M0FNU>, %arg3: tensor<5x7x1xf8E8M0FNU>,
%arg4: tensor<3x7x4xf32>) -> tensor<3x7x4xf32> {
%0 = iree_codegen.inner_tiled ins(%arg0, %arg1, %arg2, %arg3) outs(%arg4) {
indexing_maps = #contraction_accesses,
iterator_types = #iterator_types,
kind = #iree_gpu.scaled_mma_layout<intrinsic = MFMA_SCALE_F32_16x16x128_B32,
lhs_elem_type = f4E2M1FN, rhs_elem_type = f8E4M3FN, acc_elem_type = f32>,
semantics = #iree_gpu.mma_semantics<distributed = true, opaque = false>
} : tensor<3x5x1x32xf4E2M1FN>, tensor<5x1x7x32xf8E4M3FN>, tensor<3x5x1xf8E8M0FNU>, tensor<5x7x1xf8E8M0FNU>
into tensor<3x7x4xf32>
return %0 : tensor<3x7x4xf32>
}

// CHECK-LABEL: func @scaled_tensor_multi_mma

// CHECK-DAG: %[[CSTFP4:.+]] = arith.constant 0.000000e+00 : f4E2M1FN
// CHECK-DAG: %[[CSTFP8:.+]] = arith.constant 0.000000e+00 : f8E4M3FN
// CHECK-DAG: %[[CSTSCALE:.+]] = arith.constant 5.877470e-39 : f8E8M0FNU
// CHECK-DAG: %[[CSTF32:.+]] = arith.constant 0.000000e+00 : f32
// CHECK-DAG: %[[LHS:.+]] = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %[[CSTFP4]] {{.*}} : tensor<3x5x1x32xf4E2M1FN>, vector<3x5x1x32xf4E2M1FN>
// CHECK-DAG: %[[RHS:.+]] = vector.transfer_read %arg1[%c0, %c0, %c0, %c0], %[[CSTFP8]] {{.*}} : tensor<5x1x7x32xf8E4M3FN>, vector<5x1x7x32xf8E4M3FN>
// CHECK-DAG: %[[LHS_SCALE:.+]] = vector.transfer_read %arg2[%c0, %c0, %c0], %[[CSTSCALE]] {{.*}} : tensor<3x5x1xf8E8M0FNU>, vector<3x5x1xf8E8M0FNU>
// CHECK-DAG: %[[RHS_SCALE:.+]] = vector.transfer_read %arg3[%c0, %c0, %c0], %[[CSTSCALE]] {{.*}} : tensor<5x7x1xf8E8M0FNU>, vector<5x7x1xf8E8M0FNU>
// CHECK-DAG: %[[ACC:.+]] = vector.transfer_read %arg4[%c0, %c0, %c0], %[[CSTF32]] {{.*}} : tensor<3x7x4xf32>, vector<3x7x4xf32>
// CHECK: %[[MMA:.+]] = iree_codegen.inner_tiled ins(%[[LHS]], %[[RHS]], %[[LHS_SCALE]], %[[RHS_SCALE]]) outs(%[[ACC]])
// CHECK-SAME: : vector<3x5x1x32xf4E2M1FN>, vector<5x1x7x32xf8E4M3FN>, vector<3x5x1xf8E8M0FNU>, vector<5x7x1xf8E8M0FNU> into vector<3x7x4xf32>
// CHECK: vector.transfer_write %[[MMA]], %arg4[%c0, %c0, %c0] {{.*}} : vector<3x7x4xf32>, tensor<3x7x4xf32>

// -----

func.func @implicit_gather_like_generic_stride_2(%arg0: tensor<1x1x31xf32>, %arg1: tensor<1x1x1x1x16xf32>) -> tensor<1x1x1x1x16xf32> {
%0 = linalg.generic {
indexing_maps = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ iree_td_library(
"@llvm-project//mlir:SMTTdFiles",
"@llvm-project//mlir:SideEffectInterfacesTdFiles",
"@llvm-project//mlir:TilingInterfaceTdFiles",
"@llvm-project//mlir:ValueBoundsOpInterfaceTdFiles",
"@llvm-project//mlir:VectorInterfacesTdFiles",
"@llvm-project//mlir:ViewLikeInterfaceTdFiles",
],
Expand Down Expand Up @@ -125,6 +126,7 @@ iree_compiler_cc_library(
"@llvm-project//mlir:TilingInterface",
"@llvm-project//mlir:TransformDialect",
"@llvm-project//mlir:TransformDialectTransforms",
"@llvm-project//mlir:ValueBoundsOpInterface",
"@llvm-project//mlir:VectorInterfaces",
"@llvm-project//mlir:ViewLikeInterface",
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ iree_cc_library(
MLIRTilingInterface
MLIRTransformDialect
MLIRTransformDialectTransforms
MLIRValueBoundsOpInterface
MLIRVectorInterfaces
MLIRViewLikeInterface
iree::compiler::Codegen::Dialect::PCF::IR
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "mlir/IR/DialectImplementation.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Interfaces/ValueBoundsOpInterface.h"
#include "mlir/Support/LLVM.h"

// Custom parse/print helper for the knobs dictionary in constraints op.
Expand Down Expand Up @@ -422,6 +423,13 @@ std::optional<SmallVector<int64_t, 4>> InnerTiledOp::getShapeForUnroll() {
return shape;
}

void InnerTiledOp::populateBoundsForShapedValueDim(
Value value, int64_t dim, ValueBoundsConstraintSet &cstr) {
// Result shapes equal the corresponding DPS init shapes.
auto resultIdx = cast<OpResult>(value).getResultNumber();
cstr.bound(value)[dim] == cstr.getExpr(getDpsInits()[resultIdx], dim);
}

//===----------------------------------------------------------------------===//
// WorkgroupCountHintOp
//===----------------------------------------------------------------------===//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "mlir/Interfaces/InferTypeOpInterface.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Interfaces/TilingInterface.h"
#include "mlir/Interfaces/ValueBoundsOpInterface.h"
#include "mlir/Interfaces/VectorInterfaces.h"
#include "mlir/Interfaces/ViewLikeInterface.h"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ include "mlir/Interfaces/TilingInterface.td"
include "mlir/Interfaces/VectorInterfaces.td"
include "mlir/Interfaces/ViewLikeInterface.td"
include "mlir/Interfaces/InferTypeOpInterface.td"
include "mlir/Interfaces/ValueBoundsOpInterface.td"
include "mlir/Dialect/SMT/IR/SMTTypes.td"

def TensorTypeAttr : TypeAttrBase<"::mlir::TensorType", "Tensor type attribute">;
Expand Down Expand Up @@ -256,6 +257,8 @@ def IREECodegen_InnerTiledOp : Op<IREECodegen_Dialect, "inner_tiled", [
AttrSizedOperandSegments,
InferTypeOpAdaptor,
DestinationStyleOpInterface,
DeclareOpInterfaceMethods<ValueBoundsOpInterface,
["populateBoundsForShapedValueDim"]>,
DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>,
DeclareOpInterfaceMethods<TilingInterface,
["getIterationDomain",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,15 +71,6 @@ void transform_dialect::ApplyUnrollMultiMmaOp::populatePatterns(
GPU::populateIREEGPUVectorUnrollPatterns(patterns);
}

//===---------------------------------------------------------------------===//
// ApplyVectorizeIREEGPUOp
//===---------------------------------------------------------------------===//

void transform_dialect::ApplyVectorizeIREEGPUOp::populatePatterns(
RewritePatternSet &patterns) {
IREE::GPU::populateIREEGPUVectorizationPatterns(patterns);
}

//===---------------------------------------------------------------------===//
// ConvertToMultiMmaOp
//===---------------------------------------------------------------------===//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,19 +80,6 @@ def ApplyUnrollMultiMmaOp : Op<Transform_Dialect,
let assemblyFormat = "attr-dict";
}

def ApplyVectorizeIREEGPUOp : Op<Transform_Dialect,
"apply_patterns.iree.vectorize_iree_gpu",
[DeclareOpInterfaceMethods<PatternDescriptorOpInterface>,
ReportTrackingListenerFailuresOpTrait]> {
let description = [{
Populate patterns to vectorize various iree_gpu ops. Expected to run
before or as a part of a larger vectorization pass.
}];

let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
let assemblyFormat = "attr-dict";
}

def ConvertToMultiMmaOp : Op<Transform_Dialect, "iree.convert_to_multi_mma",
[TransformEachOpTrait,
DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ iree_lit_test_suite(
"transform_fuse_forall.mlir",
"transform_lower_barrier_region.mlir",
"unroll_multi_mma.mlir",
"vectorize_iree_gpu_ops.mlir",
],
include = ["*.mlir"],
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ iree_lit_test_suite(
"transform_fuse_forall.mlir"
"transform_lower_barrier_region.mlir"
"unroll_multi_mma.mlir"
"vectorize_iree_gpu_ops.mlir"
TOOLS
FileCheck
iree-opt
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ iree_compiler_cc_library(
"Passes.cpp",
"Transforms.cpp",
"UnrollToIntrinsics.cpp",
"VectorizeIREEGPUOps.cpp",
],
hdrs = [
"Passes.h",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ iree_cc_library(
"Passes.cpp"
"Transforms.cpp"
"UnrollToIntrinsics.cpp"
"VectorizeIREEGPUOps.cpp"
DEPS
::PassesIncGen
LLVMSupport
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,4 @@ def UnrollToIntrinsicsPass :
];
}

def VectorizeIREEGPUOpsPass :
InterfacePass<"iree-gpu-vectorize-ops", "mlir::FunctionOpInterface"> {
let summary = "Vectorizes then lowers a few iree_gpu ops before vectorization.";
let dependentDialects = [
"::mlir::vector::VectorDialect",
"::mlir::arith::ArithDialect",
"::mlir::iree_compiler::IREE::GPU::IREEGPUDialect"
];
}

#endif // IREE_CODEGEN_DIALECT_GPU_TRANSFORMS_PASSES
Loading
Loading