Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "iree/compiler/Codegen/Utils/GPUUtils.h"
#include "iree/compiler/Codegen/Utils/Utils.h"
#include "iree/compiler/Dialect/HAL/IR/HALOps.h"
#include "llvm/Support/MathExtras.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Transforms/Passes.h"
Expand All @@ -25,7 +26,7 @@ static void foldConstantBounds(
FunctionOpInterface funcOp,
const std::optional<SmallVector<int64_t>> &staticWorkgroupSizes,
ArrayRef<int64_t> staticWorkgroupCounts,
std::optional<uint64_t> subgroupSize) {
std::optional<int64_t> subgroupSize) {
IRRewriter rewriter(funcOp->getContext());
auto rewriteToConstant = [&](Operation *op, int64_t constant) {
rewriter.setInsertionPoint(op);
Expand Down Expand Up @@ -70,13 +71,24 @@ static void foldConstantBounds(
static void applyBounds(FunctionOpInterface funcOp,
ArrayRef<std::optional<int64_t>> workgroupSizes,
ArrayRef<std::optional<int64_t>> workgroupCounts,
std::optional<uint64_t> subgroupSize) {
std::optional<int64_t> maxSubgroupSize,
std::optional<int64_t> subgroupIdBound) {
Builder b(funcOp->getContext());
funcOp->walk([&](Operation *op) {
TypeSwitch<Operation *>(op)
.Case([&](gpu::LaneIdOp laneIdOp) {
if (subgroupSize) {
laneIdOp.setUpperBoundAttr(b.getIndexAttr(*subgroupSize));
if (maxSubgroupSize) {
laneIdOp.setUpperBoundAttr(b.getIndexAttr(*maxSubgroupSize));
}
})
.Case([&](gpu::SubgroupSizeOp subgroupSizeOp) {
if (maxSubgroupSize) {
subgroupSizeOp.setUpperBoundAttr(b.getIndexAttr(*maxSubgroupSize));
}
})
.Case([&](gpu::SubgroupIdOp subgroupIdOp) {
if (subgroupIdBound) {
subgroupIdOp.setUpperBoundAttr(b.getIndexAttr(*subgroupIdBound));
}
})
.Case([&](gpu::ThreadIdOp tidOp) {
Expand Down Expand Up @@ -143,7 +155,9 @@ struct PropagateDispatchSizeBoundsPass final
std::optional<SmallVector<int64_t>> staticWorkgroupSize =
getWorkgroupSize(funcOp);

std::optional<uint64_t> subgroupSize = getGPUSubgroupSize(funcOp);
// Check if a specific subgroup size has been explicitly chosen via the
// codegen pipeline configuration.
std::optional<int64_t> staticSubgroupSize = getSubgroupSize(funcOp);

// Late in codegen, we've reconciled the workgroup size onto the export op.
if (std::optional<IREE::HAL::ExecutableExportOp> exportOp =
Expand All @@ -157,10 +171,24 @@ struct PropagateDispatchSizeBoundsPass final

if (std::optional<uint64_t> exportSubgroupSize =
exportOp->getSubgroupSizeAsUInt()) {
subgroupSize = exportSubgroupSize;
staticSubgroupSize = *exportSubgroupSize;
}
}

// Determine min and max subgroup size bounds. When a specific subgroup
// size has been picked, min == max == that size. Otherwise, use the
// range from the GPU target's WGP info.
std::optional<int64_t> minSubgroupSize;
std::optional<int64_t> maxSubgroupSize;
if (staticSubgroupSize) {
minSubgroupSize = maxSubgroupSize = *staticSubgroupSize;
} else if (target) {
assert(!target.getWgp().getSubgroupSizeChoices().empty() &&
"GPU target must have at least one subgroup size choice");
minSubgroupSize = target.getMinSubgroupSize();
maxSubgroupSize = target.getMaxSubgroupSize();
}

if (staticWorkgroupSize) {
// Target info with no workgroup sizes gives a 0-length array, hence no
// zip_equal.
Expand All @@ -179,9 +207,33 @@ struct PropagateDispatchSizeBoundsPass final
}
}

// Compute the subgroup ID bound: max total threads / min subgroup size.
std::optional<int64_t> maxFlatWorkgroupSize;
std::optional<int64_t> subgroupIdBound;
if (staticWorkgroupSize) {
maxFlatWorkgroupSize = llvm::product_of(*staticWorkgroupSize);
}
if (target) {
maxFlatWorkgroupSize = std::min(
maxFlatWorkgroupSize.value_or(std::numeric_limits<int64_t>::max()),
static_cast<int64_t>(
target.getWgp().getMaxThreadCountPerWorkgroup()));
}
if (maxFlatWorkgroupSize && minSubgroupSize) {
subgroupIdBound =
llvm::divideCeil(*maxFlatWorkgroupSize, *minSubgroupSize);
}

std::optional<int64_t> constantSubgroupSize;
if (minSubgroupSize && maxSubgroupSize &&
*minSubgroupSize == *maxSubgroupSize) {
constantSubgroupSize = *minSubgroupSize;
}

foldConstantBounds(funcOp, staticWorkgroupSize, staticWorkgroupCounts,
subgroupSize);
applyBounds(funcOp, workgroupSizes, workgroupCounts, subgroupSize);
constantSubgroupSize);
applyBounds(funcOp, workgroupSizes, workgroupCounts, maxSubgroupSize,
subgroupIdBound);
}
};
} // namespace
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

// Note: not the real target definition, missing types
#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<arch = "gfx1100", features = "",
wgp = <compute = fp32,
storage = b32,
subgroup = arithmetic,
wgp = <compute = fp32,
storage = b32,
subgroup = arithmetic,
subgroup_size_choices = [32, 64],
max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024,
Expand All @@ -25,9 +25,12 @@ hal.executable private @static {
builtin.module {
// CHECK-LABEL: func.func @static()
func.func @static() {
// CHECK-NEXT: gpu.lane_id upper_bound 32
// CHECK-NEXT: gpu.lane_id upper_bound 64
%lane_id = gpu.lane_id

// CHECK-NEXT: gpu.subgroup_id upper_bound 4 : index
%subgroup_id = gpu.subgroup_id : index

// CHECK-NEXT: gpu.thread_id x upper_bound 64
// CHECK-NEXT: gpu.thread_id y upper_bound 2
// CHECK-NEXT: gpu.thread_id z upper_bound 1
Expand Down Expand Up @@ -73,9 +76,9 @@ hal.executable private @static {

// Note: not the real target definition, missing types
#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<arch = "gfx1100", features = "",
wgp = <compute = fp32,
storage = b32,
subgroup = arithmetic,
wgp = <compute = fp32,
storage = b32,
subgroup = arithmetic,
subgroup_size_choices = [32, 64],
max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024,
Expand All @@ -90,14 +93,59 @@ hal.executable private @manual_subgroup_size {
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
hal.return %c32, %c8, %c1 : index, index, index
} attributes {subgroup_size = 64 : index}
} attributes {subgroup_size = 32 : index}
builtin.module {
// CHECK-LABEL: func.func @manual_subgroup_size()
func.func @manual_subgroup_size() {
// CHECK-NEXT: gpu.lane_id upper_bound 32
%lane_id = gpu.lane_id

// No workgroup_size; bound from max_thread_count_per_workgroup (1024) / subgroup_size (32) = 32.
// CHECK-NEXT: gpu.subgroup_id upper_bound 32 : index
%subgroup_id = gpu.subgroup_id : index

// CHECK-NEXT: arith.constant 32 : index
%subgroup_size = gpu.subgroup_size : index

return
}
}
}
}

// -----

// Test variable subgroup sizes on gfx1100 (subgroup_size_choices = [32, 64])
// with static workgroup sizes but no explicit subgroup_size selection.
#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb",
{iree_codegen.target_info = #iree_gpu.target<arch = "gfx1100", features = "",
wgp = <compute = fp32,
storage = b32,
subgroup = arithmetic,
subgroup_size_choices = [32, 64],
max_workgroup_sizes = [1024, 1024, 1024],
max_thread_count_per_workgroup = 1024,
max_workgroup_memory_bytes = 65536,
max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>}>
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>

hal.executable private @gfx1100_variable_subgroup {
hal.executable.variant public @rocm_hsaco_fb target(#executable_target) {
hal.executable.export public @gfx1100_variable_subgroup ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
%c128 = arith.constant 128 : index
%c1 = arith.constant 1 : index
hal.return %c128, %c1, %c1 : index, index, index
} attributes {workgroup_size = [128 : index, 1 : index, 1 : index]}
builtin.module {
// CHECK-LABEL: func.func @gfx1100_variable_subgroup()
func.func @gfx1100_variable_subgroup() {
// CHECK-NEXT: gpu.lane_id upper_bound 64
%lane_id = gpu.lane_id

// CHECK-NEXT: arith.constant 64 : index
// CHECK-NEXT: gpu.subgroup_id upper_bound 4 : index
%subgroup_id = gpu.subgroup_id : index

// CHECK-NEXT: gpu.subgroup_size upper_bound 64 : index
%subgroup_size = gpu.subgroup_size : index

return
Expand All @@ -110,8 +158,8 @@ hal.executable private @manual_subgroup_size {

#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb",
{iree_codegen.target_info = #iree_gpu.target<arch = "gfx1100", features = "",
wgp = <compute = fp32,
storage = b32,
wgp = <compute = fp32,
storage = b32,
subgroup = arithmetic,
subgroup_size_choices = [32, 64],
max_workgroup_sizes = [1024, 1024, 1024],
Expand All @@ -131,6 +179,15 @@ hal.executable private @dynamic {
builtin.module {
// CHECK-LABEL: func.func @dynamic()
func.func @dynamic() {
// CHECK-NEXT: gpu.lane_id upper_bound 64
%lane_id = gpu.lane_id

// CHECK-NEXT: gpu.subgroup_id upper_bound 32 : index
%subgroup_id = gpu.subgroup_id : index

// CHECK-NEXT: gpu.subgroup_size upper_bound 64 : index
%subgroup_size = gpu.subgroup_size : index

// CHECK-NEXT: gpu.thread_id x upper_bound 1024
// CHECK-NEXT: gpu.thread_id y upper_bound 1024
// CHECK-NEXT: gpu.thread_id z upper_bound 1024
Expand Down
Loading