iree-org · krzysz00 · Mar 12, 2026 · Mar 13, 2026
@@ -8,6 +8,7 @@
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Dialect/HAL/IR/HALOps.h"
+#include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Transforms/Passes.h"
@@ -25,7 +26,7 @@ static void foldConstantBounds(
     FunctionOpInterface funcOp,
     const std::optional<SmallVector<int64_t>> &staticWorkgroupSizes,
     ArrayRef<int64_t> staticWorkgroupCounts,
-    std::optional<uint64_t> subgroupSize) {
+    std::optional<int64_t> subgroupSize) {
   IRRewriter rewriter(funcOp->getContext());
   auto rewriteToConstant = [&](Operation *op, int64_t constant) {
     rewriter.setInsertionPoint(op);
@@ -70,13 +71,24 @@ static void foldConstantBounds(
 static void applyBounds(FunctionOpInterface funcOp,
                         ArrayRef<std::optional<int64_t>> workgroupSizes,
                         ArrayRef<std::optional<int64_t>> workgroupCounts,
-                        std::optional<uint64_t> subgroupSize) {
+                        std::optional<int64_t> maxSubgroupSize,
+                        std::optional<int64_t> subgroupIdBound) {
   Builder b(funcOp->getContext());
   funcOp->walk([&](Operation *op) {
     TypeSwitch<Operation *>(op)
         .Case([&](gpu::LaneIdOp laneIdOp) {
-          if (subgroupSize) {
-            laneIdOp.setUpperBoundAttr(b.getIndexAttr(*subgroupSize));
+          if (maxSubgroupSize) {
+            laneIdOp.setUpperBoundAttr(b.getIndexAttr(*maxSubgroupSize));
+          }
+        })
+        .Case([&](gpu::SubgroupSizeOp subgroupSizeOp) {
+          if (maxSubgroupSize) {
+            subgroupSizeOp.setUpperBoundAttr(b.getIndexAttr(*maxSubgroupSize));
+          }
+        })
+        .Case([&](gpu::SubgroupIdOp subgroupIdOp) {
+          if (subgroupIdBound) {
+            subgroupIdOp.setUpperBoundAttr(b.getIndexAttr(*subgroupIdBound));
           }
         })
         .Case([&](gpu::ThreadIdOp tidOp) {
@@ -143,7 +155,9 @@ struct PropagateDispatchSizeBoundsPass final
     std::optional<SmallVector<int64_t>> staticWorkgroupSize =
         getWorkgroupSize(funcOp);
 
-    std::optional<uint64_t> subgroupSize = getGPUSubgroupSize(funcOp);
+    // Check if a specific subgroup size has been explicitly chosen via the
+    // codegen pipeline configuration.
+    std::optional<int64_t> staticSubgroupSize = getSubgroupSize(funcOp);
 
     // Late in codegen, we've reconciled the workgroup size onto the export op.
     if (std::optional<IREE::HAL::ExecutableExportOp> exportOp =
@@ -157,10 +171,24 @@ struct PropagateDispatchSizeBoundsPass final
 
       if (std::optional<uint64_t> exportSubgroupSize =
               exportOp->getSubgroupSizeAsUInt()) {
-        subgroupSize = exportSubgroupSize;
+        staticSubgroupSize = *exportSubgroupSize;
       }
     }
 
+    // Determine min and max subgroup size bounds. When a specific subgroup
+    // size has been picked, min == max == that size. Otherwise, use the
+    // range from the GPU target's WGP info.
+    std::optional<int64_t> minSubgroupSize;
+    std::optional<int64_t> maxSubgroupSize;
+    if (staticSubgroupSize) {
+      minSubgroupSize = maxSubgroupSize = *staticSubgroupSize;
+    } else if (target) {
+      assert(!target.getWgp().getSubgroupSizeChoices().empty() &&
+             "GPU target must have at least one subgroup size choice");
+      minSubgroupSize = target.getMinSubgroupSize();
+      maxSubgroupSize = target.getMaxSubgroupSize();
+    }
+
     if (staticWorkgroupSize) {
       // Target info with no workgroup sizes gives a 0-length array, hence no
       // zip_equal.
@@ -179,9 +207,33 @@ struct PropagateDispatchSizeBoundsPass final
       }
     }
 
+    // Compute the subgroup ID bound: max total threads / min subgroup size.
+    std::optional<int64_t> maxFlatWorkgroupSize;
+    std::optional<int64_t> subgroupIdBound;
+    if (staticWorkgroupSize) {
+      maxFlatWorkgroupSize = llvm::product_of(*staticWorkgroupSize);
+    }
+    if (target) {
+      maxFlatWorkgroupSize = std::min(
+          maxFlatWorkgroupSize.value_or(std::numeric_limits<int64_t>::max()),
+          static_cast<int64_t>(
+              target.getWgp().getMaxThreadCountPerWorkgroup()));
+    }
+    if (maxFlatWorkgroupSize && minSubgroupSize) {
+      subgroupIdBound =
+          llvm::divideCeil(*maxFlatWorkgroupSize, *minSubgroupSize);
+    }
+
+    std::optional<int64_t> constantSubgroupSize;
+    if (minSubgroupSize && maxSubgroupSize &&
+        *minSubgroupSize == *maxSubgroupSize) {
+      constantSubgroupSize = *minSubgroupSize;
+    }
+
     foldConstantBounds(funcOp, staticWorkgroupSize, staticWorkgroupCounts,
-                       subgroupSize);
-    applyBounds(funcOp, workgroupSizes, workgroupCounts, subgroupSize);
+                       constantSubgroupSize);
+    applyBounds(funcOp, workgroupSizes, workgroupCounts, maxSubgroupSize,
+                subgroupIdBound);
   }
 };
 } // namespace

@@ -4,9 +4,9 @@
 
 // Note: not the real target definition, missing types
 #executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<arch = "gfx1100", features = "",
-  wgp = <compute =  fp32,
-    storage =  b32,
-    subgroup =  arithmetic,
+  wgp = <compute = fp32,
+    storage = b32,
+    subgroup = arithmetic,
     subgroup_size_choices = [32, 64],
     max_workgroup_sizes = [1024, 1024, 1024],
     max_thread_count_per_workgroup = 1024,
@@ -25,9 +25,12 @@ hal.executable private @static {
     builtin.module {
 // CHECK-LABEL: func.func @static()
       func.func @static() {
-// CHECK-NEXT: gpu.lane_id upper_bound 32
+// CHECK-NEXT: gpu.lane_id upper_bound 64
         %lane_id = gpu.lane_id
 
+// CHECK-NEXT: gpu.subgroup_id upper_bound 4 : index
+        %subgroup_id = gpu.subgroup_id : index
+
 // CHECK-NEXT: gpu.thread_id x upper_bound 64
 // CHECK-NEXT: gpu.thread_id y upper_bound 2
 // CHECK-NEXT: gpu.thread_id z upper_bound 1
@@ -73,9 +76,9 @@ hal.executable private @static {
 
 // Note: not the real target definition, missing types
 #executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.target_info = #iree_gpu.target<arch = "gfx1100", features = "",
-  wgp = <compute =  fp32,
-    storage =  b32,
-    subgroup =  arithmetic,
+  wgp = <compute = fp32,
+    storage = b32,
+    subgroup = arithmetic,
     subgroup_size_choices = [32, 64],
     max_workgroup_sizes = [1024, 1024, 1024],
     max_thread_count_per_workgroup = 1024,
@@ -90,14 +93,59 @@ hal.executable private @manual_subgroup_size {
       %c8 = arith.constant 8 : index
       %c1 = arith.constant 1 : index
       hal.return %c32, %c8, %c1 : index, index, index
-    } attributes {subgroup_size = 64 : index}
+    } attributes {subgroup_size = 32 : index}
     builtin.module {
 // CHECK-LABEL: func.func @manual_subgroup_size()
       func.func @manual_subgroup_size() {
+// CHECK-NEXT: gpu.lane_id upper_bound 32
+        %lane_id = gpu.lane_id
+
+// No workgroup_size; bound from max_thread_count_per_workgroup (1024) / subgroup_size (32) = 32.
+// CHECK-NEXT: gpu.subgroup_id upper_bound 32 : index
+        %subgroup_id = gpu.subgroup_id : index
+
+// CHECK-NEXT: arith.constant 32 : index
+        %subgroup_size = gpu.subgroup_size : index
+
+        return
+      }
+    }
+  }
+}
+
+// -----
+
+// Test variable subgroup sizes on gfx1100 (subgroup_size_choices = [32, 64])
+// with static workgroup sizes but no explicit subgroup_size selection.
+#executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb",
+  {iree_codegen.target_info = #iree_gpu.target<arch = "gfx1100", features = "",
+  wgp = <compute = fp32,
+    storage = b32,
+    subgroup = arithmetic,
+    subgroup_size_choices = [32, 64],
+    max_workgroup_sizes = [1024, 1024, 1024],
+    max_thread_count_per_workgroup = 1024,
+    max_workgroup_memory_bytes = 65536,
+    max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
+
+hal.executable private @gfx1100_variable_subgroup {
+  hal.executable.variant public @rocm_hsaco_fb target(#executable_target) {
+    hal.executable.export public @gfx1100_variable_subgroup ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+      %c128 = arith.constant 128 : index
+      %c1 = arith.constant 1 : index
+      hal.return %c128, %c1, %c1 : index, index, index
+    } attributes {workgroup_size = [128 : index, 1 : index, 1 : index]}
+    builtin.module {
+// CHECK-LABEL: func.func @gfx1100_variable_subgroup()
+      func.func @gfx1100_variable_subgroup() {
 // CHECK-NEXT: gpu.lane_id upper_bound 64
         %lane_id = gpu.lane_id
 
-// CHECK-NEXT: arith.constant 64 : index
+// CHECK-NEXT: gpu.subgroup_id upper_bound 4 : index
+        %subgroup_id = gpu.subgroup_id : index
+
+// CHECK-NEXT: gpu.subgroup_size upper_bound 64 : index
         %subgroup_size = gpu.subgroup_size : index
 
         return
@@ -110,8 +158,8 @@ hal.executable private @manual_subgroup_size {
 
 #executable_target = #hal.executable.target<"rocm", "rocm-hsaco-fb",
   {iree_codegen.target_info = #iree_gpu.target<arch = "gfx1100", features = "",
-  wgp = <compute =  fp32,
-    storage =  b32,
+  wgp = <compute = fp32,
+    storage = b32,
     subgroup = arithmetic,
     subgroup_size_choices = [32, 64],
     max_workgroup_sizes = [1024, 1024, 1024],
@@ -131,6 +179,15 @@ hal.executable private @dynamic {
     builtin.module {
 // CHECK-LABEL: func.func @dynamic()
       func.func @dynamic() {
+// CHECK-NEXT: gpu.lane_id upper_bound 64
+        %lane_id = gpu.lane_id
+
+// CHECK-NEXT: gpu.subgroup_id upper_bound 32 : index
+        %subgroup_id = gpu.subgroup_id : index
+
+// CHECK-NEXT: gpu.subgroup_size upper_bound 64 : index
+        %subgroup_size = gpu.subgroup_size : index
+
 // CHECK-NEXT: gpu.thread_id x upper_bound 1024
 // CHECK-NEXT: gpu.thread_id y upper_bound 1024
 // CHECK-NEXT: gpu.thread_id z upper_bound 1024