[XLA:GPU] Rename warp to shmem_group in PackedTranspose (#434)

nurmukhametov · web-flow · commit 729dcdfc4795 · 2025-11-19T19:05:47.000Z
Also calculate their count as kNumThreadsPerBlock / kNumShmemBanks to
avoid inconsistency when manually specified.

This change is NFC for non-AMD GPUs. For AMD GPUs, it fixes the
performance regression caused by inconsistency between shmem_group size,
kNumThreadsPerBlock and kNumShmemBanks. It ended up in a situation
downstream where half of the launched threads per block were not
utilized at all. Updated packed transpose tests to verify correct thread
utilization.
diff --git a/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_bf16.hlo b/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_bf16.hlo
@@ -6,4 +6,6 @@ fusion {
   p0 = bf16[30,16,30] parameter(0)
   ROOT transpose = bf16[30,16,30] transpose(p0), dimensions={2,1,0}
 }
-// CHECK:  xla_gpu.allocate_shared : tensor<64x64xbf16>
+// CHECK: #indexing_map{{.*}}domain:{{.*}}th_x in [0, [[N_THREADS:[0-9]+]]]
+// CHECK: %thread_id_x = gpu.thread_id  x {xla.range = [0 : index, [[N_THREADS]] : index]}
+// CHECK:  xla_gpu.allocate_shared : tensor<64x64xbf16>
diff --git a/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_f16.hlo b/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_f16.hlo
@@ -6,4 +6,6 @@ fusion {
   p0 = f16[28,2,6,32] parameter(0)
   ROOT transpose = f16[2,32,6,28] transpose(p0), dimensions={1,3,2,0}
 }
+// CHECK: #indexing_map{{.*}}domain:{{.*}}th_x in [0, [[N_THREADS:[0-9]+]]]
+// CHECK: %thread_id_x = gpu.thread_id  x {xla.range = [0 : index, [[N_THREADS]] : index]}
 // CHECK:  xla_gpu.allocate_shared : tensor<64x64xf16>
diff --git a/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_s4.hlo b/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_s4.hlo
@@ -7,4 +7,6 @@ fusion {
   ROOT %transpose= s4[128, 32, 8, 256] transpose(%param_0),
     dimensions={0,3,2,1}
 }
+// CHECK: #indexing_map{{.*}}domain:{{.*}}th_x in [0, [[N_THREADS:[0-9]+]]]
+// CHECK: %thread_id_x = gpu.thread_id  x {xla.range = [0 : index, [[N_THREADS]] : index]}
 // CHECK: xla_gpu.allocate_shared : tensor<256x256xi4>
diff --git a/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_s8.hlo b/xla/backends/gpu/codegen/emitters/tests/transpose/packed_transpose_s8.hlo
@@ -6,4 +6,6 @@ fusion {
   p0 = s8[8,64,68] parameter(0)
   ROOT transpose = s8[8,68,64] transpose(p0), dimensions={0, 2, 1}
 }
-// CHECK:  xla_gpu.allocate_shared : tensor<128x128xi8>
+// CHECK: #indexing_map{{.*}}domain:{{.*}}th_x in [0, [[N_THREADS:[0-9]+]]]
+// CHECK: %thread_id_x = gpu.thread_id  x {xla.range = [0 : index, [[N_THREADS]] : index]}
+// CHECK:  xla_gpu.allocate_shared : tensor<128x128xi8>
diff --git a/xla/backends/gpu/codegen/emitters/transpose.cc b/xla/backends/gpu/codegen/emitters/transpose.cc
@@ -94,7 +94,6 @@ using mlir::VectorType;
 using mlir::func::FuncOp;
 using mlir::func::ReturnOp;
 
-
 namespace mt = ::mlir::tensor;
 namespace mv = ::mlir::vector;
 
@@ -532,21 +531,21 @@ std::vector<int64_t> GetBlockCounts(absl::Span<const int64_t> shape,
 PackedTranspose::PackedTranspose(const HloFusionAnalysis& analysis,
                                  const TransposeSpec& spec,
                                  absl::Span<const int64_t> output_block_tile,
-                                 int64_t num_warps)
+                                 int64_t num_shmem_groups)
     : TransposeFusionBase(analysis),
       spec_(spec),
       output_tile_(output_block_tile.begin(), output_block_tile.end()),
       input_tile_(Permute(output_tile_, spec_.canonical_inv_permutation)),
       block_counts_(GetBlockCounts(spec_.canonical_output_shape, output_tile_)),
-      num_warps_per_block_(num_warps),
+      num_shmem_groups_per_block_(num_shmem_groups),
       tile_size_t1_(input_tile_[spec_.dim_T1_input_id()]),
       tile_size_a_(input_tile_[spec_.dim_A_id()]),
       tile_size_t2_(input_tile_[spec_.dim_T2_input_id()]),
       populated_shmem_cols_(tile_size_a_ * tile_size_t1_),
       populated_shmem_rows_(tile_size_t2_) {
   VLOG(5) << "Transpose spec: " << spec.ToString()
           << "Output block tile: " << absl::StrJoin(output_block_tile, ", ")
-          << "\nNumber of warps: " << num_warps << "\n";
+          << "\nNumber of shmem groups: " << num_shmem_groups << "\n";
   auto bits_per_element = GetBitwidth(spec_.elem_type());
   vector_size_ = kBankBitwidth / bits_per_element;
   CHECK_GE(vector_size_, 1);
@@ -779,25 +778,27 @@ IndexingMap PackedTranspose::GetInputIndexing(MLIRContext* ctx) const {
       KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
   auto block_id =
       getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[0], ctx);
-  auto warp_size = kNumShmemBanks;
-  auto lane_id = thread_id % warp_size;
-  auto warp_id = thread_id.floorDiv(warp_size);
-  std::vector<IndexingMap::Variable> dim_vars = DimVarsFromGPUGrid(
-      {num_warps_per_block_ * warp_size, 1, 1, Product(block_counts_), 1, 1});
+  auto shmem_group_size = kNumShmemBanks;
+  auto lane_id = thread_id % shmem_group_size;
+  auto shmem_group_id = thread_id.floorDiv(shmem_group_size);
+  std::vector<IndexingMap::Variable> dim_vars =
+      DimVarsFromGPUGrid({num_shmem_groups_per_block_ * shmem_group_size, 1, 1,
+                          Product(block_counts_), 1, 1});
 
   // Range variables.
   auto loop = getAffineSymbolExpr(0, ctx);
   auto vector_element_id = getAffineSymbolExpr(1, ctx);
   std::vector<IndexingMap::Variable> range_vars = RangeVarsFromTensorSizes(
-      {{CeilOfRatio(tile_size_t2_, num_warps_per_block_), vector_size_}});
+      {{CeilOfRatio(tile_size_t2_, num_shmem_groups_per_block_),
+        vector_size_}});
 
   // Block offsets.
   auto block_ids = DelinearizeInBoundsIndex(block_id, block_counts_);
   absl::c_copy(Permute(block_ids, spec_.canonical_inv_permutation),
                block_ids.begin());
 
   // Shmem expressions.
-  auto shmem_row = loop * num_warps_per_block_ + warp_id;
+  auto shmem_row = loop * num_shmem_groups_per_block_ + shmem_group_id;
   auto shmem_col = lane_id * vector_size_ + vector_element_id;
 
   // Offsets within the block.
@@ -840,20 +841,21 @@ IndexingMap PackedTranspose::GetShmemWriteIndexing(
   // Dimensions variables.
   auto thread_id = getAffineDimExpr(
       KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
-  auto warp_size = kNumShmemBanks;
-  auto lane_id = thread_id % warp_size;
-  auto warp_id = thread_id.floorDiv(warp_size);
-  std::vector<IndexingMap::Variable> dim_vars = DimVarsFromGPUGrid(
-      {num_warps_per_block_ * warp_size, 1, 1, Product(block_counts_), 1, 1});
+  auto shmem_group_size = kNumShmemBanks;
+  auto lane_id = thread_id % shmem_group_size;
+  auto shmem_group_id = thread_id.floorDiv(shmem_group_size);
+  std::vector<IndexingMap::Variable> dim_vars =
+      DimVarsFromGPUGrid({num_shmem_groups_per_block_ * shmem_group_size, 1, 1,
+                          Product(block_counts_), 1, 1});
 
   // Range variables.
   auto loop = getAffineSymbolExpr(0, ctx);
   auto vector_element_id = getAffineSymbolExpr(1, ctx);
   std::vector<IndexingMap::Variable> range_vars = RangeVarsFromTensorSizes(
-      {CeilOfRatio(tile_size_t2_, num_warps_per_block_), vector_size_});
+      {CeilOfRatio(tile_size_t2_, num_shmem_groups_per_block_), vector_size_});
 
   // Shmem expressions.
-  auto shmem_row = loop * num_warps_per_block_ + warp_id;
+  auto shmem_row = loop * num_shmem_groups_per_block_ + shmem_group_id;
   auto shmem_col = lane_id * vector_size_ + vector_element_id;
   llvm::SmallVector<std::pair<AffineExpr, Interval>> constraints{
       {shmem_col, Interval{0, populated_shmem_cols_ - 1}},
@@ -872,25 +874,27 @@ IndexingMap PackedTranspose::GetShmemReadIndexing(
   // Dimensions variables.
   auto thread_id = getAffineDimExpr(
       KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
-  auto warp_size = kNumShmemBanks;
-  auto lane_id = thread_id % warp_size;
-  auto warp_id = thread_id.floorDiv(warp_size);
-  std::vector<IndexingMap::Variable> dim_vars = DimVarsFromGPUGrid(
-      {num_warps_per_block_ * warp_size, 1, 1, Product(block_counts_), 1, 1});
+  auto shmem_group_size = kNumShmemBanks;
+  auto lane_id = thread_id % shmem_group_size;
+  auto shmem_group_id = thread_id.floorDiv(shmem_group_size);
+  std::vector<IndexingMap::Variable> dim_vars =
+      DimVarsFromGPUGrid({num_shmem_groups_per_block_ * shmem_group_size, 1, 1,
+                          Product(block_counts_), 1, 1});
 
   // Range variables.
   auto loop = getAffineSymbolExpr(0, ctx);
   auto vector_horizontal = getAffineSymbolExpr(1, ctx);
   auto vector_vertical = getAffineSymbolExpr(2, ctx);
   std::vector<IndexingMap::Variable> range_vars = RangeVarsFromTensorSizes(
       {CeilOfRatio(populated_shmem_cols_,
-                   (vector_size_ * num_warps_per_block_)),
+                   (vector_size_ * num_shmem_groups_per_block_)),
        vector_size_, vector_size_});
 
   // Shmem expressions.
   auto shmem_row = lane_id * vector_size_ + vector_vertical;
-  auto shmem_col = (loop * num_warps_per_block_ + warp_id) * vector_size_ +
-                   vector_horizontal;
+  auto shmem_col =
+      (loop * num_shmem_groups_per_block_ + shmem_group_id) * vector_size_ +
+      vector_horizontal;
   llvm::SmallVector<std::pair<AffineExpr, Interval>> constraints{
       {shmem_col, Interval{0, populated_shmem_cols_ - 1}},
       {shmem_row, Interval{0, populated_shmem_rows_ - 1}}};
@@ -909,26 +913,29 @@ IndexingMap PackedTranspose::GetOutputIndexing(mlir::MLIRContext* ctx) const {
       KernelFusionInterface::kIndexingMapThreadIdxDims[0], ctx);
   auto block_id =
       getAffineDimExpr(KernelFusionInterface::kIndexingMapBlockIdxDims[0], ctx);
-  auto warp_size = kNumShmemBanks;
-  auto lane_id = thread_id % warp_size;
-  auto warp_id = thread_id.floorDiv(warp_size);
-  std::vector<IndexingMap::Variable> dim_vars = DimVarsFromGPUGrid(
-      {num_warps_per_block_ * warp_size, 1, 1, Product(block_counts_), 1, 1});
+  auto shmem_group_size = kNumShmemBanks;
+  auto lane_id = thread_id % shmem_group_size;
+  auto shmem_group_id = thread_id.floorDiv(shmem_group_size);
+  std::vector<IndexingMap::Variable> dim_vars =
+      DimVarsFromGPUGrid({num_shmem_groups_per_block_ * shmem_group_size, 1, 1,
+                          Product(block_counts_), 1, 1});
 
   // Range variables.
   auto loop = getAffineSymbolExpr(0, ctx);
   auto vector_horizontal = getAffineSymbolExpr(1, ctx);
   auto vector_vertical = getAffineSymbolExpr(2, ctx);
   std::vector<IndexingMap::Variable> range_vars = RangeVarsFromTensorSizes(
-      {CeilOfRatio(populated_shmem_cols_, vector_size_ * num_warps_per_block_),
+      {CeilOfRatio(populated_shmem_cols_,
+                   vector_size_ * num_shmem_groups_per_block_),
        vector_size_, vector_size_});
 
   // Block offsets.
   auto block_ids = DelinearizeInBoundsIndex(block_id, block_counts_);
 
   // Shmem expressions.
-  auto shmem_col = (loop * num_warps_per_block_ + warp_id) * vector_size_ +
-                   vector_horizontal;
+  auto shmem_col =
+      (loop * num_shmem_groups_per_block_ + shmem_group_id) * vector_size_ +
+      vector_horizontal;
   auto shmem_row = lane_id * vector_size_ + vector_vertical;
 
   // Offsets within the block.
@@ -972,7 +979,8 @@ std::unique_ptr<EmitterBase> CreateTransposeFusion(
   auto packed_transpose_tile = GetPackedTransposeTileSizes(spec);
   if (packed_transpose_tile.ok()) {
     return std::make_unique<PackedTranspose>(
-        analysis, spec, *packed_transpose_tile, /* num_warps= */ 4);
+        analysis, spec, *packed_transpose_tile,
+        kNumThreadsPerBlock / kNumShmemBanks);
   }
   return std::make_unique<TransposeFusion>(analysis);
 }
diff --git a/xla/backends/gpu/codegen/emitters/transpose.h b/xla/backends/gpu/codegen/emitters/transpose.h
@@ -196,19 +196,24 @@ class TransposeFusion : public TransposeFusionBase {
 //    slice of shared memory.
 //
 // 5. Every GPU block gets a single 64 x 10 x 6 x bf16 tile.
-//    The tile is read by `num_warps_per_block` warps.
-//    Let's assume that there are 4 warps per block. In this case, on every
-//    iteration each warp will read 10 x 6 x bf16 elements, i.e. every thread
-//    (30 out of 32) performs a vector load of 2 x bf16 and stores it to the
-//    shared memory. In total, there will be 16 iterations performed by each
-//    block.
+//    The tile is read by `num_shmem_groups_per_block` shmem groups.
+//    Let's assume that there are 4 shmem groups per block. In this case, on
+//    every iteration each shmem group will read 10 x 6 x bf16 elements, i.e.
+//    every thread (30 out of 32) performs a vector load of 2 x bf16 and stores
+//    it to the shared memory. In total, there will be 16 iterations performed
+//    by each block.
+//
+//    Note: When the hardware warp size equals kNumShmemBanks (32), then
+//    num_shmem_groups_per_block equals the number of warps per block. This is
+//    the case for NVIDIA GPUs, but not always for AMD GPUs where warp size
+//    can differ (64).
 //
 //    The following code snippet shows how the data is read from the input
 //    tensor into the shared memory:
 //
-//    for I = 0 to CEIL(shmem_rows, num_warps_per_block):
+//    for I = 0 to CEIL(shmem_rows, num_shmem_groups_per_block):
 //      for J = 0 to VECTOR_SIZE:
-//        ROW = WARP_ID + NUM_WARPS * I
+//        ROW = SHMEM_GROUP_ID + NUM_SHMEM_GROUPS * I
 //        COL = LANE_ID * VECTOR_SIZE + J
 //        SHMEM[ROW, COL] = INPUT[ROW, COL / 10, COL % 10]
 //
@@ -217,7 +222,7 @@ class TransposeFusion : public TransposeFusionBase {
 // 6. Each thread reads a VECTOR_SIZE x VECTOR_SIZE x bf16 tile from the shared
 //    memory and performs the write of each of the columns of the tile.
 //
-//    for I = 0 to CEIL(shmem_cols, VECTOR_SIZE * num_warps_per_block):
+//    for I = 0 to CEIL(shmem_cols, VECTOR_SIZE * num_shmem_groups_per_block):
 //      VECTOR_2D = arith.constant dense<0>
 //        : vector<VECTOR_SIZE x VECTOR_SIZE x bf16>
 //      for J = 0 to VECTOR_SIZE:
@@ -231,7 +236,7 @@ class PackedTranspose : public TransposeFusionBase {
   explicit PackedTranspose(const HloFusionAnalysis& analysis,
                            const TransposeSpec& spec,
                            absl::Span<const int64_t> output_block_tile,
-                           int64_t num_warps);
+                           int64_t num_shmem_groups);
 
   LaunchDimensions launch_dimensions() const override;
 
@@ -279,8 +284,10 @@ class PackedTranspose : public TransposeFusionBase {
   // Vector size in elements.
   int64_t vector_size_;
 
-  // Number of warps per block.
-  int64_t num_warps_per_block_;
+  // Number of shmem groups per block. Each shmem group consists of 32 threads
+  // (kNumShmemBanks), chosen to match the number of shared memory banks for
+  // optimal memory access patterns. This is independent of hardware warp size.
+  int64_t num_shmem_groups_per_block_;
 
   // Tile sizes for the canonicalical dimensions
   // [T2, A, T1, 1] -> [T1, A, T2, 1].

Original file line number	Diff line number	Diff line change
`@@ -6,4 +6,6 @@ fusion {`
`6`	`6`	`p0 = bf16[30,16,30] parameter(0)`
`7`	`7`	`ROOT transpose = bf16[30,16,30] transpose(p0), dimensions={2,1,0}`
`8`	`8`	`}`
`9`		`-// CHECK: xla_gpu.allocate_shared : tensor<64x64xbf16>`
	`9`	`+// CHECK: #indexing_map{{.}}domain:{{.}}th_x in [0, [[N_THREADS:[0-9]+]]]`
	`10`	`+// CHECK: %thread_id_x = gpu.thread_id x {xla.range = [0 : index, [[N_THREADS]] : index]}`
	`11`	`+// CHECK: xla_gpu.allocate_shared : tensor<64x64xbf16>`
Original file line number	Diff line number	Diff line change
`@@ -6,4 +6,6 @@ fusion {`
`6`	`6`	`p0 = f16[28,2,6,32] parameter(0)`
`7`	`7`	`ROOT transpose = f16[2,32,6,28] transpose(p0), dimensions={1,3,2,0}`
`8`	`8`	`}`
	`9`	`+// CHECK: #indexing_map{{.}}domain:{{.}}th_x in [0, [[N_THREADS:[0-9]+]]]`
	`10`	`+// CHECK: %thread_id_x = gpu.thread_id x {xla.range = [0 : index, [[N_THREADS]] : index]}`
`9`	`11`	`// CHECK: xla_gpu.allocate_shared : tensor<64x64xf16>`
Original file line number	Diff line number	Diff line change
`@@ -7,4 +7,6 @@ fusion {`
`7`	`7`	`ROOT %transpose= s4[128, 32, 8, 256] transpose(%param_0),`
`8`	`8`	`dimensions={0,3,2,1}`
`9`	`9`	`}`
	`10`	`+// CHECK: #indexing_map{{.}}domain:{{.}}th_x in [0, [[N_THREADS:[0-9]+]]]`
	`11`	`+// CHECK: %thread_id_x = gpu.thread_id x {xla.range = [0 : index, [[N_THREADS]] : index]}`
`10`	`12`	`// CHECK: xla_gpu.allocate_shared : tensor<256x256xi4>`
Original file line number	Diff line number	Diff line change
`@@ -6,4 +6,6 @@ fusion {`
`6`	`6`	`p0 = s8[8,64,68] parameter(0)`
`7`	`7`	`ROOT transpose = s8[8,68,64] transpose(p0), dimensions={0, 2, 1}`
`8`	`8`	`}`
`9`		`-// CHECK: xla_gpu.allocate_shared : tensor<128x128xi8>`
	`9`	`+// CHECK: #indexing_map{{.}}domain:{{.}}th_x in [0, [[N_THREADS:[0-9]+]]]`
	`10`	`+// CHECK: %thread_id_x = gpu.thread_id x {xla.range = [0 : index, [[N_THREADS]] : index]}`
	`11`	`+// CHECK: xla_gpu.allocate_shared : tensor<128x128xi8>`