Add padded shared layout to test_convert2d

zhanglx13 · zhanglx13 · commit 24285b9c69ee · 2025-07-21T19:15:30.000Z
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -369,7 +369,7 @@ When vec=2, elements are swizzled in pairs of 2.  In other words, the element at
   let genVerifyDecl = 1;
 }
 
-def PaddeddSharedEncodingAttr
+def PaddedSharedEncodingAttr
     : TritonGPU_Attr<"PaddedSharedEncoding", "padded_shared_encoding",
                      [SharedEncodingTrait, LayoutEncodingTrait]> {
   let mnemonic = "padded_shared";
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -424,6 +424,7 @@ Value emitPadding(Location loc, RewriterBase &rewriter,
                   unsigned bitwidth, Value smemOffset, bool offsetInBytes) {
   TritonLLVMOpBuilder b(loc, rewriter);
 
+  assert((bitwidth >= 8) && "Invalid bitwidth for padded shared layout");
   Value padOffset = b.i32_val(0);
   unsigned offScale = offsetInBytes ? bitwidth / 8 : 1;
   for (auto [interval, padding] :
@@ -712,7 +713,8 @@ bool emitTransferBetweenRegistersAndShared(
     smemOffset = b.xor_(smemOffset, offset);
     if (paddedLayout) {
       // Apply the offset needed for padding.
-      Value padOffset = emitPadding(loc, rewriter, paddedLayout, /*bitwidth=*/0,
+      auto bitwidth = elemLlvmTy.getIntOrFloatBitWidth();
+      Value padOffset = emitPadding(loc, rewriter, paddedLayout, bitwidth,
                                     smemOffset, /*offsetInBytes=*/false);
       smemOffset = b.add(smemOffset, padOffset);
     }
diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
@@ -1780,6 +1780,8 @@ int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef<int64_t> shape) const {
        llvm::zip_equal(getIntervals(), getPaddings())) {
     paddingSize += (unpaddedSize >> llvm::Log2_32(interval))
                    << llvm::Log2_32(padding);
+    // There is no need for padding after the last element
+    paddingSize -= padding;
   }
   return unpaddedSize + paddingSize;
 }
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -37,6 +37,7 @@
     is_hip_cdna3,
     is_hip_cdna4,
     is_hip_gfx12,
+    get_lds_size,
     is_xpu,
     get_arch,
     torch_float8_dtypes,
@@ -216,7 +217,7 @@ def __str__(self):
         return f"#{GPU_DIALECT}.blocked<{{sizePerThread={self.sz_per_thread}, threadsPerWarp={self.threads_per_warp}, warpsPerCTA={self.warps_per_cta}, order={self.order}, CTAsPerCGA={self.ctas_per_cga}, CTASplitNum={self.cta_split_num}, CTAOrder={self.cta_order}}}>"
 
 
-class SharedLayout:
+class SwizzledSharedLayout:
 
     def __init__(self, vec, per_phase, max_phase, order, ctas_per_cga, cta_split_num, cta_order):
         self.vec = vec
@@ -231,6 +232,19 @@ def __str__(self):
         return f"#{GPU_DIALECT}.swizzled_shared<{{vec={self.vec}, perPhase={self.per_phase}, maxPhase={self.max_phase}, order={self.order}, CTAsPerCGA={self.ctas_per_cga}, CTASplitNum={self.cta_split_num}, CTAOrder={self.cta_order}}}>"
 
 
+class PaddedSharedLayout:
+
+    def __init__(self, interval_padding_pairs, order, ctas_per_cga, cta_split_num, cta_order):
+        self.interval_padding_pairs = "[" + ", ".join(f"{v[0]}:{v[1]:+d}" for v in interval_padding_pairs) + "]"
+        self.order = order
+        self.ctas_per_cga = ctas_per_cga
+        self.cta_split_num = cta_split_num
+        self.cta_order = cta_order
+
+    def __str__(self):
+        return f"#{GPU_DIALECT}.padded_shared<{self.interval_padding_pairs} {{order={self.order}, CTAsPerCGA={self.ctas_per_cga}, CTASplitNum={self.cta_split_num}, CTAOrder={self.cta_order}}}>"
+
+
 class NVMMASharedLayout:
 
     def __init__(self, swizzle, transpose, element_bit_width, ctas_per_cga, cta_split_num, cta_order):
@@ -293,7 +307,7 @@ def warps_per_cta(layout, shape):
 
 
 def is_layout_applicable(layout) -> bool:
-    if isinstance(layout, (BlockedLayout, SharedLayout, LinearLayout)):
+    if isinstance(layout, (BlockedLayout, SwizzledSharedLayout, PaddedSharedLayout, LinearLayout)):
         return True
     elif isinstance(layout, SliceLayout):
         return is_layout_applicable(layout.parent)
@@ -6145,10 +6159,12 @@ def kernel(Out):
 
 intermediate_layouts = [
     None,
-    SharedLayout(1, 1, 1, [0, 1], [1, 1], [1, 1], [0, 1]),
-    SharedLayout(1, 1, 1, [1, 0], [1, 1], [1, 1], [0, 1]),
-    SharedLayout(4, 2, 4, [1, 0], [1, 1], [1, 1], [0, 1]),
-    SharedLayout(2, 2, 4, [1, 0], [1, 1], [1, 1], [0, 1]),
+    SwizzledSharedLayout(1, 1, 1, [0, 1], [1, 1], [1, 1], [0, 1]),
+    SwizzledSharedLayout(1, 1, 1, [1, 0], [1, 1], [1, 1], [0, 1]),
+    SwizzledSharedLayout(4, 2, 4, [1, 0], [1, 1], [1, 1], [0, 1]),
+    SwizzledSharedLayout(2, 2, 4, [1, 0], [1, 1], [1, 1], [0, 1]),
+    PaddedSharedLayout([[32, 8]], [1, 0], [1, 1], [1, 1], [0, 1]),
+    PaddedSharedLayout([[64, 4], [128, 8]], [1, 0], [1, 1], [1, 1], [0, 1])
 ]
 
 
@@ -6182,7 +6198,7 @@ def test_convert2d(M, N, src_layout, interm_layout, dst_layout, dtype, device, t
             scratch_shape = compute_scratch_buffer_shape(src_layout, dst_layout, (M, N))
         except AssertionError:
             pytest.skip("Can't compute scratch buffer size")
-        lds_size = 65536
+        lds_size = get_lds_size()
         # consider int32 dtype in scratch buffer size,
         # because it is the largest dtype used in convert_layout in this test
         int32_size = 4
@@ -6258,10 +6274,10 @@ def test_convert2d(M, N, src_layout, interm_layout, dst_layout, dtype, device, t
 ]
 
 shared_layouts_3d = [
-    SharedLayout(1, 1, 1, [2, 1, 0], [1, 1, 1], [1, 1, 1], [0, 1, 2]),
-    SharedLayout(4, 2, 4, [1, 2, 0], [1, 1, 1], [1, 1, 1], [0, 1, 2]),
-    SharedLayout(8, 2, 4, [0, 2, 1], [1, 1, 1], [1, 1, 1], [0, 1, 2]),
-    SharedLayout(4, 2, 1, [2, 0, 1], [1, 1, 1], [1, 1, 1], [0, 1, 2]),
+    SwizzledSharedLayout(1, 1, 1, [2, 1, 0], [1, 1, 1], [1, 1, 1], [0, 1, 2]),
+    SwizzledSharedLayout(4, 2, 4, [1, 2, 0], [1, 1, 1], [1, 1, 1], [0, 1, 2]),
+    SwizzledSharedLayout(8, 2, 4, [0, 2, 1], [1, 1, 1], [1, 1, 1], [0, 1, 2]),
+    SwizzledSharedLayout(4, 2, 1, [2, 0, 1], [1, 1, 1], [1, 1, 1], [0, 1, 2]),
 ]
 
 
@@ -6349,9 +6365,9 @@ def test_local_load_store(M, N, K, dist_layout, shared_layout, device, tmp_path:
 ]
 
 shared_layouts = [
-    SharedLayout(4, 2, 4, [0, 1], [1, 1], [1, 1], [0, 1]),
-    SharedLayout(8, 1, 8, [1, 0], [1, 1], [1, 1], [0, 1]),
-    SharedLayout(16, 1, 16, [1, 0], [1, 1], [1, 1], [0, 1]),
+    SwizzledSharedLayout(4, 2, 4, [0, 1], [1, 1], [1, 1], [0, 1]),
+    SwizzledSharedLayout(8, 1, 8, [1, 0], [1, 1], [1, 1], [0, 1]),
+    SwizzledSharedLayout(16, 1, 16, [1, 0], [1, 1], [1, 1], [0, 1]),
 ]
 
 
@@ -6502,7 +6518,7 @@ def test_local_load_store_dot(M, N, dtype, dist_layout, shared_layout, device, t
 ]
 
 shared_layouts = [
-    SharedLayout(8, 1, 1, [1, 0], [1, 1], [1, 1], [0, 1]),
+    SwizzledSharedLayout(8, 1, 1, [1, 0], [1, 1], [1, 1], [0, 1]),
     NVMMASharedLayout(64, False, 16, [1, 1], [1, 1], [0, 1]),
     NVMMASharedLayout(128, False, 16, [1, 1], [1, 1], [0, 1]),
 ]
diff --git a/python/triton/_internal_testing.py b/python/triton/_internal_testing.py
@@ -76,14 +76,17 @@ def is_hip_cdna4():
 
 def is_hip_gfx12():
     target = get_current_target()
-    print(target.arch)
     return target is not None and target.backend == 'hip' and 'gfx12' in target.arch
 
 
 def is_hip_cdna():
     return is_hip_cdna2() or is_hip_cdna3() or is_hip_cdna4()
 
 
+def get_lds_size():
+    return 163840 if is_hip_cdna4() else 65536
+
+
 def is_xpu():
     target = get_current_target()
     return False if target is None else target.backend == "xpu"

Original file line number	Diff line number	Diff line change
`@@ -369,7 +369,7 @@ When vec=2, elements are swizzled in pairs of 2. In other words, the element at`
`369`	`369`	`let genVerifyDecl = 1;`
`370`	`370`	`}`
`371`	`371`
`372`		`-def PaddeddSharedEncodingAttr`
	`372`	`+def PaddedSharedEncodingAttr`
`373`	`373`	`: TritonGPU_Attr<"PaddedSharedEncoding", "padded_shared_encoding",`
`374`	`374`	`[SharedEncodingTrait, LayoutEncodingTrait]> {`
`375`	`375`	`let mnemonic = "padded_shared";`
Original file line number	Diff line number	Diff line change
`@@ -1780,6 +1780,8 @@ int64_t PaddedSharedEncodingAttr::getPaddedSize(ArrayRef<int64_t> shape) const {`
`1780`	`1780`	`llvm::zip_equal(getIntervals(), getPaddings())) {`
`1781`	`1781`	`paddingSize += (unpaddedSize >> llvm::Log2_32(interval))`
`1782`	`1782`	`<< llvm::Log2_32(padding);`
	`1783`	`+ // There is no need for padding after the last element`
	`1784`	`+ paddingSize -= padding;`
`1783`	`1785`	`}`
`1784`	`1786`	`return unpaddedSize + paddingSize;`
`1785`	`1787`	`}`