ROCm
diff --git a/‎build_tools/rocm/run_xla.sh‎
Lines changed: 3 additions & 0 deletions b/‎build_tools/rocm/run_xla.sh‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎third_party/triton/temporary/0001-AMD-Quick-fix-disabling-transposed-load-used-as-diff.patch‎
Lines changed: 46 additions & 0 deletions b/‎third_party/triton/temporary/0001-AMD-Quick-fix-disabling-transposed-load-used-as-diff.patch‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎third_party/triton/temporary/accelerateamdmatmul.patch‎
Lines changed: 14 additions & 0 deletions b/‎third_party/triton/temporary/accelerateamdmatmul.patch‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎third_party/triton/temporary/accelerateamdmatmul2.patch‎
Lines changed: 24 additions & 0 deletions b/‎third_party/triton/temporary/accelerateamdmatmul2.patch‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎third_party/triton/temporary/series.bzl‎
Lines changed: 3 additions & 0 deletions b/‎third_party/triton/temporary/series.bzl‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc‎
Lines changed: 2 additions & 4 deletions b/‎xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎xla/backends/gpu/codegen/triton/emitter_helpers.cc‎
Lines changed: 9 additions & 0 deletions b/‎xla/backends/gpu/codegen/triton/emitter_helpers.cc‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎xla/backends/gpu/codegen/triton/support.cc‎
Lines changed: 47 additions & 10 deletions b/‎xla/backends/gpu/codegen/triton/support.cc‎
Lines changed: 47 additions & 10 deletions
@@ -45,6 +45,9 @@ GPU_NAME=(`rocminfo | grep -m 1 gfx`)
 GPU_NAME=${GPU_NAME[1]}
 
 EXCLUDED_TESTS=(
+# //xla/service/gpu/tests:gpu_kernel_tiling_test_gpu_amd_any
+GpuKernelTilingTest.ColumnReductionWithLayoutChangeTiled
+GpuKernelTilingTest.ReductionInputTooLarge
 # //xla/pjrt/c:pjrt_c_api_gpu_test_gpu_amd_any
 PjrtCAPIGpuExtensionTest.TritonCompile
 # //xla/backends/gpu/codegen/triton:fusion_emitter_device_test_gpu_amd_any
 
@@ -0,0 +1,46 @@
+From d539916e4d49cca93f54a5f99f7822050205432c Mon Sep 17 00:00:00 2001
+From: Jungwook Park <[email protected]>
+Date: Thu, 7 Aug 2025 06:34:49 -0500
+Subject: [PATCH] [AMD] Quick fix disabling transposed load used as different
+ type.
+
+Disabling transposedLoad if dot is using it as a different element type.
+Otherwise it's picking wrong vectorsize when lowering.
+---
+ .../lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp | 20 +++++++++++++++++++
+ 1 file changed, 20 insertions(+)
+
+diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp
+index 661a17678..6bda3a818 100644
+--- a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp
++++ b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp
+@@ -214,6 +214,26 @@ private:
+       return false;
+     }
+ 
++    // transposed load can be used only when it's consumed by dot with the
++    // loaded data type.
++    int opIdx = 0;
++    triton::gpu::LocalLoadOp lLoad = cast<triton::gpu::LocalLoadOp>(localLoad);
++    if (auto dotEnc = lLoad.getSrc().getType().getEncoding())
++      opIdx = cast<triton::gpu::DotOperandEncodingAttr>(dotEnc).getOpIdx();
++    else
++      return false;
++
++    SetVector<Operation *> slice;
++    getForwardSlice(localLoad, &slice);
++    for (auto op : slice) {
++      if (auto dotOp = dyn_cast<triton::DotOp>(op)) {
++        auto inputMat = (opIdx == 0) ? dotOp.getA() : dotOp.getB();
++        auto bitwidthMat = inputMat.getType().getElementTypeBitWidth();
++        if (bitwidth != bitwidthMat)
++          return false;
++      }
++    }
++
+     return true;
+   }
+ 
+-- 
+2.34.1
+
@@ -0,0 +1,14 @@
+--- a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
++++ b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
+@@ -1068,7 +1068,10 @@ public:
+     if (isFloat(srcElTy) && isFloat(dstElTy)) {
+       auto rmode =
+           RoundingModeAttr::get(rewriter.getContext(), RoundingMode::RTNE);
+-      return rewriter.create<FpToFpOp>(loc, dstTy, v, rmode);
++      if (dstElTy.getIntOrFloatBitWidth() < srcElTy.getIntOrFloatBitWidth()) {
++        return rewriter.create<FpToFpOp>(loc, dstTy, v, rmode);
++      }
++      return rewriter.create<FpToFpOp>(loc, dstTy, v);
+     }
+     if (!isFloat(srcElTy) && isFloat(dstElTy))
+       return rewriter.create<arith::SIToFPOp>(loc, dstTy, v);
@@ -0,0 +1,24 @@
+--- a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
++++ b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
+@@ -380,9 +380,17 @@ Value convertAndCastTensor(PatternRewriter &rewriter, Value value,
+     else if (oldElemType.isF32() && newElemType.isF16())
+       castedTensor =
+           rewriter.create<arith::TruncFOp>(loc, castedType, convertedTensor);
+-    else
+-      castedTensor =
+-          rewriter.create<tt::FpToFpOp>(loc, castedType, convertedTensor);
++    else {
++      if(oldElemType.getIntOrFloatBitWidth() > newElemType.getIntOrFloatBitWidth()) {
++        auto rmode =
++           RoundingModeAttr::get(rewriter.getContext(), RoundingMode::RTNE);
++        castedTensor =
++          rewriter.create<tt::FpToFpOp>(loc, castedType, convertedTensor, rmode);
++      } else {
++        castedTensor =
++            rewriter.create<tt::FpToFpOp>(loc, castedType, convertedTensor);
++      }
++    }
+   }
+   return castedTensor;
+ }
+ 
@@ -19,5 +19,8 @@ temporary_patch_list = [
     "//third_party/triton:temporary/tutorial_fixes.patch",
     "//third_party/triton:temporary/ws_fix.patch",
     "//third_party/triton:temporary/ws_ub_fix.patch",
+    "//third_party/triton:temporary/0001-AMD-Quick-fix-disabling-transposed-load-used-as-diff.patch",
+    "//third_party/triton:temporary/accelerateamdmatmul.patch",
+    "//third_party/triton:temporary/accelerateamdmatmul2.patch",
     # Add new patches just above this line
 ]
@@ -146,10 +146,8 @@ absl::Status CreateTritonPipeline(
   pm->addPass(mlir::createCanonicalizerPass());
   pm->addPass(mlir::createCSEPass());
   pm->addPass(mlir::createSymbolDCEPass());
-  if (/*(instruction_sched_variant=="none") == */ false) {
-    pm->addPass(mt::createTritonAMDGPULowerInstructionSchedHintsPass(
-        cc.gfx_version(), num_stages));
-  }
+  pm->addPass(mt::createTritonAMDGPULowerInstructionSchedHintsPass(
+      cc.gfx_version(), num_stages));
   pm->addPass(mt::createConvertBuiltinFuncToLLVMPass(/*ftz=*/true));
   // There is no clusters in ROCm for now.
   out_cluster_info.clusterDimX = 1;
 
@@ -106,6 +106,12 @@ absl::StatusOr<Type> TritonType(EmitterLocOpBuilder& b, PrimitiveType t) {
       return b.getType<mlir::Float8E5M2Type>();
     case F8E4M3FN:
       return b.getType<mlir::Float8E4M3FNType>();
+    case F8E4M3B11FNUZ:
+      return b.getType<mlir::Float8E4M3B11FNUZType>();
+    case F8E5M2FNUZ:
+      return b.getType<mlir::Float8E5M2FNUZType>();
+    case F8E4M3FNUZ:
+      return b.getType<mlir::Float8E4M3FNUZType>();
     default:
       return absl::UnimplementedError(
           absl::StrCat("This type is not supported yet: ",
@@ -126,6 +132,9 @@ absl::StatusOr<PrimitiveType> GetPrimitiveType(Type t) {
   if (t.isInteger(1)) return PRED;
   if (mlir::isa<mlir::Float8E5M2Type>(t)) return F8E5M2;
   if (mlir::isa<mlir::Float8E4M3FNType>(t)) return F8E4M3FN;
+  if (mlir::isa<mlir::Float8E4M3B11FNUZType>(t)) return F8E4M3B11FNUZ;
+  if (mlir::isa<mlir::Float8E5M2FNUZType>(t)) return F8E5M2FNUZ;
+  if (mlir::isa<mlir::Float8E4M3FNUZType>(t)) return F8E4M3FNUZ;
   return absl::UnimplementedError("Unsupported type in getPrimitiveType.\n");
 }
 
 
@@ -62,8 +62,12 @@ bool IsTritonSupportedDataType(PrimitiveType type,
     case F64:
       return true;
     case F8E5M2:
-    case F8E4M3FN:
-      return std::holds_alternative<se::CudaComputeCapability>(gpu_version);
+    case F8E4M3FN:      
+      return std::holds_alternative<se::CudaComputeCapability>(gpu_version) ||
+             std::holds_alternative<se::RocmComputeCapability>(gpu_version);
+    case F8E5M2FNUZ:
+    case F8E4M3FNUZ:
+      return std::holds_alternative<se::RocmComputeCapability>(gpu_version);
     case BF16:
       return std::holds_alternative<se::CudaComputeCapability>(gpu_version) ||
              (std::holds_alternative<se::RocmComputeCapability>(gpu_version) &&
@@ -92,7 +96,10 @@ absl::flat_hash_set<HloOpcode> TritonSupportedUnaryElementwiseOps(
   absl::flat_hash_set<HloOpcode> ret{HloOpcode::kAbs, HloOpcode::kCopy};
 
   if (element_type != PrimitiveType::F8E5M2 &&
-      element_type != PrimitiveType::F8E4M3FN) {
+      element_type != PrimitiveType::F8E4M3FN &&
+      element_type != PrimitiveType::F8E4M3B11FNUZ &&
+      element_type != PrimitiveType::F8E5M2FNUZ &&
+      element_type != PrimitiveType::F8E4M3FNUZ) {
     ret.insert(HloOpcode::kNegate);
   }
 
@@ -171,7 +178,10 @@ absl::flat_hash_set<HloOpcode> TritonSupportedBinaryElementwiseOps(
     PrimitiveType element_type, const se::GpuComputeCapability& gpu_version) {
   if (element_type == PrimitiveType::S4 || element_type == PrimitiveType::U16 ||
       element_type == PrimitiveType::F8E5M2 ||
-      element_type == PrimitiveType::F8E4M3FN) {
+      element_type == PrimitiveType::F8E4M3FN || 
+      element_type == PrimitiveType::F8E4M3B11FNUZ ||
+      element_type == PrimitiveType::F8E5M2FNUZ ||
+      element_type == PrimitiveType::F8E4M3FNUZ) {
     return {};
   }
 
@@ -220,7 +230,10 @@ absl::flat_hash_set<HloOpcode> TritonSupportedTernaryElementwiseOps(
   }
 
   if (element_type == PrimitiveType::F8E5M2 ||
-      element_type == PrimitiveType::F8E4M3FN) {
+      element_type == PrimitiveType::F8E4M3FN || 
+      element_type == PrimitiveType::F8E4M3B11FNUZ ||
+      element_type == PrimitiveType::F8E5M2FNUZ ||
+      element_type == PrimitiveType::F8E4M3FNUZ) {
     return {HloOpcode::kSelect};
   }
 
@@ -248,7 +261,10 @@ CodegenDecision CanTritonHandleReduce(
     const HloReduceInstruction& reduce,
     const se::GpuComputeCapability& gpu_version) {
   if (reduce.shape().element_type() == PrimitiveType::F8E4M3FN ||
-      reduce.shape().element_type() == PrimitiveType::F8E5M2) {
+      reduce.shape().element_type() == PrimitiveType::F8E5M2 || 
+      reduce.shape().element_type() == PrimitiveType::F8E5M2FNUZ ||
+      reduce.shape().element_type() == PrimitiveType::F8E4M3FNUZ ||
+      reduce.shape().element_type() == PrimitiveType::F8E4M3B11FNUZ) {
     return CodegenDecision::Forbid(
         "F8E4M3FN and F8E5M2 are not supported for reductions.");
   }
@@ -296,7 +312,8 @@ absl::Status CheckSupportedCheckDotDimensions(const HloDotInstruction& dot) {
   return absl::OkStatus();
 }
 
-bool IsSupportedDotAlgorithm(PrecisionConfig::Algorithm algorithm) {
+bool IsSupportedDotAlgorithm(PrecisionConfig::Algorithm algorithm,
+                             const se::GpuComputeCapability& gpu_version) {
   switch (algorithm) {
     case PrecisionConfig::ALG_UNSET:
     case PrecisionConfig::ALG_DOT_F16_F16_F16:
@@ -309,8 +326,13 @@ bool IsSupportedDotAlgorithm(PrecisionConfig::Algorithm algorithm) {
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
     case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
     case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
-      return true;
+      if (!std::holds_alternative<se::RocmComputeCapability>(gpu_version)) {
+        return true;
+      }
     case PrecisionConfig::ALG_DOT_BF16_BF16_BF16:
+      if (std::holds_alternative<se::RocmComputeCapability>(gpu_version)) {
+        return true;
+      }
     case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32:
     case PrecisionConfig::ALG_DOT_ANY_F8_ANY_F8_F32_FAST_ACCUM:
     default:
@@ -336,7 +358,15 @@ CodegenDecision AreTypesSupportedByAlgUnsetDot(
     }
   }
 
-  auto supported_float_types = {BF16, F16, F32, F64, F8E5M2, F8E4M3FN};
+  if (input_type == F8E4M3B11FNUZ || result_type == F8E4M3B11FNUZ ||
+      input_type == F64) {
+    if (std::holds_alternative<se::RocmComputeCapability>(gpu_version)) {
+      return CodegenDecision::Forbid(
+          "Dot operation for F8E4M3B11FNUZ is not supported on ROCM.");
+    }
+  }
+
+  auto supported_float_types = {BF16, F16, F32, F64, F8E5M2};
   if (absl::c_linear_search(supported_float_types, input_type)) {
     return CodegenDecision::Allow();
   }
@@ -405,6 +435,11 @@ CodegenDecision AreDotAlgorithmInputAndOutputConversionsSupported(
     return forbid("Unsupported BF16 on GPUs before Blackwell");
   }
 
+  if (allowed_operands_types_or->front() == PrimitiveType::F64 &&
+      std::holds_alternative<se::RocmComputeCapability>(gpu_version)) {
+   return forbid("Unsupported result conversion");
+  }
+
   if (allowed_operands_types_or->size() != 1) {
     if (lhs_type == rhs_type &&
         absl::c_linear_search(*allowed_operands_types_or, lhs_type)) {
@@ -467,7 +502,7 @@ CodegenDecision IsTritonSupportedDot(
   const PrecisionConfig& precision_config = dot.precision_config();
   const PrecisionConfig::Algorithm algorithm = precision_config.algorithm();
 
-  if (!IsSupportedDotAlgorithm(algorithm)) {
+  if (!IsSupportedDotAlgorithm(algorithm, gpu_version)) {
     return CodegenDecision::Forbid(
         absl::StrCat("Unsupported dot algorithm: ",
                      PrecisionConfig::Algorithm_Name(algorithm)));
@@ -625,6 +660,8 @@ CodegenDecision IsTritonSupportedInstructionImpl(
     return CodegenDecision(
         element_type != PrimitiveType::F8E4M3FN &&
             element_type != PrimitiveType::F8E5M2 &&
+            element_type != PrimitiveType::F8E4M3FNUZ &&
+            element_type != PrimitiveType::F8E5M2FNUZ &&
             element_type != PrimitiveType::S4,
         "F8E4M3FN, F8E5M2 and S4 are not supported for iota.");
   }
Original file line number	Diff line number	Diff line change
`@@ -19,5 +19,8 @@ temporary_patch_list = [`
`19`	`19`	`"//third_party/triton:temporary/tutorial_fixes.patch",`
`20`	`20`	`"//third_party/triton:temporary/ws_fix.patch",`
`21`	`21`	`"//third_party/triton:temporary/ws_ub_fix.patch",`
	`22`	`+ "//third_party/triton:temporary/0001-AMD-Quick-fix-disabling-transposed-load-used-as-diff.patch",`
	`23`	`+ "//third_party/triton:temporary/accelerateamdmatmul.patch",`
	`24`	`+ "//third_party/triton:temporary/accelerateamdmatmul2.patch",`
`22`	`25`	`# Add new patches just above this line`
`23`	`26`	`]`