PR #36046: [ROCm] Fix failing unit tests on ROCm platform

hsharsha · Google-ML-Automation · commit fbce546978a2 · 2026-01-13T05:02:14.000-08:00
Imported from GitHub PR #36046 📝 Summary of Changes - layout_assignment tests are marked cuda-only. - sample_file_test needs higher autotuner level for MIOpen to return conv algorithm. Earlier this was coming from GetDebugOptionsForTest. - buffer_debug_log test is made gpu agnostic by using cannonical gpu name. - cublas_gemm_rewriter_test_amdgpu_any fix unit test to remove padding for ROCm as introduced in #33854 - gpu_kernel_tiling_test_amdgpu_any is updated to respect higher launch dimensions now supported by hipruntime - Mark dynamic_shared_memory_test as cuda-only - Add arch specific checks for barriers to sorting.hlo 🎯 Justification Fixes failing unit tests on ROCm platform 🚀 Kind of Contribution 🐛 Bug Fix, 🧪 Tests Copybara import of the project: -- 472cd54 by Harsha HS <Harsha.HavanurShamsundara@amd.com>: [ROCm] Fix failing unit tests on ROCm platform - layout_assignment tests are marked cuda-only. - sample_file_test needs higher autotuner level for MIOpen to return conv algorithm. Earlier this was coming from GetDebugOptionsForTest. - buffer_debug_log test is made gpu agnostic by using cannonical gpu name. -- 3bb9422 by Harsha HS <Harsha.HavanurShamsundara@amd.com>: Fix tests which started to fail due to #33854 -- 850d955 by Harsha HS <Harsha.HavanurShamsundara@amd.com>: HIP now respects highter launch dimension similar to CUDA -- b504a7e by Harsha HS <Harsha.HavanurShamsundara@amd.com>: Make dynamic_shared_memory_test cuda only -- 1e4e57a by Harsha HS <Harsha.HavanurShamsundara@amd.com>: Add arch specific checks to sorting.hlo -- ce1241c by Harsha HS <Harsha.HavanurShamsundara@amd.com>: Address review comments Merging this change closes #36046 FUTURE_COPYBARA_INTEGRATE_REVIEW=#36046 from ROCm:ci_fix_upstream_ut_20260107 ce1241c PiperOrigin-RevId: 855607651
diff --git a/xla/service/gpu/tests/BUILD b/xla/service/gpu/tests/BUILD
@@ -836,6 +836,7 @@ xla_test(
     name = "dynamic_shared_memory_test",
     srcs = if_cuda_is_configured(["dynamic_shared_memory_test.cc"]),
     backends = ["gpu"],
+    tags = ["cuda-only"],
     deps = [
         "//xla:shape_util",
         "//xla:types",
diff --git a/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -420,7 +420,7 @@ TEST_F(GpuKernelTilingTest, ReductionInputTooLarge) {
   if (xla::PlatformUtil::CanonicalPlatformName("gpu").value() == "rocm") {
     EXPECT_THAT(status.message(),
                 ::testing::ContainsRegex(
-                    "Kernel '.*' launch needs more blocks [(]2147483648, 1[)] "
+                    "Kernel '.*' launch needs more blocks [(]4294967296, 1[)] "
                     "than allowed by hardware [(]2147483647, 65536[)]"));
   } else {
     EXPECT_THAT(status.message(),
diff --git a/xla/service/gpu/tests/sorting.hlo b/xla/service/gpu/tests/sorting.hlo
@@ -609,22 +609,26 @@ compare {
 // CHECK:         %[[VAL_405:.*]] = icmp slt i64 %[[VAL_404]], 3
 // CHECK:         br i1 %[[VAL_405]], label %[[VAL_406:.*]], label %[[VAL_407:.*]]
 // CHECK:       smaller_keys_index-after29:                       ; preds = %[[VAL_406]], %[[VAL_403]]
-// CHECK:         call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+// CHECK-PTX:     call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+// CHECK-GCN:     call void @llvm.amdgcn.s.barrier
 // CHECK:         %[[VAL_408:.*]] = mul i64 %[[VAL_363]], 4
 // CHECK:         %[[VAL_409:.*]] = icmp uge i64 %[[VAL_408]], 0
 // CHECK:         br i1 %[[VAL_409]], label %[[VAL_410:.*]], label %[[VAL_411:.*]]
 // CHECK:       is_last_tile-after:                               ; preds = %[[VAL_412:.*]], %[[VAL_413:.*]]
-// CHECK:         call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+// CHECK-PTX:     call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+// CHECK-GCN:     call void @llvm.amdgcn.s.barrier
 // CHECK:         %[[VAL_414:.*]] = mul i64 %[[VAL_363]], 4
 // CHECK:         %[[VAL_415:.*]] = icmp uge i64 %[[VAL_414]], 0
 // CHECK:         br i1 %[[VAL_415]], label %[[VAL_416:.*]], label %[[VAL_417:.*]]
 // CHECK:       is_last_tile-after56:                             ; preds = %[[VAL_418:.*]], %[[VAL_419:.*]]
-// CHECK:         call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+// CHECK-PTX:     call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+// CHECK-GCN:     call void @llvm.amdgcn.s.barrier
 // CHECK:         %[[VAL_420:.*]] = mul i64 %[[VAL_363]], 4
 // CHECK:         %[[VAL_421:.*]] = icmp uge i64 %[[VAL_420]], 0
 // CHECK:         br i1 %[[VAL_421]], label %[[VAL_422:.*]], label %[[VAL_423:.*]]
 // CHECK:       is_last_tile-after89:                             ; preds = %[[VAL_424:.*]], %[[VAL_425:.*]]
-// CHECK:         call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+// CHECK-PTX:     call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
+// CHECK-GCN:     call void @llvm.amdgcn.s.barrier
 // CHECK:         %[[VAL_426:.*]] = mul nuw nsw i64 %[[VAL_363]], 4
 // CHECK:         %[[VAL_427:.*]] = mul nuw nsw i64 %[[VAL_371]], 4
 // CHECK:         %[[VAL_428:.*]] = add nuw nsw i64 %[[VAL_426]], 0
@@ -1251,10 +1255,10 @@ compare {
 // CHECK:         %[[VAL_843:.*]] = load float, ptr %[[VAL_844:.*]], align 4
 // CHECK:         %[[VAL_845:.*]] = fcmp olt float %[[VAL_841]], %[[VAL_843]]
 // CHECK:         %[[VAL_846:.*]] = zext i1 %[[VAL_845]] to i8
-// CHECK:         store i8 %[[VAL_846]], ptr %[[VAL_840]], align 1
-// CHECK:         %[[VAL_847:.*]] = load i8, ptr %[[VAL_840]], align 1
-// CHECK:         store i8 %[[VAL_847]], ptr %[[VAL_848:.*]], align 1
-// CHECK:         ret void
+// CHECK-PTX:     store i8 %[[VAL_846]], ptr %[[VAL_840]], align 1
+// CHECK-PTX:     %[[VAL_847:.*]] = load i8, ptr %[[VAL_840]], align 1
+// CHECK-PTX:     store i8 %[[VAL_847]], ptr %[[VAL_848:.*]], align 1
+// CHECK-PTX:     ret void
 
 ENTRY main {
   x = s32[2, 3] parameter(0)
@@ -1286,7 +1290,7 @@ ENTRY main {
   ROOT sort = (f64[2, 2048], f64[2, 2048], f64[2, 2048], f64[2, 2048]) sort(param0, param1, param2, param3), dimensions={1}, to_apply=compare
 }
 // Check that we have a tile size of 1024.
-// CHECK:         getelementptr [1024 x double], ptr addrspace(3) @sort_tile_param_0
+// CHECK-PTX:         getelementptr [1024 x double], ptr addrspace(3) @sort_tile_param_0
 
 // -----
 
@@ -1304,4 +1308,4 @@ ENTRY main {
 }
 
 // CHECK-COUNT-334: xor i64
-// CHECK-NOT: xor i64
+// CHECK-GCN: xor i64
diff --git a/xla/service/gpu/transforms/BUILD b/xla/service/gpu/transforms/BUILD
@@ -9,7 +9,7 @@ load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla/tsl:tsl.bzl", "if_oss")
 load(
     "//xla/tsl/platform:build_config_root.bzl",
-    "tf_gpu_tests_tags",
+    "tf_cuda_tests_tags",
 )
 load(
     "//xla/tsl/platform/default:cuda_build_defs.bzl",
@@ -1846,7 +1846,7 @@ lit_test_suite(
     ),
     cfg = "//xla:lit.cfg.py",
     data = ["//xla/backends/gpu/target_config:all_gpu_specs"],
-    default_tags = tf_gpu_tests_tags(),
+    default_tags = ["cuda-only"] + tf_cuda_tests_tags(),
     tools = [
         "//xla/tools:hlo-opt",
         "@llvm-project//llvm:FileCheck",
diff --git a/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc b/xla/service/gpu/transforms/cublas_gemm_rewriter_test.cc
@@ -1639,11 +1639,19 @@ ENTRY test {
 })";
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK-DAG: ENTRY %test ({{.*}}: bf16[2,3], {{.*}}: bf16[3,4], {{.*}}: bf16[4]) -> bf16[2,4] {
-; CHECK-DAG:    bf16[8,8]{1,0} pad({{.*}}), padding=0_6x0_5
-; CHECK-DAG:    bf16[8,8]{1,0} pad({{.*}}), padding=0_5x0_4
+  if (IsCuda()) {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: bf16[2,3], {{.*}}: bf16[3,4], {{.*}}: bf16[4]) -> bf16[2,4] {
+    ; CHECK-DAG:    bf16[8,8]{1,0} pad({{.*}}), padding=0_6x0_5
+    ; CHECK-DAG:    bf16[8,8]{1,0} pad({{.*}}), padding=0_5x0_4
       )");
+  } else {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: bf16[2,3], {{.*}}: bf16[3,4], {{.*}}: bf16[4]) -> bf16[2,4] {
+    ; CHECK-DAG:    bf16[2,3]{1,0}
+    ; CHECK-DAG:    bf16[3,4]{1,0}
+    )");
+  }
 }
 
 TEST_F(CublasLtGemmRewriteTest, ReluActivation) {
@@ -2428,11 +2436,19 @@ ENTRY test {
 })";
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{5e-5, 1e-5}));
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK-DAG: ENTRY %test ({{.*}}: bf16[2,3], {{.*}}: bf16[3,4]) -> bf16[2,4] {
-; CHECK-DAG:    bf16[8,8]{1,0} pad({{.*}}), padding=0_6x0_5
-; CHECK-DAG:    bf16[8,8]{1,0} pad({{.*}}), padding=0_5x0_4
+  if (IsCuda()) {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: bf16[2,3], {{.*}}: bf16[3,4]) -> bf16[2,4] {
+    ; CHECK-DAG:    bf16[8,8]{1,0} pad({{.*}}), padding=0_6x0_5
+    ; CHECK-DAG:    bf16[8,8]{1,0} pad({{.*}}), padding=0_5x0_4
       )");
+  } else {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: bf16[2,3], {{.*}}: bf16[3,4]) -> bf16[2,4] {
+    ; CHECK-DAG:    bf16[2,3]{1,0}
+    ; CHECK-DAG:    bf16[3,4]{1,0}
+    )");
+  }
 }
 
 TEST_F(CublasLtGemmRewriteTest, ApproxGeluActivationBitcast) {
@@ -2606,13 +2622,19 @@ ENTRY test {
 })";
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
-  MatchOptimizedHlo(hlo_text,
-                    R"(
-
-; CHECK-DAG: ENTRY %test ({{.*}}: f16[6,12], {{.*}}: f16[12,6], {{.*}}: f16[6]) -> f16[6,6] {
-; CHECK-DAG:    f16[8,16]{1,0} pad({{.*}}), padding=0_2x0_4
-; CHECK-DAG:    f16[16,8]{1,0} pad({{.*}}), padding=0_4x0_2
+  if (IsCuda()) {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: f16[6,12], {{.*}}: f16[12,6], {{.*}}: f16[6]) -> f16[6,6] {
+    ; CHECK-DAG:    f16[8,16]{1,0} pad({{.*}}), padding=0_2x0_4
+    ; CHECK-DAG:    f16[16,8]{1,0} pad({{.*}}), padding=0_4x0_2
       )");
+  } else {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: f16[6,12], {{.*}}: f16[12,6], {{.*}}: f16[6]) -> f16[6,6] {
+    ; CHECK-DAG:    f16[6,12]{1,0}
+    ; CHECK-DAG:    f16[12,6]{1,0}
+    )");
+  }
 }
 
 // For F16, the operands are padded on GPUs with Tensor Cores (i.e. Volta and
@@ -2657,11 +2679,19 @@ ENTRY test {
 })";
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK-DAG: ENTRY %test ({{.*}}: f16[6,12], {{.*}}: f16[12,6]) -> f16[6,6] {
-; CHECK-DAG:    f16[8,16]{1,0} pad({{.*}}), padding=0_2x0_4
-; CHECK-DAG:    f16[16,8]{1,0} pad({{.*}}), padding=0_4x0_2
+  if (IsCuda()) {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: f16[6,12], {{.*}}: f16[12,6]) -> f16[6,6] {
+    ; CHECK-DAG:    f16[8,16]{1,0} pad({{.*}}), padding=0_2x0_4
+    ; CHECK-DAG:    f16[16,8]{1,0} pad({{.*}}), padding=0_4x0_2
       )");
+  } else {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: f16[6,12], {{.*}}: f16[12,6]) -> f16[6,6] {
+    ; CHECK-DAG:    f16[6,12]{1,0}
+    ; CHECK-DAG:    f16[12,6]{1,0}
+    )");
+  }
 }
 
 TEST_F(CublasLtGemmRewriteTest, MatrixBiasReluActivationF16) {
@@ -2757,11 +2787,19 @@ ENTRY test {
 })";
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK-DAG: ENTRY %test ({{.*}}: f16[6,12], {{.*}}: f16[12,6], {{.*}}: f16[6]) -> f16[6,6] {
-; CHECK-DAG:   f16[8,16]{1,0} pad({{.*}}), padding=0_2x0_4
-; CHECK-DAG:   f16[16,8]{1,0} pad({{.*}}), padding=0_4x0_2
+  if (IsCuda()) {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: f16[6,12], {{.*}}: f16[12,6], {{.*}}: f16[6]) -> f16[6,6] {
+    ; CHECK-DAG:   f16[8,16]{1,0} pad({{.*}}), padding=0_2x0_4
+    ; CHECK-DAG:   f16[16,8]{1,0} pad({{.*}}), padding=0_4x0_2
       )");
+  } else {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: f16[6,12], {{.*}}: f16[12,6], {{.*}}: f16[6]) -> f16[6,6] {
+    ; CHECK-DAG:   f16[6,12]{1,0}
+    ; CHECK-DAG:   f16[12,6]{1,0}
+    )");
+  }
 }
 
 // For bfloat16, the sizes of all dimensions of the operands are required to be
@@ -2899,11 +2937,19 @@ ENTRY test {
 })";
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK-DAG:  ENTRY %test ({{.*}}: bf16[6,12], {{.*}}: bf16[12,6], {{.*}}: bf16[6]) -> bf16[6,6] {
-; CHECK-DAG:    bf16[8,16]{1,0} pad({{.*}}), padding=0_2x0_4
-; CHECK-DAG:    bf16[16,8]{1,0} pad({{.*}}), padding=0_4x0_2
+  if (IsCuda()) {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG:  ENTRY %test ({{.*}}: bf16[6,12], {{.*}}: bf16[12,6], {{.*}}: bf16[6]) -> bf16[6,6] {
+    ; CHECK-DAG:    bf16[8,16]{1,0} pad({{.*}}), padding=0_2x0_4
+    ; CHECK-DAG:    bf16[16,8]{1,0} pad({{.*}}), padding=0_4x0_2
       )");
+  } else {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG:  ENTRY %test ({{.*}}: bf16[6,12], {{.*}}: bf16[12,6], {{.*}}: bf16[6]) -> bf16[6,6] {
+    ; CHECK-DAG:    bf16[6,12]{1,0}
+    ; CHECK-DAG:    bf16[12,6]{1,0}
+    )");
+  }
 }
 
 // For bfloat16, the operands are padded if necessary on Ampere and newer
@@ -2955,11 +3001,19 @@ ENTRY test {
 })";
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK-DAG: ENTRY %test ({{.*}}: bf16[6,12], {{.*}}: bf16[12,6]) -> bf16[6,6] {
-; CHECK-DAG:     bf16[8,16]{1,0} pad({{.*}}), padding=0_2x0_4
-; CHECK-DAG:     bf16[16,8]{1,0} pad({{.*}}), padding=0_4x0_2
+  if (IsCuda()) {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: bf16[6,12], {{.*}}: bf16[12,6]) -> bf16[6,6] {
+    ; CHECK-DAG:     bf16[8,16]{1,0} pad({{.*}}), padding=0_2x0_4
+    ; CHECK-DAG:     bf16[16,8]{1,0} pad({{.*}}), padding=0_4x0_2
       )");
+  } else {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: bf16[6,12], {{.*}}: bf16[12,6]) -> bf16[6,6] {
+    ; CHECK-DAG:     bf16[6,12]{1,0}
+    ; CHECK-DAG:     bf16[12,6]{1,0}
+    )");
+  }
 }
 
 // For bfloat16, the operands are padded if necessary on Ampere and newer
@@ -3018,11 +3072,19 @@ ENTRY test {
 
 )";
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK-DAG: ENTRY %test ({{.*}}: bf16[6,12], {{.*}}: bf16[12,6], {{.*}}: bf16[6]) -> bf16[6,6] {
-; CHECK-DAG:     bf16[8,16]{1,0} pad({{.*}}), padding=0_2x0_4
-; CHECK-DAG:     bf16[16,8]{1,0} pad({{.*}}), padding=0_4x0_2
+  if (IsCuda()) {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: bf16[6,12], {{.*}}: bf16[12,6], {{.*}}: bf16[6]) -> bf16[6,6] {
+    ; CHECK-DAG:     bf16[8,16]{1,0} pad({{.*}}), padding=0_2x0_4
+    ; CHECK-DAG:     bf16[16,8]{1,0} pad({{.*}}), padding=0_4x0_2
       )");
+  } else {
+    MatchOptimizedHlo(hlo_text, R"(
+    ; CHECK-DAG: ENTRY %test ({{.*}}: bf16[6,12], {{.*}}: bf16[12,6], {{.*}}: bf16[6]) -> bf16[6,6] {
+    ; CHECK-DAG:     bf16[6,12]{1,0}
+    ; CHECK-DAG:     bf16[12,6]{1,0}
+    )");
+  }
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasReluActivationF64) {
diff --git a/xla/stream_executor/gpu/BUILD b/xla/stream_executor/gpu/BUILD
@@ -917,6 +917,7 @@ xla_test(
         "//xla/backends/gpu/runtime:buffer_debug_log_proto_cc",
         "//xla/backends/gpu/runtime:buffer_debug_log_structs",
         "//xla/backends/gpu/runtime:thunk_id",
+        "//xla/service:platform_util",
         "//xla/stream_executor:device_address",
         "//xla/stream_executor:platform",
         "//xla/stream_executor:platform_manager",
diff --git a/xla/stream_executor/gpu/buffer_debug_log_test.cc b/xla/stream_executor/gpu/buffer_debug_log_test.cc
@@ -26,9 +26,11 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/status/status_matchers.h"
+#include "absl/strings/ascii.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log.pb.h"
 #include "xla/backends/gpu/runtime/buffer_debug_log_structs.h"
 #include "xla/backends/gpu/runtime/thunk_id.h"
+#include "xla/service/platform_util.h"
 #include "xla/stream_executor/device_address.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform_manager.h"
@@ -47,8 +49,9 @@ using ::xla::gpu::ThunkId;
 class BufferDebugLogTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    TF_ASSERT_OK_AND_ASSIGN(platform_,
-                            PlatformManager::PlatformWithName("CUDA"));
+    auto name = absl::AsciiStrToUpper(
+        xla::PlatformUtil::CanonicalPlatformName("gpu").value());
+    TF_ASSERT_OK_AND_ASSIGN(platform_, PlatformManager::PlatformWithName(name));
     TF_ASSERT_OK_AND_ASSIGN(executor_, platform_->ExecutorForDevice(0));
     TF_ASSERT_OK_AND_ASSIGN(stream_, executor_->CreateStream(std::nullopt));
     allocator_ =
diff --git a/xla/tests/sample_file_test.cc b/xla/tests/sample_file_test.cc
@@ -63,6 +63,8 @@ TEST_F(SampleFileTest, Convolution) {
       .mutable_debug_options()
       .set_xla_cpu_parallel_codegen_split_count(1);
 
+  module->mutable_config().mutable_debug_options().set_xla_gpu_autotune_level(
+      4);
   EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{0.01}));
 }
 
diff --git a/xla/tsl/platform/default/build_config_root.bzl b/xla/tsl/platform/default/build_config_root.bzl
@@ -44,7 +44,10 @@ def tf_gpu_tests_tags():
 
 # terminology changes: saving tf_cuda_* for compatibility
 def tf_cuda_tests_tags():
-    return tf_gpu_tests_tags()
+    if is_cuda_configured():
+        return ["requires-gpu-cuda", "gpu"] + gpu_test_tags()
+    else:
+        return []
 
 def tf_has_tag(kwargs, tag):
     return ("tags" in kwargs and kwargs["tags"] != None and tag in kwargs["tags"])

Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,8 @@ TEST_F(SampleFileTest, Convolution) {`
`63`	`63`	`.mutable_debug_options()`
`64`	`64`	`.set_xla_cpu_parallel_codegen_split_count(1);`
`65`	`65`
	`66`	`+ module->mutable_config().mutable_debug_options().set_xla_gpu_autotune_level(`
	`67`	`+ 4);`
`66`	`68`	`EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{0.01}));`
`67`	`69`	`}`
`68`	`70`