ROCm · amd-songpiao · Sep 29, 2025 · Sep 30, 2025 · Oct 1, 2025 · Oct 2, 2025
diff --git a/.github/workflows/presubmit_benchmark.yml b/.github/workflows/presubmit_benchmark.yml
@@ -31,7 +31,7 @@ on:
         - 'no'
   pull_request:
     branches:
-      - main
+      - rocm-jaxlib-v0.7.1
 
 concurrency:
   # Group all jobs for a given PR together, and cancel presubmit if a new commit is pushed.
@@ -47,7 +47,7 @@ jobs:
     uses: ./.github/workflows/generate_benchmark_matrix.yml
     with:
       workflow_type: 'PRESUBMIT'
-      registry_file: 'xla/tools/benchmarks/registries/default_registry.yml'
+      registry_file: 'xla/tools/benchmarks/registries/amd_default_registry.yml'
       checkout_ref: ${{ github.event.pull_request.head.sha || github.sha }}
 
   run_benchmarks:

diff --git a/xla/tools/benchmarks/proto/benchmark_config.proto b/xla/tools/benchmarks/proto/benchmark_config.proto
@@ -24,6 +24,8 @@ enum HardwareCategory {
   CPU_ARM64 = 2;  // ARM64 CPU
   GPU_L4 = 3;     // L4 GPU
   GPU_B200 = 4;   // B200 GPU
+  GPU_MI300 = 5; // AMD MI300 GPU
+  GPU_MI250 = 6;  // AMD MI250 GPU
 }
 
 // Enum defining the workflow type.

diff --git a/xla/tools/benchmarks/registries/amd_default_registry.yml b/xla/tools/benchmarks/registries/amd_default_registry.yml
@@ -0,0 +1,79 @@
+# Copyright 2025 The OpenXLA Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# Default registry for XLA benchmarks.
+
+benchmarks: [
+  {
+    name: "gemma3_1b_flax_call"
+    description: "Gemma3 1b in Flax."
+    owner: "amd"
+    input_artifact: {
+      input_format: HLO_TEXT
+      artifact_gcs_bucket_path: "https://storage.googleapis.com/xla-benchmarking-temp/gemma3_1b_flax_call.hlo"
+    }
+    model_source_info: ["Gemma3 1B"]
+    hardware_execution_configs: [{
+      hardware_category: GPU_MI250
+      topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
+      target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
+      workflow_type: [PRESUBMIT, POSTSUBMIT, SCHEDULED]
+      runtime_flags: ["--num_repeats=5"]
+    },
+    {
+      hardware_category: CPU_X86
+      topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
+      target_metrics: [CPU_TIME]
+      workflow_type: [PRESUBMIT, POSTSUBMIT, SCHEDULED]
+      runtime_flags: ["--num_repeats=5"]
+    }
+    ]
+    update_frequency_policy: QUARTERLY
+  },
+  {
+    name: "gemma2_2b_keras_jax"
+    description: "Gemma2 2B in Keras."
+    owner: "amd"
+    input_artifact: {
+      input_format: HLO_TEXT
+      artifact_gcs_bucket_path: "https://storage.googleapis.com/xla-benchmarking-temp/gemma2_2b_keras_jax.hlo"
+    }
+    model_source_info: ["Gemma2 2B"]
+    hardware_execution_configs: [{
+      hardware_category: GPU_MI250
+      topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
+      target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
+      workflow_type: [PRESUBMIT, POSTSUBMIT]
+      runtime_flags: ["--num_repeats=5"]
+    },
+    {
+      hardware_category: GPU_MI300
+      topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
+      target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME]
+      workflow_type: [POSTSUBMIT]
+      runtime_flags: ["--num_repeats=5"]
+    },
+    {
+      hardware_category: CPU_X86
+      topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false }
+      target_metrics: [CPU_TIME]
+      workflow_type: [PRESUBMIT, POSTSUBMIT]
+      runtime_flags: ["--num_repeats=5"]
+    }]
+    update_frequency_policy: QUARTERLY
+    # TODO(amd): remove this label once the benchmark is stable.
+    github_labels: ["blocking_presubmit_test"]
+  }
+]
diff --git a/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc b/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc
@@ -95,10 +95,15 @@ GetHardwareToRunnerLabelMap() {
   // - CPU_X86: linux-x86-n2-128
   // - GPU_L4: linux-x86-g2-16-l4-1gpu
   // - GPU_B200: linux-x86-a4-224-b200-1gpu
+  // - GPU_MI300: linux-x86-mi300x-1gpu (AMD Instinct MI300)
+  // - GPU_MI250: linux-x86-mi250-1gpu (AMD Instinct MI250)
   static const auto* kMap = new absl::flat_hash_map<std::string, std::string>{
       {"CPU_X86", "linux-x86-n2-128"},
       {"GPU_L4", "linux-x86-g2-16-l4-1gpu"},
       {"GPU_B200", "linux-x86-a4-224-b200-1gpu"},
+      // AMD GPU runners (placeholders; adjust to match actual CI runner labels)
+      {"GPU_MI300", "ixt-rack-04"},
+      {"GPU_MI250", "ixt-rack-04"},
       // Add more mappings
   };
   return *kMap;
@@ -123,6 +128,10 @@ GetHardwareToContainerImage() {
           {"GPU_L4_1H_4D",
            "us-docker.pkg.dev/ml-oss-artifacts-published/ml-public-container/"
            "ml-build-cuda12.8-cudnn9.8:latest"},
+          // AMD GPU container images (ROCm). Replace with canonical published
+          // image if different.
+          {"GPU_MI300", "rocm/tensorflow:rocm7.0-py3.12-tf2.19-dev"},
+          {"GPU_MI250", "rocm/tensorflow:rocm7.0-py3.12-tf2.19-dev"},
       };
   return *kHardwareToContainerImage;
 }
@@ -141,18 +150,20 @@ Json::Value RepeatedStringFieldToJsonArray(
 // Note: TargetMetric can be overridden in the BenchmarkConfig proto.
 const absl::flat_hash_map<std::string, std::vector<TargetMetric>>&
 GetHardwareToDefaultTargetMetrics() {
-  static const auto* kHardwareToDefaultTargetMetrics =
-      new absl::flat_hash_map<std::string, std::vector<TargetMetric>>{
-          // Key is just the hardware category name for this map
-          {"CPU_X86", {TargetMetric::CPU_TIME}},
-          {"CPU_ARM64", {TargetMetric::CPU_TIME}},
-          {"GPU_L4",
-           {TargetMetric::GPU_DEVICE_TIME,
-            TargetMetric::GPU_DEVICE_MEMCPY_TIME}},
-          {"GPU_B200",
-           {TargetMetric::GPU_DEVICE_TIME,
-            TargetMetric::GPU_DEVICE_MEMCPY_TIME}},
-      };
+  static const auto* kHardwareToDefaultTargetMetrics = new absl::flat_hash_map<
+      std::string, std::vector<TargetMetric>>{
+      // Key is just the hardware category name for this map
+      {"CPU_X86", {TargetMetric::CPU_TIME}},
+      {"CPU_ARM64", {TargetMetric::CPU_TIME}},
+      {"GPU_L4",
+       {TargetMetric::GPU_DEVICE_TIME, TargetMetric::GPU_DEVICE_MEMCPY_TIME}},
+      {"GPU_B200",
+       {TargetMetric::GPU_DEVICE_TIME, TargetMetric::GPU_DEVICE_MEMCPY_TIME}},
+      {"GPU_MI300",
+       {TargetMetric::GPU_DEVICE_TIME, TargetMetric::GPU_DEVICE_MEMCPY_TIME}},
+      {"GPU_MI250",
+       {TargetMetric::GPU_DEVICE_TIME, TargetMetric::GPU_DEVICE_MEMCPY_TIME}},
+  };
   return *kHardwareToDefaultTargetMetrics;
 }
 

diff --git a/xla/tools/multihost_hlo_runner/BUILD b/xla/tools/multihost_hlo_runner/BUILD
@@ -71,10 +71,10 @@ cc_library(
         "@tsl//tsl/platform:statusor",
     ] + if_cuda_or_rocm([
         "//xla/service:gpu_plugin",
-        "//xla/backends/profiler/gpu:cupti_tracer",
         "//xla/backends/profiler/gpu:device_tracer",
     ]) + if_cuda([
         "//xla/stream_executor:cuda_platform",
+        "//xla/backends/profiler/gpu:cupti_tracer",
     ] + if_google(
         [
             "//third_party/py/jax/jaxlib/cuda:cuda_gpu_kernels",  # fixdeps: keep